123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808 |
- from dl.common.Utils import findAllIndex
- from dl.interface.Entitys import PREM
- import re
- import copy
- import math
- def getTheRole(entity,role_list):
- '''
- @summary:根据实体名称拿到index
- @param:
- entity:实体名称
- role_list:角色list
- @return:该实体所在下标
- '''
- for role_index in range(len(role_list)):
- if entity in role_list[role_index]:
- return role_index
- return None
- dict_role_id = {"0":"tenderee",
- "1":"agency",
- "2":"win_tenderer",
- "3":"second_tenderer",
- "4":"third_tenderer"}
- def getPackage(packageList,sentence_index,begin_index):
- '''
- @param:
- packageList:文章的包的信息
- sentence_index:实体所在的句子
- begin_index:实体所在句子的起始位置
- @return:公司实体所属的包
- '''
- if len(packageList)==0:
- return None
- before_index = None
- after_index = None
- equal_index = None
- equal_count = 0
- for pack_index in range(len(packageList)):
- if packageList[pack_index][1]>sentence_index and after_index is None:
- after_index = pack_index
- if packageList[pack_index][1]<sentence_index:
- before_index = pack_index
- if packageList[pack_index][1]==sentence_index and equal_index is None:
- equal_index = pack_index
- #当前句子和之前句子未找到包
- if before_index is None and equal_index is None:
- return None
- else:
- if after_index is None:
- end_index = len(packageList)
- else:
- end_index = after_index
- #只在当前句子找到一个包号
- if end_index-max((before_index if before_index is not None else -1,equal_index if equal_index is not None else -1))==1:
- return packageList[end_index-1][0]
- else:
- for i in range(max((before_index if before_index is not None else -1,equal_index if equal_index is not None else -1)),end_index):
- if packageList[i][2]>int(begin_index):
- if packageList[i-1][4]:
- return packageList[i-1][0]
- else:
- if packageList[i][4]:
- return packageList[i-1][0]
- else:
- return packageList[i][0]
- return packageList[end_index-1][0]
- return None
- #生成合法的组合
- def get_legal_comba(list_entity,dict_role_combination):
- #拿到一个包中所有合法的组合
- def recursive_package(_dict_legal_combination,set_legal_entity,dict_one_selution,list_all_selution):
- last_layer = False
- #若是空组合则放回空
- if len(_dict_legal_combination.keys())==0:
- return []
- #递归到最后一层则修改状态
- if len(_dict_legal_combination.keys())==1:
- last_layer = True
- #取一个角色开始进行遍历
- _key_role = list(_dict_legal_combination.keys())[0]
- for item in _dict_legal_combination[_key_role]:
- copy_dict_one_selution = copy.copy(dict_one_selution)
- copy_dict_legal_combination = {}
- copy_set_legal_entity = copy.copy(set_legal_entity)
- #复制余下的所有角色,进行下一轮递归
- for _key in _dict_legal_combination.keys():
- if _key!=_key_role:
- copy_dict_legal_combination[_key] = _dict_legal_combination[_key]
- #修改为招标人和代理人可以为同一个
- if item !="":
- _flag = True
- if str(_key_role) in ["0","1"]:
- for _key_flag in copy_dict_one_selution:
- if _key_flag not in ["0","1"] and copy_dict_one_selution[_key_flag]==item:
- _flag = False
- else:
- for _key_flag in copy_dict_one_selution:
- if copy_dict_one_selution[_key_flag]==item:
- _flag = False
- if _flag:
- copy_dict_one_selution[_key_role] = item
- '''
- if item not in copy_set_legal_entity:
- if item !="":
- copy_dict_one_selution[_key_role] = item
- '''
- copy_set_legal_entity.add(item)
- if last_layer:
- list_all_selution.append(copy_dict_one_selution)
- else:
- recursive_package(copy_dict_legal_combination,copy_set_legal_entity,copy_dict_one_selution,list_all_selution)
- #递归匹配各个包的结果
- def recursive_packages(_dict_legal_combination,dict_one_selution,list_all_selution):
- last_layer = False
- if len(_dict_legal_combination.keys())==0:
- return []
- if len(_dict_legal_combination.keys())==1:
- last_layer = True
- _key_pack = list(_dict_legal_combination.keys())[0]
- for item in _dict_legal_combination[_key_pack]:
- copy_dict_one_selution = copy.copy(dict_one_selution)
- copy_dict_legal_combination = {}
- for _key in _dict_legal_combination.keys():
- if _key!=_key_pack:
- copy_dict_legal_combination[_key] = _dict_legal_combination[_key]
- for _key_role in item.keys():
- copy_dict_one_selution[_key_pack+"$$"+_key_role] = item[_key_role]
- if last_layer:
- list_all_selution.append(copy_dict_one_selution)
- else:
- recursive_packages(copy_dict_legal_combination,copy_dict_one_selution,list_all_selution)
- return list_all_selution
- #循环获取所有包组合
- def circle_pageages(_dict_legal_combination):
- list_all_selution = []
- for _key_pack in _dict_legal_combination.keys():
- list_key_selution = []
- for item in _dict_legal_combination[_key_pack]:
- _dict = dict()
- for _key_role in item.keys():
- _dict[_key_pack+"$$"+_key_role] = item[_key_role]
- list_key_selution.append(_dict)
- if len(list_all_selution)==0:
- list_all_selution = list_key_selution
- else:
- _list_all_selution = []
- for item_1 in list_all_selution:
- for item_2 in list_key_selution:
- _list_all_selution.append(dict(item_1,**item_2))
- list_all_selution = _list_all_selution
- return list_all_selution
- #拿到各个包解析之后的结果
- _dict_legal_combination = {}
- for packageName in dict_role_combination.keys():
- _list_all_selution = []
- recursive_package(dict_role_combination[packageName], set(), {}, _list_all_selution)
- '''
- print("===1")
- print(packageName)
- for item in _list_all_selution:
- print(item)
- print("===2")
- '''
- #去除包含子集
- list_all_selution_simple = []
- _list_set_all_selution = []
- for item_selution in _list_all_selution:
- item_set_selution = set()
- for _key in item_selution.keys():
- item_set_selution.add((_key,item_selution[_key]))
- _list_set_all_selution.append(item_set_selution)
- if len(_list_set_all_selution)>1000:
- _dict_legal_combination[packageName] = _list_all_selution
- continue
- for i in range(len(_list_set_all_selution)):
- be_included = False
- for j in range(len(_list_set_all_selution)):
- if i!=j:
- if len(set(_list_set_all_selution[i])&set(_list_set_all_selution[j]))==len(_list_set_all_selution[i]) and len(_list_set_all_selution[i])!=len(_list_set_all_selution[j]):
- be_included = True
- if not be_included:
- list_all_selution_simple.append(_list_all_selution[i])
- _dict_legal_combination[packageName] = list_all_selution_simple
- _list_final_comba = []
- #对各个包的结果进行排列组合
- _comba_count = 1
- for _key in _dict_legal_combination.keys():
- _comba_count *= len(_dict_legal_combination[_key])
- #如果过大,则每个包只取概率最大的那个
- if _comba_count>250:
- new_dict_legal_combination = dict()
- for _key_pack in _dict_legal_combination.keys():
- MAX_PROB = -1000
- _MAX_PROB_COMBA = None
- for item in _dict_legal_combination[_key_pack]:
- _dict = dict()
- for _key in item.keys():
- _dict[str(_key_pack)+"$$"+str(_key)] = item[_key]
- _prob = getSumExpectation(list_entity, _dict)
- if _prob>MAX_PROB:
- MAX_PROB = _prob
- _MAX_PROB_COMBA = [item]
- new_dict_legal_combination[_key_pack] = _MAX_PROB_COMBA
- _dict_legal_combination = new_dict_legal_combination
- #recursive_packages(_dict_legal_combination, {}, _list_final_comba)
- _list_final_comba = circle_pageages(_dict_legal_combination)
- #除了Project包(招标人和代理人),其他包是不会有冲突的
- #查看是否有一个实体出现在了Project包和其他包中,如有,要进行裁剪
- _list_real_comba = []
- for dict_item in _list_final_comba:
- set_project = set()
- set_other = set()
- for _key in list(dict_item.keys()):
- if _key.split("$$")[0]=="Project":
- set_project.add(dict_item[_key])
- else:
- set_other.add(dict_item[_key])
- set_common = set_project&set_other
- if len(set_common)>0:
- dict_project = {}
- dict_not_project = {}
- for _key in list(dict_item.keys()):
- if dict_item[_key] in set_common:
- if str(_key.split("$$")[0])=="Project":
- dict_project[_key] = dict_item[_key]
- else:
- dict_not_project[_key] = dict_item[_key]
- else:
- dict_project[_key] = dict_item[_key]
- dict_not_project[_key] = dict_item[_key]
- _list_real_comba.append(dict_project)
- _list_real_comba.append(dict_not_project)
- else:
- _list_real_comba.append(dict_item)
- return _list_real_comba
- #计算合计期望
- def getSumExpectation(list_entity,combination,on_value=0.5):
- '''
- expect = 0
- for entity in list_entity:
- if entity.entity_type in ['org','company']:
- values = entity.values
- role_prob = float(values[int(entity.label)])
- _key = entity.packageName+"$$"+str(entity.label)
- if role_prob>on_value and str(entity.label)!="5":
- if _key in combination.keys() and combination[_key]==entity.entity_text:
- expect += math.pow(role_prob,4)
- else:
- expect -= math.pow(role_prob,4)
- '''
- #修改为同一个实体只取对应包-角色的最大的概率值
- expect = 0
- dict_entity_prob = {}
- for entity in list_entity:
- if entity.entity_type in ['org','company']:
- values = entity.values
- role_prob = float(values[int(entity.label)])
- _key = entity.packageName+"$$"+str(entity.label)
- if role_prob>=on_value and str(entity.label)!="5":
- if _key in combination.keys() and combination[_key]==entity.entity_text:
- _key_prob = _key+entity.entity_text
- if _key_prob in dict_entity_prob.keys():
- if dict_entity_prob[_key_prob]<role_prob:
- dict_entity_prob[_key_prob] = role_prob
- else:
- dict_entity_prob[_key_prob] = role_prob
- else:
- _key_prob = _key+entity.entity_text
- if _key_prob in dict_entity_prob.keys():
- if dict_entity_prob[_key_prob]>-role_prob:
- dict_entity_prob[_key_prob] = -role_prob
- else:
- dict_entity_prob[_key_prob] = -role_prob
- for _key in dict_entity_prob.keys():
- symbol = 1 if dict_entity_prob[_key]>0 else -1
- expect += symbol*math.pow(dict_entity_prob[_key],2)
- return expect
- def getRoleList(list_sentence,list_entity,on_value = 0.5):
- '''
- @summary: 搜索树,得到所有不矛盾的角色组合,取合计期望值最大的作为结果返回
- @param:
- list_sentence:文章所有的sentence
- list_entity:文章所有的实体
- on_value:概率阈值
- @return:文章的角色list
- '''
- pack = getPackagesFromArticle(list_sentence)
- if pack is None:
- return None
- PackageList,PackageSet,dict_PackageCode = pack
- #拿到所有可能的情况
- dict_role_combination = {}
- #拿到各个实体的packageName,packageCode
- for entity in list_entity:
- if entity.entity_type in ['org','company']:
- values = entity.values
- role_prob = float(values[int(entity.label)])
- if role_prob>=on_value and str(entity.label)!="5":
- if str(entity.label) in ["0","1"]:
- packageName = "Project"
- else:
- if len(PackageSet)>1:
- packageName = getPackage(PackageList,entity.sentence_index,entity.end_index)
- if packageName is None:
- #continue
- packageName = "Project"
- else:
- packageName = "Project"
- find_flag = False
- role_name = dict_role_id.get(str(entity.label))
- if packageName in dict_PackageCode.keys():
- packageCode = dict_PackageCode[packageName]
- else:
- packageCode = ""
- entity.packageName = packageName
- entity.packageCode = packageCode
- entity.roleName = role_name
- if entity.packageName in dict_role_combination.keys():
- if str(entity.label) in dict_role_combination[entity.packageName].keys():
- dict_role_combination[entity.packageName][str(entity.label)].add(entity.entity_text)
- else:
- dict_role_combination[entity.packageName][str(entity.label)] = set([entity.entity_text])
- else:
- dict_role_combination[entity.packageName] = {}
- #初始化空值
- roleIds = [0,1,2,3,4]
- for _roleId in roleIds:
- dict_role_combination[entity.packageName][str(_roleId)] = set([""])
- dict_role_combination[entity.packageName][str(entity.label)].add(entity.entity_text)
- list_real_comba = get_legal_comba(list_entity,dict_role_combination)
- #拿到最大期望值的组合
- max_index = 0
- max_expect = -100
- _index = 0
- for item_combination in list_real_comba:
- expect = getSumExpectation(list_entity, item_combination)
- if expect>max_expect:
- max_index = _index
- max_expect = expect
- _index += 1
- RoleList = []
- RoleSet = set()
- if len(list_real_comba)>0:
- for _key in list_real_comba[max_index].keys():
- packageName = _key.split("$$")[0]
- label = _key.split("$$")[1]
- role_name = dict_role_id.get(str(label))
- entity_text = list_real_comba[max_index][_key]
- if packageName in dict_PackageCode.keys():
- packagecode = dict_PackageCode.get(packageName)
- else:
- packagecode = ""
- RoleList.append(PREM(packageName,packagecode,role_name,entity_text,0,0,0.0,[]))
- RoleSet.add(entity_text)
- return RoleList,RoleSet,PackageList,PackageSet
- def getPackagesFromArticle(list_sentence):
- '''
- @param:
- list_sentence:文章的句子list
- @return: type:list if [包号,句子index,词偏移,标段号] meaning:文章的包/标段信息
- '''
- if len(list_sentence)==0:
- return None
- PackageList = []
- PackageSet = set()
- dict_packageCode = dict()
- package_name_pattern = re.compile("((标[段号的包]|分包)的?(名[称单]?|包名))[::]?([^::]{3,30}?),{1}")
- package_N_name_pattern = re.compile("(分?包|标段|标|包|包组|项目)编?号?[::]?[\((]?([0-9A-Za-z一二三四五六七八九十]){1,2},{1}")
- package_number_pattern = re.compile("((包|标[段号的包]|分?包|包组|项目)编?号?[::]?[\((]?[0-9A-Za-z一二三四五六七八九十]{1,4})|(第?[0-9A-Za-z一二三四五六七八九十]{1,4}(包号|标[段号的包]|分?包))")
- other_package_pattern = re.compile('(项目名称|物资名称|设备名称|场次名称|标段名称)[::](.{,20}?)(,|项目)') # 新正则识别标段
- number_pattern = re.compile("[0-9A-Za-z一二三四五六七八九十]{1,4}")
- package_code_pattern = re.compile("(?:编号[::]?\s*)([-\dA-Za-z]+)")
- def changeIndexFromWordToWords(tokens,word_index):
- '''
- @summary:转换某个字的字偏移为词偏移
- '''
- before_index = 0
- after_index = 0
- for i in range(len(tokens)):
- after_index = after_index+len(tokens[i])
- if before_index<=word_index and after_index>=word_index:
- return i
- before_index = after_index
- package_names = []
- def extractPackageCode(tokens,word_index,size=20,pattern = package_code_pattern):
- '''
- @summary:抽取包附近的标段号
- @param:
- tokens:包所在句子的分词
- word_index:包所在字偏移
- size:左右各取多少个词
- pattern:提取标段号的正则
- @return: type:string,meaning:标段号
- '''
- index = changeIndexFromWordToWords(tokens,word_index)
- if index<size:
- begin = index
- else:
- begin = index-size
- if index+size>len(tokens):
- end = len(tokens)
- else:
- end = index+size
- #拿到左右两边的词语组成短语
- text = "".join(tokens[begin:end])
- #在短语中的字偏移
- new_word_index = word_index-len("".join(tokens[:begin]))
- min_distance = len(text)
- packageCode = None
- for the_iter in re.finditer(pattern,text):
- #算出最小距离
- distance = min([abs(new_word_index-the_iter.span()[0]),abs(new_word_index-the_iter.span()[1])])
- if distance<min_distance:
- min_distance = distance
- packageCode = the_iter.group(1)
- return packageCode
- #从标段介绍表格中提取包名和包号
- for i in range(len(list_sentence)):
- content = list_sentence[i].sentence_text
- names = re.findall(package_name_pattern,content)
- if names == []:
- names = re.findall(other_package_pattern, content)
- N_names = re.findall(package_N_name_pattern,content)
- if len(names)==1 and len(N_names)==1:
- package_names.append([names[0][-1],N_names[0][-1]])
- for i in range(len(list_sentence)):
- PackageList_item = []
- content = list_sentence[i].sentence_text
- tokens = list_sentence[i].tokens
- for name in package_names:
- for index in findAllIndex(name[0],content):
- temp_package_number = re.findall(number_pattern,name[1])[0]
- PackageList_item.append([temp_package_number,i,changeIndexFromWordToWords(tokens,index),index])
- code = extractPackageCode(tokens, index)
- if code is not None:
- dict_packageCode[temp_package_number] = code
- PackageSet.add(temp_package_number)
- for iter in re.finditer(package_number_pattern,content):
- temp_package_number = re.findall(number_pattern,content[iter.span()[0]:iter.span()[1]])[0]
- PackageList_item.append([temp_package_number,i,changeIndexFromWordToWords(tokens,iter.span()[0]),iter.span()[0]])
- code = extractPackageCode(tokens, iter.span()[0])
- if code is not None:
- dict_packageCode[temp_package_number] = code
- PackageSet.add(temp_package_number)
- if PackageList_item == []: # 原有正则没有识别到标段和包号时增加以下正则识别
- for iter in re.finditer(other_package_pattern,content):
- temp_package_number = iter.group(2)
- PackageList_item.append([temp_package_number,i,changeIndexFromWordToWords(tokens,iter.span()[0]),iter.span()[0]])
- code = extractPackageCode(tokens, iter.span()[0])
- if code is not None:
- dict_packageCode[temp_package_number] = code
- PackageSet.add(temp_package_number)
- PackageList_item.sort(key=lambda x:x[2])
- PackageList = PackageList+PackageList_item
- pattern_punctuation = "[::()\(\),,。;;]"
- for i in range(len(list_sentence)):
- for j in range(len(PackageList)):
- if i==PackageList[j][1]:
- _flag = False
- left_str = list_sentence[i].sentence_text[PackageList[j][3]-30:PackageList[j][3]]
- right_str = list_sentence[i].sentence_text[PackageList[j][3]:PackageList[j][3]+30]
- _left_find = re.findall(pattern_punctuation,left_str)
- _right_find = re.findall(pattern_punctuation,right_str)
- if len(_left_find)>0 and _left_find[-1] in [":",":"]:
- _flag = True
- if len(_right_find)>0 and _right_find[0] in [":",":"]:
- _flag = True
- PackageList[j].append(_flag)
- return PackageList,PackageSet,dict_packageCode
- def findAttributeAfterEntity(roleList,roleSet,PackageList,PackageSet,list_entity,on_value = 0.5,on_value_person=0.5,sentence_len=4):
- '''
- @param:
- roleList:文章角色list
- roleSet:文章所有角色的公司名称
- PackageList:文章的包信息
- PackageSet:文章所有包的名称
- list_entity:文章所有经过模型处理的实体
- on_value:金额模型的阈值
- on_value_person:联系人模型的阈值
- sentence_len:公司和属性间隔句子的最大长度
- @return:添加了属性信息的角色list
- '''
- #根据roleid添加金额到rolelist中
- def addMoneyByRoleid(RoleList,packageName,roleid,money,money_prob):
- for i in range(len(RoleList)):
- if RoleList[i].packageName==packageName and RoleList[i].role_name==dict_role_id.get(str(roleid)):
- if money_prob>RoleList[i].money_prob:
- RoleList[i].money = money
- RoleList[i].money_prob = money_prob
- return RoleList
- #根据实体名称添加金额到rolelist中
- def addMoneyByEntity(RoleList,packageName,entity,money,money_prob):
- for i in range(len(RoleList)):
- if RoleList[i].packageName==packageName and RoleList[i].entity_text==entity:
- if money_prob>RoleList[i].money_prob:
- RoleList[i].money = money
- RoleList[i].money_prob = money_prob
- return RoleList
- #根据实体名称得到角色
- def getRoleWithText(roleList,entity_text):
- for i in range(len(roleList)):
- if roleList[i].entity_text==entity_text:
- return roleList[i].role_name
- def doesEntityOrLinkedEntity_inRoleSet(entity,RoleSet):
- _list_entitys = [entity]+entity.linked_entitys
- for _entity in _list_entitys:
- if _entity.entity_text in RoleSet:
- return True
- p_entity = 0
- set_tenderer_role = set()
- set_tenderer_money = set()
- #遍历所有实体
- while(p_entity<len(list_entity)):
- entity = list_entity[p_entity]
- if entity.entity_type=="money":
- if entity.values[entity.label]>=on_value:
- if str(entity.label)=="1":
- set_tenderer_money.add(float(entity.entity_text))
- if str(entity.label)=="0":
- packageName = "Project"
- addMoneyByRoleid(roleList, packageName, "0", entity.entity_text, entity.values[entity.label])
- if entity.entity_type=="person":
- if entity.values[entity.label]>=on_value_person:
- if str(entity.label)=="1":
- for i in range(len(roleList)):
- if roleList[i].role_name=="tenderee":
- roleList[i].linklist.append((entity.entity_text,entity.person_phone))
- elif str(entity.label)=="2":
- for i in range(len(roleList)):
- if roleList[i].role_name=="agency":
- roleList[i].linklist.append((entity.entity_text,entity.person_phone))
- #如果实体属于角色集合,则往后找属性
- if doesEntityOrLinkedEntity_inRoleSet(entity, roleSet):
- p_entity += 1
- #循环查找符合的属性
- while(p_entity<len(list_entity)):
- entity_after = list_entity[p_entity]
- if entity_after.sentence_index-entity.sentence_index>=sentence_len:
- p_entity -= 1
- break
- #若是遇到公司实体,则跳出循环
- if entity_after.entity_type in ['org','company']:
- p_entity -= 1
- break
- if entity_after.values is not None:
- if entity_after.entity_type=="money":
- if entity_after.values[entity_after.label]>=on_value:
- if str(entity_after.label)=="0":
- packageName = "Project"
- addMoneyByRoleid(roleList, packageName, "0", entity_after.entity_text, entity_after.values[entity_after.label])
- elif str(entity_after.label)=="1":
- _list_entitys = [entity]+entity.linked_entitys
- for _entity in _list_entitys:
- if getRoleWithText(roleList, _entity.entity_text) in ['tenderee','agency']:
- packageName_entity = "Project"
- else:
- if len(PackageSet)>1:
- packageName_entity = getPackage(PackageList,_entity.sentence_index,_entity.begin_index)
- if packageName_entity is None:
- continue
- else:
- packageName_entity = "Project"
- if str(_entity.label) in ["2","3","4"]:
- addMoneyByEntity(roleList, packageName_entity, _entity.entity_text, entity_after.entity_text, entity_after.values[entity_after.label])
- '''
- if entity_after.entity_type=="person":
- if entity_after.values[entity_after.label]>=on_value_person:
- if str(entity_after.label)=="1":
- for i in range(len(roleList)):
- if roleList[i].role_name=="tenderee":
- roleList[i].linklist.append((entity_after.entity_text,entity_after.person_phone))
- elif str(entity_after.label)=="2":
- for i in range(len(roleList)):
- if roleList[i].role_name=="agency":
- roleList[i].linklist.append((entity_after.entity_text,entity_after.person_phone))
- elif str(entity_after.label)=="3":
- _list_entitys = [entity]+entity.linked_entitys
- for _entity in _list_entitys:
- for i in range(len(roleList)):
- if roleList[i].entity_text==_entity.entity_text:
- if entity_after.sentence_index-_entity.sentence_index>1 and len(roleList[i].linklist)>0:
- break
- roleList[i].linklist.append((entity_after.entity_text,entity_after.person_phone))
- '''
- p_entity += 1
- p_entity += 1
- ''''''
- #删除一个机构有多个角色的数据
- # 1)第一个公司实体的招标人,则看看下一个实体是否为代理人,如果是则联系人错位连接 。2)在同一句中往后找联系人。3)连接不上在整个文章找联系人。
- temp_ent_list = [] # 临时列表,记录0,1角色及3联系人
- other_person = [] # 阈值以上的联系人列表
- link_person = [] # 有电话没联系上角色的person列表
- other_ent = []
- link_ent = []
- found_person = False
- ent_list = []
- for entity in list_entity:
- if entity.entity_type in ['org','company','person']:
- ent_list.append(entity)
- #for list_index in range(len(ent_list)):
- #if ent_list[list_index].entity_type in ['org','company'] and ent_list[list_index].label == 0 and list_index+2<len(ent_list) and \
- #ent_list[list_index+1].entity_type in ['org','company'] and ent_list[list_index+1].label == 1 and ent_list[list_index+2].entity_type in ['person']:
- #ent_list[list_index+1], ent_list[list_index+2] = ent_list[list_index+2], ent_list[list_index+1]
- for index in range(len(ent_list)):
- entity = ent_list[index]
- if entity.entity_type=="person":
- if entity.values[entity.label]>on_value_person:
- if str(entity.label)=="1":
- for i in range(len(roleList)):
- if roleList[i].role_name=="tenderee":
- roleList[i].linklist.append((entity.entity_text,entity.person_phone))
- link_person.append(entity.entity_text)
- link_ent.append(roleList[i].entity_text)
- elif str(entity.label)=="2":
- for i in range(len(roleList)):
- if roleList[i].role_name=="agency":
- roleList[i].linklist.append((entity.entity_text,entity.person_phone))
- link_person.append(entity.entity_text)
- link_ent.append(roleList[i].entity_text)
- elif str(entity.label)=="3":
- #not_link_person.append((entity_after.entity_text,entity_after.person_phone))
- other_person.append(entity.entity_text)
- temp_ent_list.append((entity.entity_text,entity.person_phone))
- #if entity.entity_text in roleSet:
- if entity.entity_text in set([ent.entity_text for ent in roleList]):
- if entity.label in [0,1]:
- other_ent.append(entity.entity_text)
- temp_ent_list.append((entity.entity_text, entity.label))
- for behind_index in range(index+1, len(ent_list)):
- entity_after = ent_list[behind_index]
- if entity_after.sentence_index-entity.sentence_index>=1 or entity_after.entity_type in ['org','company']: # 只在本句中找联系人
- break
- if entity_after.values is not None:
- if entity_after.entity_type=="person":
- if entity_after.values[entity_after.label]>on_value_person:
- if str(entity_after.label)=="1":
- for i in range(len(roleList)):
- if roleList[i].role_name=="tenderee":
- roleList[i].linklist.append((entity_after.entity_text,entity_after.person_phone))
- link_person.append(entity_after.entity_text)
- link_ent.append(roleList[i].entity_text)
- elif str(entity_after.label)=="2":
- for i in range(len(roleList)):
- if roleList[i].role_name=="agency":
- roleList[i].linklist.append((entity_after.entity_text,entity_after.person_phone))
- link_person.append(entity_after.entity_text)
- link_ent.append(roleList[i].entity_text)
- elif str(entity_after.label)=="3":
- for i in range(len(roleList)):
- if roleList[i].entity_text==entity.entity_text:
- #if entity_after.sentence_index-entity.sentence_index>1 and len(roleList[i].linklist)>0:
- #break
- roleList[i].linklist.append((entity_after.entity_text,entity_after.person_phone))
- link_person.append(entity_after.entity_text)
- not_link_person = [person for person in other_person if person not in link_person]
- not_link_ent = [ent for ent in other_ent if ent not in link_ent]
- if len(not_link_person) > 0 and len(not_link_ent) > 0 :
- item = temp_ent_list
- for i in range(len(item)):
- if item[i][0] in not_link_ent and item[i][1] == 0 and i+3 < len(item):
- if item[i+1][0] in other_ent and item[i+1][1] == 1 and item[i+2][0] in other_person and item[i+3][0] in other_person:
- item[i+1], item[i+2] = item[i+2], item[i+1]
- for i in range(len(item)-1, -1, -1):
- if item[i][0] in not_link_ent:
- for role in roleList:
- if role.entity_text == item[i][0] and len(role.linklist) < 1:
- for j in range(i+1, len(item)):
- if item[j][0] in not_link_person:
- role.linklist.append(item[j])
- break
- else:
- break
- for i in range(len(roleList)):
- if roleList[i].role_name=="win_tenderer":
- set_tenderer_role.add(roleList[i])
- if len(set_tenderer_money)==1 and len(set_tenderer_role)==1:
- list(set_tenderer_role)[0].money = list(set_tenderer_money)[0]
- #删除一个机构有多个角色的数据
- #删除重复人、概率不回传
- final_roleList = []
- for i in range(len(roleList)):
- item = roleList[i].getString(roleList)
- if item:
- final_roleList.append(item)
- return final_roleList
- def getPackageRoleMoney(list_sentence,list_entity):
- '''
- @param:
- list_sentence:文章的句子list
- list_entity:文章的实体list
- @return: 拿到文章的包-标段号-角色-实体名称-金额-联系人-联系电话
- '''
- theRole = getRoleList(list_sentence,list_entity)
- if not theRole:
- return []
- RoleList,RoleSet,PackageList,PackageSet = theRole
- RoleList = findAttributeAfterEntity(RoleList, RoleSet, PackageList, PackageSet, list_entity)
- return RoleList
- def getPREMs(list_sentences,list_entitys,list_articles):
- '''
- @param:
- list_sentence:所有文章的句子list
- list_entity:所有文章的实体list
- @return:list of dict which include文章的包-角色-实体名称-金额-联系人-联系电话
- '''
- result = []
- for list_sentence,list_entity,list_article in zip(list_sentences,list_entitys,list_articles):
- RoleList = getPackageRoleMoney(list_sentence,list_entity)
- result.append([list_article.id,{"prem":RoleList}])
- return result
- if __name__=="__main__":
- '''
- conn = getConnection()
- cursor = conn.cursor()
- #sql = " select distinct A.doc_id from entity_mention A,test_predict_role B where A.entity_id=B.entity_id limit 200"
- sql = " select B.doc_id,B.prem from articles_processed A, articles_validation B where A.id=B.doc_id "
-
- result = []
-
- cursor.execute(sql)
- rows = cursor.fetchall()
- count = 0
- for row in rows:
-
- count += 1
- print(count)
- doc_id = row[0]
-
- roleList = getPackageRoleMoney(doc_id)
- result.append([doc_id,str(roleList),row[1]])
- ''''''
- with codecs.open("getAttribute.html","w",encoding="utf8") as f:
- f.write('<html><head>\
- <meta http-equiv="Content-Type"\
- content="text/html; charset=UTF-8">\
- </head>\
- <body bgcolor="#FFFFFF">\
- <table border="1">\
- <tr>\
- <td>doc_id</td>\
- <td>角色</td>\
- </tr>')
- for item in result:
- f.write("<tr>"+"<td>"+item[0]+"</td>"+"<td>"+item[1]+"</td>"+"<td>"+item[2]+"</td>"+"</tr>")
- f.write("</table></body>")
- '''
|