from dl.common.Utils import findAllIndex from dl.interface.Entitys import PREM import re import copy import math def getTheRole(entity,role_list): ''' @summary:根据实体名称拿到index @param: entity:实体名称 role_list:角色list @return:该实体所在下标 ''' for role_index in range(len(role_list)): if entity in role_list[role_index]: return role_index return None dict_role_id = {"0":"tenderee", "1":"agency", "2":"win_tenderer", "3":"second_tenderer", "4":"third_tenderer"} def getPackage(packageList,sentence_index,begin_index): ''' @param: packageList:文章的包的信息 sentence_index:实体所在的句子 begin_index:实体所在句子的起始位置 @return:公司实体所属的包 ''' if len(packageList)==0: return None before_index = None after_index = None equal_index = None equal_count = 0 for pack_index in range(len(packageList)): if packageList[pack_index][1]>sentence_index and after_index is None: after_index = pack_index if packageList[pack_index][1]int(begin_index): if packageList[i-1][4]: return packageList[i-1][0] else: if packageList[i][4]: return packageList[i-1][0] else: return packageList[i][0] return packageList[end_index-1][0] return None #生成合法的组合 def get_legal_comba(list_entity,dict_role_combination): #拿到一个包中所有合法的组合 def recursive_package(_dict_legal_combination,set_legal_entity,dict_one_selution,list_all_selution): last_layer = False #若是空组合则放回空 if len(_dict_legal_combination.keys())==0: return [] #递归到最后一层则修改状态 if len(_dict_legal_combination.keys())==1: last_layer = True #取一个角色开始进行遍历 _key_role = list(_dict_legal_combination.keys())[0] for item in _dict_legal_combination[_key_role]: copy_dict_one_selution = copy.copy(dict_one_selution) copy_dict_legal_combination = {} copy_set_legal_entity = copy.copy(set_legal_entity) #复制余下的所有角色,进行下一轮递归 for _key in _dict_legal_combination.keys(): if _key!=_key_role: copy_dict_legal_combination[_key] = _dict_legal_combination[_key] #修改为招标人和代理人可以为同一个 if item !="": _flag = True if str(_key_role) in ["0","1"]: for _key_flag in copy_dict_one_selution: if _key_flag not in ["0","1"] and copy_dict_one_selution[_key_flag]==item: _flag = False else: for _key_flag in copy_dict_one_selution: if copy_dict_one_selution[_key_flag]==item: _flag = False if _flag: copy_dict_one_selution[_key_role] = item ''' if item not in copy_set_legal_entity: if item !="": copy_dict_one_selution[_key_role] = item ''' copy_set_legal_entity.add(item) if last_layer: list_all_selution.append(copy_dict_one_selution) else: recursive_package(copy_dict_legal_combination,copy_set_legal_entity,copy_dict_one_selution,list_all_selution) #递归匹配各个包的结果 def recursive_packages(_dict_legal_combination,dict_one_selution,list_all_selution): last_layer = False if len(_dict_legal_combination.keys())==0: return [] if len(_dict_legal_combination.keys())==1: last_layer = True _key_pack = list(_dict_legal_combination.keys())[0] for item in _dict_legal_combination[_key_pack]: copy_dict_one_selution = copy.copy(dict_one_selution) copy_dict_legal_combination = {} for _key in _dict_legal_combination.keys(): if _key!=_key_pack: copy_dict_legal_combination[_key] = _dict_legal_combination[_key] for _key_role in item.keys(): copy_dict_one_selution[_key_pack+"$$"+_key_role] = item[_key_role] if last_layer: list_all_selution.append(copy_dict_one_selution) else: recursive_packages(copy_dict_legal_combination,copy_dict_one_selution,list_all_selution) return list_all_selution #循环获取所有包组合 def circle_pageages(_dict_legal_combination): list_all_selution = [] for _key_pack in _dict_legal_combination.keys(): list_key_selution = [] for item in _dict_legal_combination[_key_pack]: _dict = dict() for _key_role in item.keys(): _dict[_key_pack+"$$"+_key_role] = item[_key_role] list_key_selution.append(_dict) if len(list_all_selution)==0: list_all_selution = list_key_selution else: _list_all_selution = [] for item_1 in list_all_selution: for item_2 in list_key_selution: _list_all_selution.append(dict(item_1,**item_2)) list_all_selution = _list_all_selution return list_all_selution #拿到各个包解析之后的结果 _dict_legal_combination = {} for packageName in dict_role_combination.keys(): _list_all_selution = [] recursive_package(dict_role_combination[packageName], set(), {}, _list_all_selution) ''' print("===1") print(packageName) for item in _list_all_selution: print(item) print("===2") ''' #去除包含子集 list_all_selution_simple = [] _list_set_all_selution = [] for item_selution in _list_all_selution: item_set_selution = set() for _key in item_selution.keys(): item_set_selution.add((_key,item_selution[_key])) _list_set_all_selution.append(item_set_selution) if len(_list_set_all_selution)>1000: _dict_legal_combination[packageName] = _list_all_selution continue for i in range(len(_list_set_all_selution)): be_included = False for j in range(len(_list_set_all_selution)): if i!=j: if len(set(_list_set_all_selution[i])&set(_list_set_all_selution[j]))==len(_list_set_all_selution[i]) and len(_list_set_all_selution[i])!=len(_list_set_all_selution[j]): be_included = True if not be_included: list_all_selution_simple.append(_list_all_selution[i]) _dict_legal_combination[packageName] = list_all_selution_simple _list_final_comba = [] #对各个包的结果进行排列组合 _comba_count = 1 for _key in _dict_legal_combination.keys(): _comba_count *= len(_dict_legal_combination[_key]) #如果过大,则每个包只取概率最大的那个 if _comba_count>250: new_dict_legal_combination = dict() for _key_pack in _dict_legal_combination.keys(): MAX_PROB = -1000 _MAX_PROB_COMBA = None for item in _dict_legal_combination[_key_pack]: _dict = dict() for _key in item.keys(): _dict[str(_key_pack)+"$$"+str(_key)] = item[_key] _prob = getSumExpectation(list_entity, _dict) if _prob>MAX_PROB: MAX_PROB = _prob _MAX_PROB_COMBA = [item] new_dict_legal_combination[_key_pack] = _MAX_PROB_COMBA _dict_legal_combination = new_dict_legal_combination #recursive_packages(_dict_legal_combination, {}, _list_final_comba) _list_final_comba = circle_pageages(_dict_legal_combination) #除了Project包(招标人和代理人),其他包是不会有冲突的 #查看是否有一个实体出现在了Project包和其他包中,如有,要进行裁剪 _list_real_comba = [] for dict_item in _list_final_comba: set_project = set() set_other = set() for _key in list(dict_item.keys()): if _key.split("$$")[0]=="Project": set_project.add(dict_item[_key]) else: set_other.add(dict_item[_key]) set_common = set_project&set_other if len(set_common)>0: dict_project = {} dict_not_project = {} for _key in list(dict_item.keys()): if dict_item[_key] in set_common: if str(_key.split("$$")[0])=="Project": dict_project[_key] = dict_item[_key] else: dict_not_project[_key] = dict_item[_key] else: dict_project[_key] = dict_item[_key] dict_not_project[_key] = dict_item[_key] _list_real_comba.append(dict_project) _list_real_comba.append(dict_not_project) else: _list_real_comba.append(dict_item) return _list_real_comba #计算合计期望 def getSumExpectation(list_entity,combination,on_value=0.5): ''' expect = 0 for entity in list_entity: if entity.entity_type in ['org','company']: values = entity.values role_prob = float(values[int(entity.label)]) _key = entity.packageName+"$$"+str(entity.label) if role_prob>on_value and str(entity.label)!="5": if _key in combination.keys() and combination[_key]==entity.entity_text: expect += math.pow(role_prob,4) else: expect -= math.pow(role_prob,4) ''' #修改为同一个实体只取对应包-角色的最大的概率值 expect = 0 dict_entity_prob = {} for entity in list_entity: if entity.entity_type in ['org','company']: values = entity.values role_prob = float(values[int(entity.label)]) _key = entity.packageName+"$$"+str(entity.label) if role_prob>=on_value and str(entity.label)!="5": if _key in combination.keys() and combination[_key]==entity.entity_text: _key_prob = _key+entity.entity_text if _key_prob in dict_entity_prob.keys(): if dict_entity_prob[_key_prob]-role_prob: dict_entity_prob[_key_prob] = -role_prob else: dict_entity_prob[_key_prob] = -role_prob for _key in dict_entity_prob.keys(): symbol = 1 if dict_entity_prob[_key]>0 else -1 expect += symbol*math.pow(dict_entity_prob[_key],2) return expect def getRoleList(list_sentence,list_entity,on_value = 0.5): ''' @summary: 搜索树,得到所有不矛盾的角色组合,取合计期望值最大的作为结果返回 @param: list_sentence:文章所有的sentence list_entity:文章所有的实体 on_value:概率阈值 @return:文章的角色list ''' pack = getPackagesFromArticle(list_sentence) if pack is None: return None PackageList,PackageSet,dict_PackageCode = pack #拿到所有可能的情况 dict_role_combination = {} #拿到各个实体的packageName,packageCode for entity in list_entity: if entity.entity_type in ['org','company']: values = entity.values role_prob = float(values[int(entity.label)]) if role_prob>=on_value and str(entity.label)!="5": if str(entity.label) in ["0","1"]: packageName = "Project" else: if len(PackageSet)>1: packageName = getPackage(PackageList,entity.sentence_index,entity.end_index) if packageName is None: #continue packageName = "Project" else: packageName = "Project" find_flag = False role_name = dict_role_id.get(str(entity.label)) if packageName in dict_PackageCode.keys(): packageCode = dict_PackageCode[packageName] else: packageCode = "" entity.packageName = packageName entity.packageCode = packageCode entity.roleName = role_name if entity.packageName in dict_role_combination.keys(): if str(entity.label) in dict_role_combination[entity.packageName].keys(): dict_role_combination[entity.packageName][str(entity.label)].add(entity.entity_text) else: dict_role_combination[entity.packageName][str(entity.label)] = set([entity.entity_text]) else: dict_role_combination[entity.packageName] = {} #初始化空值 roleIds = [0,1,2,3,4] for _roleId in roleIds: dict_role_combination[entity.packageName][str(_roleId)] = set([""]) dict_role_combination[entity.packageName][str(entity.label)].add(entity.entity_text) list_real_comba = get_legal_comba(list_entity,dict_role_combination) #拿到最大期望值的组合 max_index = 0 max_expect = -100 _index = 0 for item_combination in list_real_comba: expect = getSumExpectation(list_entity, item_combination) if expect>max_expect: max_index = _index max_expect = expect _index += 1 RoleList = [] RoleSet = set() if len(list_real_comba)>0: for _key in list_real_comba[max_index].keys(): packageName = _key.split("$$")[0] label = _key.split("$$")[1] role_name = dict_role_id.get(str(label)) entity_text = list_real_comba[max_index][_key] if packageName in dict_PackageCode.keys(): packagecode = dict_PackageCode.get(packageName) else: packagecode = "" RoleList.append(PREM(packageName,packagecode,role_name,entity_text,0,0,0.0,[])) RoleSet.add(entity_text) return RoleList,RoleSet,PackageList,PackageSet def getPackagesFromArticle(list_sentence): ''' @param: list_sentence:文章的句子list @return: type:list if [包号,句子index,词偏移,标段号] meaning:文章的包/标段信息 ''' if len(list_sentence)==0: return None PackageList = [] PackageSet = set() dict_packageCode = dict() package_name_pattern = re.compile("((标[段号的包]|分包)的?(名[称单]?|包名))[::]?([^::]{3,30}?),{1}") package_N_name_pattern = re.compile("(分?包|标段|标|包|包组|项目)编?号?[::]?[\((]?([0-9A-Za-z一二三四五六七八九十]){1,2},{1}") package_number_pattern = re.compile("((包|标[段号的包]|分?包|包组|项目)编?号?[::]?[\((]?[0-9A-Za-z一二三四五六七八九十]{1,4})|(第?[0-9A-Za-z一二三四五六七八九十]{1,4}(包号|标[段号的包]|分?包))") other_package_pattern = re.compile('(项目名称|物资名称|设备名称|场次名称|标段名称)[::](.{,20}?)(,|项目)') # 新正则识别标段 number_pattern = re.compile("[0-9A-Za-z一二三四五六七八九十]{1,4}") package_code_pattern = re.compile("(?:编号[::]?\s*)([-\dA-Za-z]+)") def changeIndexFromWordToWords(tokens,word_index): ''' @summary:转换某个字的字偏移为词偏移 ''' before_index = 0 after_index = 0 for i in range(len(tokens)): after_index = after_index+len(tokens[i]) if before_index<=word_index and after_index>=word_index: return i before_index = after_index package_names = [] def extractPackageCode(tokens,word_index,size=20,pattern = package_code_pattern): ''' @summary:抽取包附近的标段号 @param: tokens:包所在句子的分词 word_index:包所在字偏移 size:左右各取多少个词 pattern:提取标段号的正则 @return: type:string,meaning:标段号 ''' index = changeIndexFromWordToWords(tokens,word_index) if indexlen(tokens): end = len(tokens) else: end = index+size #拿到左右两边的词语组成短语 text = "".join(tokens[begin:end]) #在短语中的字偏移 new_word_index = word_index-len("".join(tokens[:begin])) min_distance = len(text) packageCode = None for the_iter in re.finditer(pattern,text): #算出最小距离 distance = min([abs(new_word_index-the_iter.span()[0]),abs(new_word_index-the_iter.span()[1])]) if distance0 and _left_find[-1] in [":",":"]: _flag = True if len(_right_find)>0 and _right_find[0] in [":",":"]: _flag = True PackageList[j].append(_flag) return PackageList,PackageSet,dict_packageCode def findAttributeAfterEntity(roleList,roleSet,PackageList,PackageSet,list_entity,on_value = 0.5,on_value_person=0.5,sentence_len=4): ''' @param: roleList:文章角色list roleSet:文章所有角色的公司名称 PackageList:文章的包信息 PackageSet:文章所有包的名称 list_entity:文章所有经过模型处理的实体 on_value:金额模型的阈值 on_value_person:联系人模型的阈值 sentence_len:公司和属性间隔句子的最大长度 @return:添加了属性信息的角色list ''' #根据roleid添加金额到rolelist中 def addMoneyByRoleid(RoleList,packageName,roleid,money,money_prob): for i in range(len(RoleList)): if RoleList[i].packageName==packageName and RoleList[i].role_name==dict_role_id.get(str(roleid)): if money_prob>RoleList[i].money_prob: RoleList[i].money = money RoleList[i].money_prob = money_prob return RoleList #根据实体名称添加金额到rolelist中 def addMoneyByEntity(RoleList,packageName,entity,money,money_prob): for i in range(len(RoleList)): if RoleList[i].packageName==packageName and RoleList[i].entity_text==entity: if money_prob>RoleList[i].money_prob: RoleList[i].money = money RoleList[i].money_prob = money_prob return RoleList #根据实体名称得到角色 def getRoleWithText(roleList,entity_text): for i in range(len(roleList)): if roleList[i].entity_text==entity_text: return roleList[i].role_name def doesEntityOrLinkedEntity_inRoleSet(entity,RoleSet): _list_entitys = [entity]+entity.linked_entitys for _entity in _list_entitys: if _entity.entity_text in RoleSet: return True p_entity = 0 set_tenderer_role = set() set_tenderer_money = set() #遍历所有实体 while(p_entity=on_value: if str(entity.label)=="1": set_tenderer_money.add(float(entity.entity_text)) if str(entity.label)=="0": packageName = "Project" addMoneyByRoleid(roleList, packageName, "0", entity.entity_text, entity.values[entity.label]) if entity.entity_type=="person": if entity.values[entity.label]>=on_value_person: if str(entity.label)=="1": for i in range(len(roleList)): if roleList[i].role_name=="tenderee": roleList[i].linklist.append((entity.entity_text,entity.person_phone)) elif str(entity.label)=="2": for i in range(len(roleList)): if roleList[i].role_name=="agency": roleList[i].linklist.append((entity.entity_text,entity.person_phone)) #如果实体属于角色集合,则往后找属性 if doesEntityOrLinkedEntity_inRoleSet(entity, roleSet): p_entity += 1 #循环查找符合的属性 while(p_entity=sentence_len: p_entity -= 1 break #若是遇到公司实体,则跳出循环 if entity_after.entity_type in ['org','company']: p_entity -= 1 break if entity_after.values is not None: if entity_after.entity_type=="money": if entity_after.values[entity_after.label]>=on_value: if str(entity_after.label)=="0": packageName = "Project" addMoneyByRoleid(roleList, packageName, "0", entity_after.entity_text, entity_after.values[entity_after.label]) elif str(entity_after.label)=="1": _list_entitys = [entity]+entity.linked_entitys for _entity in _list_entitys: if getRoleWithText(roleList, _entity.entity_text) in ['tenderee','agency']: packageName_entity = "Project" else: if len(PackageSet)>1: packageName_entity = getPackage(PackageList,_entity.sentence_index,_entity.begin_index) if packageName_entity is None: continue else: packageName_entity = "Project" if str(_entity.label) in ["2","3","4"]: addMoneyByEntity(roleList, packageName_entity, _entity.entity_text, entity_after.entity_text, entity_after.values[entity_after.label]) ''' if entity_after.entity_type=="person": if entity_after.values[entity_after.label]>=on_value_person: if str(entity_after.label)=="1": for i in range(len(roleList)): if roleList[i].role_name=="tenderee": roleList[i].linklist.append((entity_after.entity_text,entity_after.person_phone)) elif str(entity_after.label)=="2": for i in range(len(roleList)): if roleList[i].role_name=="agency": roleList[i].linklist.append((entity_after.entity_text,entity_after.person_phone)) elif str(entity_after.label)=="3": _list_entitys = [entity]+entity.linked_entitys for _entity in _list_entitys: for i in range(len(roleList)): if roleList[i].entity_text==_entity.entity_text: if entity_after.sentence_index-_entity.sentence_index>1 and len(roleList[i].linklist)>0: break roleList[i].linklist.append((entity_after.entity_text,entity_after.person_phone)) ''' p_entity += 1 p_entity += 1 '''''' #删除一个机构有多个角色的数据 # 1)第一个公司实体的招标人,则看看下一个实体是否为代理人,如果是则联系人错位连接 。2)在同一句中往后找联系人。3)连接不上在整个文章找联系人。 temp_ent_list = [] # 临时列表,记录0,1角色及3联系人 other_person = [] # 阈值以上的联系人列表 link_person = [] # 有电话没联系上角色的person列表 other_ent = [] link_ent = [] found_person = False ent_list = [] for entity in list_entity: if entity.entity_type in ['org','company','person']: ent_list.append(entity) #for list_index in range(len(ent_list)): #if ent_list[list_index].entity_type in ['org','company'] and ent_list[list_index].label == 0 and list_index+2on_value_person: if str(entity.label)=="1": for i in range(len(roleList)): if roleList[i].role_name=="tenderee": roleList[i].linklist.append((entity.entity_text,entity.person_phone)) link_person.append(entity.entity_text) link_ent.append(roleList[i].entity_text) elif str(entity.label)=="2": for i in range(len(roleList)): if roleList[i].role_name=="agency": roleList[i].linklist.append((entity.entity_text,entity.person_phone)) link_person.append(entity.entity_text) link_ent.append(roleList[i].entity_text) elif str(entity.label)=="3": #not_link_person.append((entity_after.entity_text,entity_after.person_phone)) other_person.append(entity.entity_text) temp_ent_list.append((entity.entity_text,entity.person_phone)) #if entity.entity_text in roleSet: if entity.entity_text in set([ent.entity_text for ent in roleList]): if entity.label in [0,1]: other_ent.append(entity.entity_text) temp_ent_list.append((entity.entity_text, entity.label)) for behind_index in range(index+1, len(ent_list)): entity_after = ent_list[behind_index] if entity_after.sentence_index-entity.sentence_index>=1 or entity_after.entity_type in ['org','company']: # 只在本句中找联系人 break if entity_after.values is not None: if entity_after.entity_type=="person": if entity_after.values[entity_after.label]>on_value_person: if str(entity_after.label)=="1": for i in range(len(roleList)): if roleList[i].role_name=="tenderee": roleList[i].linklist.append((entity_after.entity_text,entity_after.person_phone)) link_person.append(entity_after.entity_text) link_ent.append(roleList[i].entity_text) elif str(entity_after.label)=="2": for i in range(len(roleList)): if roleList[i].role_name=="agency": roleList[i].linklist.append((entity_after.entity_text,entity_after.person_phone)) link_person.append(entity_after.entity_text) link_ent.append(roleList[i].entity_text) elif str(entity_after.label)=="3": for i in range(len(roleList)): if roleList[i].entity_text==entity.entity_text: #if entity_after.sentence_index-entity.sentence_index>1 and len(roleList[i].linklist)>0: #break roleList[i].linklist.append((entity_after.entity_text,entity_after.person_phone)) link_person.append(entity_after.entity_text) not_link_person = [person for person in other_person if person not in link_person] not_link_ent = [ent for ent in other_ent if ent not in link_ent] if len(not_link_person) > 0 and len(not_link_ent) > 0 : item = temp_ent_list for i in range(len(item)): if item[i][0] in not_link_ent and item[i][1] == 0 and i+3 < len(item): if item[i+1][0] in other_ent and item[i+1][1] == 1 and item[i+2][0] in other_person and item[i+3][0] in other_person: item[i+1], item[i+2] = item[i+2], item[i+1] for i in range(len(item)-1, -1, -1): if item[i][0] in not_link_ent: for role in roleList: if role.entity_text == item[i][0] and len(role.linklist) < 1: for j in range(i+1, len(item)): if item[j][0] in not_link_person: role.linklist.append(item[j]) break else: break for i in range(len(roleList)): if roleList[i].role_name=="win_tenderer": set_tenderer_role.add(roleList[i]) if len(set_tenderer_money)==1 and len(set_tenderer_role)==1: list(set_tenderer_role)[0].money = list(set_tenderer_money)[0] #删除一个机构有多个角色的数据 #删除重复人、概率不回传 final_roleList = [] for i in range(len(roleList)): item = roleList[i].getString(roleList) if item: final_roleList.append(item) return final_roleList def getPackageRoleMoney(list_sentence,list_entity): ''' @param: list_sentence:文章的句子list list_entity:文章的实体list @return: 拿到文章的包-标段号-角色-实体名称-金额-联系人-联系电话 ''' theRole = getRoleList(list_sentence,list_entity) if not theRole: return [] RoleList,RoleSet,PackageList,PackageSet = theRole RoleList = findAttributeAfterEntity(RoleList, RoleSet, PackageList, PackageSet, list_entity) return RoleList def getPREMs(list_sentences,list_entitys,list_articles): ''' @param: list_sentence:所有文章的句子list list_entity:所有文章的实体list @return:list of dict which include文章的包-角色-实体名称-金额-联系人-联系电话 ''' result = [] for list_sentence,list_entity,list_article in zip(list_sentences,list_entitys,list_articles): RoleList = getPackageRoleMoney(list_sentence,list_entity) result.append([list_article.id,{"prem":RoleList}]) return result if __name__=="__main__": ''' conn = getConnection() cursor = conn.cursor() #sql = " select distinct A.doc_id from entity_mention A,test_predict_role B where A.entity_id=B.entity_id limit 200" sql = " select B.doc_id,B.prem from articles_processed A, articles_validation B where A.id=B.doc_id " result = [] cursor.execute(sql) rows = cursor.fetchall() count = 0 for row in rows: count += 1 print(count) doc_id = row[0] roleList = getPackageRoleMoney(doc_id) result.append([doc_id,str(roleList),row[1]]) '''''' with codecs.open("getAttribute.html","w",encoding="utf8") as f: f.write('\ \ \ \ \ \ \ \ ') for item in result: f.write(""+""+""+""+"") f.write("
doc_id角色
"+item[0]+""+item[1]+""+item[2]+"
") '''