from BiddingKG.dl.common.Utils import findAllIndex,debug,timeFormat from BiddingKG.dl.interface.Entitys import PREM,Role,Entity import re import copy import math import pandas as pd import os def getTheRole(entity,role_list): ''' @summary:根据实体名称拿到index @param: entity:实体名称 role_list:角色list @return:该实体所在下标 ''' for role_index in range(len(role_list)): if entity in role_list[role_index]: return role_index return None dict_role_id = {"0":"tenderee", "1":"agency", "2":"win_tenderer", "3":"second_tenderer", "4":"third_tenderer"} def getPackage(packageList,sentence_index,begin_index,roleid,MAX_DIS=None,DIRECT=None): ''' @param: packageList:文章的包的信息,包号-sent_index-词偏移-字偏移-[[前作用域句子,句内偏移],[后作用域句子,句内偏移]]-匹配集合 sentence_index:实体所在的句子 begin_index:实体所在句子的起始位置 @return:公司实体所属的包 @summary: 优化多标段,确定标段作用域之后,寻找作用域包含该实体的所有包,从前往后找到一个还没有该roleid的包返回,若找到的包都有roleid,则返回第一个,若没有找到包,返回None ''' ''' if len(packageList)==0: return None before_index = None after_index = None equal_index = None equal_count = 0 for pack_index in range(len(packageList)): if packageList[pack_index][1]>sentence_index and after_index is None: after_index = pack_index if packageList[pack_index][1]int(begin_index): if packageList[i-1][4]: return packageList[i-1][0] else: if packageList[i][4]: return packageList[i-1][0] else: return packageList[i][0] return packageList[end_index-1][0] ''' if len(packageList)==0: return None,False list_legalPack = [] for pack_index in range(len(packageList)): if DIRECT=="L" and (packageList[pack_index]["sentence_index"]>sentence_index or (packageList[pack_index]["sentence_index"]==sentence_index and packageList[pack_index]["offsetWords_begin"]>begin_index)): continue if DIRECT=="R" and (packageList[pack_index]["sentence_index"]sentence_index or (packageList[pack_index]["scope"][1][0]==sentence_index and packageList[pack_index]["scope"][1][1]>=begin_index)): if MAX_DIS is not None: if abs(sentence_index-packageList[pack_index]["sentence_index"])<=MAX_DIS: list_legalPack.append(pack_index) else: list_legalPack.append(pack_index) _flag = True for _index in list_legalPack: if roleid in packageList[_index]["hit"]: continue else: _flag = False packageList[_index]["hit"].add(roleid) return packageList[_index]["pointer"],_flag if len(list_legalPack)>0: return packageList[0]["pointer"],_flag return None,False #生成合法的组合 def get_legal_comba(list_entity,dict_role_combination): #拿到一个包中所有合法的组合 def circle_package(_dict_legal_combination): list_dict_role_first = [] for _role in _dict_legal_combination: if len(list_dict_role_first)==0: for _entity in _dict_legal_combination[_role]: if _entity !="": list_dict_role_first.append({_role:_entity}) else: list_dict_role_after = [] _find_count = 0 for _entity in _dict_legal_combination[_role]: if _entity !="": for _dict in list_dict_role_first: _flag = True for _key1 in _dict: if _entity==_dict[_key1]: #修改为招标人和代理人可以为同一个 if str(_key1) in ["0","1"] and str(_role) in ["0","1"]: _flag = True else: _flag = False if _flag: _find_count += 1 _new_dict = copy.copy(_dict) _new_dict[_role] = _entity if len(list_dict_role_after)>100000: break list_dict_role_after.append(_new_dict) if len(list_dict_role_after)==0: pass else: list_dict_role_first.extend(list_dict_role_after) return list_dict_role_first def recursive_package(_dict_legal_combination,set_legal_entity,dict_one_selution,list_all_selution): last_layer = False #若是空组合则放回空 if len(_dict_legal_combination.keys())==0: return [] #递归到最后一层则修改状态 if len(_dict_legal_combination.keys())==1: last_layer = True #取一个角色开始进行遍历 _key_role = list(_dict_legal_combination.keys())[0] for item in _dict_legal_combination[_key_role]: copy_dict_one_selution = copy.copy(dict_one_selution) copy_dict_legal_combination = {} copy_set_legal_entity = copy.copy(set_legal_entity) #复制余下的所有角色,进行下一轮递归 for _key in _dict_legal_combination.keys(): if _key!=_key_role: copy_dict_legal_combination[_key] = _dict_legal_combination[_key] #修改为招标人和代理人可以为同一个 if item !="": _flag = True if str(_key_role) in ["0","1"]: for _key_flag in copy_dict_one_selution: if _key_flag not in ["0","1"] and copy_dict_one_selution[_key_flag]==item: _flag = False else: for _key_flag in copy_dict_one_selution: if copy_dict_one_selution[_key_flag]==item: _flag = False if _flag: copy_dict_one_selution[_key_role] = item ''' if item not in copy_set_legal_entity: if item !="": copy_dict_one_selution[_key_role] = item ''' copy_set_legal_entity.add(item) if last_layer: list_all_selution.append(copy_dict_one_selution) else: recursive_package(copy_dict_legal_combination,copy_set_legal_entity,copy_dict_one_selution,list_all_selution) #递归匹配各个包的结果 def recursive_packages(_dict_legal_combination,dict_one_selution,list_all_selution): last_layer = False if len(_dict_legal_combination.keys())==0: return [] if len(_dict_legal_combination.keys())==1: last_layer = True _key_pack = list(_dict_legal_combination.keys())[0] for item in _dict_legal_combination[_key_pack]: copy_dict_one_selution = copy.copy(dict_one_selution) copy_dict_legal_combination = {} for _key in _dict_legal_combination.keys(): if _key!=_key_pack: copy_dict_legal_combination[_key] = _dict_legal_combination[_key] for _key_role in item.keys(): copy_dict_one_selution[_key_pack+"$$"+_key_role] = item[_key_role] if last_layer: list_all_selution.append(copy_dict_one_selution) else: recursive_packages(copy_dict_legal_combination,copy_dict_one_selution,list_all_selution) return list_all_selution #循环获取所有包组合 def circle_pageages(_dict_legal_combination): list_all_selution = [] for _key_pack in _dict_legal_combination.keys(): list_key_selution = [] for item in _dict_legal_combination[_key_pack]: _dict = dict() for _key_role in item.keys(): _dict[_key_pack+"$$"+_key_role] = item[_key_role] list_key_selution.append(_dict) if len(list_all_selution)==0: list_all_selution = list_key_selution else: _list_all_selution = [] for item_1 in list_all_selution: for item_2 in list_key_selution: _list_all_selution.append(dict(item_1,**item_2)) list_all_selution = _list_all_selution return list_all_selution #拿到各个包解析之后的结果 _dict_legal_combination = {} for packageName in dict_role_combination.keys(): _list_all_selution = [] # recursive_package(dict_role_combination[packageName], set(), {}, _list_all_selution) _list_all_selution = circle_package(dict_role_combination[packageName]) ''' print("===1") print(packageName) for item in _list_all_selution: print(item) print("===2") ''' #去除包含子集 list_all_selution_simple = [] _list_set_all_selution = [] for item_selution in _list_all_selution: item_set_selution = set() for _key in item_selution.keys(): item_set_selution.add((_key,item_selution[_key])) _list_set_all_selution.append(item_set_selution) if len(_list_set_all_selution)>1000: _dict_legal_combination[packageName] = _list_all_selution continue for i in range(len(_list_set_all_selution)): be_included = False for j in range(len(_list_set_all_selution)): if i!=j: if len(set(_list_set_all_selution[i])&set(_list_set_all_selution[j]))==len(_list_set_all_selution[i]) and len(_list_set_all_selution[i])!=len(_list_set_all_selution[j]): be_included = True if not be_included: list_all_selution_simple.append(_list_all_selution[i]) _dict_legal_combination[packageName] = list_all_selution_simple _list_final_comba = [] #对各个包的结果进行排列组合 _comba_count = 1 for _key in _dict_legal_combination.keys(): _comba_count *= len(_dict_legal_combination[_key]) #如果过大,则每个包只取概率最大的那个 dict_pack_entity_prob = get_dict_entity_prob(list_entity) if _comba_count>250: new_dict_legal_combination = dict() for _key_pack in _dict_legal_combination.keys(): MAX_PROB = -1000 _MAX_PROB_COMBA = None for item in _dict_legal_combination[_key_pack]: # print(_key_pack,item) _dict = dict() for _key in item.keys(): _dict[str(_key_pack)+"$$"+str(_key)] = item[_key] _prob = getSumExpectation(dict_pack_entity_prob, _dict) if _prob>MAX_PROB: MAX_PROB = _prob _MAX_PROB_COMBA = [item] new_dict_legal_combination[_key_pack] = _MAX_PROB_COMBA _dict_legal_combination = new_dict_legal_combination #recursive_packages(_dict_legal_combination, {}, _list_final_comba) _list_final_comba = circle_pageages(_dict_legal_combination) #除了Project包(招标人和代理人),其他包是不会有冲突的 #查看是否有一个实体出现在了Project包和其他包中,如有,要进行裁剪 _list_real_comba = [] for dict_item in _list_final_comba: set_project = set() set_other = set() for _key in list(dict_item.keys()): if _key.split("$$")[0]=="Project": set_project.add(dict_item[_key]) else: set_other.add(dict_item[_key]) set_common = set_project&set_other if len(set_common)>0: dict_project = {} dict_not_project = {} for _key in list(dict_item.keys()): if dict_item[_key] in set_common: if str(_key.split("$$")[0])=="Project": dict_project[_key] = dict_item[_key] else: dict_not_project[_key] = dict_item[_key] else: dict_project[_key] = dict_item[_key] dict_not_project[_key] = dict_item[_key] _list_real_comba.append(dict_project) _list_real_comba.append(dict_not_project) else: _list_real_comba.append(dict_item) return _list_real_comba def get_dict_entity_prob(list_entity,on_value=0.5): dict_pack_entity_prob = {} for entity in list_entity: if entity.entity_type in ['org','company']: values = entity.values role_prob = float(values[int(entity.label)]) _key = entity.packageName+"$$"+str(entity.label) if role_prob>=on_value and str(entity.label)!="5": _key_prob = _key+"$text$"+entity.entity_text if _key_prob in dict_pack_entity_prob: if role_prob>dict_pack_entity_prob[_key_prob][1]: dict_pack_entity_prob[_key_prob] = [entity.entity_text,role_prob] else: dict_pack_entity_prob[_key_prob] = [entity.entity_text,role_prob] return dict_pack_entity_prob #计算合计期望 def getSumExpectation(dict_pack_entity_prob,combination,on_value=0.5): ''' expect = 0 for entity in list_entity: if entity.entity_type in ['org','company']: values = entity.values role_prob = float(values[int(entity.label)]) _key = entity.packageName+"$$"+str(entity.label) if role_prob>on_value and str(entity.label)!="5": if _key in combination.keys() and combination[_key]==entity.entity_text: expect += math.pow(role_prob,4) else: expect -= math.pow(role_prob,4) ''' #修改为同一个实体只取对应包-角色的最大的概率值 expect = 0 dict_entity_prob = {} for _key_pack_entity in dict_pack_entity_prob: _key_pack = _key_pack_entity.split("$text$")[0] role_prob = dict_pack_entity_prob[_key_pack_entity][1] if _key_pack in combination.keys() and combination[_key_pack]==dict_pack_entity_prob[_key_pack_entity][0]: if _key_pack_entity in dict_entity_prob.keys(): if dict_entity_prob[_key_pack_entity]-role_prob: dict_entity_prob[_key_pack_entity] = -role_prob else: dict_entity_prob[_key_pack_entity] = -role_prob # for entity in list_entity: # if entity.entity_type in ['org','company']: # values = entity.values # role_prob = float(values[int(entity.label)]) # _key = entity.packageName+"$$"+str(entity.label) # if role_prob>=on_value and str(entity.label)!="5": # if _key in combination.keys() and combination[_key]==entity.entity_text: # _key_prob = _key+entity.entity_text # if _key_prob in dict_entity_prob.keys(): # if dict_entity_prob[_key_prob]-role_prob: # dict_entity_prob[_key_prob] = -role_prob # else: # dict_entity_prob[_key_prob] = -role_prob for _key in dict_entity_prob.keys(): symbol = 1 if dict_entity_prob[_key]>0 else -1 expect += symbol*math.pow(dict_entity_prob[_key],2) return expect def getRoleList(list_sentence,list_entity,on_value = 0.5): ''' @summary: 搜索树,得到所有不矛盾的角色组合,取合计期望值最大的作为结果返回 @param: list_sentence:文章所有的sentence list_entity:文章所有的实体 on_value:概率阈值 @return:文章的角色list ''' pack = getPackagesFromArticle(list_sentence,list_entity) if pack is None: return None PackageList,PackageSet,dict_PackageCode = pack #拿到所有可能的情况 dict_role_combination = {} #拿到各个实体的packageName,packageCode for entity in list_entity: if entity.entity_type in ['org','company']: values = entity.values role_prob = float(values[int(entity.label)]) if role_prob>=on_value and str(entity.label)!="5": if str(entity.label) in ["0","1"]: packageName = "Project" else: if len(PackageSet)>0: packagePointer,_ = getPackage(PackageList,entity.sentence_index,entity.end_index,"role-"+str(entity.label)) if packagePointer is None: #continue packageName = "Project" else: #add pointer_pack entity.pointer_pack = packagePointer packageName = packagePointer.entity_text else: packageName = "Project" find_flag = False if packageName in dict_PackageCode.keys(): packageCode = dict_PackageCode[packageName] else: packageCode = "" entity.packageCode = packageCode role_name = dict_role_id.get(str(entity.label)) entity.roleName = role_name entity.packageName = packageName if entity.packageName in dict_role_combination.keys(): if str(entity.label) in dict_role_combination[entity.packageName].keys(): dict_role_combination[entity.packageName][str(entity.label)].add(entity.entity_text) else: dict_role_combination[entity.packageName][str(entity.label)] = set([entity.entity_text]) else: dict_role_combination[entity.packageName] = {} #初始化空值 roleIds = [0,1,2,3,4] for _roleId in roleIds: dict_role_combination[entity.packageName][str(_roleId)] = set([""]) dict_role_combination[entity.packageName][str(entity.label)].add(entity.entity_text) list_real_comba = get_legal_comba(list_entity,dict_role_combination) #拿到最大期望值的组合 max_index = 0 max_expect = -100 _index = 0 dict_pack_entity_prob = get_dict_entity_prob(list_entity) for item_combination in list_real_comba: expect = getSumExpectation(dict_pack_entity_prob, item_combination) if expect>max_expect: max_index = _index max_expect = expect _index += 1 RoleList = [] RoleSet = set() if len(list_real_comba)>0: for _key in list_real_comba[max_index].keys(): packageName = _key.split("$$")[0] label = _key.split("$$")[1] role_name = dict_role_id.get(str(label)) entity_text = list_real_comba[max_index][_key] if packageName in dict_PackageCode.keys(): packagecode = dict_PackageCode.get(packageName) else: packagecode = "" RoleList.append(PREM(packageName,packagecode,role_name,entity_text,0,0,0.0,[])) RoleSet.add(entity_text) #根据最优树来修正list_entity中角色对包的连接 for _entity in list_entity: if _entity.pointer_pack is not None: _pack_name = _entity.pointer_pack.entity_text _find_flag = False for _prem in RoleList: if _prem.packageName==_pack_name and _prem.entity_text==_entity.entity_text: _find_flag = True if not _find_flag: _entity.pointer_pack = None return RoleList,RoleSet,PackageList,PackageSet def getPackageScopePattern(): ''' @summary: 获取包的作用域关键词 ''' df = pd.read_excel(os.path.dirname(__file__)+"/end.xls") pattern = "(" for item in df["list_word"]: item = str(item).replace("(","\(").replace(")","\)").replace(".","\.").replace("[","\[").replace("]","\]").replace("-","\-") pattern += item+"|" pattern = pattern[:-1]+")[::是为]|业绩.{,30}标段[0-9A-Za-z一二三四五六七八九十]{0,3}" return pattern pattern_packageScope = getPackageScopePattern() def getPackagesFromArticle(list_sentence,list_entity): ''' @param: list_sentence:文章的句子list @summary: 将包的信息插入list_entity中 @return: type:list if [包号,句子index,词偏移,标段号] meaning:文章的包/标段信息 ''' if len(list_sentence)==0: return None PackageList = [] PackageList_scope = [] PackageSet = set() dict_packageCode = dict() package_name_pattern = re.compile("((标[段号的包]|分包)的?(名[称单]?|包名))[::]?([^::]{3,30}?),{1}") package_N_name_pattern = re.compile("[^承](分?包|标段|标包|标|包|包组|子项目|包件|项目类型)编?号?[::]?[\((]?([0-9A-Za-z一二三四五六七八九十ⅠⅡⅢⅣⅤⅥⅦ]){1,2},{1}") package_number_pattern = re.compile("(([^承](包|标[段号的包]|分?包|包组|包件)编?号?|子项目|项目类型)[::]?[0-9A-Za-z一二三四五六七八九十ⅠⅡⅢⅣⅤⅥⅦ]{1,4}[^\.])[^至]?|([^\.]?第?[ⅠⅡⅢⅣⅤⅥⅦ0-9A-Za-z一二三四五六七八九十]{1,4}(包号|标[段号的包]|分?包))") # other_package_pattern = re.compile('(项目名称|物资名称|设备名称|场次名称|标段名称)[::](.{,20}?)(,|项目)') # 新正则识别标段 other_package_pattern = re.compile('((项目|物资|设备|场次|标段|标的|产品)(名称))[::]([^,。]{,50}?)(,|。)') # # 2020/11/23 大网站规则 调整 package_N_name_pattern, package_N_name_pattern 中的项目 改为 子项目 win_tenderer_pattern = re.compile('(中标人|供应商)[::](.{,25})(,|。)') # 2020/11/23 大网站规则 调整 model_pattern = re.compile('(型号|序号)[::]([^,。]{,20})(,|。)') # 2020/11/23 大网站规则 调整 number_pattern = re.compile("[0-9A-Za-z一二三四五六七八九十ⅠⅡⅢⅣⅤⅥⅦ]{1,4}") package_code_pattern = re.compile("(?:编号[::]?\s*)([-\dA-Za-z\(\)]+)") def changeIndexFromWordToWords(tokens,word_index): ''' @summary:转换某个字的字偏移为词偏移 ''' before_index = 0 after_index = 0 for i in range(len(tokens)): after_index = after_index+len(tokens[i]) if before_index<=word_index and after_index>=word_index: return i before_index = after_index package_names = [] def extractPackageCode(tokens,word_index,size=20,pattern = package_code_pattern): ''' @summary:抽取包附近的标段号 @param: tokens:包所在句子的分词 word_index:包所在字偏移 size:左右各取多少个词 pattern:提取标段号的正则 @return: type:string,meaning:标段号 ''' index = changeIndexFromWordToWords(tokens,word_index) if indexlen(tokens): end = len(tokens) else: end = index+size #拿到左右两边的词语组成短语 text = "".join(tokens[begin:end]) #在短语中的字偏移 new_word_index = word_index-len("".join(tokens[:begin])) min_distance = len(text) packageCode = None for the_iter in re.finditer(pattern,text): #算出最小距离 distance = min([abs(new_word_index-the_iter.span()[0]),abs(new_word_index-the_iter.span()[1])]) if distance1: for i in range(len(list_sentence)): PackageList_item = [] PackageList_item_scope = [] content = list_sentence[i].sentence_text tokens = list_sentence[i].tokens names = re.findall(other_package_pattern, content) N_names = re.findall(win_tenderer_pattern, content) if len(names) != 1 or len(N_names) != 1: continue for iter in re.finditer(other_package_pattern,content): temp_package_number = iter.group(4) xinghao = re.search(model_pattern, content) if xinghao: temp_package_number = temp_package_number + '+' + xinghao.group(2) # print('新正则采购包名补充',temp_package_number) PackageList_item.append({"name":temp_package_number,"sentence_index":i,"offsetWords_begin":changeIndexFromWordToWords(tokens,iter.span()[0]),"offsetWord_begin":iter.span()[0],"offsetWord_end":iter.span()[1]}) # PackageList_item.append([temp_package_number,i,changeIndexFromWordToWords(tokens,iter.span()[0]),iter.span()[0],iter.span()[1]]) code = extractPackageCode(tokens, iter.span()[0]) if code is not None: dict_packageCode[temp_package_number] = code PackageSet.add(temp_package_number) #识别packageScope for iter in re.finditer(pattern_packageScope,content): PackageList_item_scope.append({"name":"","sentence_index":i,"offsetWords_begin":changeIndexFromWordToWords(tokens,iter.span()[0]),"offsetWord_begin":iter.span()[0],"offsetWord_end":iter.span()[1]}) # PackageList_item_scope.append(["",i,changeIndexFromWordToWords(tokens,iter.span()[0]),iter.span()[0],iter.span()[1]]) PackageList_item_scope = PackageList_item +PackageList_item_scope PackageList_item_scope.sort(key=lambda x:x["offsetWord_begin"]) PackageList_scope = PackageList_scope+PackageList_item_scope PackageList_item.sort(key=lambda x:x["sentence_index"]) pattern_punctuation = "[::()\(\),,。;;]" for i in range(len(list_sentence)): for j in range(len(PackageList_scope)): if i==PackageList_scope[j]["sentence_index"] and PackageList_scope[j]["name"]!="": _flag = False left_str = list_sentence[i].sentence_text[PackageList_scope[j]["offsetWord_begin"]-30:PackageList_scope[j]["offsetWord_begin"]+1] right_str = list_sentence[i].sentence_text[PackageList_scope[j]["offsetWord_begin"]:PackageList_scope[j]["offsetWord_begin"]+30] _left_find = re.findall(pattern_punctuation,left_str) _right_find = re.findall(pattern_punctuation,right_str) #print(left_str) if re.search("同",left_str[-1:]) is not None and PackageList_scope[j]["name"]=="一": continue if re.search("划分",right_str[:10]) is not None: continue if len(_left_find)>0 and _left_find[-1] in [":",":"]: _flag = True if len(_right_find)>0 and _right_find[0] in [":",":"]: _flag = True if _flag: scope_begin = [PackageList_scope[j]["sentence_index"],PackageList_scope[j]["offsetWords_begin"]] else: if j==0: scope_begin = [0,0] else: scope_begin = [PackageList_scope[j-1]["sentence_index"],PackageList_scope[j-1]["offsetWords_begin"]] if j==len(PackageList_scope)-1: scope_end = [PackageList_scope[j]["offsetWords_begin"],changeIndexFromWordToWords(list_sentence[i].tokens, len(list_sentence[i].sentence_text))] else: scope_end = [PackageList_scope[j+1]["sentence_index"],PackageList_scope[j+1]["offsetWords_begin"]] if PackageList_scope[j-1]["sentence_index"]==PackageList_scope[j]["sentence_index"] and PackageList_scope[j-1]["offsetWord_begin"]<=PackageList_scope[j]["offsetWord_begin"] and PackageList_scope[j-1]["offsetWord_end"]>=PackageList_scope[j]["offsetWord_end"]: continue #add package to entity _pack_entity = Entity(doc_id=list_sentence[0].doc_id,entity_id="%s_%s_%s_%s"%(list_sentence[0].doc_id,i,PackageList_scope[j]["offsetWord_begin"],PackageList_scope[j]["offsetWord_begin"]),entity_text=PackageList_scope[j]["name"],entity_type="package",sentence_index=PackageList_scope[j]["sentence_index"],begin_index=changeIndexFromWordToWords(list_sentence[i].tokens,PackageList_scope[j]["offsetWord_begin"]),end_index=changeIndexFromWordToWords(list_sentence[i].tokens,PackageList_scope[j]["offsetWord_end"]),wordOffset_begin=PackageList_scope[j]["offsetWord_begin"],wordOffset_end=PackageList_scope[j]["offsetWord_end"]) list_entity.append(_pack_entity) copy_pack = copy.copy(PackageList_scope[j]) copy_pack["scope"] = [scope_begin,scope_end] copy_pack["hit"] = set() copy_pack["pointer"] = _pack_entity PackageList.append(copy_pack) return PackageList,PackageSet,dict_packageCode def findAttributeAfterEntity(PackDict,roleSet,PackageList,PackageSet,list_entity,on_value = 0.5,on_value_person=0.5,sentence_len=4): ''' @param: PackDict:文章包dict roleSet:文章所有角色的公司名称 PackageList:文章的包信息 PackageSet:文章所有包的名称 list_entity:文章所有经过模型处理的实体 on_value:金额模型的阈值 on_value_person:联系人模型的阈值 sentence_len:公司和属性间隔句子的最大长度 @return:添加了属性信息的角色list ''' #根据roleid添加金额到rolelist中 def addMoneyByRoleid(packDict,packageName,roleid,money,money_prob): for i in range(len(packDict[packageName]["roleList"])): if packDict[packageName]["roleList"][i].role_name==dict_role_id.get(str(roleid)): if money_prob>packDict[packageName]["roleList"][i].money_prob: packDict[packageName]["roleList"][i].money = money packDict[packageName]["roleList"][i].money_prob = money_prob return packDict #根据实体名称添加金额到rolelist中 def addMoneyByEntity(packDict,packageName,entity,money,money_prob): for i in range(len(packDict[packageName]["roleList"])): if packDict[packageName]["roleList"][i].entity_text==entity: if money_prob>packDict[packageName]["roleList"][i].money_prob: packDict[packageName]["roleList"][i].money = money packDict[packageName]["roleList"][i].money_prob = money_prob return packDict #根据实体名称得到角色 def getRoleWithText(packDict,entity_text): for pack in packDict.keys(): for i in range(len(packDict[pack]["roleList"])): if packDict[pack]["roleList"][i].entity_text==entity_text: return packDict[pack]["roleList"][i].role_name def doesEntityOrLinkedEntity_inRoleSet(entity,RoleSet): _list_entitys = [entity]+entity.linked_entitys for _entity in _list_entitys: if _entity.entity_text in RoleSet: return True p_entity = 0 #遍历所有实体 while(p_entity=on_value: if str(entity.label)=="0": packagePointer,_ = getPackage(PackageList,entity.sentence_index,entity.begin_index,"money-"+str(entity.label)) if packagePointer is None: packageName = "Project" else: packageName = packagePointer.entity_text addMoneyByRoleid(PackDict, packageName, "0", entity.entity_text, entity.values[entity.label]) ''' ''' # 2020/11/25 与下面的联系人连接步骤重复,取消 if entity.entity_type=="person": if entity.values[entity.label]>=on_value_person: if str(entity.label)=="1": for i in range(len(PackDict["Project"]["roleList"])): if PackDict["Project"]["roleList"][i].role_name=="tenderee": PackDict["Project"]["roleList"][i].linklist.append((entity.entity_text,entity.person_phone)) # add pointer_person for _entity in list_entity: if dict_role_id.get(str(_entity.label))=="tenderee": for i in range(len(PackDict["Project"]["roleList"])): if PackDict["Project"]["roleList"][i].entity_text==_entity.entity_text and PackDict["Project"]["roleList"][i].role_name=="tenderee": _entity.pointer_person = entity elif str(entity.label)=="2": for i in range(len(PackDict["Project"]["roleList"])): if PackDict["Project"]["roleList"][i].role_name=="agency": PackDict["Project"]["roleList"][i].linklist.append((entity.entity_text,entity.person_phone)) # add pointer_person for _entity in list_entity: if dict_role_id.get(str(_entity.label))=="agency": for i in range(len(PackDict["Project"]["roleList"])): if PackDict["Project"]["roleList"][i].entity_text==_entity.entity_text and PackDict["Project"]["roleList"][i].role_name=="agency": _entity.pointer_person = entity ''' #金额往前找实体 if entity.entity_type=="money": if entity.values[entity.label]>=on_value: p_entity_money= p_entity entity_money = list_entity[p_entity_money] if len(PackageSet)>0: packagePointer,_ = getPackage(PackageList,entity_money.sentence_index,entity_money.begin_index,"money-"+str(entity_money.entity_text)+"-"+str(entity_money.label)) if packagePointer is None: packageName_entity = "Project" else: packageName_entity = packagePointer.entity_text else: packageName_entity = "Project" while(p_entity_money>0): entity_before = list_entity[p_entity_money] if entity_before.entity_type in ['org','company']: if str(entity_before.label)=="1": addMoneyByEntity(PackDict, packageName_entity, entity_before.entity_text, entity_money.entity_text, entity_money.values[entity_money.label]) #add pointer_money entity_before.pointer_money = entity_money break p_entity_money -= 1 #如果实体属于角色集合,则往后找属性 if doesEntityOrLinkedEntity_inRoleSet(entity, roleSet): p_entity += 1 #循环查找符合的属性 while(p_entity=sentence_len: p_entity -= 1 break #若是遇到公司实体,则跳出循环 if entity_after.entity_type in ['org','company']: p_entity -= 1 break if entity_after.values is not None: if entity_after.entity_type=="money": if entity_after.values[entity_after.label]>=on_value: ''' #招标金额从后往前找 if str(entity_after.label)=="0": packagePointer,_ = getPackage(PackageList,entity.sentence_index,entity.begin_index,"money-"+str(entity.label)) if packagePointer is None: packageName = "Project" else: packageName = packagePointer.entity_text addMoneyByRoleid(PackDict, packageName, "0", entity_after.entity_text, entity_after.values[entity_after.label]) ''' if str(entity_after.label)=="1": #print(entity_after.entity_text,entity.entity_text) _list_entitys = [entity]+entity.linked_entitys if len(PackageSet)>0: packagePointer,_ = getPackage(PackageList,entity_after.sentence_index,entity_after.begin_index,"money-"+str(entity_after.entity_text)+"-"+str(entity_after.label)) if packagePointer is None: packageName_entity = "Project" else: packageName_entity = packagePointer.entity_text else: packageName_entity = "Project" if str(entity.label) in ["2","3","4"]: addMoneyByEntity(PackDict, packageName_entity, entity.entity_text, entity_after.entity_text, entity_after.values[entity_after.label]) #add pointer_money entity.pointer_money = entity_after ''' if entity_after.entity_type=="person": if entity_after.values[entity_after.label]>=on_value_person: if str(entity_after.label)=="1": for i in range(len(roleList)): if roleList[i].role_name=="tenderee": roleList[i].linklist.append((entity_after.entity_text,entity_after.person_phone)) elif str(entity_after.label)=="2": for i in range(len(roleList)): if roleList[i].role_name=="agency": roleList[i].linklist.append((entity_after.entity_text,entity_after.person_phone)) elif str(entity_after.label)=="3": _list_entitys = [entity]+entity.linked_entitys for _entity in _list_entitys: for i in range(len(roleList)): if roleList[i].entity_text==_entity.entity_text: if entity_after.sentence_index-_entity.sentence_index>1 and len(roleList[i].linklist)>0: break roleList[i].linklist.append((entity_after.entity_text,entity_after.person_phone)) ''' p_entity += 1 p_entity += 1 '''''' # 1)第一个公司实体的招标人,则看看下一个实体是否为代理人,如果是则联系人错位连接 。2)在同一句中往后找联系人。3)连接不上在整个文章找联系人。 temp_ent_list = [] # 临时列表,记录0,1角色及3联系人 other_person = [] # 阈值以上的联系人列表 link_person = [] # 有电话没联系上角色的person列表 other_ent = [] link_ent = [] found_person = False ent_list = [] for entity in list_entity: if entity.entity_type in ['org','company','person']: ent_list.append(entity) #for list_index in range(len(ent_list)): #if ent_list[list_index].entity_type in ['org','company'] and ent_list[list_index].label == 0 and list_index+2on_value_person: if str(entity.label)=="1": for i in range(len(PackDict["Project"]["roleList"])): if PackDict["Project"]["roleList"][i].role_name=="tenderee": PackDict["Project"]["roleList"][i].linklist.append((entity.entity_text,entity.person_phone)) link_person.append(entity.entity_text) link_ent.append(PackDict["Project"]["roleList"][i].entity_text) # add pointer_person for _entity in list_entity: if dict_role_id.get(str(_entity.label))=="tenderee": for i in range(len(PackDict["Project"]["roleList"])): if PackDict["Project"]["roleList"][i].entity_text==_entity.entity_text and PackDict["Project"]["roleList"][i].role_name=="tenderee": _entity.pointer_person = entity elif str(entity.label)=="2": for i in range(len(PackDict["Project"]["roleList"])): if PackDict["Project"]["roleList"][i].role_name=="agency": PackDict["Project"]["roleList"][i].linklist.append((entity.entity_text,entity.person_phone)) link_person.append(entity.entity_text) link_ent.append(PackDict["Project"]["roleList"][i].entity_text) # add pointer_person for _entity in list_entity: if dict_role_id.get(str(_entity.label))=="agency": for i in range(len(PackDict["Project"]["roleList"])): if PackDict["Project"]["roleList"][i].entity_text==_entity.entity_text and PackDict["Project"]["roleList"][i].role_name=="agency": _entity.pointer_person = entity elif str(entity.label)=="3": if entity.entity_text in sure_person_set: # 2020/11/25 排除已经确定角色的联系人 continue #not_link_person.append((entity_after.entity_text,entity_after.person_phone)) other_person.append(entity.entity_text) temp_ent_list.append((entity.entity_text,entity.person_phone,entity)) #if entity.entity_text in roleSet: if entity.entity_text in roleSet: if entity.label in [0,1]: other_ent.append(entity.entity_text) temp_ent_list.append((entity.entity_text, entity.label,entity)) for behind_index in range(index+1, len(ent_list)): entity_after = ent_list[behind_index] if entity_after.sentence_index-entity.sentence_index>=1 or entity_after.entity_type in ['org','company']: # 只在本句中找联系人 break if entity_after.values is not None: if entity_after.entity_type=="person": if str(entity_after.label) == "0": # 2020/11/25角色后面为非联系人 停止继续往后找 break if entity_after.values[entity_after.label]>on_value_person: if str(entity_after.label)=="1": for i in range(len(PackDict["Project"]["roleList"])): if PackDict["Project"]["roleList"][i].role_name=="tenderee": PackDict["Project"]["roleList"][i].linklist.append((entity_after.entity_text,entity_after.person_phone)) link_person.append(entity_after.entity_text) link_ent.append(PackDict["Project"]["roleList"][i].entity_text) elif str(entity_after.label)=="2": for i in range(len(PackDict["Project"]["roleList"])): if PackDict["Project"]["roleList"][i].role_name=="agency": PackDict["Project"]["roleList"][i].linklist.append((entity_after.entity_text,entity_after.person_phone)) link_person.append(entity_after.entity_text) link_ent.append(PackDict["Project"]["roleList"][i].entity_text) elif str(entity_after.label)=="3": if entity_after.entity_text in sure_person_set: # 2020/11/25 如果姓名已经出现在确定角色联系人中则停止往后找 break elif entity_after.begin_index - entity.end_index > 30:#2020/10/25 如果角色实体与联系人实体间隔大于阈值停止 break for pack in PackDict.keys(): for i in range(len(PackDict[pack]["roleList"])): if PackDict[pack]["roleList"][i].entity_text==entity.entity_text: #if entity_after.sentence_index-entity.sentence_index>1 and len(roleList[i].linklist)>0: #break PackDict[pack]["roleList"][i].linklist.append((entity_after.entity_text,entity_after.person_phone)) link_person.append(entity_after.entity_text) #add pointer_person entity.pointer_person = entity_after not_link_person = [person for person in other_person if person not in link_person] not_link_ent = [ent for ent in other_ent if ent not in link_ent] if len(not_link_person) > 0 and len(not_link_ent) > 0 : item = temp_ent_list for i in range(len(item)): if item[i][0] in not_link_ent and item[i][1] == 0 and i+3 < len(item): if item[i+1][0] in other_ent and item[i+1][1] == 1 and item[i+2][0] in other_person and item[i+3][0] in other_person: item[i+1], item[i+2] = item[i+2], item[i+1] for i in range(len(item)-1, -1, -1): if item[i][0] in not_link_ent: for pack in PackDict.keys(): for role in PackDict[pack]["roleList"]: if role.entity_text == item[i][0] and len(role.linklist) < 1: for j in range(i+1, len(item)): if item[j][0] in not_link_person: role.linklist.append(item[j][:2]) #add pointer_person item[i][2].pointer_person = item[j][2] break else: break #寻找多标段招标金额 p_entity = len(list_entity)-1 set_tenderer_money = set() #遍历所有实体 while(p_entity>=0): entity = list_entity[p_entity] if entity.entity_type=="money": if entity.values[entity.label]>=on_value: if str(entity.label)=="1": set_tenderer_money.add(float(entity.entity_text)) if str(entity.label)=="0": ''' if p_entity>0: p_before = list_entity[p_entity-1] if p_before.entity_type=="money" and p_before.label==entity.label and p_before.entity_text==entity.entity_text and abs(entity.begin_index-p_before.end_index)<=2: p_entity -= 1 continue ''' packagePointer,_flag = getPackage(PackageList,entity.sentence_index,entity.begin_index,"money-"+str(entity.label),MAX_DIS=2,DIRECT="L") if packagePointer is None: packageName = "Project" else: packageName = packagePointer.entity_text if packageName=="Project": if PackDict["Project"]["tendereeMoney"]1 and len(set_tenderer_role)==1: _maxMoney = 0 _sumMoney = 0 for _m in list(set_tenderer_money): _sumMoney += _m if _m>_maxMoney: _maxMoney = _m if _sumMoney/_maxMoney==2: list(set_tenderer_role)[0].money = _maxMoney else: list(set_tenderer_role)[0].money = _maxMoney #每个包都只找到一个金额 _flag_pack_money = True for k,v in dict_pack_tenderer_money.items(): if len(v[1])!=1: _flag_pack_money = False if _flag_pack_money and len(PackageSet)==len(dict_pack_tenderer_money.keys()): for k,v in dict_pack_tenderer_money.items(): v[0].money = list(v[1])[0] for pack in PackDict.keys(): for i in range(len(PackDict[pack]["roleList"])): PackDict[pack]["roleList"][i] = PackDict[pack]["roleList"][i].getString() for item in list_pop: PackDict.pop(item) return PackDict def initPackageAttr(RoleList,PackageSet): ''' @summary: 根据拿到的roleList和packageSet初始化接口返回的数据 ''' packDict = dict() packDict["Project"] = {"code":"","tendereeMoney":0,"roleList":[]} for item in list(PackageSet): packDict[item] = {"code":"","tendereeMoney":0,"roleList":[]} for item in RoleList: if packDict[item.packageName]["code"] =="": packDict[item.packageName]["code"] = item.packageCode packDict[item.packageName]["roleList"].append(Role(item.role_name,item.entity_text,0,0,0.0,[])) return packDict def getPackageRoleMoney(list_sentence,list_entity): ''' @param: list_sentence:文章的句子list list_entity:文章的实体list @return: 拿到文章的包-标段号-角色-实体名称-金额-联系人-联系电话 ''' # print("=1") theRole = getRoleList(list_sentence,list_entity) if not theRole: return [] RoleList,RoleSet,PackageList,PackageSet = theRole ''' for item in PackageList: print(item) ''' # print("=2") PackDict = initPackageAttr(RoleList, PackageSet) # print("=3") PackDict = findAttributeAfterEntity(PackDict, RoleSet, PackageList, PackageSet, list_entity) # print("=4") return PackDict def getOtherAttributes(list_entity): dict_other = {"bidway":"", "moneysource":"", "person_review":[], "time_release":"", "time_bidopen":"", "time_bidclose":"", "serviceTime":"", "product":[]} for entity in list_entity: if entity.entity_type == 'bidway': dict_other["bidway"] = entity.entity_text elif entity.entity_type=='moneysource': dict_other["moneysource"] = entity.entity_text elif entity.entity_type=='serviceTime': dict_other["serviceTime"] = entity.entity_text elif entity.entity_type == 'time' and entity.label==1: dict_other["time_release"] = timeFormat(entity.entity_text) elif entity.entity_type == 'time' and entity.label==2: dict_other["time_bidopen"] = timeFormat(entity.entity_text) elif entity.entity_type == 'time' and entity.label == 3: dict_other["time_bidclose"] = timeFormat(entity.entity_text) elif entity.entity_type=="person" and entity.label ==4: dict_other["person_review"].append(entity.entity_text) elif entity.entity_type=='product': dict_other["product"].append(entity.entity_text) dict_other["product"] = list(set(dict_other["product"])) return dict_other def getPREMs(list_sentences,list_entitys,list_articles): ''' @param: list_sentence:所有文章的句子list list_entity:所有文章的实体list @return:list of dict which include文章的包-角色-实体名称-金额-联系人-联系电话 ''' result = [] for list_sentence,list_entity,list_article in zip(list_sentences,list_entitys,list_articles): RoleList = getPackageRoleMoney(list_sentence,list_entity) result.append(dict({"prem":RoleList,"docid":list_article.doc_id},**getOtherAttributes(list_entity))) return result if __name__=="__main__": ''' conn = getConnection() cursor = conn.cursor() #sql = " select distinct A.doc_id from entity_mention A,test_predict_role B where A.entity_id=B.entity_id limit 200" sql = " select B.doc_id,B.prem from articles_processed A, articles_validation B where A.id=B.doc_id " result = [] cursor.execute(sql) rows = cursor.fetchall() count = 0 for row in rows: count += 1 print(count) doc_id = row[0] roleList = getPackageRoleMoney(doc_id) result.append([doc_id,str(roleList),row[1]]) '''''' with codecs.open("getAttribute.html","w",encoding="utf8") as f: f.write('\ \ \ \ \ \ \ \ ') for item in result: f.write(""+""+""+""+"") f.write("
doc_id角色
"+item[0]+""+item[1]+""+item[2]+"
") '''