from BiddingKG.dl.common.Utils import findAllIndex,debug,timeFormat,getCurrent_date from BiddingKG.dl.interface.Entitys import PREM,Role,Entity from decimal import Decimal import re import copy import math import pandas as pd import os from scipy.optimize import linear_sum_assignment from BiddingKG.dl.interface.Entitys import Match import numpy as np def getTheRole(entity,role_list): ''' @summary:根据实体名称拿到index @param: entity:实体名称 role_list:角色list @return:该实体所在下标 ''' for role_index in range(len(role_list)): if entity in role_list[role_index]: return role_index return None dict_role_id = {"0":"tenderee", "1":"agency", "2":"win_tenderer", "3":"second_tenderer", "4":"third_tenderer"} def getPackage(packageList,sentence_index,begin_index,roleid,MAX_DIS=None,DIRECT=None): ''' @param: packageList:文章的包的信息,包号-sent_index-词偏移-字偏移-[[前作用域句子,句内偏移],[后作用域句子,句内偏移]]-匹配集合 sentence_index:实体所在的句子 begin_index:实体所在句子的起始位置 @return:公司实体所属的包 @summary: 优化多标段,确定标段作用域之后,寻找作用域包含该实体的所有包,从前往后找到一个还没有该roleid的包返回,若找到的包都有roleid,则返回第一个,若没有找到包,返回None ''' ''' if len(packageList)==0: return None before_index = None after_index = None equal_index = None equal_count = 0 for pack_index in range(len(packageList)): if packageList[pack_index][1]>sentence_index and after_index is None: after_index = pack_index if packageList[pack_index][1]int(begin_index): if packageList[i-1][4]: return packageList[i-1][0] else: if packageList[i][4]: return packageList[i-1][0] else: return packageList[i][0] return packageList[end_index-1][0] ''' if len(packageList)==0: return None,False list_legalPack = [] for pack_index in range(len(packageList)): if DIRECT=="L" and (packageList[pack_index]["sentence_index"]>sentence_index or (packageList[pack_index]["sentence_index"]==sentence_index and packageList[pack_index]["offsetWords_begin"]>begin_index)): continue if DIRECT=="R" and (packageList[pack_index]["sentence_index"]sentence_index or (packageList[pack_index]["scope"][1][0]==sentence_index and packageList[pack_index]["scope"][1][1]>=begin_index)): if MAX_DIS is not None: if abs(sentence_index-packageList[pack_index]["sentence_index"])<=MAX_DIS: list_legalPack.append(pack_index) else: list_legalPack.append(pack_index) # if (packageList[pack_index]["scope"][0][0] < sentence_index # or (packageList[pack_index]["scope"][0][0] == sentence_index # and packageList[pack_index]["scope"][0][1] <= begin_index)) # and (packageList[pack_index]["scope"][1][0] > sentence_index # or (packageList[pack_index]["scope"][1][0] == sentence_index # and packageList[pack_index]["scope"][1][1] >= begin_index)): # pass _flag = True for _index in list_legalPack: if roleid in packageList[_index]["hit"]: continue else: _flag = False packageList[_index]["hit"].add(roleid) return packageList[_index]["pointer"],_flag if len(list_legalPack)>0: return packageList[0]["pointer"],_flag return None,False #生成合法的组合 def get_legal_comba(list_entity,dict_role_combination): #拿到一个包中所有合法的组合 def circle_package(_dict_legal_combination): list_dict_role_first = [] for _role in _dict_legal_combination: if len(list_dict_role_first)==0: for _entity in _dict_legal_combination[_role]: if _entity !="": list_dict_role_first.append({_role:_entity}) else: list_dict_role_after = [] _find_count = 0 for _entity in _dict_legal_combination[_role]: if _entity !="": for _dict in list_dict_role_first: _flag = True for _key1 in _dict: if _entity==_dict[_key1]: #修改为招标人和代理人可以为同一个 if str(_key1) in ["0","1"] and str(_role) in ["0","1"]: _flag = True else: _flag = False if _flag: _find_count += 1 _new_dict = copy.copy(_dict) _new_dict[_role] = _entity if len(list_dict_role_after)>100000: break list_dict_role_after.append(_new_dict) else: # 2021/5/25 update,同一实体(entity_text)不同角色 if len(list_dict_role_after) > 100000: break for _dict in list_dict_role_first: for _key1 in _dict: if _entity == _dict[_key1]: _new_dict = copy.copy(_dict) _new_dict.pop(_key1) _new_dict[_role] = _entity list_dict_role_after.append({_role:_entity}) if len(list_dict_role_after)==0: pass else: list_dict_role_first.extend(list_dict_role_after) return list_dict_role_first def recursive_package(_dict_legal_combination,set_legal_entity,dict_one_selution,list_all_selution): last_layer = False #若是空组合则放回空 if len(_dict_legal_combination.keys())==0: return [] #递归到最后一层则修改状态 if len(_dict_legal_combination.keys())==1: last_layer = True #取一个角色开始进行遍历 _key_role = list(_dict_legal_combination.keys())[0] for item in _dict_legal_combination[_key_role]: copy_dict_one_selution = copy.copy(dict_one_selution) copy_dict_legal_combination = {} copy_set_legal_entity = copy.copy(set_legal_entity) #复制余下的所有角色,进行下一轮递归 for _key in _dict_legal_combination.keys(): if _key!=_key_role: copy_dict_legal_combination[_key] = _dict_legal_combination[_key] #修改为招标人和代理人可以为同一个 if item !="": _flag = True if str(_key_role) in ["0","1"]: for _key_flag in copy_dict_one_selution: if _key_flag not in ["0","1"] and copy_dict_one_selution[_key_flag]==item: _flag = False else: for _key_flag in copy_dict_one_selution: if copy_dict_one_selution[_key_flag]==item: _flag = False if _flag: copy_dict_one_selution[_key_role] = item ''' if item not in copy_set_legal_entity: if item !="": copy_dict_one_selution[_key_role] = item ''' copy_set_legal_entity.add(item) if last_layer: list_all_selution.append(copy_dict_one_selution) else: recursive_package(copy_dict_legal_combination,copy_set_legal_entity,copy_dict_one_selution,list_all_selution) #递归匹配各个包的结果 def recursive_packages(_dict_legal_combination,dict_one_selution,list_all_selution): last_layer = False if len(_dict_legal_combination.keys())==0: return [] if len(_dict_legal_combination.keys())==1: last_layer = True _key_pack = list(_dict_legal_combination.keys())[0] for item in _dict_legal_combination[_key_pack]: copy_dict_one_selution = copy.copy(dict_one_selution) copy_dict_legal_combination = {} for _key in _dict_legal_combination.keys(): if _key!=_key_pack: copy_dict_legal_combination[_key] = _dict_legal_combination[_key] for _key_role in item.keys(): copy_dict_one_selution[_key_pack+"$$"+_key_role] = item[_key_role] if last_layer: list_all_selution.append(copy_dict_one_selution) else: recursive_packages(copy_dict_legal_combination,copy_dict_one_selution,list_all_selution) return list_all_selution #循环获取所有包组合 def circle_pageages(_dict_legal_combination): list_all_selution = [] for _key_pack in _dict_legal_combination.keys(): list_key_selution = [] for item in _dict_legal_combination[_key_pack]: _dict = dict() for _key_role in item.keys(): _dict[_key_pack+"$$"+_key_role] = item[_key_role] list_key_selution.append(_dict) if len(list_all_selution)==0: list_all_selution = list_key_selution else: _list_all_selution = [] for item_1 in list_all_selution: for item_2 in list_key_selution: _list_all_selution.append(dict(item_1,**item_2)) list_all_selution = _list_all_selution return list_all_selution #拿到各个包解析之后的结果 _dict_legal_combination = {} for packageName in dict_role_combination.keys(): _list_all_selution = [] # recursive_package(dict_role_combination[packageName], set(), {}, _list_all_selution) _list_all_selution = circle_package(dict_role_combination[packageName]) ''' # print("===1") # print(packageName) for item in _list_all_selution: # print(item) # print("===2") ''' #去除包含子集 list_all_selution_simple = [] _list_set_all_selution = [] for item_selution in _list_all_selution: item_set_selution = set() for _key in item_selution.keys(): item_set_selution.add((_key,item_selution[_key])) _list_set_all_selution.append(item_set_selution) if len(_list_set_all_selution)>1000: _dict_legal_combination[packageName] = _list_all_selution continue for i in range(len(_list_set_all_selution)): be_included = False for j in range(len(_list_set_all_selution)): if i!=j: if len(set(_list_set_all_selution[i])&set(_list_set_all_selution[j]))==len(_list_set_all_selution[i]) and len(_list_set_all_selution[i])!=len(_list_set_all_selution[j]): be_included = True if not be_included: list_all_selution_simple.append(_list_all_selution[i]) _dict_legal_combination[packageName] = list_all_selution_simple _list_final_comba = [] #对各个包的结果进行排列组合 _comba_count = 1 for _key in _dict_legal_combination.keys(): _comba_count *= len(_dict_legal_combination[_key]) #如果过大,则每个包只取概率最大的那个 dict_pack_entity_prob = get_dict_entity_prob(list_entity) if _comba_count>250: new_dict_legal_combination = dict() for _key_pack in _dict_legal_combination.keys(): MAX_PROB = -1000 _MAX_PROB_COMBA = None for item in _dict_legal_combination[_key_pack]: # print(_key_pack,item) _dict = dict() for _key in item.keys(): _dict[str(_key_pack)+"$$"+str(_key)] = item[_key] _prob = getSumExpectation(dict_pack_entity_prob, _dict) if _prob>MAX_PROB: MAX_PROB = _prob _MAX_PROB_COMBA = [item] if _MAX_PROB_COMBA is not None: new_dict_legal_combination[_key_pack] = _MAX_PROB_COMBA _dict_legal_combination = new_dict_legal_combination #recursive_packages(_dict_legal_combination, {}, _list_final_comba) _list_final_comba = circle_pageages(_dict_legal_combination) #除了Project包(招标人和代理人),其他包是不会有冲突的 #查看是否有一个实体出现在了Project包和其他包中,如有,要进行裁剪 _list_real_comba = [] for dict_item in _list_final_comba: set_project = set() set_other = set() for _key in list(dict_item.keys()): if _key.split("$$")[0]=="Project": set_project.add(dict_item[_key]) else: set_other.add(dict_item[_key]) set_common = set_project&set_other if len(set_common)>0: dict_project = {} dict_not_project = {} for _key in list(dict_item.keys()): if dict_item[_key] in set_common: if str(_key.split("$$")[0])=="Project": dict_project[_key] = dict_item[_key] else: dict_not_project[_key] = dict_item[_key] else: dict_project[_key] = dict_item[_key] dict_not_project[_key] = dict_item[_key] _list_real_comba.append(dict_project) _list_real_comba.append(dict_not_project) else: _list_real_comba.append(dict_item) return _list_real_comba def get_dict_entity_prob(list_entity,on_value=0.5): dict_pack_entity_prob = {} for entity in list_entity: if entity.entity_type in ['org','company']: values = entity.values role_prob = float(values[int(entity.label)]) _key = entity.packageName+"$$"+str(entity.label) if role_prob>=on_value and str(entity.label)!="5": _key_prob = _key+"$text$"+entity.entity_text if _key_prob in dict_pack_entity_prob: if role_prob>dict_pack_entity_prob[_key_prob][1]: dict_pack_entity_prob[_key_prob] = [entity.entity_text,role_prob] else: dict_pack_entity_prob[_key_prob] = [entity.entity_text,role_prob] return dict_pack_entity_prob #计算合计期望 def getSumExpectation(dict_pack_entity_prob,combination,on_value=0.5): ''' expect = 0 for entity in list_entity: if entity.entity_type in ['org','company']: values = entity.values role_prob = float(values[int(entity.label)]) _key = entity.packageName+"$$"+str(entity.label) if role_prob>on_value and str(entity.label)!="5": if _key in combination.keys() and combination[_key]==entity.entity_text: expect += math.pow(role_prob,4) else: expect -= math.pow(role_prob,4) ''' #修改为同一个实体只取对应包-角色的最大的概率值 expect = 0 dict_entity_prob = {} for _key_pack_entity in dict_pack_entity_prob: _key_pack = _key_pack_entity.split("$text$")[0] role_prob = dict_pack_entity_prob[_key_pack_entity][1] if _key_pack in combination.keys() and combination[_key_pack]==dict_pack_entity_prob[_key_pack_entity][0]: if _key_pack_entity in dict_entity_prob.keys(): if dict_entity_prob[_key_pack_entity]-role_prob: dict_entity_prob[_key_pack_entity] = -role_prob else: dict_entity_prob[_key_pack_entity] = -role_prob # for entity in list_entity: # if entity.entity_type in ['org','company']: # values = entity.values # role_prob = float(values[int(entity.label)]) # _key = entity.packageName+"$$"+str(entity.label) # if role_prob>=on_value and str(entity.label)!="5": # if _key in combination.keys() and combination[_key]==entity.entity_text: # _key_prob = _key+entity.entity_text # if _key_prob in dict_entity_prob.keys(): # if dict_entity_prob[_key_prob]-role_prob: # dict_entity_prob[_key_prob] = -role_prob # else: # dict_entity_prob[_key_prob] = -role_prob for _key in dict_entity_prob.keys(): symbol = 1 if dict_entity_prob[_key]>0 else -1 expect += symbol*math.pow(dict_entity_prob[_key],2) return expect def getRoleList(list_sentence,list_entity,on_value = 0.5): ''' @summary: 搜索树,得到所有不矛盾的角色组合,取合计期望值最大的作为结果返回 @param: list_sentence:文章所有的sentence list_entity:文章所有的实体 on_value:概率阈值 @return:文章的角色list ''' pack = getPackagesFromArticle(list_sentence,list_entity) if pack is None: return None PackageList,PackageSet,dict_PackageCode = pack #拿到所有可能的情况 dict_role_combination = {} #拿到各个实体的packageName,packageCode for entity in list_entity: if entity.entity_type in ['org','company']: #过滤掉字数小于3个的实体 if len(entity.entity_text)<=3: continue values = entity.values role_prob = float(values[int(entity.label)]) if role_prob>=on_value and str(entity.label)!="5": if str(entity.label) in ["0","1"]: packageName = "Project" else: if len(PackageSet)>0: packagePointer,_ = getPackage(PackageList,entity.sentence_index,entity.begin_index,"role-"+str(entity.label)) if packagePointer is None: #continue packageName = "Project" else: #add pointer_pack entity.pointer_pack = packagePointer packageName = packagePointer.entity_text else: packageName = "Project" find_flag = False if packageName in dict_PackageCode.keys(): packageCode = dict_PackageCode[packageName] else: packageCode = "" entity.packageCode = packageCode role_name = dict_role_id.get(str(entity.label)) entity.roleName = role_name entity.packageName = packageName if entity.packageName in dict_role_combination.keys(): if str(entity.label) in dict_role_combination[entity.packageName].keys(): dict_role_combination[entity.packageName][str(entity.label)].add(entity.entity_text) else: dict_role_combination[entity.packageName][str(entity.label)] = set([entity.entity_text]) else: dict_role_combination[entity.packageName] = {} #初始化空值 roleIds = [0,1,2,3,4] for _roleId in roleIds: dict_role_combination[entity.packageName][str(_roleId)] = set([""]) dict_role_combination[entity.packageName][str(entity.label)].add(entity.entity_text) list_real_comba = get_legal_comba(list_entity,dict_role_combination) #拿到最大期望值的组合 max_index = 0 max_expect = -100 _index = 0 dict_pack_entity_prob = get_dict_entity_prob(list_entity) for item_combination in list_real_comba: expect = getSumExpectation(dict_pack_entity_prob, item_combination) if expect>max_expect: max_index = _index max_expect = expect _index += 1 RoleList = [] RoleSet = set() if len(list_real_comba)>0: for _key in list_real_comba[max_index].keys(): packageName = _key.split("$$")[0] label = _key.split("$$")[1] role_name = dict_role_id.get(str(label)) entity_text = list_real_comba[max_index][_key] if packageName in dict_PackageCode.keys(): packagecode = dict_PackageCode.get(packageName) else: packagecode = "" RoleList.append(PREM(packageName,packagecode,role_name,entity_text,0,0,0.0,[])) RoleSet.add(entity_text) #根据最优树来修正list_entity中角色对包的连接 for _entity in list_entity: if _entity.pointer_pack is not None: _pack_name = _entity.pointer_pack.entity_text _find_flag = False for _prem in RoleList: if _prem.packageName==_pack_name and _prem.entity_text==_entity.entity_text: _find_flag = True if not _find_flag: _entity.pointer_pack = None return RoleList,RoleSet,PackageList,PackageSet def getPackageScopePattern(): ''' @summary: 获取包的作用域关键词 ''' df = pd.read_excel(os.path.dirname(__file__)+"/end.xls") pattern = "(" for item in df["list_word"]: item = str(item).replace("(","\(").replace(")","\)").replace(".","\.").replace("[","\[").replace("]","\]").replace("-","\-") pattern += item+"|" pattern = pattern[:-1]+")[::是为]|业绩.{,30}标段[0-9A-Za-z一二三四五六七八九十]{0,3}" return pattern pattern_packageScope = getPackageScopePattern() def getPackagesFromArticle(list_sentence,list_entity): ''' @param: list_sentence:文章的句子list @summary: 将包的信息插入list_entity中 @return: type:list if [包号,句子index,词偏移,标段号] meaning:文章的包/标段信息 ''' if len(list_sentence)==0: return None PackageList = [] PackageList_scope = [] PackageSet = set() dict_packageCode = dict() package_name_pattern = re.compile("((标[段号的包]|分包)的?(名[称单]?|包名))[::]?([^::]{3,30}?),{1}") package_N_name_pattern = re.compile("[^承](分?包|标段|标包|标|包|包组|子项目|包件|项目类型)编?号?[::]?[\((]?([0-9A-Za-z一二三四五六七八九十ⅠⅡⅢⅣⅤⅥⅦ]){1,2},{1}") package_number_pattern = re.compile("(([^承](包|标[段号的包]|分?包|包组|包件)编?号?|子项目|项目类型)[::]?[0-9A-Za-z一二三四五六七八九十ⅠⅡⅢⅣⅤⅥⅦ]{1,4}[^\.])[^至]?|([^\.]?第?[ⅠⅡⅢⅣⅤⅥⅦ0-9A-Za-z一二三四五六七八九十]{1,4}(包号|标[段号的包]|分?包))") # other_package_pattern = re.compile('(项目名称|物资名称|设备名称|场次名称|标段名称)[::](.{,20}?)(,|项目)') # 新正则识别标段 other_package_pattern = re.compile('((项目|物资|设备|场次|标段|标的|产品)(名称))[::]([^,。]{2,50}?)[,。]') # # 2020/11/23 大网站规则 调整 package_N_name_pattern, package_N_name_pattern 中的项目 改为 子项目 win_tenderer_pattern = re.compile('(中标人|供应商)[::](.{2,25})[,。]') # 2020/11/23 大网站规则 调整 model_pattern = re.compile('(型号|序号)[::]([^,。]{2,20})[,。]') # 2020/11/23 大网站规则 调整 number_pattern = re.compile("[0-9A-Za-z一二三四五六七八九十ⅠⅡⅢⅣⅤⅥⅦ]{1,4}") package_code_pattern = re.compile("(?:编号[::]?\s*)([-\dA-Za-z\(\)]+)") # 纯数字类型的包号统一,例如:'01','1' re_digital = re.compile("^\d+$") def changeIndexFromWordToWords(tokens,word_index): ''' @summary:转换某个字的字偏移为词偏移 ''' before_index = 0 after_index = 0 for i in range(len(tokens)): after_index = after_index+len(tokens[i]) if before_index<=word_index and after_index>=word_index: return i before_index = after_index package_names = [] def extractPackageCode(tokens,word_index,size=20,pattern = package_code_pattern): ''' @summary:抽取包附近的标段号 @param: tokens:包所在句子的分词 word_index:包所在字偏移 size:左右各取多少个词 pattern:提取标段号的正则 @return: type:string,meaning:标段号 ''' index = changeIndexFromWordToWords(tokens,word_index) if indexlen(tokens): end = len(tokens) else: end = index+size #拿到左右两边的词语组成短语 text = "".join(tokens[begin:end]) #在短语中的字偏移 new_word_index = word_index-len("".join(tokens[:begin])) min_distance = len(text) packageCode = None for the_iter in re.finditer(pattern,text): #算出最小距离 distance = min([abs(new_word_index-the_iter.span()[0]),abs(new_word_index-the_iter.span()[1])]) if distance1: for i in range(len(list_sentence)): PackageList_item = [] PackageList_item_scope = [] content = list_sentence[i].sentence_text tokens = list_sentence[i].tokens names = re.findall(other_package_pattern, content) N_names = re.findall(win_tenderer_pattern, content) if len(names) != 1 or len(N_names) != 1: continue for iter in re.finditer(other_package_pattern,content): temp_package_number = iter.group(4) xinghao = re.search(model_pattern, content) if xinghao: temp_package_number = temp_package_number + '+' + xinghao.group(2) # print('新正则采购包名补充',temp_package_number) if re.search(re_digital,temp_package_number): temp_package_number = str(int(temp_package_number)) PackageList_item.append({"name":temp_package_number,"sentence_index":list_sentence[i].sentence_index,"offsetWords_begin":changeIndexFromWordToWords(tokens,iter.span()[0]),"offsetWord_begin":iter.span()[0],"offsetWord_end":iter.span()[1]}) # PackageList_item.append([temp_package_number,i,changeIndexFromWordToWords(tokens,iter.span()[0]),iter.span()[0],iter.span()[1]]) code = extractPackageCode(tokens, iter.span()[0]) if code is not None: dict_packageCode[temp_package_number] = code PackageSet.add(temp_package_number) #识别packageScope for iter in re.finditer(pattern_packageScope,content): PackageList_item_scope.append({"name":"","sentence_index":list_sentence[i].sentence_index,"offsetWords_begin":changeIndexFromWordToWords(tokens,iter.span()[0]),"offsetWord_begin":iter.span()[0],"offsetWord_end":iter.span()[1]}) # PackageList_item_scope.append(["",i,changeIndexFromWordToWords(tokens,iter.span()[0]),iter.span()[0],iter.span()[1]]) PackageList_item_scope = PackageList_item +PackageList_item_scope PackageList_item_scope.sort(key=lambda x:x["offsetWord_begin"]) PackageList_scope = PackageList_scope+PackageList_item_scope PackageList_item.sort(key=lambda x:x["sentence_index"]) pattern_punctuation = "[::()\(\),,。;;]" for i in range(len(list_sentence)): for j in range(len(PackageList_scope)): if i==PackageList_scope[j]["sentence_index"] and PackageList_scope[j]["name"]!="": _flag = False left_str = list_sentence[i].sentence_text[PackageList_scope[j]["offsetWord_begin"]-30:PackageList_scope[j]["offsetWord_begin"]+1] right_str = list_sentence[i].sentence_text[PackageList_scope[j]["offsetWord_begin"]:PackageList_scope[j]["offsetWord_begin"]+30] _left_find = re.findall(pattern_punctuation,left_str) _right_find = re.findall(pattern_punctuation,right_str) #print(left_str) if re.search("同",left_str[-1:]) is not None and PackageList_scope[j]["name"]=="一": continue if re.search("划分",right_str[:10]) is not None: continue if len(_left_find)>0 and _left_find[-1] in [":",":"]: _flag = True if len(_right_find)>0 and _right_find[0] in [":",":"]: _flag = True if _flag: scope_begin = [PackageList_scope[j]["sentence_index"],PackageList_scope[j]["offsetWords_begin"]] else: if j==0: scope_begin = [0,0] else: scope_begin = [PackageList_scope[j-1]["sentence_index"],PackageList_scope[j-1]["offsetWords_begin"]] if j==len(PackageList_scope)-1: scope_end = [PackageList_scope[j]["sentence_index"],changeIndexFromWordToWords(list_sentence[i].tokens, len(list_sentence[i].sentence_text))] else: scope_end = [PackageList_scope[j+1]["sentence_index"],PackageList_scope[j+1]["offsetWords_begin"]] if PackageList_scope[j-1]["sentence_index"]==PackageList_scope[j]["sentence_index"] and PackageList_scope[j-1]["offsetWord_begin"]<=PackageList_scope[j]["offsetWord_begin"] and PackageList_scope[j-1]["offsetWord_end"]>=PackageList_scope[j]["offsetWord_end"]: continue #add package to entity _pack_entity = Entity(doc_id=list_sentence[0].doc_id,entity_id="%s_%s_%s_%s"%(list_sentence[0].doc_id,i,PackageList_scope[j]["offsetWord_begin"],PackageList_scope[j]["offsetWord_begin"]),entity_text=PackageList_scope[j]["name"],entity_type="package",sentence_index=PackageList_scope[j]["sentence_index"],begin_index=changeIndexFromWordToWords(list_sentence[i].tokens,PackageList_scope[j]["offsetWord_begin"]),end_index=changeIndexFromWordToWords(list_sentence[i].tokens,PackageList_scope[j]["offsetWord_end"]),wordOffset_begin=PackageList_scope[j]["offsetWord_begin"],wordOffset_end=PackageList_scope[j]["offsetWord_end"]) list_entity.append(_pack_entity) copy_pack = copy.copy(PackageList_scope[j]) copy_pack["scope"] = [scope_begin,scope_end] copy_pack["hit"] = set() copy_pack["pointer"] = _pack_entity PackageList.append(copy_pack) return PackageList,PackageSet,dict_packageCode from BiddingKG.dl.relation_extraction.model import Relation_extraction relationExtraction_model = Relation_extraction() def findAttributeAfterEntity(PackDict,roleSet,PackageList,PackageSet,list_entity,list_sentence,on_value = 0.5,on_value_person=0.5,sentence_len=4): ''' @param: PackDict:文章包dict roleSet:文章所有角色的公司名称 PackageList:文章的包信息 PackageSet:文章所有包的名称 list_entity:文章所有经过模型处理的实体 on_value:金额模型的阈值 on_value_person:联系人模型的阈值 sentence_len:公司和属性间隔句子的最大长度 @return:添加了属性信息的角色list ''' #根据roleid添加金额到rolelist中 def addMoneyByRoleid(packDict,packageName,roleid,money,money_prob): for i in range(len(packDict[packageName]["roleList"])): if packDict[packageName]["roleList"][i].role_name==dict_role_id.get(str(roleid)): if money_prob>packDict[packageName]["roleList"][i].money_prob: packDict[packageName]["roleList"][i].money = money packDict[packageName]["roleList"][i].money_prob = money_prob return packDict #根据实体名称添加金额到rolelist中 def addMoneyByEntity(packDict,packageName,entity,money,money_prob): for i in range(len(packDict[packageName]["roleList"])): if packDict[packageName]["roleList"][i].entity_text==entity: # if money_prob>packDict[packageName]["roleList"][i].money_prob: # packDict[packageName]["roleList"][i].money = money # packDict[packageName]["roleList"][i].money_prob = money_prob if packDict[packageName]["roleList"][i].money_prob==0 : # 2021/7/20第一次更新金额 packDict[packageName]["roleList"][i].money = money.entity_text packDict[packageName]["roleList"][i].money_prob = money_prob elif money_prob>packDict[packageName]["roleList"][i].money_prob+0.2 or money.notes in ['大写']: # 2021/7/20改为优先选择大写金额, # print('已连接金额概率:money_prob:',packDict[packageName]["roleList"][i].money_prob) # print('链接金额备注 ',money.notes, money.entity_text, money.values) packDict[packageName]["roleList"][i].money = money.entity_text packDict[packageName]["roleList"][i].money_prob = money_prob return packDict #根据实体名称得到角色 def getRoleWithText(packDict,entity_text): for pack in packDict.keys(): for i in range(len(packDict[pack]["roleList"])): if packDict[pack]["roleList"][i].entity_text==entity_text: return packDict[pack]["roleList"][i].role_name def doesEntityOrLinkedEntity_inRoleSet(entity,RoleSet): _list_entitys = [entity]+entity.linked_entitys for _entity in _list_entitys: if _entity.entity_text in RoleSet: return True p_entity = 0 # 2021/7/19 顺序比较金额,前面是后面的一万倍则把前面金额/10000 money_list = [it for it in list_entity if it.entity_type=="money"] for i in range(len(money_list)-1): for j in range(1, len(money_list)): if (float(money_list[i].entity_text) > 5000000000 or money_list[j].notes=='大写') and \ Decimal(money_list[i].entity_text)/Decimal(money_list[j].entity_text)==10000: money_list[i].entity_text = str(Decimal(money_list[i].entity_text)/10000) # print('连接前修改大于50亿金额:前面是后面的一万倍则把前面金额/10000') #遍历所有实体 while(p_entity=on_value: if str(entity.label)=="0": packagePointer,_ = getPackage(PackageList,entity.sentence_index,entity.begin_index,"money-"+str(entity.label)) if packagePointer is None: packageName = "Project" else: packageName = packagePointer.entity_text addMoneyByRoleid(PackDict, packageName, "0", entity.entity_text, entity.values[entity.label]) ''' ''' # 2020/11/25 与下面的联系人连接步骤重复,取消 if entity.entity_type=="person": if entity.values[entity.label]>=on_value_person: if str(entity.label)=="1": for i in range(len(PackDict["Project"]["roleList"])): if PackDict["Project"]["roleList"][i].role_name=="tenderee": PackDict["Project"]["roleList"][i].linklist.append((entity.entity_text,entity.person_phone)) # add pointer_person for _entity in list_entity: if dict_role_id.get(str(_entity.label))=="tenderee": for i in range(len(PackDict["Project"]["roleList"])): if PackDict["Project"]["roleList"][i].entity_text==_entity.entity_text and PackDict["Project"]["roleList"][i].role_name=="tenderee": _entity.pointer_person = entity elif str(entity.label)=="2": for i in range(len(PackDict["Project"]["roleList"])): if PackDict["Project"]["roleList"][i].role_name=="agency": PackDict["Project"]["roleList"][i].linklist.append((entity.entity_text,entity.person_phone)) # add pointer_person for _entity in list_entity: if dict_role_id.get(str(_entity.label))=="agency": for i in range(len(PackDict["Project"]["roleList"])): if PackDict["Project"]["roleList"][i].entity_text==_entity.entity_text and PackDict["Project"]["roleList"][i].role_name=="agency": _entity.pointer_person = entity ''' # #金额往前找实体 # if entity.entity_type=="money": # if entity.values[entity.label]>=on_value: # p_entity_money= p_entity # entity_money = list_entity[p_entity_money] # if len(PackageSet)>0: # packagePointer,_ = getPackage(PackageList,entity_money.sentence_index,entity_money.begin_index,"money-"+str(entity_money.entity_text)+"-"+str(entity_money.label)) # if packagePointer is None: # packageName_entity = "Project" # else: # packageName_entity = packagePointer.entity_text # else: # packageName_entity = "Project" # while(p_entity_money>0): # entity_before = list_entity[p_entity_money] # if entity_before.entity_type in ['org','company']: # if str(entity_before.label)=="1": # addMoneyByEntity(PackDict, packageName_entity, entity_before.entity_text, entity_money.entity_text, entity_money.values[entity_money.label]) # #add pointer_money # entity_before.pointer_money = entity_money # break # p_entity_money -= 1 #如果实体属于角色集合,则往后找属性 if doesEntityOrLinkedEntity_inRoleSet(entity, roleSet): p_entity += 1 #循环查找符合的属性 while(p_entity=sentence_len: p_entity -= 1 break #若是遇到公司实体,则跳出循环 if entity_after.entity_type in ['org','company']: p_entity -= 1 break if entity_after.values is not None: if entity_after.entity_type=="money": if entity_after.values[entity_after.label]>=on_value: ''' #招标金额从后往前找 if str(entity_after.label)=="0": packagePointer,_ = getPackage(PackageList,entity.sentence_index,entity.begin_index,"money-"+str(entity.label)) if packagePointer is None: packageName = "Project" else: packageName = packagePointer.entity_text addMoneyByRoleid(PackDict, packageName, "0", entity_after.entity_text, entity_after.values[entity_after.label]) ''' if str(entity_after.label)=="1": #print(entity_after.entity_text,entity.entity_text) _list_entitys = [entity]+entity.linked_entitys if len(PackageSet)>0: packagePointer,_ = getPackage(PackageList,entity_after.sentence_index,entity_after.begin_index,"money-"+str(entity_after.entity_text)+"-"+str(entity_after.label)) if packagePointer is None: packageName_entity = "Project" else: packageName_entity = packagePointer.entity_text else: packageName_entity = "Project" if str(entity.label) in ["2","3","4"]: # addMoneyByEntity(PackDict, packageName_entity, entity.entity_text, entity_after.entity_text, entity_after.values[entity_after.label]) if entity_after.notes == '单价': addMoneyByEntity(PackDict, packageName_entity, entity.entity_text, entity_after, 0.5) entity.pointer_money = entity_after # print('role zhao money', entity.entity_text, '中标金额:', entity_after.entity_text) else: addMoneyByEntity(PackDict, packageName_entity, entity.entity_text, entity_after, entity_after.values[entity_after.label]) entity.pointer_money = entity_after # print('role zhao money', entity.entity_text, '中标金额:', entity_after.entity_text) break # 2021/7/16 新增,找到中标金额,非单价即停止,不再往后找金额 #add pointer_money # entity.pointer_money = entity_after # print('role zhao money', entity.entity_text, '中标金额:', entity_after.entity_text) # if entity_after.notes!='单价': # break # 2021/7/16 新增,找到中标金额即停止,不再往后找金额 ''' if entity_after.entity_type=="person": if entity_after.values[entity_after.label]>=on_value_person: if str(entity_after.label)=="1": for i in range(len(roleList)): if roleList[i].role_name=="tenderee": roleList[i].linklist.append((entity_after.entity_text,entity_after.person_phone)) elif str(entity_after.label)=="2": for i in range(len(roleList)): if roleList[i].role_name=="agency": roleList[i].linklist.append((entity_after.entity_text,entity_after.person_phone)) elif str(entity_after.label)=="3": _list_entitys = [entity]+entity.linked_entitys for _entity in _list_entitys: for i in range(len(roleList)): if roleList[i].entity_text==_entity.entity_text: if entity_after.sentence_index-_entity.sentence_index>1 and len(roleList[i].linklist)>0: break roleList[i].linklist.append((entity_after.entity_text,entity_after.person_phone)) ''' p_entity += 1 p_entity += 1 '''''' # 通过模型分类的招标/代理联系人 list_sentence = sorted(list_sentence, key=lambda x: x.sentence_index) person_list = [entity for entity in list_entity if entity.entity_type == 'person' and entity.label in [1, 2]] tenderee_contact = set() tenderee_phone = set() agency_contact = set() agency_phone = set() for _person in person_list: if _person.label == 1: tenderee_contact.add(_person.entity_text) if _person.label == 2: agency_contact.add(_person.entity_text) # 正则匹配无 '主体/联系人' 的电话 # 例:"采购人联系方式:0833-5226788," re_tenderee_phone = re.compile( "(?:(?:(?:采购|招标|议价|议标|比选)(?:人|公司|单位|组织|部门)|建设(?:单位|业主)|(?:采购|招标|甲)方|询价单位|项目业主|业主)[^。]{0,5}(?:电话|联系方式|联系人)[::]?[^。]{0,7}?)" # 电话号码 "(1[3-9][0-9][-—-]?\d{4}[-—-]?\d{4}|0\d{2,3}[-—-][1-9]\d{6,7}/[1-9]\d{6,7}|0\d{2,3}[-—-][1-9]\d{6,7}转\d{1,4}|0\d{2,3}[-—-]?[1-9]\d{6,7}|[\(|\(]0\d{2,3}[\)|\)]-?\d{7,8}-?\d{,4}|[1-9]\d{6,7})(?:[^\.]|$)") # 例:"采购人地址和联系方式:峨边彝族自治县教育局,0833-5226788," re_tenderee_phone2 = re.compile( "(?:(?:(?:采购|招标|议价|议标|比选)(?:人|公司|单位|组织|部门)|建设(?:单位|业主)|(?:采购|招标|甲)方|询价单位|项目业主|业主)[^。]{0,3}(?:地址)[^。]{0,3}(?:电话|联系方式|联系人)[::]?[^。]{0,20}?)" # 电话号码 "(1[3-9][0-9][-—-]?\d{4}[-—-]?\d{4}|0\d{2,3}[-—-][1-9]\d{6,7}/[1-9]\d{6,7}|0\d{2,3}[-—-][1-9]\d{6,7}转\d{1,4}|0\d{2,3}[-—-]?[1-9]\d{6,7}|[\(|\(]0\d{2,3}[\)|\)]-?\d{7,8}-?\d{,4}|[1-9]\d{6,7})(?:[^\.]|$)") re_agent_phone = re.compile( "(?:(?:代理(?:人|机构|公司|单位|组织|方)|采购机构|集中采购机构|集采机构|招标机构)[^。]{0,5}(?:电话|联系方式|联系人)[::]?[^。]{0,7}?)" # 电话号码 "(1[3-9][0-9][-—-]?\d{4}[-—-]?\d{4}|0\d{2,3}[-—-][1-9]\d{6,7}/[1-9]\d{6,7}|0\d{2,3}[-—-][1-9]\d{6,7}转\d{1,4}|0\d{2,3}[-—-]?[1-9]\d{6,7}|[\(|\(]0\d{2,3}[\)|\)]-?\d{7,8}-?\d{,4}|[1-9]\d{6,7})(?:[^\.]|$)") re_agent_phone2 = re.compile( "(?:(?:代理(?:人|机构|公司|单位|组织|方)|采购机构|集中采购机构|集采机构|招标机构)[^。]{0,3}(?:地址)[^。]{0,3}(?:电话|联系方式|联系人)[::]?[^。]{0,20}?)" # 电话号码 "(1[3-9][0-9][-—-]?\d{4}[-—-]?\d{4}|0\d{2,3}[-—-][1-9]\d{6,7}/[1-9]\d{6,7}|0\d{2,3}[-—-][1-9]\d{6,7}转\d{1,4}|0\d{2,3}[-—-]?[1-9]\d{6,7}|[\(|\(]0\d{2,3}[\)|\)]-?\d{7,8}-?\d{,4}|[1-9]\d{6,7})(?:[^\.]|$)") content = "" for _sentence in list_sentence: content += "".join(_sentence.tokens) _content = copy.deepcopy(content) while re.search("(.)(,)([^0-9])|([^0-9])(,)(.)", content): content_words = list(content) for i in re.finditer("(.)(,)([^0-9])", content): content_words[i.span(2)[0]] = "" for i in re.finditer("([^0-9])(,)(.)", content): content_words[i.span(2)[0]] = "" content = "".join(content_words) content = re.sub("[::]|[\((]|[\))]", "", content) _tenderee_phone = re.findall(re_tenderee_phone, content) # 更新正则确定的角色属性 for i in range(len(PackDict["Project"]["roleList"])): if PackDict["Project"]["roleList"][i].role_name == "tenderee": _tenderee_phone = re.findall(re_tenderee_phone, content) if _tenderee_phone: for _phone in _tenderee_phone: PackDict["Project"]["roleList"][i].linklist.append(("", _phone)) tenderee_phone.add(_phone) _tenderee_phone2 = re.findall(re_tenderee_phone2, content) if _tenderee_phone2: for _phone in _tenderee_phone2: PackDict["Project"]["roleList"][i].linklist.append(("", _phone)) tenderee_phone.add(_phone) if PackDict["Project"]["roleList"][i].role_name == "agency": _agent_phone = re.findall(re_agent_phone, content) if _agent_phone: for _phone in _agent_phone: PackDict["Project"]["roleList"][i].linklist.append(("", _phone)) agency_phone.add(_phone) _agent_phone2 = re.findall(re_agent_phone2, content) if _agent_phone2: for _phone in _agent_phone2: PackDict["Project"]["roleList"][i].linklist.append(("", _phone)) agency_phone.add(_phone) # km配对方法 def dispatch(match_list): main_roles = list(set([match.main_role for match in match_list])) attributes = list(set([match.attribute for match in match_list])) label = np.zeros(shape=(len(main_roles), len(attributes))) for match in match_list: main_role = match.main_role attribute = match.attribute value = match.value label[main_roles.index(main_role), attributes.index(attribute)] = value + 10000 # print(label) gragh = -label # km算法 row, col = linear_sum_assignment(gragh) max_dispatch = [(i, j) for i, j, value in zip(row, col, gragh[row, col]) if value] # return [Match(main_roles[row], attributes[col]) for row, col in max_dispatch] return [(main_roles[row], attributes[col]) for row, col in max_dispatch] # 正则提取电话号码实体 key_word = re.compile('((?:电话|联系方式|联系人).{0,4}?)([0-1]\d{6,11})') phone = re.compile('1[3|4|5|6|7|8|9][0-9][-—-―]?\d{4}[-—-―]?\d{4}|' '\+86.?1[3|4|5|6|7|8|9]\d{9}|' '0[^0]\d{1,2}[-—-―][1-9]\d{6,7}/[1-9]\d{6,10}|' '0[^0]\d{1,2}[-—-―]\d{7,8}转\d{1,4}|' '0[^0]\d{1,2}[-—-―]?[1-9]\d{6}\d?(?=1[3|4|5|6|7|8|9]\d{9})|' '0[^0]\d{1,2}[-—-―]?[1-9]\d{6}\d?(?=0[^0]\d{1,2}[-—-―]?[1-9]\d{6}\d?)|' '0[^0]\d{1,2}[-—-―]?[1-9]\d{6}\d?(?=[1-9]\d{6,7})|' '0[^0]\d{1,2}[-—-―]?[1-9]\d{6}\d?|' '[\(|\(]0[^0]\d{1,2}[\)|\)]-?\d{7,8}-?\d{,4}|' '[2-9]\d{6,7}') phone_entitys = [] for _sentence in list_sentence: sentence_text = _sentence.sentence_text list_tokenbegin = [] begin = 0 for i in range(0, len(_sentence.tokens)): list_tokenbegin.append(begin) begin += len(str(_sentence.tokens[i])) list_tokenbegin.append(begin + 1) res_set = set() for i in re.finditer(phone, sentence_text): res_set.add((i.group(), i.start(), i.end())) # for i in re.finditer(key_word, sentence_text): # res_set.add((i.group(2), i.start() + len(i.group(1)), i.end())) for item in list(res_set): phone_left = sentence_text[max(0, item[1] - 10):item[1]] phone_right = sentence_text[item[2]:item[2] + 8] # 排除“传真号”和其它错误项 if re.search("传,?真|信,?箱|邮,?箱", phone_left): if not re.search("电,?话", phone_left): continue if re.search("帐,?号|编,?号|报,?价|证,?号|价,?格|[\((]万?元[\))]", phone_left): continue if re.search("[.,]\d{2,}", phone_right): continue for j in range(len(list_tokenbegin)): if list_tokenbegin[j] == item[1]: begin_index = j break elif list_tokenbegin[j] > item[1]: begin_index = j - 1 break for j in range(begin_index, len(list_tokenbegin)): if list_tokenbegin[j] >= item[2]: end_index = j - 1 break _entity = Entity(_sentence.doc_id, None, item[0], "phone", _sentence.sentence_index, begin_index, end_index, item[1], item[2]) phone_entitys.append(_entity) def is_company(entity,text): # 判断"公司"实体是否为地址地点 if entity.label!=5 and entity.values[entity.label]>0.5: return True if ent.is_tail==True: return False entity_left = text[max(0,entity.wordOffset_begin-10):entity.wordOffset_begin] entity_left = re.sub(",()\(\)::","",entity_left) entity_left = entity_left[-5:] if re.search("地址|地点",entity_left): return False else: return True pre_entity = [] for ent in list_entity: if (ent.entity_type in ['company','org','phone'] and is_company(ent,list_sentence[ent.sentence_index].sentence_text)) or (ent.entity_type=='person' and ent.label in [1,2,3]) \ or (ent.entity_type=='location' and len(ent.entity_text)>5): pre_entity.append(ent) text_data,pre_data = relationExtraction_model.get_predata(pre_entity+phone_entitys,list_sentence) # print(pre_data) maxlen = 512 relation_list = [] if 00: value = (-1 / 2 * (distance ** 2))/10000 else: distance = abs(distance) value = (-1 / 2 * (distance ** 2)) _match_list.append(Match(_subject,_object,value)) _match_combo.append((_subject,_object)) match_result = dispatch(_match_list) error_list = [] for mat in list(set(_match_combo)-set(match_result)): for temp in match_result: if mat[1]==temp[1] and mat[0]!=temp[0]: error_list.append(mat) break result = list(set(_match_combo)-set(error_list)) if predicate=='rel_person': # 从后往前更新状态,已近后向链接的属性不在前向链接(解决错误链接) result = sorted(result,key=lambda x:x[1].begin_index,reverse=True) for combo in result: is_continue = False if not combo[0].pointer_person: combo[0].pointer_person = [] if combo[1].begin_indexcombo[0].begin_index: is_continue = True break if is_continue: continue combo[0].pointer_person.append(combo[1]) linked_company.add(combo[0]) linked_person.add(combo[1]) # print(1,combo[0].entity_text,combo[1].entity_text) if predicate=='rel_address': result = sorted(result,key=lambda x:x[1].begin_index,reverse=True) for combo in result: if combo[0].pointer_address: continue combo[0].pointer_address = combo[1] # print(2,combo[0].entity_text,combo[1].entity_text) if predicate=='rel_phone': result = sorted(result,key=lambda x:x[1].begin_index,reverse=True) for combo in result: is_continue = False if not combo[0].person_phone: combo[0].person_phone = [] if combo[1].begin_indexcombo[0].begin_index: is_continue = True break if is_continue: continue combo[0].person_phone.append(combo[1]) if combo[0].label in [1,2]: if PackDict.get("Project"): for i in range(len(PackDict["Project"]["roleList"])): if (combo[0].label==1 and PackDict["Project"]["roleList"][i].role_name=='tenderee') \ or (combo[0].label==2 and PackDict["Project"]["roleList"][i].role_name=='agency'): PackDict["Project"]["roleList"][i].linklist.append((combo[0].entity_text,combo[1].entity_text)) break # print(3,combo[0].entity_text,combo[1].entity_text) # 更新 PackDict for link_p in list(linked_company): for k in PackDict.keys(): for i in range(len(PackDict[k]["roleList"])): if PackDict[k]["roleList"][i].role_name == "tenderee": if PackDict[k]["roleList"][i].entity_text == link_p.entity_text or link_p.label == 0: for per in link_p.pointer_person: person_phone = [phone for phone in per.person_phone] if per.person_phone else [] if not person_phone: if per.entity_text not in agency_contact: PackDict[k]["roleList"][i].linklist.append((per.entity_text, "")) continue for _p in person_phone: if per.entity_text not in agency_contact and _p.entity_text not in agency_phone: PackDict[k]["roleList"][i].linklist.append((per.entity_text, _p.entity_text)) elif PackDict[k]["roleList"][i].role_name == "agency": if PackDict[k]["roleList"][i].entity_text == link_p.entity_text or link_p.label == 1: for per in link_p.pointer_person: person_phone = [phone for phone in per.person_phone] if per.person_phone else [] if not person_phone: if per.entity_text not in tenderee_contact: PackDict[k]["roleList"][i].linklist.append((per.entity_text, "")) continue for _p in person_phone: if per.entity_text not in tenderee_contact and _p.entity_text not in tenderee_phone: PackDict[k]["roleList"][i].linklist.append((per.entity_text, _p.entity_text)) else: if PackDict[k]["roleList"][i].entity_text == link_p.entity_text: for per in link_p.pointer_person: person_phone = [phone for phone in per.person_phone] if per.person_phone else [] if not person_phone: if per.entity_text not in tenderee_contact and per.entity_text not in agency_contact: PackDict[k]["roleList"][i].linklist.append((per.entity_text, "")) continue for _p in person_phone: if per.entity_text not in tenderee_contact and _p.entity_text not in tenderee_phone and \ per.entity_text not in agency_contact and _p.entity_text not in agency_phone: PackDict[k]["roleList"][i].linklist.append((per.entity_text, _p.entity_text)) re_split = re.compile("[^\u4e00-\u9fa5、](十一|十二|十三|十四|十五|一|二|三|四|五|六|七|八|九|十)、") split_list = [0] * 16 split_dict = { "一、": 1, "二、": 2, "三、": 3, "四、": 4, "五、": 5, "六、": 6, "七、": 7, "八、": 8, "九、": 9, "十、": 10, "十一、": 11, "十二、": 12, "十三、": 13, "十四、": 14, "十五、": 15 } for item in re.finditer(re_split, _content): _index = split_dict.get(item.group()[1:]) if not split_list[_index]: split_list[_index] = item.span()[0] + 1 split_list = [i for i in split_list if i != 0] start = 0 new_split_list = [] for idx in split_list: new_split_list.append((start, idx)) start = idx new_split_list.append((start, len(_content))) # 实体列表按照“公告分段”分组 words_num_dict = dict() last_words_num = 0 for sentence in list_sentence: _index = sentence.sentence_index if _index == 0: words_num_dict[_index] = 0 else: words_num_dict[_index] = words_num_dict[_index - 1] + last_words_num last_words_num = len(sentence.sentence_text) # 公司-联系人连接(km算法) re_phone = re.compile('1[3-9][0-9][-—-]?\d{4}[-—-]?\d{4}|' '0\d{2,3}[-—-][1-9]\d{6,7}/[1-9]\d{6,10}|' '0\d{2,3}[-—-][1-9]\d{6,7}转\d{1,4}|' '0\d{2,3}[-—-]?[1-9]\d{6,7}|' '[\(|\(]0\d{2,3}[\)|\)]-?\d{7,8}-?\d{,4}|' '[1-9]\d{6,7}') key_phone = re.compile("联系方式|电话|联系人|负责人") temporary_list2 = [] for entity in list_entity: # if entity.entity_type in ['org', 'company', 'person'] and entity.is_tail==False: if entity.entity_type in ['org', 'company', 'person']: temporary_list2.append(entity) temporary_list2 = sorted(temporary_list2, key=lambda x: (x.sentence_index, x.begin_index)) new_temporary_list2 = [] for _split in new_split_list: temp_list = [] for _entity in temporary_list2: if words_num_dict[_entity.sentence_index] + _entity.wordOffset_begin >= _split[0] and words_num_dict[ _entity.sentence_index] + _entity.wordOffset_end < _split[1]: temp_list.append(_entity) elif words_num_dict[_entity.sentence_index] + _entity.wordOffset_begin >= _split[1]: break new_temporary_list2.append(temp_list) # print(new_temporary_list2) match_list2 = [] for split_index in range(len(new_temporary_list2)): split_entitys = new_temporary_list2[split_index] is_skip = False for index in range(len(split_entitys)): entity = split_entitys[index] if is_skip: is_skip = False continue else: if entity.entity_type in ['org', 'company']: if entity.label != 5 or entity.entity_text in roleSet: match_nums = 0 for after_index in range(index + 1, min(len(split_entitys), index + 4)): after_entity = split_entitys[after_index] if after_entity.entity_type in ['person']: # 实体为中标人/候选人,联系人已确定类别【1,2】 if entity.label in [2, 3, 4] and after_entity.label in [1, 2]: break if after_entity.label in [1, 2, 3]: distance = (tokens_num_dict[ after_entity.sentence_index] + after_entity.begin_index) - ( tokens_num_dict[entity.sentence_index] + entity.end_index) sentence_distance = after_entity.sentence_index - entity.sentence_index if sentence_distance == 0: if distance < 100: if (entity.label == 0 and after_entity.label == 1) or ( entity.label == 1 and after_entity.label == 2): distance = distance / 100 value = (-1 / 2 * (distance ** 2)) / 10000 match_list2.append(Match(entity, after_entity, value)) match_nums += 1 else: if distance < 60: if (entity.label == 0 and after_entity.label == 1) or ( entity.label == 1 and after_entity.label == 2): distance = distance / 100 value = (-1 / 2 * (distance ** 2)) / 10000 match_list2.append(Match(entity, after_entity, value)) match_nums += 1 if after_entity.entity_type in ['org', 'company']: # 解决在‘地址’中识别出org/company的问题 # if entity.label in [0,1] and after_index==index+1 and after_entity.label not in [0,1]: if entity.label != 5 and after_index == index + 1 and ( after_entity.label == entity.label or after_entity.label == 5): distance = (tokens_num_dict[ after_entity.sentence_index] + after_entity.begin_index) - ( tokens_num_dict[entity.sentence_index] + entity.end_index) if distance < 20: after_entity_left = list_sentence[after_entity.sentence_index].tokens[max(0, after_entity.begin_index - 10):after_entity.begin_index] after_entity_right = list_sentence[after_entity.sentence_index].tokens[ after_entity.end_index + 1:after_entity.end_index + 6] after_entity_left = "".join(after_entity_left) if len(after_entity_left) > 20: after_entity_left = after_entity_left[-20:] after_entity_right = "".join(after_entity_right)[:10] if re.search("地,?址", after_entity_left): is_skip = True continue if re.search("\(|(", after_entity_left) and re.search("\)|)", after_entity_right): is_skip = True continue if entity.label in [0, 1] and after_entity.label in [0, 1] and entity.label == after_entity.label: break if entity.label in [0, 1] and after_entity.label in [0, 1] and split_entitys[ index + 1].entity_type == "person": break if entity.label in [0, 1] and after_entity.label in [2, 3, 4]: break if entity.label in [2, 3, 4] and after_entity.label in [0, 1]: break # 搜索没有联系人的电话 mid_tokens = [] is_same_sentence = False if index == len(split_entitys) - 1: for i in range(entity.sentence_index, len(list_sentence)): mid_tokens += list_sentence[i].tokens mid_tokens = mid_tokens[entity.end_index + 1:] mid_sentence = "".join(mid_tokens) have_phone = re.findall(re_phone, mid_sentence) if have_phone: if re.findall(re_phone, mid_sentence.split("。")[0]): is_same_sentence = True _phone = have_phone[0] phone_begin = mid_sentence.find(_phone) if words_num_dict[entity.sentence_index] + entity.wordOffset_begin + phone_begin < \ new_split_list[split_index][1]: mid_sentence = mid_sentence[max(0, phone_begin - 15):phone_begin].replace(",", "") if re.search(key_phone, mid_sentence): distance = 1 if is_same_sentence: if phone_begin <= 200: value = (-1 / 2 * (distance ** 2)) / 10000 match_list2.append(Match(entity, (entity, _phone), value)) match_nums += 1 else: if phone_begin <= 60: value = (-1 / 2 * (distance ** 2)) / 10000 match_list2.append(Match(entity, (entity, _phone), value)) match_nums += 1 else: next_entity = split_entitys[index + 1] if entity.sentence_index == next_entity.sentence_index: mid_tokens += list_sentence[entity.sentence_index].tokens[ entity.end_index + 1:next_entity.begin_index] else: sentence_index = entity.sentence_index while sentence_index <= next_entity.sentence_index: mid_tokens += list_sentence[sentence_index].tokens sentence_index += 1 mid_tokens = mid_tokens[entity.end_index + 1:-(len( list_sentence[next_entity.sentence_index].tokens) - next_entity.begin_index) + 1] mid_sentence = "".join(mid_tokens) have_phone = re.findall(re_phone, mid_sentence) if have_phone: if re.findall(re_phone, mid_sentence.split("。")[0]): is_same_sentence = True _phone = have_phone[0] phone_begin = mid_sentence.find(_phone) mid_sentence = mid_sentence[max(0, phone_begin - 15):phone_begin].replace(",", "") if re.search(key_phone, mid_sentence): p_phone = [p.entity_text for p in next_entity.person_phone] if next_entity.person_phone else [] if next_entity.entity_type == 'person' and _phone in p_phone: pass else: distance = (tokens_num_dict[ next_entity.sentence_index] + next_entity.begin_index) - ( tokens_num_dict[entity.sentence_index] + entity.end_index) distance = distance / 2 if is_same_sentence: if phone_begin <= 200: value = (-1 / 2 * (distance ** 2)) / 10000 match_list2.append(Match(entity, (entity, _phone), value)) match_nums += 1 else: if phone_begin <= 60: value = (-1 / 2 * (distance ** 2)) / 10000 match_list2.append(Match(entity, (entity, _phone), value)) match_nums += 1 # 实体无匹配时,尝试前向查找匹配 if not match_nums: if entity.label != 5 and entity.values[entity.label] > 0.5 and index != 0: previous_entity = split_entitys[index - 1] if previous_entity.entity_type == 'person' and previous_entity.label in [1, 2, 3]: if entity.label in [2, 3, 4] and previous_entity.label in [1, 2]: continue if previous_entity.sentence_index == entity.sentence_index: distance = (tokens_num_dict[entity.sentence_index] + entity.begin_index) - ( tokens_num_dict[ previous_entity.sentence_index] + previous_entity.end_index) if distance < 20: # 距离相等时,前向添加处罚值 # distance += 1 # 前向 没有 /10000 value = (-1 / 2 * (distance ** 2)) match_list2.append(Match(entity, previous_entity, value)) # print(match_list2) match_list2 = [mat for mat in match_list2 if mat.main_role not in linked_company and mat.attribute not in linked_person] # print(match_list2) # km算法分配求解 result2 = dispatch(match_list2) # print(result2) linked_person = [] linked_persons_with = [] for match in result2: entity = match[0] # print(entity.entity_text) # print(match.attribute) entity_index = list_entity.index(entity) is_update = False if isinstance(match[1], tuple): person_ = '' phone_ = [match[1][1]] else: person_ = match[1].entity_text phone_ = [i.entity_text for i in match[1].person_phone] if match[1].person_phone else [] for k in PackDict.keys(): for i in range(len(PackDict[k]["roleList"])): if PackDict[k]["roleList"][i].role_name == "tenderee": if not PackDict[k]["roleList"][i].linklist: if PackDict[k]["roleList"][i].entity_text == entity.entity_text or entity.label == 0: if person_ not in agency_contact and len(set(phone_)&set(agency_phone))==0: if not phone_: PackDict[k]["roleList"][i].linklist.append((person_, "")) for p in phone_: PackDict[k]["roleList"][i].linklist.append((person_, p)) is_update = True elif PackDict[k]["roleList"][i].role_name == "agency": if not PackDict[k]["roleList"][i].linklist: if PackDict[k]["roleList"][i].entity_text == entity.entity_text or entity.label == 1: if person_ not in tenderee_contact and len(set(phone_)&set(tenderee_phone))==0: if not phone_: PackDict[k]["roleList"][i].linklist.append((person_, "")) for p in phone_: PackDict[k]["roleList"][i].linklist.append((person_, p)) is_update = True else: if PackDict[k]["roleList"][i].entity_text == entity.entity_text: if not PackDict[k]["roleList"][i].linklist: if person_ not in tenderee_contact and len(set(phone_)&set(tenderee_phone))==0 and \ person_ not in agency_contact and len(set(phone_)&set(agency_phone))==0: if not phone_: PackDict[k]["roleList"][i].linklist.append((person_, "")) for p in phone_: PackDict[k]["roleList"][i].linklist.append((person_, p)) is_update = True if not person_: is_update = False if is_update: # 更新 list_entity if not list_entity[entity_index].pointer_person: list_entity[entity_index].pointer_person = [] list_entity[entity_index].pointer_person.append(match[1]) linked_person.append(match[1]) linked_persons_with.append(entity) # 一个公司对应多个联系人的补充 person_entitys = [entity for entity in list_entity if entity.entity_type=='person'] person_entitys = person_entitys[::-1] for index in range(len(person_entitys)): entity = person_entitys[index] prepare_link = [] if entity not in linked_person: prepare_link.append(entity) last_person = entity for after_index in range(index + 1, min(len(person_entitys), index + 5)): after_entity = person_entitys[after_index] if after_entity.sentence_index==last_person.sentence_index and last_person.begin_index-after_entity.end_index<5: if after_entity in linked_person: _index = linked_person.index(after_entity) with_company = linked_persons_with[_index] for i in range(len(PackDict["Project"]["roleList"])): if PackDict["Project"]["roleList"][i].role_name == "tenderee": if PackDict["Project"]["roleList"][i].entity_text == with_company.entity_text or with_company.label == 0: for item in prepare_link: person_phone = [p.entity_text for p in item.person_phone] if item.person_phone else [] for _p in person_phone: PackDict["Project"]["roleList"][i].linklist.append((item.entity_text, _p)) with_company.pointer_person.append(item) linked_person.append(item) elif PackDict["Project"]["roleList"][i].role_name == "agency": if PackDict["Project"]["roleList"][i].entity_text == with_company.entity_text or with_company.label == 1: for item in prepare_link: person_phone = [p.entity_text for p in item.person_phone] if item.person_phone else [] for _p in person_phone: PackDict["Project"]["roleList"][i].linklist.append((item.entity_text, _p)) with_company.pointer_person.append(item) linked_person.append(item) else: if PackDict["Project"]["roleList"][i].entity_text == with_company.entity_text: for item in prepare_link: person_phone = [p.entity_text for p in item.person_phone] if item.person_phone else [] for _p in person_phone: PackDict["Project"]["roleList"][i].linklist.append((item.entity_text, _p)) with_company.pointer_person.append(item) linked_person.append(item) break else: prepare_link.append(after_entity) last_person = after_entity continue # 统一同类角色的属性 if PackDict.get("Project"): for i in range(len(PackDict["Project"]["roleList"])): # if PackDict["Project"]["roleList"][i].role_name in ["tenderee","agency"]: for _entity in list_entity: if _entity.entity_type in ['org','company']: is_similar = False # entity_text相同 if _entity.entity_text==PackDict["Project"]["roleList"][i].entity_text: is_similar = True # entity.label为【0,1】 if _entity.label in [0,1] and dict_role_id[str(_entity.label)]==PackDict["Project"]["roleList"][i].role_name: is_similar = True if is_similar: linked_entitys = _entity.linked_entitys if linked_entitys: for linked_entity in linked_entitys: pointer_person = linked_entity.pointer_person if linked_entity.pointer_person else [] for _pointer_person in pointer_person: _phone = [p.entity_text for p in _pointer_person.person_phone] if _pointer_person.person_phone else [] for _p in _phone: if (_pointer_person.entity_text,_p) not in PackDict["Project"]["roleList"][i].linklist: PackDict["Project"]["roleList"][i].linklist.append((_pointer_person.entity_text,_p)) # "roleList"中联系人电话去重 for i in range(len(PackDict["Project"]["roleList"])): # print(123, PackDict["Project"]["roleList"][i].linklist) # 带有联系人的电话 with_person = [person_phone[1] for person_phone in PackDict["Project"]["roleList"][i].linklist if person_phone[0]] # 带有电话的联系人 with_phone = [person_phone[0] for person_phone in PackDict["Project"]["roleList"][i].linklist if person_phone[1]] remove_list = [] for item in PackDict["Project"]["roleList"][i].linklist: if not item[0]: if item[1] in with_person: # 删除重复的无联系人电话 remove_list.append(item) elif not item[1]: if item[0] in with_phone: remove_list.append(item) for _item in remove_list: PackDict["Project"]["roleList"][i].linklist.remove(_item) # # 1)第一个公司实体的招标人,则看看下一个实体是否为代理人,如果是则联系人错位连接 。2)在同一句中往后找联系人。3)连接不上在整个文章找联系人。 # temp_ent_list = [] # 临时列表,记录0,1角色及3联系人 # other_person = [] # 阈值以上的联系人列表 # link_person = [] # 有电话没联系上角色的person列表 # other_ent = [] # link_ent = [] # found_person = False # ent_list = [] # for entity in list_entity: # if entity.entity_type in ['org','company','person']: # ent_list.append(entity) # # ent_list = [entity for entity in list_entity if entity.entity_type in ['org','company','person']] # #for list_index in range(len(ent_list)): # #if ent_list[list_index].entity_type in ['org','company'] and ent_list[list_index].label == 0 and list_index+2on_value_person: # if str(entity.label)=="1": # for i in range(len(PackDict["Project"]["roleList"])): # if PackDict["Project"]["roleList"][i].role_name=="tenderee": # PackDict["Project"]["roleList"][i].linklist.append((entity.entity_text,entity.person_phone)) # link_person.append(entity.entity_text) # link_ent.append(PackDict["Project"]["roleList"][i].entity_text) # # add pointer_person # for _entity in list_entity: # if dict_role_id.get(str(_entity.label))=="tenderee": # for i in range(len(PackDict["Project"]["roleList"])): # if PackDict["Project"]["roleList"][i].entity_text==_entity.entity_text and PackDict["Project"]["roleList"][i].role_name=="tenderee": # _entity.pointer_person = entity # elif str(entity.label)=="2": # for i in range(len(PackDict["Project"]["roleList"])): # if PackDict["Project"]["roleList"][i].role_name=="agency": # PackDict["Project"]["roleList"][i].linklist.append((entity.entity_text,entity.person_phone)) # link_person.append(entity.entity_text) # link_ent.append(PackDict["Project"]["roleList"][i].entity_text) # # add pointer_person # for _entity in list_entity: # if dict_role_id.get(str(_entity.label))=="agency": # for i in range(len(PackDict["Project"]["roleList"])): # if PackDict["Project"]["roleList"][i].entity_text==_entity.entity_text and PackDict["Project"]["roleList"][i].role_name=="agency": # _entity.pointer_person = entity # elif str(entity.label)=="3": # if entity.entity_text in sure_person_set: # 2020/11/25 排除已经确定角色的联系人 # continue # #not_link_person.append((entity_after.entity_text,entity_after.person_phone)) # other_person.append(entity.entity_text) # temp_ent_list.append((entity.entity_text,entity.person_phone,entity)) # # #if entity.entity_text in roleSet: # if entity.entity_text in roleSet: # if entity.label in [0,1]: # other_ent.append(entity.entity_text) # temp_ent_list.append((entity.entity_text, entity.label,entity)) # for behind_index in range(index+1, len(ent_list)): # entity_after = ent_list[behind_index] # if entity_after.sentence_index-entity.sentence_index>=1 or entity_after.entity_type in ['org','company']: # 只在本句中找联系人 # break # if entity_after.values is not None: # if entity_after.entity_type=="person": # if str(entity_after.label) == "0": # 2020/11/25角色后面为非联系人 停止继续往后找 # break # if entity_after.values[entity_after.label]>on_value_person: # if str(entity_after.label)=="1": # for i in range(len(PackDict["Project"]["roleList"])): # if PackDict["Project"]["roleList"][i].role_name=="tenderee": # PackDict["Project"]["roleList"][i].linklist.append((entity_after.entity_text,entity_after.person_phone)) # link_person.append(entity_after.entity_text) # link_ent.append(PackDict["Project"]["roleList"][i].entity_text) # elif str(entity_after.label)=="2": # for i in range(len(PackDict["Project"]["roleList"])): # if PackDict["Project"]["roleList"][i].role_name=="agency": # PackDict["Project"]["roleList"][i].linklist.append((entity_after.entity_text,entity_after.person_phone)) # link_person.append(entity_after.entity_text) # link_ent.append(PackDict["Project"]["roleList"][i].entity_text) # elif str(entity_after.label)=="3": # if entity_after.entity_text in sure_person_set: # 2020/11/25 如果姓名已经出现在确定角色联系人中则停止往后找 # break # elif entity_after.begin_index - entity.end_index > 30:#2020/10/25 如果角色实体与联系人实体间隔大于阈值停止 # break # for pack in PackDict.keys(): # for i in range(len(PackDict[pack]["roleList"])): # if PackDict[pack]["roleList"][i].entity_text==entity.entity_text: # #if entity_after.sentence_index-entity.sentence_index>1 and len(roleList[i].linklist)>0: # #break # PackDict[pack]["roleList"][i].linklist.append((entity_after.entity_text,entity_after.person_phone)) # link_person.append(entity_after.entity_text) # #add pointer_person # entity.pointer_person = entity_after # # not_link_person = [person for person in other_person if person not in link_person] # not_link_ent = [ent for ent in other_ent if ent not in link_ent] # if len(not_link_person) > 0 and len(not_link_ent) > 0 : # item = temp_ent_list # for i in range(len(item)): # if item[i][0] in not_link_ent and item[i][1] == 0 and i+3 < len(item): # if item[i+1][0] in other_ent and item[i+1][1] == 1 and item[i+2][0] in other_person and item[i+3][0] in other_person: # item[i+1], item[i+2] = item[i+2], item[i+1] # for i in range(len(item)-1, -1, -1): # if item[i][0] in not_link_ent: # for pack in PackDict.keys(): # for role in PackDict[pack]["roleList"]: # if role.entity_text == item[i][0] and len(role.linklist) < 1: # for j in range(i+1, len(item)): # if item[j][0] in not_link_person: # role.linklist.append(item[j][:2]) # #add pointer_person # item[i][2].pointer_person = item[j][2] # break # else: # break # # 电话没有联系人的处理 # role_with_no_phone = [] # for i in range(len(PackDict["Project"]["roleList"])): # if PackDict["Project"]["roleList"][i].role_name in ["tenderee","agency"]: # if len(PackDict["Project"]["roleList"][i].linklist)==0: # 找出没有联系人的招标/代理人 # role_with_no_phone.append(PackDict["Project"]["roleList"][i].entity_text) # else: # phone_nums = 0 # for link in PackDict["Project"]["roleList"][i].linklist: # if link[1]: # phone_nums += 1 # break # if not phone_nums: # role_with_no_phone.append(PackDict["Project"]["roleList"][i].entity_text) # if role_with_no_phone: # phone_with_person = [entity.person_phone for entity in list_entity if entity.entity_type == "person"] # # phone_with_person = [phone for phone in phone_with_person if phone] # # dict_index_sentence = {} # for _sentence in list_sentence: # dict_index_sentence[_sentence.sentence_index] = _sentence # new_entity_list = [entity for entity in list_entity if entity.entity_type in ['org','company','person']] # for index in range(len(new_entity_list)): # entity = new_entity_list[index] # if entity.entity_text in role_with_no_phone: # e_sentence = dict_index_sentence[entity.sentence_index] # entity_right = e_sentence.tokens[entity.end_index:entity.end_index+40] # entity_right = "".join(entity_right) # if index+1-1: # entity_right = entity_right[:entity_right.find(new_entity_list[index+1].entity_text)] # have_phone = re.findall(phone,entity_right) # if have_phone: # _phone = have_phone[0] # phone_begin = entity_right.find(_phone) # if _phone not in phone_with_person and re.search(key_phone,entity_right[:phone_begin]): # # entity.person_phone = _phone # for i in range(len(PackDict["Project"]["roleList"])): # if PackDict["Project"]["roleList"][i].entity_text == entity.entity_text: # PackDict["Project"]["roleList"][i].linklist.append(('', _phone)) #寻找多标段招标金额 p_entity = len(list_entity)-1 set_tenderer_money = set() list_tenderer_money = [] #2021/7/16 新增列表,倒序保存所有中标金额 #遍历所有实体 while(p_entity>=0): entity = list_entity[p_entity] if entity.entity_type=="money": if entity.values[entity.label]>=on_value: if str(entity.label)=="1": set_tenderer_money.add(float(entity.entity_text)) list_tenderer_money.append(float(entity.entity_text)) # 2021/7/16 新增列表,倒序保存所有中标金额 # if str(entity.label)=="0": if str(entity.label)=="0" and entity.notes!='总投资': ''' if p_entity>0: p_before = list_entity[p_entity-1] if p_before.entity_type=="money" and p_before.label==entity.label and p_before.entity_text==entity.entity_text and abs(entity.begin_index-p_before.end_index)<=2: p_entity -= 1 continue ''' packagePointer,_flag = getPackage(PackageList,entity.sentence_index,entity.begin_index,"money-"+str(entity.label),MAX_DIS=2,DIRECT="L") if packagePointer is None: packageName = "Project" else: packageName = packagePointer.entity_text if packageName=="Project": # if PackDict["Project"]["tendereeMoney"]on_value: PackDict["Project"]["tendereeMoney"] = float(entity.entity_text) else: PackDict[packageName]["tendereeMoney"] = float(entity.entity_text) #add pointer_tendereeMoney packagePointer.pointer_tendereeMoney = entity p_entity -= 1 #删除一个机构有多个角色的数据 #删除重复人、概率不回传 final_roleList = [] list_pop = [] set_tenderer_role = set() dict_pack_tenderer_money = dict() for pack in PackDict.keys(): #删除无效包 if PackDict[pack]["code"]=="" and PackDict[pack]["tendereeMoney"]==0 and len(PackDict[pack]["roleList"])==0: list_pop.append(pack) for i in range(len(PackDict[pack]["roleList"])): if PackDict[pack]["roleList"][i].role_name=="win_tenderer": if PackDict[pack]["roleList"][i].money==0: set_tenderer_role.add(PackDict[pack]["roleList"][i]) dict_pack_tenderer_money[pack] = [PackDict[pack]["roleList"][i],set()] #找到包的中投标金额 for _index in range(len(PackageList)): if "hit" in PackageList[_index]: for _hit in list(PackageList[_index]["hit"]): _money = float(_hit.split("-")[1]) if _hit.split("-")[0]=="money" else None if PackageList[_index]["name"] in dict_pack_tenderer_money and _money is not None: dict_pack_tenderer_money[PackageList[_index]["name"]][1].add(_money) #只找到一个中标人和中标金额 if len(set_tenderer_money)==1 and len(set_tenderer_role)==1: list(set_tenderer_role)[0].money = list(set_tenderer_money)[0] # print('一个中标人一个金额:', list(set_tenderer_money)[0]) #找到一个中标人和多个招标金额 if len(set_tenderer_money)>1 and len(set_tenderer_role)==1: _maxMoney = 0 _sumMoney = 0 for _m in list(set_tenderer_money): _sumMoney += _m if _m>_maxMoney: _maxMoney = _m if _sumMoney/_maxMoney==2: list(set_tenderer_role)[0].money = _maxMoney # print('一人多金额分项合计 取最大金额:', _maxMoney) else: # list(set_tenderer_role)[0].money = _maxMoney if min(list_tenderer_money)>200000 and list_tenderer_money[-1]/min(list_tenderer_money)>9000: list(set_tenderer_role)[0].money = min(list_tenderer_money) # print('一人多金额 且最小的大于20万第一个金额比最小金额大几千倍的最小中标金额:', min(list_tenderer_money)) else: list(set_tenderer_role)[0].money = list_tenderer_money[-1] # 2021/7/16 修改 不是单价合计方式取第一个中标金额 # print('一人多金额 取第一个中标金额:', list_tenderer_money[-1]) #每个包都只找到一个金额 _flag_pack_money = True for k,v in dict_pack_tenderer_money.items(): if len(v[1])!=1: _flag_pack_money = False if _flag_pack_money and len(PackageSet)==len(dict_pack_tenderer_money.keys()): for k,v in dict_pack_tenderer_money.items(): v[0].money = list(v[1])[0] # print('k,v in dict_pack_tenderer_money.items', k, v) # 2021/7/16 #增加判断中标金额是否远大于招标金额逻辑 for pack in PackDict.keys(): for i in range(len(PackDict[pack]["roleList"])): if PackDict[pack]["tendereeMoney"] > 0: # print('金额数据类型:',type(PackDict[pack]["roleList"][i].money)) if float(PackDict[pack]["roleList"][i].money) >10000000 and \ float(PackDict[pack]["roleList"][i].money)/float(PackDict[pack]["tendereeMoney"])>=1000: PackDict[pack]["roleList"][i].money = float(PackDict[pack]["roleList"][i].money) / 10000 # print('招标金额校正中标金额') # 2021/7/19 #增加判断中标金额是否远大于第二三中标金额 for pack in PackDict.keys(): tmp_moneys = [] for i in range(len(PackDict[pack]["roleList"])): if float(PackDict[pack]["roleList"][i].money) >100000: tmp_moneys.append(float(PackDict[pack]["roleList"][i].money)) if len(tmp_moneys)>2 and max(tmp_moneys)/min(tmp_moneys)>1000: for i in range(len(PackDict[pack]["roleList"])): if float(PackDict[pack]["roleList"][i].money)/min(tmp_moneys)>1000: PackDict[pack]["roleList"][i].money = float(PackDict[pack]["roleList"][i].money) / 10000 # print('通过其他中标人投标金额校正中标金额') for pack in PackDict.keys(): for i in range(len(PackDict[pack]["roleList"])): PackDict[pack]["roleList"][i] = PackDict[pack]["roleList"][i].getString() for item in list_pop: PackDict.pop(item) return PackDict def initPackageAttr(RoleList,PackageSet): ''' @summary: 根据拿到的roleList和packageSet初始化接口返回的数据 ''' packDict = dict() packDict["Project"] = {"code":"","tendereeMoney":0,"roleList":[]} for item in list(PackageSet): packDict[item] = {"code":"","tendereeMoney":0,"roleList":[]} for item in RoleList: if packDict[item.packageName]["code"] =="": packDict[item.packageName]["code"] = item.packageCode packDict[item.packageName]["roleList"].append(Role(item.role_name,item.entity_text,0,0,0.0,[])) return packDict def getPackageRoleMoney(list_sentence,list_entity): ''' @param: list_sentence:文章的句子list list_entity:文章的实体list @return: 拿到文章的包-标段号-角色-实体名称-金额-联系人-联系电话 ''' # print("=1") theRole = getRoleList(list_sentence,list_entity) if not theRole: return [] RoleList,RoleSet,PackageList,PackageSet = theRole ''' for item in PackageList: # print(item) ''' # print("=2") PackDict = initPackageAttr(RoleList, PackageSet) # print("=3") PackDict = findAttributeAfterEntity(PackDict, RoleSet, PackageList, PackageSet, list_entity, list_sentence) # print("=4") return PackDict def turnBidWay(bidway): if bidway in ("邀请招标","采购方式:邀请"): return "邀请招标" elif bidway in ("询价","询单","询比","采购方式:询价"): return "询价" elif bidway in ("竞谈","竞争性谈判","公开竞谈"): return "竞争性谈判" elif bidway in ("竞争性磋商","磋商"): return "竞争性磋商" elif bidway in ("竞价","竞标","电子竞价","以电子竞价","电子书面竞投"): return "竞价" elif bidway in ("公开招标","网上电子投标","网上招标","采购方式:公开","招标为其他"): return "公开招标" elif bidway in ("单一来源"): return "单一来源" elif bidway in ("比选"): return "比选" else: return "其他" def getOtherAttributes(list_entity): dict_other = {"bidway":"", "moneysource":"", "person_review":[], "time_release":"", "time_bidopen":"", "time_bidclose":"", "serviceTime":"", "product":[], "total_tendereeMoney":0} for entity in list_entity: if entity.entity_type == 'bidway': dict_other["bidway"] = turnBidWay(entity.entity_text) elif entity.entity_type=='moneysource': dict_other["moneysource"] = entity.entity_text elif entity.entity_type=='serviceTime': dict_other["serviceTime"] = entity.entity_text elif entity.entity_type == 'time' and entity.label==1: dict_other["time_release"] = timeFormat(entity.entity_text) elif entity.entity_type == 'time' and entity.label==2: dict_other["time_bidopen"] = timeFormat(entity.entity_text) elif entity.entity_type == 'time' and entity.label == 3: dict_other["time_bidclose"] = timeFormat(entity.entity_text) elif entity.entity_type=="person" and entity.label ==4: dict_other["person_review"].append(entity.entity_text) elif entity.entity_type=='product': dict_other["product"].append(entity.entity_text) elif entity.entity_type=='money' and entity.notes=='总投资' and dict_other["total_tendereeMoney"]\ \ \ \ \ \ \ \ ') for item in result: f.write(""+""+""+""+"") f.write("
doc_id角色
"+item[0]+""+item[1]+""+item[2]+"
") '''