from BiddingKG.dl.common.Utils import findAllIndex,debug,timeFormat,getCurrent_date from BiddingKG.dl.interface.Entitys import PREM,Role,Entity from decimal import Decimal import re import copy import math import pandas as pd import os from scipy.optimize import linear_sum_assignment from BiddingKG.dl.interface.Entitys import Match import numpy as np def getTheRole(entity,role_list): ''' @summary:根据实体名称拿到index @param: entity:实体名称 role_list:角色list @return:该实体所在下标 ''' for role_index in range(len(role_list)): if entity in role_list[role_index]: return role_index return None dict_role_id = {"0":"tenderee", "1":"agency", "2":"win_tenderer", "3":"second_tenderer", "4":"third_tenderer"} def getPackage(packageList,sentence_index,begin_index,roleid,MAX_DIS=None,DIRECT=None): ''' @param: packageList:文章的包的信息,包号-sent_index-词偏移-字偏移-[[前作用域句子,句内偏移],[后作用域句子,句内偏移]]-匹配集合 sentence_index:实体所在的句子 begin_index:实体所在句子的起始位置 @return:公司实体所属的包 @summary: 优化多标段,确定标段作用域之后,寻找作用域包含该实体的所有包,从前往后找到一个还没有该roleid的包返回,若找到的包都有roleid,则返回第一个,若没有找到包,返回None ''' ''' if len(packageList)==0: return None before_index = None after_index = None equal_index = None equal_count = 0 for pack_index in range(len(packageList)): if packageList[pack_index][1]>sentence_index and after_index is None: after_index = pack_index if packageList[pack_index][1]int(begin_index): if packageList[i-1][4]: return packageList[i-1][0] else: if packageList[i][4]: return packageList[i-1][0] else: return packageList[i][0] return packageList[end_index-1][0] ''' if len(packageList)==0: return None,False list_legalPack = [] for pack_index in range(len(packageList)): if DIRECT=="L" and (packageList[pack_index]["sentence_index"]>sentence_index or (packageList[pack_index]["sentence_index"]==sentence_index and packageList[pack_index]["offsetWords_begin"]>begin_index)): continue if DIRECT=="R" and (packageList[pack_index]["sentence_index"]sentence_index or (packageList[pack_index]["scope"][1][0]==sentence_index and packageList[pack_index]["scope"][1][1]>=begin_index)): if MAX_DIS is not None: if abs(sentence_index-packageList[pack_index]["sentence_index"])<=MAX_DIS: list_legalPack.append(pack_index) else: list_legalPack.append(pack_index) # if (packageList[pack_index]["scope"][0][0] < sentence_index # or (packageList[pack_index]["scope"][0][0] == sentence_index # and packageList[pack_index]["scope"][0][1] <= begin_index)) # and (packageList[pack_index]["scope"][1][0] > sentence_index # or (packageList[pack_index]["scope"][1][0] == sentence_index # and packageList[pack_index]["scope"][1][1] >= begin_index)): # pass _flag = True for _index in list_legalPack: if roleid in packageList[_index]["hit"]: continue else: _flag = False packageList[_index]["hit"].add(roleid) return packageList[_index]["pointer"],_flag if len(list_legalPack)>0: return packageList[0]["pointer"],_flag return None,False #生成合法的组合 def get_legal_comba(list_entity,dict_role_combination): #拿到一个包中所有合法的组合 def circle_package(_dict_legal_combination): list_dict_role_first = [] for _role in _dict_legal_combination: if len(list_dict_role_first)==0: for _entity in _dict_legal_combination[_role]: if _entity !="": list_dict_role_first.append({_role:_entity}) else: list_dict_role_after = [] _find_count = 0 for _entity in _dict_legal_combination[_role]: if _entity !="": for _dict in list_dict_role_first: _flag = True for _key1 in _dict: if _entity==_dict[_key1]: #修改为招标人和代理人可以为同一个 if str(_key1) in ["0","1"] and str(_role) in ["0","1"]: _flag = True else: _flag = False if _flag: _find_count += 1 _new_dict = copy.copy(_dict) _new_dict[_role] = _entity if len(list_dict_role_after)>100000: break list_dict_role_after.append(_new_dict) else: # 2021/5/25 update,同一实体(entity_text)不同角色 if len(list_dict_role_after) > 100000: break for _dict in list_dict_role_first: for _key1 in _dict: if _entity == _dict[_key1]: _new_dict = copy.copy(_dict) _new_dict.pop(_key1) _new_dict[_role] = _entity list_dict_role_after.append({_role:_entity}) if len(list_dict_role_after)==0: pass else: list_dict_role_first.extend(list_dict_role_after) return list_dict_role_first def recursive_package(_dict_legal_combination,set_legal_entity,dict_one_selution,list_all_selution): last_layer = False #若是空组合则放回空 if len(_dict_legal_combination.keys())==0: return [] #递归到最后一层则修改状态 if len(_dict_legal_combination.keys())==1: last_layer = True #取一个角色开始进行遍历 _key_role = list(_dict_legal_combination.keys())[0] for item in _dict_legal_combination[_key_role]: copy_dict_one_selution = copy.copy(dict_one_selution) copy_dict_legal_combination = {} copy_set_legal_entity = copy.copy(set_legal_entity) #复制余下的所有角色,进行下一轮递归 for _key in _dict_legal_combination.keys(): if _key!=_key_role: copy_dict_legal_combination[_key] = _dict_legal_combination[_key] #修改为招标人和代理人可以为同一个 if item !="": _flag = True if str(_key_role) in ["0","1"]: for _key_flag in copy_dict_one_selution: if _key_flag not in ["0","1"] and copy_dict_one_selution[_key_flag]==item: _flag = False else: for _key_flag in copy_dict_one_selution: if copy_dict_one_selution[_key_flag]==item: _flag = False if _flag: copy_dict_one_selution[_key_role] = item ''' if item not in copy_set_legal_entity: if item !="": copy_dict_one_selution[_key_role] = item ''' copy_set_legal_entity.add(item) if last_layer: list_all_selution.append(copy_dict_one_selution) else: recursive_package(copy_dict_legal_combination,copy_set_legal_entity,copy_dict_one_selution,list_all_selution) #递归匹配各个包的结果 def recursive_packages(_dict_legal_combination,dict_one_selution,list_all_selution): last_layer = False if len(_dict_legal_combination.keys())==0: return [] if len(_dict_legal_combination.keys())==1: last_layer = True _key_pack = list(_dict_legal_combination.keys())[0] for item in _dict_legal_combination[_key_pack]: copy_dict_one_selution = copy.copy(dict_one_selution) copy_dict_legal_combination = {} for _key in _dict_legal_combination.keys(): if _key!=_key_pack: copy_dict_legal_combination[_key] = _dict_legal_combination[_key] for _key_role in item.keys(): copy_dict_one_selution[_key_pack+"$$"+_key_role] = item[_key_role] if last_layer: list_all_selution.append(copy_dict_one_selution) else: recursive_packages(copy_dict_legal_combination,copy_dict_one_selution,list_all_selution) return list_all_selution #循环获取所有包组合 def circle_pageages(_dict_legal_combination): list_all_selution = [] for _key_pack in _dict_legal_combination.keys(): list_key_selution = [] for item in _dict_legal_combination[_key_pack]: _dict = dict() for _key_role in item.keys(): _dict[_key_pack+"$$"+_key_role] = item[_key_role] list_key_selution.append(_dict) if len(list_all_selution)==0: list_all_selution = list_key_selution else: _list_all_selution = [] for item_1 in list_all_selution: for item_2 in list_key_selution: _list_all_selution.append(dict(item_1,**item_2)) list_all_selution = _list_all_selution return list_all_selution #拿到各个包解析之后的结果 _dict_legal_combination = {} for packageName in dict_role_combination.keys(): _list_all_selution = [] # recursive_package(dict_role_combination[packageName], set(), {}, _list_all_selution) _list_all_selution = circle_package(dict_role_combination[packageName]) ''' # print("===1") # print(packageName) for item in _list_all_selution: # print(item) # print("===2") ''' #去除包含子集 list_all_selution_simple = [] _list_set_all_selution = [] for item_selution in _list_all_selution: item_set_selution = set() for _key in item_selution.keys(): item_set_selution.add((_key,item_selution[_key])) _list_set_all_selution.append(item_set_selution) if len(_list_set_all_selution)>1000: _dict_legal_combination[packageName] = _list_all_selution continue for i in range(len(_list_set_all_selution)): be_included = False for j in range(len(_list_set_all_selution)): if i!=j: if len(set(_list_set_all_selution[i])&set(_list_set_all_selution[j]))==len(_list_set_all_selution[i]) and len(_list_set_all_selution[i])!=len(_list_set_all_selution[j]): be_included = True if not be_included: list_all_selution_simple.append(_list_all_selution[i]) _dict_legal_combination[packageName] = list_all_selution_simple _list_final_comba = [] #对各个包的结果进行排列组合 _comba_count = 1 for _key in _dict_legal_combination.keys(): _comba_count *= len(_dict_legal_combination[_key]) #如果过大,则每个包只取概率最大的那个 dict_pack_entity_prob = get_dict_entity_prob(list_entity) if _comba_count>250: new_dict_legal_combination = dict() for _key_pack in _dict_legal_combination.keys(): MAX_PROB = -1000 _MAX_PROB_COMBA = None for item in _dict_legal_combination[_key_pack]: # print(_key_pack,item) _dict = dict() for _key in item.keys(): _dict[str(_key_pack)+"$$"+str(_key)] = item[_key] _prob = getSumExpectation(dict_pack_entity_prob, _dict) if _prob>MAX_PROB: MAX_PROB = _prob _MAX_PROB_COMBA = [item] if _MAX_PROB_COMBA is not None: new_dict_legal_combination[_key_pack] = _MAX_PROB_COMBA _dict_legal_combination = new_dict_legal_combination #recursive_packages(_dict_legal_combination, {}, _list_final_comba) _list_final_comba = circle_pageages(_dict_legal_combination) #除了Project包(招标人和代理人),其他包是不会有冲突的 #查看是否有一个实体出现在了Project包和其他包中,如有,要进行裁剪 _list_real_comba = [] for dict_item in _list_final_comba: set_project = set() set_other = set() for _key in list(dict_item.keys()): if _key.split("$$")[0]=="Project": set_project.add(dict_item[_key]) else: set_other.add(dict_item[_key]) set_common = set_project&set_other if len(set_common)>0: dict_project = {} dict_not_project = {} for _key in list(dict_item.keys()): if dict_item[_key] in set_common: if str(_key.split("$$")[0])=="Project": dict_project[_key] = dict_item[_key] else: dict_not_project[_key] = dict_item[_key] else: dict_project[_key] = dict_item[_key] dict_not_project[_key] = dict_item[_key] _list_real_comba.append(dict_project) _list_real_comba.append(dict_not_project) else: _list_real_comba.append(dict_item) return _list_real_comba def get_dict_entity_prob(list_entity,on_value=0.5): dict_pack_entity_prob = {} for in_attachment in [False,True]: identified_role = [] if in_attachment==True: identified_role = [value[0] for value in dict_pack_entity_prob.values()] for entity in list_entity: if entity.entity_type in ['org','company'] and entity.in_attachment==in_attachment: values = entity.values role_prob = float(values[int(entity.label)]) _key = entity.packageName+"$$"+str(entity.label) if role_prob>=on_value and str(entity.label)!="5": _key_prob = _key+"$text$"+entity.entity_text if in_attachment == True: if entity.entity_text in identified_role: continue if _key_prob in dict_pack_entity_prob: if role_prob>dict_pack_entity_prob[_key_prob][1]: dict_pack_entity_prob[_key_prob] = [entity.entity_text,role_prob] else: dict_pack_entity_prob[_key_prob] = [entity.entity_text,role_prob] return dict_pack_entity_prob #计算合计期望 def getSumExpectation(dict_pack_entity_prob,combination,on_value=0.5): ''' expect = 0 for entity in list_entity: if entity.entity_type in ['org','company']: values = entity.values role_prob = float(values[int(entity.label)]) _key = entity.packageName+"$$"+str(entity.label) if role_prob>on_value and str(entity.label)!="5": if _key in combination.keys() and combination[_key]==entity.entity_text: expect += math.pow(role_prob,4) else: expect -= math.pow(role_prob,4) ''' #修改为同一个实体只取对应包-角色的最大的概率值 expect = 0 dict_entity_prob = {} for _key_pack_entity in dict_pack_entity_prob: _key_pack = _key_pack_entity.split("$text$")[0] role_prob = dict_pack_entity_prob[_key_pack_entity][1] if _key_pack in combination.keys() and combination[_key_pack]==dict_pack_entity_prob[_key_pack_entity][0]: if _key_pack_entity in dict_entity_prob.keys(): if dict_entity_prob[_key_pack_entity]-role_prob: dict_entity_prob[_key_pack_entity] = -role_prob else: dict_entity_prob[_key_pack_entity] = -role_prob # for entity in list_entity: # if entity.entity_type in ['org','company']: # values = entity.values # role_prob = float(values[int(entity.label)]) # _key = entity.packageName+"$$"+str(entity.label) # if role_prob>=on_value and str(entity.label)!="5": # if _key in combination.keys() and combination[_key]==entity.entity_text: # _key_prob = _key+entity.entity_text # if _key_prob in dict_entity_prob.keys(): # if dict_entity_prob[_key_prob]-role_prob: # dict_entity_prob[_key_prob] = -role_prob # else: # dict_entity_prob[_key_prob] = -role_prob for _key in dict_entity_prob.keys(): symbol = 1 if dict_entity_prob[_key]>0 else -1 expect += symbol*math.pow(dict_entity_prob[_key],2) return expect def getRoleList(list_sentence,list_entity,on_value = 0.5): ''' @summary: 搜索树,得到所有不矛盾的角色组合,取合计期望值最大的作为结果返回 @param: list_sentence:文章所有的sentence list_entity:文章所有的实体 on_value:概率阈值 @return:文章的角色list ''' pack = getPackagesFromArticle(list_sentence,list_entity) if pack is None: return None PackageList,PackageSet,dict_PackageCode = pack #拿到所有可能的情况 dict_role_combination = {} # print(PackageList) #拿到各个实体的packageName,packageCode for entity in list_entity: if entity.entity_type in ['org','company']: #限制附件里角色values[label]最大概率prob max_prob = 0.85 if str(entity.label)!="5" and entity.in_attachment: if entity.values[entity.label]>max_prob: entity.values[entity.label] = max_prob #过滤掉字数小于3个的实体 if len(entity.entity_text)<=3: continue values = entity.values role_prob = float(values[int(entity.label)]) if role_prob>=on_value and str(entity.label)!="5": if str(entity.label) in ["0","1"]: packageName = "Project" else: if len(PackageSet)>0: packagePointer,_ = getPackage(PackageList,entity.sentence_index,entity.begin_index,"role-"+str(entity.label)) if packagePointer is None: #continue packageName = "Project" # print(entity.entity_text, packageName,entity.sentence_index,entity.begin_index) else: #add pointer_pack entity.pointer_pack = packagePointer packageName = packagePointer.entity_text # print(entity.entity_text, packageName) else: packageName = "Project" find_flag = False if packageName in dict_PackageCode.keys(): packageCode = dict_PackageCode[packageName] else: packageCode = "" entity.packageCode = packageCode role_name = dict_role_id.get(str(entity.label)) entity.roleName = role_name entity.packageName = packageName if entity.packageName in dict_role_combination.keys(): if str(entity.label) in dict_role_combination[entity.packageName].keys(): dict_role_combination[entity.packageName][str(entity.label)].add(entity.entity_text) else: dict_role_combination[entity.packageName][str(entity.label)] = set([entity.entity_text]) else: dict_role_combination[entity.packageName] = {} #初始化空值 roleIds = [0,1,2,3,4] for _roleId in roleIds: dict_role_combination[entity.packageName][str(_roleId)] = set([""]) dict_role_combination[entity.packageName][str(entity.label)].add(entity.entity_text) list_real_comba = get_legal_comba(list_entity,dict_role_combination) # print("===role_combination",dict_role_combination) # print("== real_comba",list_real_comba) #拿到最大期望值的组合 max_index = 0 max_expect = -100 _index = 0 dict_pack_entity_prob = get_dict_entity_prob(list_entity) for item_combination in list_real_comba: expect = getSumExpectation(dict_pack_entity_prob, item_combination) if expect>max_expect: max_index = _index max_expect = expect _index += 1 RoleList = [] RoleSet = set() if len(list_real_comba)>0: for _key in list_real_comba[max_index].keys(): packageName = _key.split("$$")[0] label = _key.split("$$")[1] role_name = dict_role_id.get(str(label)) entity_text = list_real_comba[max_index][_key] if packageName in dict_PackageCode.keys(): packagecode = dict_PackageCode.get(packageName) else: packagecode = "" RoleList.append(PREM(packageName,packagecode,role_name,entity_text,0,0,0.0,[])) RoleSet.add(entity_text) #根据最优树来修正list_entity中角色对包的连接 for _entity in list_entity: if _entity.pointer_pack is not None: _pack_name = _entity.pointer_pack.entity_text _find_flag = False for _prem in RoleList: if _prem.packageName==_pack_name and _prem.entity_text==_entity.entity_text: _find_flag = True if not _find_flag: _entity.pointer_pack = None return RoleList,RoleSet,PackageList,PackageSet def getPackageScopePattern(): ''' @summary: 获取包的作用域关键词 ''' df = pd.read_excel(os.path.dirname(__file__)+"/end.xls") pattern = "(" for item in df["list_word"]: item = str(item).replace("(","\(").replace(")","\)").replace(".","\.").replace("[","\[").replace("]","\]").replace("-","\-") pattern += item+"|" pattern = pattern[:-1]+")[::是为]|业绩.{,30}标段[0-9A-Za-z一二三四五六七八九十]{0,3}" return pattern pattern_packageScope = getPackageScopePattern() def getPackagesFromArticle(list_sentence,list_entity): ''' @param: list_sentence:文章的句子list @summary: 将包的信息插入list_entity中 @return: type:list if [包号,句子index,词偏移,标段号] meaning:文章的包/标段信息 ''' if len(list_sentence)==0: return None list_sentence.sort(key=lambda x:x.sentence_index) PackageList = [] PackageList_scope = [] PackageSet = set() dict_packageCode = dict() package_name_pattern = re.compile("((标[段号的包]|分包)的?(名[称单]?|包名))[::]?([^::]{3,30}?),{1}") package_N_name_pattern = re.compile("(([^承]|^)分?包|标段|标包|标|包|包组|子项目|包件|项目类型)编?号?[::]?[\((]?([0-9A-Za-z一二三四五六七八九十ⅠⅡⅢⅣⅤⅥⅦ]){1,2},{1}") package_number_pattern = re.compile("(((([^承]|^)包|标[段号的包]|分?包|包组|包件)编?号?|子项目|项目类型)[::]?[0-9A-Za-z一二三四五六七八九十ⅠⅡⅢⅣⅤⅥⅦ]{1,4}[^\.])[^至]?|([^\.]?第[ⅠⅡⅢⅣⅤⅥⅦ0-9A-Za-z一二三四五六七八九十]{1,4}(包号|标[段号的包]|分?包))") # 第? 去掉问号 修复 纯木浆8包/箱复印 这种作为包号 # other_package_pattern = re.compile('(项目名称|物资名称|设备名称|场次名称|标段名称)[::](.{,20}?)(,|项目)') # 新正则识别标段 other_package_pattern = re.compile('((项目|物资|设备|场次|标段|标的|产品)(名称)?)[::]([^,。]{2,50}?)[,。]') # # 2020/11/23 大网站规则 调整 package_N_name_pattern, package_N_name_pattern 中的项目 改为 子项目 win_tenderer_pattern = re.compile('(中标候?选?人|供应商)(名称)?[::](.{2,25})[,。]') # 2020/11/23 大网站规则 调整 model_pattern = re.compile('(型号|序号)[::]([^,。]{2,20})[,。]') # 2020/11/23 大网站规则 调整 number_pattern = re.compile("[0-9A-Za-z一二三四五六七八九十ⅠⅡⅢⅣⅤⅥⅦ]{1,4}") package_code_pattern = re.compile("(?:编号[::]?\s*)([-\dA-Za-z\(\)]+)") # 纯数字类型的包号统一,例如:'01','1' re_digital = re.compile("^\d+$") def changeIndexFromWordToWords(tokens,word_index): ''' @summary:转换某个字的字偏移为词偏移 ''' before_index = 0 after_index = 0 for i in range(len(tokens)): after_index = after_index+len(tokens[i]) if before_index<=word_index and after_index>=word_index: return i before_index = after_index package_names = [] def extractPackageCode(tokens,word_index,size=20,pattern = package_code_pattern): ''' @summary:抽取包附近的标段号 @param: tokens:包所在句子的分词 word_index:包所在字偏移 size:左右各取多少个词 pattern:提取标段号的正则 @return: type:string,meaning:标段号 ''' index = changeIndexFromWordToWords(tokens,word_index) if indexlen(tokens): end = len(tokens) else: end = index+size #拿到左右两边的词语组成短语 text = "".join(tokens[begin:end]) #在短语中的字偏移 new_word_index = word_index-len("".join(tokens[:begin])) min_distance = len(text) packageCode = None for the_iter in re.finditer(pattern,text): #算出最小距离 distance = min([abs(new_word_index-the_iter.span()[0]),abs(new_word_index-the_iter.span()[1])]) if distance1: for i in range(len(list_sentence)): PackageList_item = [] PackageList_item_scope = [] content = list_sentence[i].sentence_text tokens = list_sentence[i].tokens names = re.findall(other_package_pattern, content) N_names = re.findall(win_tenderer_pattern, content) if len(names) != 1 or len(N_names) != 1: continue for iter in re.finditer(other_package_pattern,content): temp_package_number = iter.group(4) xinghao = re.search(model_pattern, content) if xinghao: temp_package_number = temp_package_number + '+' + xinghao.group(2) # print('新正则采购包名补充',temp_package_number) if re.search(re_digital,temp_package_number): temp_package_number = str(int(temp_package_number)) PackageList_item.append({"name":temp_package_number,"sentence_index":list_sentence[i].sentence_index,"offsetWords_begin":changeIndexFromWordToWords(tokens,iter.span()[0]),"offsetWord_begin":iter.span()[0],"offsetWord_end":iter.span()[1]}) # PackageList_item.append([temp_package_number,i,changeIndexFromWordToWords(tokens,iter.span()[0]),iter.span()[0],iter.span()[1]]) code = extractPackageCode(tokens, iter.span()[0]) if code is not None: dict_packageCode[temp_package_number] = code PackageSet.add(temp_package_number) #识别packageScope for iter in re.finditer(pattern_packageScope,content): PackageList_item_scope.append({"name":"","sentence_index":list_sentence[i].sentence_index,"offsetWords_begin":changeIndexFromWordToWords(tokens,iter.span()[0]),"offsetWord_begin":iter.span()[0],"offsetWord_end":iter.span()[1]}) # PackageList_item_scope.append(["",i,changeIndexFromWordToWords(tokens,iter.span()[0]),iter.span()[0],iter.span()[1]]) PackageList_item_scope = PackageList_item +PackageList_item_scope PackageList_item_scope.sort(key=lambda x:x["offsetWord_begin"]) PackageList_scope = PackageList_scope+PackageList_item_scope PackageList_item.sort(key=lambda x:x["sentence_index"]) pattern_punctuation = "[::()\(\),,。;;]" # print("===packageList_scope",PackageList_scope) for i in range(len(list_sentence)): for j in range(len(PackageList_scope)): if i==PackageList_scope[j]["sentence_index"] and PackageList_scope[j]["name"]!="": _flag = False left_str = list_sentence[i].sentence_text[PackageList_scope[j]["offsetWord_begin"]-30:PackageList_scope[j]["offsetWord_begin"]+1] right_str = list_sentence[i].sentence_text[PackageList_scope[j]["offsetWord_begin"]:PackageList_scope[j]["offsetWord_begin"]+30] _left_find = re.findall(pattern_punctuation,left_str) _right_find = re.findall(pattern_punctuation,right_str) #print(left_str) if re.search("同",left_str[-1:]) is not None and PackageList_scope[j]["name"]=="一": continue if re.search("划分",right_str[:10]) is not None: continue if len(_left_find)>0 and _left_find[-1] in [":",":"]: _flag = True if len(_right_find)>0 and _right_find[0] in [":",":"]: _flag = True if _flag: scope_begin = [PackageList_scope[j]["sentence_index"],PackageList_scope[j]["offsetWords_begin"]] else: if j==0: scope_begin = [0,0] else: scope_begin = [PackageList_scope[j-1]["sentence_index"],PackageList_scope[j-1]["offsetWords_begin"]] if j==len(PackageList_scope)-1: scope_end = [list_sentence[-1].sentence_index,changeIndexFromWordToWords(list_sentence[-1].tokens, len(list_sentence[-1].sentence_text))] else: scope_end = [PackageList_scope[j+1]["sentence_index"],PackageList_scope[j+1]["offsetWords_begin"]] if PackageList_scope[j-1]["sentence_index"]==PackageList_scope[j]["sentence_index"] and PackageList_scope[j-1]["offsetWord_begin"]<=PackageList_scope[j]["offsetWord_begin"] and PackageList_scope[j-1]["offsetWord_end"]>=PackageList_scope[j]["offsetWord_end"]: continue #add package to entity _pack_entity = Entity(doc_id=list_sentence[0].doc_id,entity_id="%s_%s_%s_%s"%(list_sentence[0].doc_id,i,PackageList_scope[j]["offsetWord_begin"],PackageList_scope[j]["offsetWord_begin"]),entity_text=PackageList_scope[j]["name"],entity_type="package",sentence_index=PackageList_scope[j]["sentence_index"],begin_index=changeIndexFromWordToWords(list_sentence[i].tokens,PackageList_scope[j]["offsetWord_begin"]),end_index=changeIndexFromWordToWords(list_sentence[i].tokens,PackageList_scope[j]["offsetWord_end"]),wordOffset_begin=PackageList_scope[j]["offsetWord_begin"],wordOffset_end=PackageList_scope[j]["offsetWord_end"],in_attachment=list_sentence[i].in_attachment) list_entity.append(_pack_entity) copy_pack = copy.copy(PackageList_scope[j]) copy_pack["scope"] = [scope_begin,scope_end] copy_pack["hit"] = set() copy_pack["pointer"] = _pack_entity PackageList.append(copy_pack) return PackageList,PackageSet,dict_packageCode # km配对方法 def dispatch(match_list): main_roles = list(set([match.main_role for match in match_list])) attributes = list(set([match.attribute for match in match_list])) label = np.zeros(shape=(len(main_roles), len(attributes))) for match in match_list: main_role = match.main_role attribute = match.attribute value = match.value label[main_roles.index(main_role), attributes.index(attribute)] = value + 10000 # print(label) gragh = -label # km算法 row, col = linear_sum_assignment(gragh) max_dispatch = [(i, j) for i, j, value in zip(row, col, gragh[row, col]) if value] # return [Match(main_roles[row], attributes[col]) for row, col in max_dispatch] return [(main_roles[row], attributes[col]) for row, col in max_dispatch] from BiddingKG.dl.common.Utils import getUnifyMoney from BiddingKG.dl.interface.modelFactory import Model_relation_extraction relationExtraction_model = Model_relation_extraction() def findAttributeAfterEntity(PackDict,roleSet,PackageList,PackageSet,list_sentence,list_entity,list_outline,on_value = 0.5,on_value_person=0.5,sentence_len=4): ''' @param: PackDict:文章包dict roleSet:文章所有角色的公司名称 PackageList:文章的包信息 PackageSet:文章所有包的名称 list_entity:文章所有经过模型处理的实体 on_value:金额模型的阈值 on_value_person:联系人模型的阈值 sentence_len:公司和属性间隔句子的最大长度 @return:添加了属性信息的角色list ''' #根据roleid添加金额到rolelist中 def addMoneyByRoleid(packDict,packageName,roleid,money,money_prob): for i in range(len(packDict[packageName]["roleList"])): if packDict[packageName]["roleList"][i].role_name==dict_role_id.get(str(roleid)): if money_prob>packDict[packageName]["roleList"][i].money_prob: packDict[packageName]["roleList"][i].money = money packDict[packageName]["roleList"][i].money_prob = money_prob return packDict #根据实体名称添加金额到rolelist中 def addMoneyByEntity(packDict,packageName,entity,money,money_prob): for i in range(len(packDict[packageName]["roleList"])): if packDict[packageName]["roleList"][i].entity_text==entity: # if money_prob>packDict[packageName]["roleList"][i].money_prob: # packDict[packageName]["roleList"][i].money = money # packDict[packageName]["roleList"][i].money_prob = money_prob if packDict[packageName]["roleList"][i].money_prob==0 : # 2021/7/20第一次更新金额 packDict[packageName]["roleList"][i].money = money.entity_text packDict[packageName]["roleList"][i].money_prob = money_prob packDict[packageName]["roleList"][i].money_unit = money.money_unit elif money_prob>packDict[packageName]["roleList"][i].money_prob+0.2 or (money.notes in ['大写'] and money.in_attachment==False): # 2021/7/20改为优先选择大写金额, # print('已连接金额概率:money_prob:',packDict[packageName]["roleList"][i].money_prob) # print('链接金额备注 ',money.notes, money.entity_text, money.values) packDict[packageName]["roleList"][i].money = money.entity_text packDict[packageName]["roleList"][i].money_prob = money_prob packDict[packageName]["roleList"][i].money_unit = money.money_unit # print('链接中的金额:{0}, 单位:{1}'.format(money.entity_text, money.money_unit)) return packDict def addRatioByEntity(packDict,packageName,entity,ratio): for i in range(len(packDict[packageName]["roleList"])): if packDict[packageName]["roleList"][i].entity_text==entity: packDict[packageName]["roleList"][i].ratio = ratio.entity_text def addServiceTimeByEntity(packDict,packageName,entity,serviceTime): for i in range(len(packDict[packageName]["roleList"])): if packDict[packageName]["roleList"][i].entity_text==entity: packDict[packageName]["roleList"][i].serviceTime = serviceTime.entity_text #根据实体名称得到角色 def getRoleWithText(packDict,entity_text): for pack in packDict.keys(): for i in range(len(packDict[pack]["roleList"])): if packDict[pack]["roleList"][i].entity_text==entity_text: return packDict[pack]["roleList"][i].role_name def doesEntityOrLinkedEntity_inRoleSet(entity,RoleSet): _list_entitys = [entity]+entity.linked_entitys for _entity in _list_entitys: if _entity.entity_text in RoleSet: return True p_entity = 0 # 2021/7/19 顺序比较金额,前面是后面的一万倍则把前面金额/10000 money_list = [it for it in list_entity if it.entity_type=="money"] for i in range(len(money_list)-1): for j in range(1, len(money_list)): if (float(money_list[i].entity_text) > 5000000000 or money_list[j].notes=='大写') and \ Decimal(money_list[i].entity_text)/Decimal(money_list[j].entity_text)==10000: money_list[i].entity_text = str(Decimal(money_list[i].entity_text)/10000) # print('连接前修改大于50亿金额:前面是后面的一万倍则把前面金额/10000') #遍历所有实体 # while(p_entity=on_value: if str(entity.label)=="0": packagePointer,_ = getPackage(PackageList,entity.sentence_index,entity.begin_index,"money-"+str(entity.label)) if packagePointer is None: packageName = "Project" else: packageName = packagePointer.entity_text addMoneyByRoleid(PackDict, packageName, "0", entity.entity_text, entity.values[entity.label]) ''' ''' # 2020/11/25 与下面的联系人连接步骤重复,取消 if entity.entity_type=="person": if entity.values[entity.label]>=on_value_person: if str(entity.label)=="1": for i in range(len(PackDict["Project"]["roleList"])): if PackDict["Project"]["roleList"][i].role_name=="tenderee": PackDict["Project"]["roleList"][i].linklist.append((entity.entity_text,entity.person_phone)) # add pointer_person for _entity in list_entity: if dict_role_id.get(str(_entity.label))=="tenderee": for i in range(len(PackDict["Project"]["roleList"])): if PackDict["Project"]["roleList"][i].entity_text==_entity.entity_text and PackDict["Project"]["roleList"][i].role_name=="tenderee": _entity.pointer_person = entity elif str(entity.label)=="2": for i in range(len(PackDict["Project"]["roleList"])): if PackDict["Project"]["roleList"][i].role_name=="agency": PackDict["Project"]["roleList"][i].linklist.append((entity.entity_text,entity.person_phone)) # add pointer_person for _entity in list_entity: if dict_role_id.get(str(_entity.label))=="agency": for i in range(len(PackDict["Project"]["roleList"])): if PackDict["Project"]["roleList"][i].entity_text==_entity.entity_text and PackDict["Project"]["roleList"][i].role_name=="agency": _entity.pointer_person = entity ''' # #金额往前找实体 # if entity.entity_type=="money": # if entity.values[entity.label]>=on_value: # p_entity_money= p_entity # entity_money = list_entity[p_entity_money] # if len(PackageSet)>0: # packagePointer,_ = getPackage(PackageList,entity_money.sentence_index,entity_money.begin_index,"money-"+str(entity_money.entity_text)+"-"+str(entity_money.label)) # if packagePointer is None: # packageName_entity = "Project" # else: # packageName_entity = packagePointer.entity_text # else: # packageName_entity = "Project" # while(p_entity_money>0): # entity_before = list_entity[p_entity_money] # if entity_before.entity_type in ['org','company']: # if str(entity_before.label)=="1": # addMoneyByEntity(PackDict, packageName_entity, entity_before.entity_text, entity_money.entity_text, entity_money.values[entity_money.label]) # #add pointer_money # entity_before.pointer_money = entity_money # break # p_entity_money -= 1 #如果实体属于角色集合,则往后找属性 # if doesEntityOrLinkedEntity_inRoleSet(entity, roleSet): # # p_entity += 1 # #循环查找符合的属性 # while(p_entity=sentence_len: # p_entity -= 1 # break # #若是遇到公司实体,则跳出循环 # if entity_after.entity_type in ['org','company']: # p_entity -= 1 # break # if entity_after.values is not None: # if entity_after.entity_type=="money": # if entity_after.values[entity_after.label]>=on_value: # ''' # #招标金额从后往前找 # if str(entity_after.label)=="0": # packagePointer,_ = getPackage(PackageList,entity.sentence_index,entity.begin_index,"money-"+str(entity.label)) # if packagePointer is None: # packageName = "Project" # else: # packageName = packagePointer.entity_text # addMoneyByRoleid(PackDict, packageName, "0", entity_after.entity_text, entity_after.values[entity_after.label]) # ''' # if str(entity_after.label)=="1": # #print(entity_after.entity_text,entity.entity_text) # _list_entitys = [entity]+entity.linked_entitys # if len(PackageSet)>0: # packagePointer,_ = getPackage(PackageList,entity_after.sentence_index,entity_after.begin_index,"money-"+str(entity_after.entity_text)+"-"+str(entity_after.label)) # if packagePointer is None: # packageName_entity = "Project" # else: # packageName_entity = packagePointer.entity_text # else: # packageName_entity = "Project" # if str(entity.label) in ["2","3","4"]: # # addMoneyByEntity(PackDict, packageName_entity, entity.entity_text, entity_after.entity_text, entity_after.values[entity_after.label]) # if entity_after.notes == '单价' or float(entity_after.entity_text)<5000: #2021/12/17 调整小金额阈值,避免203608823.html 两次金额一次万元没提取到的情况 # addMoneyByEntity(PackDict, packageName_entity, entity.entity_text, entity_after, # 0.5) # entity.pointer_money = entity_after # # print('role zhao money', entity.entity_text, '中标金额:', entity_after.entity_text) # else: # addMoneyByEntity(PackDict, packageName_entity, entity.entity_text, entity_after, # entity_after.values[entity_after.label]) # entity.pointer_money = entity_after # # print('role zhao money', entity.entity_text, '中标金额:', entity_after.entity_text) # if entity_after.values[entity_after.label]>0.6: # break # 2021/7/16 新增,找到中标金额,非单价即停止,不再往后找金额 # #add pointer_money # # entity.pointer_money = entity_after # # print('role zhao money', entity.entity_text, '中标金额:', entity_after.entity_text) # # if entity_after.notes!='单价': # # break # 2021/7/16 新增,找到中标金额即停止,不再往后找金额 # ''' # if entity_after.entity_type=="person": # if entity_after.values[entity_after.label]>=on_value_person: # if str(entity_after.label)=="1": # for i in range(len(roleList)): # if roleList[i].role_name=="tenderee": # roleList[i].linklist.append((entity_after.entity_text,entity_after.person_phone)) # elif str(entity_after.label)=="2": # for i in range(len(roleList)): # if roleList[i].role_name=="agency": # roleList[i].linklist.append((entity_after.entity_text,entity_after.person_phone)) # elif str(entity_after.label)=="3": # _list_entitys = [entity]+entity.linked_entitys # for _entity in _list_entitys: # for i in range(len(roleList)): # if roleList[i].entity_text==_entity.entity_text: # if entity_after.sentence_index-_entity.sentence_index>1 and len(roleList[i].linklist)>0: # break # roleList[i].linklist.append((entity_after.entity_text,entity_after.person_phone)) # ''' # # p_entity += 1 # # p_entity += 1 # 记录每句的分词数量 tokens_num_dict = dict() last_tokens_num = 0 for sentence in list_sentence: _index = sentence.sentence_index if _index == 0: tokens_num_dict[_index] = 0 else: tokens_num_dict[_index] = tokens_num_dict[_index - 1] + last_tokens_num last_tokens_num = len(sentence.tokens) attribute_type = ['money','serviceTime','ratio']# 'money'仅指“中投标金额” for link_attribute in attribute_type: temp_entity_list = [] if link_attribute=="money": temp_entity_list = [ent for ent in list_entity if (ent.entity_type in ['org','company'] and ent.label in [2,3,4]) or (ent.entity_type=='money' and ent.label==1 and ent.values[ent.label]>=0.5)] # 删除重复的‘中投标金额’,一般为大小写两种样式 drop_tendererMoney = [] for ent_idx in range(len(temp_entity_list)-1): entity = temp_entity_list[ent_idx] if entity.entity_type=='money': next_entity = temp_entity_list[ent_idx+1] if next_entity.entity_type=='money': if getUnifyMoney(entity.entity_text)==getUnifyMoney(next_entity.entity_text): if (tokens_num_dict[next_entity.sentence_index] + next_entity.begin_index) - ( tokens_num_dict[entity.sentence_index] + entity.end_index) < 10: drop_tendererMoney.append(next_entity) for _drop in drop_tendererMoney: temp_entity_list.remove(_drop) elif link_attribute=="serviceTime": temp_entity_list = [ent for ent in list_entity if (ent.entity_type in ['org','company'] and ent.label in [2,3,4]) or ent.entity_type=='serviceTime'] elif link_attribute=="ratio": temp_entity_list = [ent for ent in list_entity if (ent.entity_type in ['org','company'] and ent.label in [2,3,4]) or ent.entity_type=='ratio'] temp_entity_list = sorted(temp_entity_list,key=lambda x: (x.sentence_index, x.begin_index)) temp_match_list = [] for ent_idx in range(len(temp_entity_list)): entity = temp_entity_list[ent_idx] if entity.entity_type in ['org','company']: match_nums = 0 tenderer_nums = 0 #经过其他中投标人的数量 byNotTenderer_match_nums = 0 #跟在中投标人后面的属性 for after_index in range(ent_idx + 1, min(len(temp_entity_list), ent_idx + 4)): after_entity = temp_entity_list[after_index] if after_entity.entity_type == link_attribute: distance = (tokens_num_dict[after_entity.sentence_index] + after_entity.begin_index) - ( tokens_num_dict[entity.sentence_index] + entity.end_index) sentence_distance = after_entity.sentence_index - entity.sentence_index value = (-1 / 2 * (distance ** 2)) / 10000 if link_attribute == "money": if after_entity.notes == '单价': value = value * 100 if sentence_distance == 0: if distance < 100: # value = (-1 / 2 * (distance ** 2)) / 10000 temp_match_list.append(Match(entity, after_entity, value)) match_nums += 1 if not tenderer_nums: byNotTenderer_match_nums += 1 else: break else: if distance < 60: # value = (-1 / 2 * (distance ** 2)) / 10000 temp_match_list.append(Match(entity, after_entity, value)) match_nums += 1 if not tenderer_nums: byNotTenderer_match_nums += 1 else: break else: tenderer_nums += 1 #前向查找属性 if ent_idx!=0 and (not match_nums or not byNotTenderer_match_nums): previous_entity = temp_entity_list[ent_idx - 1] if previous_entity.entity_type == link_attribute: # if previous_entity.sentence_index == entity.sentence_index: distance = (tokens_num_dict[entity.sentence_index] + entity.begin_index) - ( tokens_num_dict[previous_entity.sentence_index] + previous_entity.end_index) if distance < 40: # 前向 没有 /10000 value = (-1 / 2 * (distance ** 2)) temp_match_list.append(Match(entity, previous_entity, value)) # km算法分配求解 dispatch_result = dispatch(temp_match_list) dispatch_result = sorted(dispatch_result, key=lambda x: (x[0].sentence_index,x[0].begin_index)) for match in dispatch_result: _entity = match[0] _attribute = match[1] if link_attribute=='money': _entity.pointer_money = _attribute packagePointer, _ = getPackage(PackageList, _attribute.sentence_index, _attribute.begin_index, "money-" + str(_attribute.entity_text) + "-" + str(_attribute.label)) # print(_entity.entity_text,_attribute.entity_text) if packagePointer is None: packageName_entity = "Project" else: packageName_entity = packagePointer.entity_text if _attribute.notes == '单价' or float(_attribute.entity_text) < 5000: # 2021/12/17 调整小金额阈值,避免203608823.html 两次金额一次万元没提取到的情况 # print(packageName_entity,_attribute.entity_text, _attribute.values[_attribute.label]) addMoneyByEntity(PackDict, packageName_entity, _entity.entity_text, _attribute,0.5) else: # print(packageName_entity,_attribute.entity_text, _attribute.values[_attribute.label]) addMoneyByEntity(PackDict, packageName_entity, _entity.entity_text, _attribute, _attribute.values[_attribute.label]) elif link_attribute=='serviceTime': _entity.pointer_serviceTime = _attribute packagePointer, _ = getPackage(PackageList, _attribute.sentence_index, _attribute.begin_index, "serviceTime-" + str(_attribute.entity_text) + "-" + str(_attribute.label)) if packagePointer is None: packageName_entity = "Project" else: packageName_entity = packagePointer.entity_text addServiceTimeByEntity(PackDict, packageName_entity, _entity.entity_text, _attribute) elif link_attribute=='ratio': _entity.pointer_ratio = _attribute packagePointer, _ = getPackage(PackageList, _attribute.sentence_index, _attribute.begin_index, "ratio-" + str(_attribute.entity_text) + "-" + str(_attribute.label)) if packagePointer is None: packageName_entity = "Project" else: packageName_entity = packagePointer.entity_text addRatioByEntity(PackDict, packageName_entity, _entity.entity_text, _attribute) '''''' # 通过模型分类的招标/代理联系人 list_sentence = sorted(list_sentence, key=lambda x: x.sentence_index) person_list = [entity for entity in list_entity if entity.entity_type == 'person' and entity.label in [1, 2]] tenderee_contact = set() tenderee_phone = set() agency_contact = set() agency_phone = set() winter_contact = set() for _person in person_list: if _person.label == 1: tenderee_contact.add(_person.entity_text) if _person.label == 2: agency_contact.add(_person.entity_text) # 正则匹配无 '主体/联系人' 的电话 # 例:"采购人联系方式:0833-5226788," phone_pattern = '(1[3-9][0-9][-—-―]?\d{4}[-—-―]?\d{4}|' \ '\+86.?1[3-9]\d{9}|' \ '0[1-9]\d{1,2}[-—-―][1-9]\d{6,7}/[1-9]\d{6,10}|' \ '0[1-9]\d{1,2}[-—-―]\d{7,8}.?转\d{1,4}|' \ '0[1-9]\d{1,2}[-—-―]\d{7,8}[-—-―]\d{1,4}|' \ '0[1-9]\d{1,2}[-—-―]?[1-9]\d{6}\d?(?=1[3-9]\d{9})|' \ '0[1-9]\d{1,2}[-—-―]?[1-9]\d{6}\d?(?=0[1-9]\d{1,2}[-—-―]?[1-9]\d{6}\d?)|' \ '0[1-9]\d{1,2}[-—-―]?[1-9]\d{6}\d?(?=[1-9]\d{6,7})|' \ '0[1-9]\d{1,2}[-—-―]?[1-9]\d{6}\d?|' \ '[\(|\(]0[1-9]\d{1,2}[\)|\)]-?\d{7,8}-?\d{,4}|' \ '[2-9]\d{6,7})' re_tenderee_phone = re.compile( "(?:(?:(?:采购|招标|议价|议标|比选)(?:人|公司|单位|组织|部门)|建设(?:单位|业主)|(?:采购|招标|甲)方|询价单位|项目业主|业主)[^。]{0,5}(?:电话|联系方式|联系人|联系电话)[::]?[^。]{0,7}?)" # 电话号码 + phone_pattern) # 例:"采购人地址和联系方式:峨边彝族自治县教育局,0833-5226788," re_tenderee_phone2 = re.compile( "(?:(?:(?:采购|招标|议价|议标|比选)(?:人|公司|单位|组织|部门)|建设(?:单位|业主)|(?:采购|招标|甲)方|询价单位|项目业主|业主)[^。]{0,3}(?:地址)[^。]{0,3}(?:电话|联系方式|联系人|联系电话)[::]?[^。]{0,20}?)" # 电话号码 + phone_pattern) re_agent_phone = re.compile( "(?:(?:代理(?:人|机构|公司|单位|组织|方)|采购机构|集中采购机构|集采机构|招标机构)[^。]{0,5}(?:电话|联系方式|联系人|联系电话)[::]?[^。]{0,7}?)" # 电话号码 + phone_pattern) re_agent_phone2 = re.compile( "(?:(?:代理(?:人|机构|公司|单位|组织|方)|采购机构|集中采购机构|集采机构|招标机构)[^。]{0,3}(?:地址)[^。]{0,3}(?:电话|联系方式|联系人|联系电话)[::]?[^。]{0,20}?)" # 电话号码 + phone_pattern) content = "" for _sentence in list_sentence: content += "".join(_sentence.tokens) _content = copy.deepcopy(content) while re.search("(.)(,)([^0-9])|([^0-9])(,)(.)", content): content_words = list(content) for i in re.finditer("(.)(,)([^0-9])", content): content_words[i.span(2)[0]] = "" for i in re.finditer("([^0-9])(,)(.)", content): content_words[i.span(2)[0]] = "" content = "".join(content_words) content = re.sub("[::]|[\((]|[\))]", "", content) _tenderee_phone = re.findall(re_tenderee_phone, content) # 更新正则确定的角色属性 for i in range(len(PackDict["Project"]["roleList"])): if PackDict["Project"]["roleList"][i].role_name == "tenderee": _tenderee_phone = re.findall(re_tenderee_phone, content) if _tenderee_phone: for _phone in _tenderee_phone: _phone = _phone.split("/") # 分割多个号码 for one_phone in _phone: PackDict["Project"]["roleList"][i].linklist.append(("", one_phone)) tenderee_phone.add(one_phone) _tenderee_phone2 = re.findall(re_tenderee_phone2, content) if _tenderee_phone2: for _phone in _tenderee_phone2: _phone = _phone.split("/") for one_phone in _phone: PackDict["Project"]["roleList"][i].linklist.append(("", one_phone)) tenderee_phone.add(one_phone) if PackDict["Project"]["roleList"][i].role_name == "agency": _agent_phone = re.findall(re_agent_phone, content) if _agent_phone: for _phone in _agent_phone: _phone = _phone.split("/") for one_phone in _phone: PackDict["Project"]["roleList"][i].linklist.append(("", one_phone)) agency_phone.add(one_phone) _agent_phone2 = re.findall(re_agent_phone2, content) if _agent_phone2: for _phone in _agent_phone2: _phone = _phone.split("/") for one_phone in _phone: PackDict["Project"]["roleList"][i].linklist.append(("", one_phone)) agency_phone.add(one_phone) # 正则提取电话号码实体 # key_word = re.compile('((?:电话|联系方式|联系人).{0,4}?)([0-1]\d{6,11})') phone = re.compile('1[3-9][0-9][-—-―]?\d{4}[-—-―]?\d{4}|' '\+86.?1[3-9]\d{9}|' # '0[^0]\d{1,2}[-—-―][1-9]\d{6,7}/[1-9]\d{6,10}|' '0[1-9]\d{1,2}[-—-―]\d{7,8}.?转\d{1,4}|' '0[1-9]\d{1,2}[-—-―]\d{7,8}[-—-―]\d{1,4}|' '0[1-9]\d{1,2}[-—-―]?[1-9]\d{6}\d?(?=1[3-9]\d{9})|' '0[1-9]\d{1,2}[-—-―]?[1-9]\d{6}\d?(?=0[1-9]\d{1,2}[-—-―]?[1-9]\d{6}\d?)|' '0[1-9]\d{1,2}[-—-―]?[1-9]\d{6}\d?(?=[1-9]\d{6,7})|' '0[1-9]\d{1,2}[-—-―]?[1-9]\d{6}\d?|' '[\(|\(]0[1-9]\d{1,2}[\)|\)]-?\d{7,8}-?\d{,4}|' '[2-9]\d{6,7}') url_pattern = re.compile("http[s]?://(?:[a-zA-Z]|[0-9]|[$\-_@.&+=\?:/]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+") email_pattern = re.compile("[a-zA-Z0-9][a-zA-Z0-9_-]+(?:\.[a-zA-Z0-9_-]+)*@" "[a-zA-Z0-9_-]+(?:\.[a-zA-Z0-9_-]+)*(?:\.[a-zA-Z]{2,})") phone_entitys = [] code_entitys = [ent for ent in list_entity if ent.entity_type=='code'] for _sentence in list_sentence: sentence_text = _sentence.sentence_text in_attachment = _sentence.in_attachment list_tokenbegin = [] begin = 0 for i in range(0, len(_sentence.tokens)): list_tokenbegin.append(begin) begin += len(str(_sentence.tokens[i])) list_tokenbegin.append(begin + 1) # 排除网址、邮箱、项目编号实体 error_list = [] for i in re.finditer(url_pattern, sentence_text): error_list.append((i.start(), i.end())) for i in re.finditer(email_pattern, sentence_text): error_list.append((i.start(), i.end())) for code_ent in [ent for ent in code_entitys if ent.sentence_index==_sentence.sentence_index]: error_list.append((code_ent.wordOffset_begin,code_ent.wordOffset_end)) res_set = set() for i in re.finditer(phone, sentence_text): is_continue = False for error_ent in error_list: if i.start()>=error_ent[0] and i.end()<=error_ent[1]: is_continue = True break if is_continue: continue res_set.add((i.group(), i.start(), i.end())) res_set = sorted(list(res_set),key=lambda x:x[1]) last_phone_mask = True for item_idx in range(len(res_set)): item = res_set[item_idx] phone_left = sentence_text[max(0, item[1] - 10):item[1]] phone_right = sentence_text[item[2]:item[2] + 8] if re.search("电话|手机|联系人|联系方式”",re.sub(",","",phone_left)): pass else: # 排除“传真号”和其它错误项 if re.search("传,?真|信,?箱|邮,?[箱件]|QQ|qq", phone_left): if not re.search("电,?话", phone_left): last_phone_mask = False continue if re.search("注册[证号]|帐号|编[号码]|报价|标价|证号|价格|型号|附件|代码|列号|行号|税号|[\(\(]万?元[\)\)]|[a-zA-Z]+\d*$", re.sub(",","",phone_left)): last_phone_mask = False continue if re.search("^\d{0,4}[.,]\d{2,}|^[0-9a-zA-Z\.]*@|^\d*[a-zA-Z]+|元", phone_right): last_phone_mask = False continue # 前后跟着字母 if re.search("[a-zA-Z/]+$", phone_left) or re.search("^[a-zA-Z/]+", phone_right): last_phone_mask = False continue # 前后跟着长度小于一定值数字的正则排除 if re.search("\d+[-—-―]?\d*$",phone_left) or re.search("^\d+[-—-―]?\d*",phone_right): phone_left_number = re.search("\d+[-—-―]?\d*$",phone_left) phone_right_number = re.search("^\d+[-—-―]?\d+",phone_right) if phone_left_number: if len(phone_left_number.group())<7: last_phone_mask = False continue if phone_right_number: if len(phone_right_number.group())<7: last_phone_mask = False continue # if:上一个phone实体不符合条件 if not last_phone_mask: item_start = item[1] last_item_end = res_set[item_idx-1][2] if item_start - last_item_end<=1 or re.search("^[\da-zA-Z\-—-―、]+$",sentence_text[last_item_end:item_start]): last_phone_mask = False continue for j in range(len(list_tokenbegin)): if list_tokenbegin[j] == item[1]: begin_index = j break elif list_tokenbegin[j] > item[1]: begin_index = j - 1 break for j in range(begin_index, len(list_tokenbegin)): if list_tokenbegin[j] >= item[2]: end_index = j - 1 break _entity = Entity(_sentence.doc_id, None, item[0], "phone", _sentence.sentence_index, begin_index, end_index, item[1], item[2],in_attachment=in_attachment) phone_entitys.append(_entity) last_phone_mask = True def is_company(entity,text): # 判断"公司"实体是否为地址地点 if entity.label!=5 and entity.values[entity.label]>0.5: return True if ent.is_tail==True: return False entity_left = text[max(0,entity.wordOffset_begin-10):entity.wordOffset_begin] entity_left = re.sub(",()\(\)","",entity_left) entity_left = entity_left[-5:] if re.search("地址|地点|银行[::]",entity_left): return False else: return True pre_entity = [] for ent in list_entity: if (ent.entity_type in ['company','org','phone'] and is_company(ent,list_sentence[ent.sentence_index].sentence_text)) or (ent.entity_type=='person' and ent.label in [1,2,3]) \ or (ent.entity_type=='location' and len(ent.entity_text)>5): pre_entity.append(ent) text_data,pre_data = relationExtraction_model.encode(pre_entity + phone_entitys, list_sentence) # print(pre_data) maxlen = 512 relation_list = [] if 03: break for _text_data, _pre_data in temp_data: relation_list.extend(relationExtraction_model.predict(_text_data,_pre_data)) temp_data = [] start = start + maxlen - 120 # print("预测数据:",len(temp_data)) # if len(temp_data)<=6: # for _text_data,_pre_data in temp_data: # relation_list.extend(relationExtraction_model.predict(_text_data,_pre_data)) # else: # relation_list = [] # 去重结果 relation_list = list(set(relation_list)) # print(relation_list) # tokens_num_dict = dict() # last_tokens_num = 0 # for sentence in list_sentence: # _index = sentence.sentence_index # if _index == 0: # tokens_num_dict[_index] = 0 # else: # tokens_num_dict[_index] = tokens_num_dict[_index - 1] + last_tokens_num # last_tokens_num = len(sentence.tokens) right_combination = [('org','person'),('company','person'),('company','location'),('org','location'),('person','phone')] linked_company = set() linked_person = set() linked_connetPerson = set() linked_phone = set() for predicate in ["rel_address","rel_phone","rel_person"]: _match_list = [] _match_combo = [] for relation in relation_list: _subject = relation[0] _object = relation[2] if isinstance(_subject,Entity) and isinstance(_object,Entity) and (_subject.entity_type,_object.entity_type) in right_combination: if relation[1]==predicate: if predicate=="rel_person": if (_subject.label==0 and _object.entity_text in agency_contact ) or (_subject.label==1 and _object.entity_text in tenderee_contact): continue distance = (tokens_num_dict[_object.sentence_index] + _object.begin_index) - ( tokens_num_dict[_subject.sentence_index] + _subject.end_index) if distance>0: value = (-1 / 2 * (distance ** 2))/10000 else: distance = abs(distance) value = (-1 / 2 * (distance ** 2)) _match_list.append(Match(_subject,_object,value)) _match_combo.append((_subject,_object)) match_result = dispatch(_match_list) error_list = [] for mat in list(set(_match_combo)-set(match_result)): for temp in match_result: if mat[1]==temp[1] and mat[0]!=temp[0]: error_list.append(mat) break result = list(set(_match_combo)-set(error_list)) if predicate=='rel_person': # 从后往前更新状态,已近后向链接的属性不在前向链接(解决错误链接) result = sorted(result,key=lambda x:x[1].begin_index,reverse=True) for combo in result: is_continue = False if not combo[0].pointer_person: combo[0].pointer_person = [] if combo[1].begin_indexcombo[0].begin_index: is_continue = True break if is_continue: continue combo[0].pointer_person.append(combo[1]) linked_company.add(combo[0]) linked_person.add(combo[1]) # print(1,combo[0].entity_text,combo[1].entity_text) if predicate=='rel_address': result = sorted(result,key=lambda x:x[1].begin_index,reverse=True) for combo in result: if combo[0].pointer_address: continue combo[0].pointer_address = combo[1] # print(2,combo[0].entity_text,combo[1].entity_text) if predicate=='rel_phone': result = sorted(result,key=lambda x:x[1].begin_index,reverse=True) for combo in result: is_continue = False if not combo[0].person_phone: combo[0].person_phone = [] if combo[1].begin_indexcombo[0].begin_index: is_continue = True break if is_continue: continue combo[0].person_phone.append(combo[1]) linked_connetPerson.add(combo[0]) linked_phone.add(combo[1]) if combo[0].label in [1,2]: if PackDict.get("Project"): for i in range(len(PackDict["Project"]["roleList"])): if (combo[0].label==1 and PackDict["Project"]["roleList"][i].role_name=='tenderee') \ or (combo[0].label==2 and PackDict["Project"]["roleList"][i].role_name=='agency'): PackDict["Project"]["roleList"][i].linklist.append((combo[0].entity_text,combo[1].entity_text)) break # print(3,combo[0].entity_text,combo[1].entity_text) # "联系人——联系电话" 链接规则补充 person_phone_EntityList = [ent for ent in pre_entity+ phone_entitys if ent.entity_type not in ['company','org','location']] person_phone_EntityList = sorted(person_phone_EntityList, key=lambda x: (x.sentence_index, x.begin_index)) t_match_list = [] for ent_idx in range(len(person_phone_EntityList)): entity = person_phone_EntityList[ent_idx] if entity.entity_type=="person": match_nums = 0 person_nums = 0 # 经过其他中联系人的数量 byNotPerson_match_nums = 0 # 跟在联系人后面的属性 phone_nums = 0 # 经过电话的数量 for after_index in range(ent_idx + 1, min(len(person_phone_EntityList), ent_idx + 8)): after_entity = person_phone_EntityList[after_index] if after_entity.entity_type == "phone": distance = (tokens_num_dict[after_entity.sentence_index] + after_entity.begin_index) - ( tokens_num_dict[entity.sentence_index] + entity.end_index) phone_nums += 1 if distance>100 or phone_nums>=4: break sentence_distance = after_entity.sentence_index - entity.sentence_index value = (-1 / 2 * (distance ** 2)) / 10000 if sentence_distance == 0: if distance < 80: # value = (-1 / 2 * (distance ** 2)) / 10000 t_match_list.append(Match(entity, after_entity, value)) match_nums += 1 if not person_nums: byNotPerson_match_nums += 1 else: break else: if distance < 50: # value = (-1 / 2 * (distance ** 2)) / 10000 t_match_list.append(Match(entity, after_entity, value)) match_nums += 1 if not person_nums: byNotPerson_match_nums += 1 else: break else: person_nums += 1 # 前向查找属性 if ent_idx != 0 and (not match_nums or not byNotPerson_match_nums): previous_entity = person_phone_EntityList[ent_idx - 1] if previous_entity.entity_type == 'phone': # if previous_entity.sentence_index == entity.sentence_index: distance = (tokens_num_dict[entity.sentence_index] + entity.begin_index) - ( tokens_num_dict[previous_entity.sentence_index] + previous_entity.end_index) if distance < 40: # 前向 没有 /10000 value = (-1 / 2 * (distance ** 2)) t_match_list.append(Match(entity, previous_entity, value)) # km算法分配求解(person-phone) t_match_list = [mat for mat in t_match_list if mat.main_role not in linked_connetPerson and mat.attribute not in linked_phone] personphone_result = dispatch(t_match_list) personphone_result = sorted(personphone_result, key=lambda x: (x[0].sentence_index, x[0].begin_index)) for match in personphone_result: _person = match[0] _phone = match[1] if not _person.person_phone: _person.person_phone = [] _person.person_phone.append(_phone) # 多个招标人/代理人或者别称 for idx in range(1,len(pre_entity)): _pre_entity = pre_entity[idx] if _pre_entity in linked_company and _pre_entity.label==5: last_ent = pre_entity[idx-1] if last_ent.entity_type in ['company','org'] and last_ent.label in [0,1]: if last_ent.sentence_index==_pre_entity.sentence_index: mid_text = list_sentence[_pre_entity.sentence_index].sentence_text[last_ent.wordOffset_end:_pre_entity.wordOffset_begin] if len(mid_text)<=20 and "," not in mid_text and re.search("[、\((]",mid_text): _pre_entity.label = last_ent.label _pre_entity.values[last_ent.label] = 0.6 # 2022/01/25 固定电话可连多个联系人 temp_person_entitys = [entity for entity in pre_entity if entity.entity_type == 'person'] temp_person_entitys2 = [] #和固定电话相连的联系人 for entity in temp_person_entitys: if entity.person_phone: for _phone in entity.person_phone: if not re.search("^1[3-9]\d{9}$", _phone.entity_text): temp_person_entitys2.append(entity) break for index in range(len(temp_person_entitys)): entity = temp_person_entitys[index] if entity in temp_person_entitys2: last_person = entity for after_index in range(index + 1, min(len(temp_person_entitys), index + 5)): after_entity = temp_person_entitys[after_index] if after_entity.sentence_index == last_person.sentence_index and after_entity.begin_index - last_person.end_index < 3: for _phone in entity.person_phone: if not re.search("^1[3-9]\d{9}$", _phone.entity_text): if _phone not in after_entity.person_phone: after_entity.person_phone.append(_phone) last_person = after_entity else: break if index==0: continue last_person = entity for before_index in range(index-1, max(-1,index-5), -1): before_entity = temp_person_entitys[before_index] if before_entity.sentence_index == last_person.sentence_index and last_person.begin_index - before_entity.end_index < 3: for _phone in entity.person_phone: if not re.search("^1[3-9]\d{9}$", _phone.entity_text): if _phone not in before_entity.person_phone: before_entity.person_phone.append(_phone) last_person = before_entity else: break # 更新person为招标/代理联系人的联系方式 for k in PackDict.keys(): for i in range(len(PackDict[k]["roleList"])): if PackDict[k]["roleList"][i].role_name == "tenderee": for _person in person_list: if _person.label==1:#招标联系人 person_phone = [phone for phone in _person.person_phone] if _person.person_phone else [] for _p in person_phone: PackDict[k]["roleList"][i].linklist.append((_person.entity_text, _p.entity_text)) if not person_phone: PackDict[k]["roleList"][i].linklist.append((_person.entity_text,"")) if PackDict[k]["roleList"][i].role_name == "agency": for _person in person_list: if _person.label==2:#代理联系人 person_phone = [phone for phone in _person.person_phone] if _person.person_phone else [] for _p in person_phone: PackDict[k]["roleList"][i].linklist.append((_person.entity_text, _p.entity_text)) if not person_phone: PackDict[k]["roleList"][i].linklist.append((_person.entity_text,"")) # 更新 PackDict not_sure_linked = [] for link_p in list(linked_company): for k in PackDict.keys(): for i in range(len(PackDict[k]["roleList"])): if PackDict[k]["roleList"][i].role_name == "tenderee": if PackDict[k]["roleList"][i].entity_text != link_p.entity_text and link_p.label == 0: not_sure_linked.append(link_p) continue if PackDict[k]["roleList"][i].entity_text == link_p.entity_text: for per in link_p.pointer_person: person_phone = [phone for phone in per.person_phone] if per.person_phone else [] if not person_phone: if per.entity_text not in agency_contact: PackDict[k]["roleList"][i].linklist.append((per.entity_text, "")) continue for _p in person_phone: if per.entity_text not in agency_contact and _p.entity_text not in agency_phone: PackDict[k]["roleList"][i].linklist.append((per.entity_text, _p.entity_text)) elif PackDict[k]["roleList"][i].role_name == "agency": if PackDict[k]["roleList"][i].entity_text != link_p.entity_text and link_p.label == 1: not_sure_linked.append(link_p) continue if PackDict[k]["roleList"][i].entity_text == link_p.entity_text: for per in link_p.pointer_person: person_phone = [phone for phone in per.person_phone] if per.person_phone else [] if not person_phone: if per.entity_text not in tenderee_contact: PackDict[k]["roleList"][i].linklist.append((per.entity_text, "")) continue for _p in person_phone: if per.entity_text not in tenderee_contact and _p.entity_text not in tenderee_phone: PackDict[k]["roleList"][i].linklist.append((per.entity_text, _p.entity_text)) else: if PackDict[k]["roleList"][i].entity_text == link_p.entity_text: for per in link_p.pointer_person: person_phone = [phone for phone in per.person_phone] if per.person_phone else [] if not person_phone: if per.entity_text not in tenderee_contact and per.entity_text not in agency_contact: PackDict[k]["roleList"][i].linklist.append((per.entity_text, "")) winter_contact.add(per.entity_text) continue for _p in person_phone: if per.entity_text not in tenderee_contact and _p.entity_text not in tenderee_phone and \ per.entity_text not in agency_contact and _p.entity_text not in agency_phone: PackDict[k]["roleList"][i].linklist.append((per.entity_text, _p.entity_text)) winter_contact.add(per.entity_text) # 更新org/company实体label为0,1的链接 for link_p in not_sure_linked: for k in PackDict.keys(): for i in range(len(PackDict[k]["roleList"])): if PackDict[k]["roleList"][i].role_name == "tenderee": if link_p.label == 0: for per in link_p.pointer_person: person_phone = [phone for phone in per.person_phone] if per.person_phone else [] if not person_phone: if per.entity_text not in agency_contact and per.entity_text not in winter_contact: PackDict[k]["roleList"][i].linklist.append((per.entity_text, "")) continue for _p in person_phone: if per.entity_text not in agency_contact and _p.entity_text not in agency_phone and per.entity_text not in winter_contact: PackDict[k]["roleList"][i].linklist.append((per.entity_text, _p.entity_text)) elif PackDict[k]["roleList"][i].role_name == "agency": if link_p.label == 1: for per in link_p.pointer_person: person_phone = [phone for phone in per.person_phone] if per.person_phone else [] if not person_phone: if per.entity_text not in tenderee_contact and per.entity_text not in winter_contact: PackDict[k]["roleList"][i].linklist.append((per.entity_text, "")) continue for _p in person_phone: if per.entity_text not in tenderee_contact and _p.entity_text not in tenderee_phone and per.entity_text not in winter_contact: PackDict[k]["roleList"][i].linklist.append((per.entity_text, _p.entity_text)) re_split = re.compile("[^\u4e00-\u9fa5、](十一|十二|十三|十四|十五|一|二|三|四|五|六|七|八|九|十)、") split_list = [0] * 16 split_dict = { "一、": 1, "二、": 2, "三、": 3, "四、": 4, "五、": 5, "六、": 6, "七、": 7, "八、": 8, "九、": 9, "十、": 10, "十一、": 11, "十二、": 12, "十三、": 13, "十四、": 14, "十五、": 15 } for item in re.finditer(re_split, _content): _index = split_dict.get(item.group()[1:]) if not split_list[_index]: split_list[_index] = item.span()[0] + 1 split_list = [i for i in split_list if i != 0] start = 0 new_split_list = [] for idx in split_list: new_split_list.append((start, idx)) start = idx new_split_list.append((start, len(_content))) # 实体列表按照“公告分段”分组 words_num_dict = dict() last_words_num = 0 for sentence in list_sentence: _index = sentence.sentence_index if _index == 0: words_num_dict[_index] = 0 else: words_num_dict[_index] = words_num_dict[_index - 1] + last_words_num last_words_num = len(sentence.sentence_text) # 公司-联系人连接(km算法) re_phone = re.compile('1[3-9][0-9][-—-―]?\d{4}[-—-―]?\d{4}|' '\+86.?1[3-9]\d{9}|' '0[1-9]\d{1,2}[-—-―][1-9]\d{6,7}/[1-9]\d{6,10}|' '0[1-9]\d{1,2}[-—-―]\d{7,8}.?转\d{1,4}|' '0[1-9]\d{1,2}[-—-―]\d{7,8}[-—-―]\d{1,4}|' '0[1-9]\d{1,2}[-—-―]?[1-9]\d{6}\d?(?=1[3-9]\d{9})|' '0[1-9]\d{1,2}[-—-―]?[1-9]\d{6}\d?(?=0[1-9]\d{1,2}[-—-―]?[1-9]\d{6}\d?)|' '0[1-9]\d{1,2}[-—-―]?[1-9]\d{6}\d?(?=[1-9]\d{6,7})|' '0[1-9]\d{1,2}[-—-―]?[1-9]\d{6}\d?|' '[\(|\(]0[1-9]\d{1,2}[\)|\)]-?\d{7,8}-?\d{,4}|' '[2-9]\d{6,7}') key_phone = re.compile("联系方式|电话|联系人|负责人") temporary_list2 = [] for entity in list_entity: # if entity.entity_type in ['org', 'company', 'person'] and entity.is_tail==False: if entity.entity_type in ['org', 'company', 'person']: temporary_list2.append(entity) temporary_list2 = sorted(temporary_list2, key=lambda x: (x.sentence_index, x.begin_index)) new_temporary_list2 = [] for _split in new_split_list: temp_list = [] for _entity in temporary_list2: if words_num_dict[_entity.sentence_index] + _entity.wordOffset_begin >= _split[0] and words_num_dict[ _entity.sentence_index] + _entity.wordOffset_end < _split[1]: temp_list.append(_entity) elif words_num_dict[_entity.sentence_index] + _entity.wordOffset_begin >= _split[1]: break new_temporary_list2.append(temp_list) # print(new_temporary_list2) match_list2 = [] for split_index in range(len(new_temporary_list2)): split_entitys = new_temporary_list2[split_index] is_skip = False for index in range(len(split_entitys)): entity = split_entitys[index] if is_skip: is_skip = False continue else: if entity.entity_type in ['org', 'company']: if entity.label != 5 or entity.entity_text in roleSet: match_nums = 0 for after_index in range(index + 1, min(len(split_entitys), index + 4)): after_entity = split_entitys[after_index] if after_entity.entity_type in ['person']: # 实体为中标人/候选人,联系人已确定类别【1,2】 if entity.label in [2, 3, 4] and after_entity.label in [1, 2]: break if after_entity.label in [1, 2, 3]: distance = (tokens_num_dict[ after_entity.sentence_index] + after_entity.begin_index) - ( tokens_num_dict[entity.sentence_index] + entity.end_index) sentence_distance = after_entity.sentence_index - entity.sentence_index if sentence_distance == 0: if distance < 100: if (entity.label == 0 and after_entity.label == 1) or ( entity.label == 1 and after_entity.label == 2): distance = distance / 100 value = (-1 / 2 * (distance ** 2)) / 10000 match_list2.append(Match(entity, after_entity, value)) match_nums += 1 else: if distance < 60: if (entity.label == 0 and after_entity.label == 1) or ( entity.label == 1 and after_entity.label == 2): distance = distance / 100 value = (-1 / 2 * (distance ** 2)) / 10000 match_list2.append(Match(entity, after_entity, value)) match_nums += 1 if after_entity.entity_type in ['org', 'company']: # 解决在‘地址’中识别出org/company的问题 # if entity.label in [0,1] and after_index==index+1 and after_entity.label not in [0,1]: if entity.label != 5 and after_index == index + 1 and ( after_entity.label == entity.label or after_entity.label == 5): distance = (tokens_num_dict[ after_entity.sentence_index] + after_entity.begin_index) - ( tokens_num_dict[entity.sentence_index] + entity.end_index) if distance < 20: after_entity_left = list_sentence[after_entity.sentence_index].tokens[max(0, after_entity.begin_index - 10):after_entity.begin_index] after_entity_right = list_sentence[after_entity.sentence_index].tokens[ after_entity.end_index + 1:after_entity.end_index + 6] after_entity_left = "".join(after_entity_left) if len(after_entity_left) > 20: after_entity_left = after_entity_left[-20:] after_entity_right = "".join(after_entity_right)[:10] if re.search("地,?址", after_entity_left): is_skip = True continue if re.search("\(|(", after_entity_left) and re.search("\)|)", after_entity_right): is_skip = True continue if entity.label in [0, 1] and after_entity.label in [0, 1] and entity.label == after_entity.label: break if entity.label in [0, 1] and after_entity.label in [0, 1] and split_entitys[ index + 1].entity_type == "person": break if entity.label in [0, 1] and after_entity.label in [2, 3, 4]: break if entity.label in [2, 3, 4] and after_entity.label in [0, 1]: break # 搜索没有联系人的电话 mid_tokens = [] is_same_sentence = False if index == len(split_entitys) - 1: for i in range(entity.sentence_index, len(list_sentence)): mid_tokens += list_sentence[i].tokens mid_tokens = mid_tokens[entity.end_index + 1:] mid_sentence = "".join(mid_tokens) have_phone = re.findall(re_phone, mid_sentence) if have_phone: if re.findall(re_phone, mid_sentence.split("。")[0]): is_same_sentence = True _phone = have_phone[0] phone_begin = mid_sentence.find(_phone) if words_num_dict[entity.sentence_index] + entity.wordOffset_begin + phone_begin < \ new_split_list[split_index][1]: mid_sentence = mid_sentence[max(0, phone_begin - 15):phone_begin].replace(",", "") if re.search(key_phone, mid_sentence): distance = 1 if is_same_sentence: if phone_begin <= 200: value = (-1 / 2 * (distance ** 2)) / 10000 match_list2.append(Match(entity, (entity, _phone), value)) match_nums += 1 else: if phone_begin <= 60: value = (-1 / 2 * (distance ** 2)) / 10000 match_list2.append(Match(entity, (entity, _phone), value)) match_nums += 1 else: next_entity = split_entitys[index + 1] if next_entity.entity_type in ["org","company"]: _entity_left = list_sentence[next_entity.sentence_index].sentence_text[max(0, next_entity.wordOffset_begin - 20):next_entity.wordOffset_begin] _entity_left2 = re.sub(",()\(\)::", "", _entity_left) _entity_left2 = _entity_left2[-5:] if re.search("(地,?址|地,?点)[::][^,。]*$", _entity_left) or re.search("地址|地点", _entity_left2): if index + 2<= len(split_entitys) - 1: next_entity = split_entitys[index + 2] if entity.sentence_index == next_entity.sentence_index: mid_tokens += list_sentence[entity.sentence_index].tokens[ entity.end_index + 1:next_entity.begin_index] else: sentence_index = entity.sentence_index while sentence_index <= next_entity.sentence_index: mid_tokens += list_sentence[sentence_index].tokens sentence_index += 1 mid_tokens = mid_tokens[entity.end_index + 1:-(len( list_sentence[next_entity.sentence_index].tokens) - next_entity.begin_index) + 1] mid_sentence = "".join(mid_tokens) have_phone = re.findall(re_phone, mid_sentence) if have_phone: if re.findall(re_phone, mid_sentence.split("。")[0]): is_same_sentence = True _phone = have_phone[0] phone_begin = mid_sentence.find(_phone) mid_sentence = mid_sentence[max(0, phone_begin - 15):phone_begin].replace(",", "") if re.search(key_phone, mid_sentence): p_phone = [p.entity_text for p in next_entity.person_phone] if next_entity.person_phone else [] if next_entity.entity_type == 'person' and _phone in p_phone: pass else: distance = (tokens_num_dict[ next_entity.sentence_index] + next_entity.begin_index) - ( tokens_num_dict[entity.sentence_index] + entity.end_index) distance = distance / 2 if is_same_sentence: if phone_begin <= 200: value = (-1 / 2 * (distance ** 2)) / 10000 match_list2.append(Match(entity, (entity, _phone), value)) match_nums += 1 else: if phone_begin <= 60: value = (-1 / 2 * (distance ** 2)) / 10000 match_list2.append(Match(entity, (entity, _phone), value)) match_nums += 1 # 实体无匹配时,尝试前向查找匹配 if not match_nums: if (entity.label != 5 or entity.entity_text in roleSet) and entity.values[entity.label] >= 0.5 and index != 0: previous_entity = split_entitys[index - 1] if previous_entity.entity_type == 'person' and previous_entity.label in [1, 2, 3]: if entity.label in [2, 3, 4] and previous_entity.label in [1, 2]: continue if previous_entity.sentence_index == entity.sentence_index: distance = (tokens_num_dict[entity.sentence_index] + entity.begin_index) - ( tokens_num_dict[ previous_entity.sentence_index] + previous_entity.end_index) if distance < 20: # 距离相等时,前向添加处罚值 # distance += 1 # 前向 没有 /10000 value = (-1 / 2 * (distance ** 2)) match_list2.append(Match(entity, previous_entity, value)) # print(match_list2) match_list2 = [mat for mat in match_list2 if mat.main_role not in linked_company and mat.attribute not in linked_person] # print(match_list2) # km算法分配求解 result2 = dispatch(match_list2) # print(result2) for match in result2: entity = match[0] # print(entity.entity_text) # print(match.attribute) entity_index = list_entity.index(entity) is_update = False if isinstance(match[1], tuple): person_ = '' phone_ = match[1][1].split("/") # 分割多个号码 # print(person_,phone_) else: person_ = match[1].entity_text phone_ = [i.entity_text for i in match[1].person_phone] if match[1].person_phone else [] for k in PackDict.keys(): for i in range(len(PackDict[k]["roleList"])): if PackDict[k]["roleList"][i].role_name == "tenderee": # if not PackDict[k]["roleList"][i].linklist: if PackDict[k]["roleList"][i].entity_text == entity.entity_text or entity.label == 0: if person_ not in agency_contact and len(set(phone_)&set(agency_phone))==0 and person_ not in winter_contact: if not phone_: PackDict[k]["roleList"][i].linklist.append((person_, "")) for p in phone_: # if not person_ and len() PackDict[k]["roleList"][i].linklist.append((person_, p)) is_update = True elif PackDict[k]["roleList"][i].role_name == "agency": # if not PackDict[k]["roleList"][i].linklist: if PackDict[k]["roleList"][i].entity_text == entity.entity_text or entity.label == 1 and person_ not in winter_contact: if person_ not in tenderee_contact and len(set(phone_)&set(tenderee_phone))==0: if not phone_: PackDict[k]["roleList"][i].linklist.append((person_, "")) for p in phone_: PackDict[k]["roleList"][i].linklist.append((person_, p)) is_update = True else: if PackDict[k]["roleList"][i].entity_text == entity.entity_text: if not PackDict[k]["roleList"][i].linklist: if person_ not in tenderee_contact and len(set(phone_)&set(tenderee_phone))==0 and \ person_ not in agency_contact and len(set(phone_)&set(agency_phone))==0: if not phone_: PackDict[k]["roleList"][i].linklist.append((person_, "")) for p in phone_: PackDict[k]["roleList"][i].linklist.append((person_, p)) is_update = True if not person_: is_update = False if is_update: # 更新 list_entity if not list_entity[entity_index].pointer_person: list_entity[entity_index].pointer_person = [] list_entity[entity_index].pointer_person.append(match[1]) linked_person = [] linked_persons_with = [] for company_entity in [entity for entity in list_entity if entity.entity_type in ['company','org']]: if company_entity.pointer_person: for _person in company_entity.pointer_person: linked_person.append(_person) linked_persons_with.append(company_entity) # 一个公司对应多个联系人的补充 person_entitys = [entity for entity in list_entity if entity.entity_type=='person'] person_entitys = person_entitys[::-1] for index in range(len(person_entitys)): entity = person_entitys[index] prepare_link = [] if entity not in linked_person: prepare_link.append(entity) last_person = entity for after_index in range(index + 1, min(len(person_entitys), index + 5)): after_entity = person_entitys[after_index] if after_entity.sentence_index==last_person.sentence_index and last_person.begin_index-after_entity.end_index<5: if after_entity in linked_person: _index = linked_person.index(after_entity) with_company = linked_persons_with[_index] for i in range(len(PackDict["Project"]["roleList"])): if PackDict["Project"]["roleList"][i].role_name == "tenderee": if PackDict["Project"]["roleList"][i].entity_text == with_company.entity_text or with_company.label == 0: for item in prepare_link: person_phone = [p.entity_text for p in item.person_phone] if item.person_phone else [] for _p in person_phone: PackDict["Project"]["roleList"][i].linklist.append((item.entity_text, _p)) with_company.pointer_person.append(item) linked_person.append(item) elif PackDict["Project"]["roleList"][i].role_name == "agency": if PackDict["Project"]["roleList"][i].entity_text == with_company.entity_text or with_company.label == 1: for item in prepare_link: person_phone = [p.entity_text for p in item.person_phone] if item.person_phone else [] for _p in person_phone: PackDict["Project"]["roleList"][i].linklist.append((item.entity_text, _p)) with_company.pointer_person.append(item) linked_person.append(item) else: if PackDict["Project"]["roleList"][i].entity_text == with_company.entity_text: for item in prepare_link: person_phone = [p.entity_text for p in item.person_phone] if item.person_phone else [] for _p in person_phone: PackDict["Project"]["roleList"][i].linklist.append((item.entity_text, _p)) with_company.pointer_person.append(item) linked_person.append(item) break else: prepare_link.append(after_entity) last_person = after_entity continue # 统一同类角色的属性 if PackDict.get("Project"): for i in range(len(PackDict["Project"]["roleList"])): # if PackDict["Project"]["roleList"][i].role_name in ["tenderee","agency"]: for _entity in list_entity: if _entity.entity_type in ['org','company']: is_similar = False # entity_text相同 if _entity.entity_text==PackDict["Project"]["roleList"][i].entity_text: is_similar = True # entity.label为【0,1】 if _entity.label in [0,1] and dict_role_id[str(_entity.label)]==PackDict["Project"]["roleList"][i].role_name: is_similar = True if is_similar: linked_entitys = _entity.linked_entitys if linked_entitys: for linked_entity in linked_entitys: pointer_person = linked_entity.pointer_person if linked_entity.pointer_person else [] for _pointer_person in pointer_person: _phone = [p.entity_text for p in _pointer_person.person_phone] if _pointer_person.person_phone else [] for _p in _phone: if (_pointer_person.entity_text,_p) not in PackDict["Project"]["roleList"][i].linklist: PackDict["Project"]["roleList"][i].linklist.append((_pointer_person.entity_text,_p)) # "roleList"中联系人电话去重 for i in range(len(PackDict["Project"]["roleList"])): # print(123, PackDict["Project"]["roleList"][i].linklist) # 带有联系人的电话 with_person = [person_phone[1] for person_phone in PackDict["Project"]["roleList"][i].linklist if person_phone[0]] # 带有电话的联系人 with_phone = [person_phone[0] for person_phone in PackDict["Project"]["roleList"][i].linklist if person_phone[1]] remove_list = [] for item in PackDict["Project"]["roleList"][i].linklist: if not item[0]: if item[1] in with_person: # 删除重复的无联系人电话 remove_list.append(item) elif not item[1]: if item[0] in with_phone: remove_list.append(item) for _item in remove_list: PackDict["Project"]["roleList"][i].linklist.remove(_item) # 联系人——电子邮箱链接 temporary_list3 = [entity for entity in list_entity if entity.entity_type=='email' or (entity.entity_type=='person' and entity.label in [1,2,3])] temporary_list3 = sorted(temporary_list3, key=lambda x: (x.sentence_index, x.begin_index)) new_temporary_list3 = [] for _split in new_split_list: temp_list = [] for _entity in temporary_list3: if words_num_dict[_entity.sentence_index] + _entity.wordOffset_begin >= _split[0] and words_num_dict[ _entity.sentence_index] + _entity.wordOffset_end < _split[1]: temp_list.append(_entity) elif words_num_dict[_entity.sentence_index] + _entity.wordOffset_begin >= _split[1]: break new_temporary_list3.append(temp_list) # print(new_temporary_list3) match_list3 = [] for split_index in range(len(new_temporary_list3)): split_entitys = new_temporary_list3[split_index] for index in range(len(split_entitys)): entity = split_entitys[index] if entity.entity_type == 'person': match_nums = 0 for after_index in range(index + 1, min(len(split_entitys), index + 4)): after_entity = split_entitys[after_index] if match_nums > 2: break if after_entity.entity_type == 'email': distance = (tokens_num_dict[after_entity.sentence_index] + after_entity.begin_index) - ( tokens_num_dict[entity.sentence_index] + entity.end_index) sentence_distance = after_entity.sentence_index - entity.sentence_index if sentence_distance == 0: if distance < 100: if (entity.label == 0 and after_entity.label == 1) or ( entity.label == 1 and after_entity.label == 2): distance = distance / 100 value = (-1 / 2 * (distance ** 2)) / 10000 match_list3.append(Match(entity, after_entity, value)) match_nums += 1 else: if distance < 60: if (entity.label == 0 and after_entity.label == 1) or ( entity.label == 1 and after_entity.label == 2): distance = distance / 100 value = (-1 / 2 * (distance ** 2)) / 10000 match_list3.append(Match(entity, after_entity, value)) match_nums += 1 # 前向查找匹配 # if not match_nums: if index != 0: previous_entity = split_entitys[index - 1] if previous_entity.entity_type == 'email': if previous_entity.sentence_index == entity.sentence_index: distance = (tokens_num_dict[entity.sentence_index] + entity.begin_index) - ( tokens_num_dict[ previous_entity.sentence_index] + previous_entity.end_index) if distance < 30: # 距离相等时,前向添加处罚值 # distance += 1 # 前向 没有 /10000 value = (-1 / 2 * (distance ** 2)) match_list3.append(Match(entity, previous_entity, value)) # print(match_list3) # km算法分配求解 result3 = dispatch(match_list3) for match in result3: match_person = match[0] match_email = match[1] match_person.pointer_email = match_email # # 1)第一个公司实体的招标人,则看看下一个实体是否为代理人,如果是则联系人错位连接 。2)在同一句中往后找联系人。3)连接不上在整个文章找联系人。 # temp_ent_list = [] # 临时列表,记录0,1角色及3联系人 # other_person = [] # 阈值以上的联系人列表 # link_person = [] # 有电话没联系上角色的person列表 # other_ent = [] # link_ent = [] # found_person = False # ent_list = [] # for entity in list_entity: # if entity.entity_type in ['org','company','person']: # ent_list.append(entity) # # ent_list = [entity for entity in list_entity if entity.entity_type in ['org','company','person']] # #for list_index in range(len(ent_list)): # #if ent_list[list_index].entity_type in ['org','company'] and ent_list[list_index].label == 0 and list_index+2on_value_person: # if str(entity.label)=="1": # for i in range(len(PackDict["Project"]["roleList"])): # if PackDict["Project"]["roleList"][i].role_name=="tenderee": # PackDict["Project"]["roleList"][i].linklist.append((entity.entity_text,entity.person_phone)) # link_person.append(entity.entity_text) # link_ent.append(PackDict["Project"]["roleList"][i].entity_text) # # add pointer_person # for _entity in list_entity: # if dict_role_id.get(str(_entity.label))=="tenderee": # for i in range(len(PackDict["Project"]["roleList"])): # if PackDict["Project"]["roleList"][i].entity_text==_entity.entity_text and PackDict["Project"]["roleList"][i].role_name=="tenderee": # _entity.pointer_person = entity # elif str(entity.label)=="2": # for i in range(len(PackDict["Project"]["roleList"])): # if PackDict["Project"]["roleList"][i].role_name=="agency": # PackDict["Project"]["roleList"][i].linklist.append((entity.entity_text,entity.person_phone)) # link_person.append(entity.entity_text) # link_ent.append(PackDict["Project"]["roleList"][i].entity_text) # # add pointer_person # for _entity in list_entity: # if dict_role_id.get(str(_entity.label))=="agency": # for i in range(len(PackDict["Project"]["roleList"])): # if PackDict["Project"]["roleList"][i].entity_text==_entity.entity_text and PackDict["Project"]["roleList"][i].role_name=="agency": # _entity.pointer_person = entity # elif str(entity.label)=="3": # if entity.entity_text in sure_person_set: # 2020/11/25 排除已经确定角色的联系人 # continue # #not_link_person.append((entity_after.entity_text,entity_after.person_phone)) # other_person.append(entity.entity_text) # temp_ent_list.append((entity.entity_text,entity.person_phone,entity)) # # #if entity.entity_text in roleSet: # if entity.entity_text in roleSet: # if entity.label in [0,1]: # other_ent.append(entity.entity_text) # temp_ent_list.append((entity.entity_text, entity.label,entity)) # for behind_index in range(index+1, len(ent_list)): # entity_after = ent_list[behind_index] # if entity_after.sentence_index-entity.sentence_index>=1 or entity_after.entity_type in ['org','company']: # 只在本句中找联系人 # break # if entity_after.values is not None: # if entity_after.entity_type=="person": # if str(entity_after.label) == "0": # 2020/11/25角色后面为非联系人 停止继续往后找 # break # if entity_after.values[entity_after.label]>on_value_person: # if str(entity_after.label)=="1": # for i in range(len(PackDict["Project"]["roleList"])): # if PackDict["Project"]["roleList"][i].role_name=="tenderee": # PackDict["Project"]["roleList"][i].linklist.append((entity_after.entity_text,entity_after.person_phone)) # link_person.append(entity_after.entity_text) # link_ent.append(PackDict["Project"]["roleList"][i].entity_text) # elif str(entity_after.label)=="2": # for i in range(len(PackDict["Project"]["roleList"])): # if PackDict["Project"]["roleList"][i].role_name=="agency": # PackDict["Project"]["roleList"][i].linklist.append((entity_after.entity_text,entity_after.person_phone)) # link_person.append(entity_after.entity_text) # link_ent.append(PackDict["Project"]["roleList"][i].entity_text) # elif str(entity_after.label)=="3": # if entity_after.entity_text in sure_person_set: # 2020/11/25 如果姓名已经出现在确定角色联系人中则停止往后找 # break # elif entity_after.begin_index - entity.end_index > 30:#2020/10/25 如果角色实体与联系人实体间隔大于阈值停止 # break # for pack in PackDict.keys(): # for i in range(len(PackDict[pack]["roleList"])): # if PackDict[pack]["roleList"][i].entity_text==entity.entity_text: # #if entity_after.sentence_index-entity.sentence_index>1 and len(roleList[i].linklist)>0: # #break # PackDict[pack]["roleList"][i].linklist.append((entity_after.entity_text,entity_after.person_phone)) # link_person.append(entity_after.entity_text) # #add pointer_person # entity.pointer_person = entity_after # # not_link_person = [person for person in other_person if person not in link_person] # not_link_ent = [ent for ent in other_ent if ent not in link_ent] # if len(not_link_person) > 0 and len(not_link_ent) > 0 : # item = temp_ent_list # for i in range(len(item)): # if item[i][0] in not_link_ent and item[i][1] == 0 and i+3 < len(item): # if item[i+1][0] in other_ent and item[i+1][1] == 1 and item[i+2][0] in other_person and item[i+3][0] in other_person: # item[i+1], item[i+2] = item[i+2], item[i+1] # for i in range(len(item)-1, -1, -1): # if item[i][0] in not_link_ent: # for pack in PackDict.keys(): # for role in PackDict[pack]["roleList"]: # if role.entity_text == item[i][0] and len(role.linklist) < 1: # for j in range(i+1, len(item)): # if item[j][0] in not_link_person: # role.linklist.append(item[j][:2]) # #add pointer_person # item[i][2].pointer_person = item[j][2] # break # else: # break # # 电话没有联系人的处理 # role_with_no_phone = [] # for i in range(len(PackDict["Project"]["roleList"])): # if PackDict["Project"]["roleList"][i].role_name in ["tenderee","agency"]: # if len(PackDict["Project"]["roleList"][i].linklist)==0: # 找出没有联系人的招标/代理人 # role_with_no_phone.append(PackDict["Project"]["roleList"][i].entity_text) # else: # phone_nums = 0 # for link in PackDict["Project"]["roleList"][i].linklist: # if link[1]: # phone_nums += 1 # break # if not phone_nums: # role_with_no_phone.append(PackDict["Project"]["roleList"][i].entity_text) # if role_with_no_phone: # phone_with_person = [entity.person_phone for entity in list_entity if entity.entity_type == "person"] # # phone_with_person = [phone for phone in phone_with_person if phone] # # dict_index_sentence = {} # for _sentence in list_sentence: # dict_index_sentence[_sentence.sentence_index] = _sentence # new_entity_list = [entity for entity in list_entity if entity.entity_type in ['org','company','person']] # for index in range(len(new_entity_list)): # entity = new_entity_list[index] # if entity.entity_text in role_with_no_phone: # e_sentence = dict_index_sentence[entity.sentence_index] # entity_right = e_sentence.tokens[entity.end_index:entity.end_index+40] # entity_right = "".join(entity_right) # if index+1-1: # entity_right = entity_right[:entity_right.find(new_entity_list[index+1].entity_text)] # have_phone = re.findall(phone,entity_right) # if have_phone: # _phone = have_phone[0] # phone_begin = entity_right.find(_phone) # if _phone not in phone_with_person and re.search(key_phone,entity_right[:phone_begin]): # # entity.person_phone = _phone # for i in range(len(PackDict["Project"]["roleList"])): # if PackDict["Project"]["roleList"][i].entity_text == entity.entity_text: # PackDict["Project"]["roleList"][i].linklist.append(('', _phone)) #寻找多标段招标金额 p_entity = len(list_entity)-1 set_tenderer_money = set() list_tenderer_money = [] #2021/7/16 新增列表,倒序保存所有中标金额 unit_list = [] #2021/8/17 新增,保存金额单位 #遍历所有实体 while(p_entity>=0): entity = list_entity[p_entity] if entity.entity_type=="money": # 2021/12/03 添加成本警戒线、保证金 if entity.notes in ['保证金', '成本警戒线']: packagePointer, _flag = getPackage(PackageList, entity.sentence_index, entity.begin_index, "money-" + str(entity.label), MAX_DIS=2, DIRECT="L") if packagePointer is None: packageName = "Project" else: packageName = packagePointer.entity_text if packageName == "Project": # if PackDict["Project"]["tendereeMoney"]=on_value: if str(entity.label)=="1": set_tenderer_money.add(float(entity.entity_text)) list_tenderer_money.append(float(entity.entity_text)) # 2021/7/16 新增列表,倒序保存所有中标金额 unit_list.append(entity.money_unit) # if str(entity.label)=="0": if str(entity.label)=="0" and entity.notes!='总投资': ''' if p_entity>0: p_before = list_entity[p_entity-1] if p_before.entity_type=="money" and p_before.label==entity.label and p_before.entity_text==entity.entity_text and abs(entity.begin_index-p_before.end_index)<=2: p_entity -= 1 continue ''' packagePointer,_flag = getPackage(PackageList,entity.sentence_index,entity.begin_index,"money-"+str(entity.label),MAX_DIS=2,DIRECT="L") if packagePointer is None: packageName = "Project" else: packageName = packagePointer.entity_text if packageName=="Project": # if PackDict["Project"]["tendereeMoney"]on_value: PackDict["Project"]["tendereeMoney"] = float(entity.entity_text) PackDict["Project"]["tendereeMoneyUnit"] = entity.money_unit else: PackDict[packageName]["tendereeMoney"] = float(entity.entity_text) PackDict[packageName]["tendereeMoneyUnit"] = entity.money_unit #add pointer_tendereeMoney packagePointer.pointer_tendereeMoney = entity p_entity -= 1 #删除一个机构有多个角色的数据 #删除重复人、概率不回传 final_roleList = [] list_pop = [] set_tenderer_role = set() dict_pack_tenderer_money = dict() for pack in PackDict.keys(): #删除无效包 if PackDict[pack]["code"]=="" and PackDict[pack]["tendereeMoney"]==0 and len(PackDict[pack]["roleList"])==0: list_pop.append(pack) for i in range(len(PackDict[pack]["roleList"])): if PackDict[pack]["roleList"][i].role_name=="win_tenderer": if PackDict[pack]["roleList"][i].money==0: set_tenderer_role.add(PackDict[pack]["roleList"][i]) dict_pack_tenderer_money[pack] = [PackDict[pack]["roleList"][i],set()] #找到包的中投标金额 for _index in range(len(PackageList)): if "hit" in PackageList[_index]: for _hit in list(PackageList[_index]["hit"]): _money = float(_hit.split("-")[1]) if _hit.split("-")[0]=="money" else None if PackageList[_index]["name"] in dict_pack_tenderer_money and _money is not None: dict_pack_tenderer_money[PackageList[_index]["name"]][1].add(_money) #只找到一个中标人和中标金额 if len(set_tenderer_money)==1 and len(set_tenderer_role)==1: list(set_tenderer_role)[0].money = list(set_tenderer_money)[0] list(set_tenderer_role)[0].money_unit = unit_list[0] # print('一个中标人一个金额:', list(set_tenderer_money)[0]) #找到一个中标人和多个招标金额 if len(set_tenderer_money)>1 and len(set_tenderer_role)==1: _maxMoney = 0 _sumMoney = 0 for _m in list(set_tenderer_money): _sumMoney += _m if _m>_maxMoney: _maxMoney = _m if _sumMoney/_maxMoney==2: list(set_tenderer_role)[0].money = _maxMoney # print('一人多金额分项合计 取最大金额:', _maxMoney) else: # list(set_tenderer_role)[0].money = _maxMoney if min(list_tenderer_money)>200000 and list_tenderer_money[-1]/min(list_tenderer_money)>9000: list(set_tenderer_role)[0].money = min(list_tenderer_money) list(set_tenderer_role)[0].money_unit = unit_list[list_tenderer_money.index(min(list_tenderer_money))] # print('一人多金额 且最小的大于20万第一个金额比最小金额大几千倍的最小中标金额:', min(list_tenderer_money)) else: list(set_tenderer_role)[0].money = list_tenderer_money[-1] # 2021/7/16 修改 不是单价合计方式取第一个中标金额 list(set_tenderer_role)[0].money_unit = unit_list[-1] # 金额单位 # print('一人多金额 取第一个中标金额:', list_tenderer_money[-1]) #每个包都只找到一个金额 _flag_pack_money = True for k,v in dict_pack_tenderer_money.items(): if len(v[1])!=1: _flag_pack_money = False if _flag_pack_money and len(PackageSet)==len(dict_pack_tenderer_money.keys()): for k,v in dict_pack_tenderer_money.items(): v[0].money = list(v[1])[0] # print('k,v in dict_pack_tenderer_money.items', k, v) # 2021/7/16 #增加判断中标金额是否远大于招标金额逻辑 for pack in PackDict.keys(): for i in range(len(PackDict[pack]["roleList"])): if PackDict[pack]["tendereeMoney"] > 0: # print('金额数据类型:',type(PackDict[pack]["roleList"][i].money)) if float(PackDict[pack]["roleList"][i].money) >10000000 and \ float(PackDict[pack]["roleList"][i].money)/float(PackDict[pack]["tendereeMoney"])>=1000: PackDict[pack]["roleList"][i].money = float(PackDict[pack]["roleList"][i].money) / 10000 # print('招标金额校正中标金额') # 2022/04/01 #增加判断中标金额是否远小于招标金额逻辑,比例相差10000倍左右(中标金额“万”单位丢失或未识别) for pack in PackDict.keys(): for i in range(len(PackDict[pack]["roleList"])): if PackDict[pack]["tendereeMoney"] > 0 and float(PackDict[pack]["roleList"][i].money) > 0.: if float(PackDict[pack]["roleList"][i].money) < 1000 and \ float(PackDict[pack]["tendereeMoney"])/float(PackDict[pack]["roleList"][i].money)>=9995 and \ float(PackDict[pack]["tendereeMoney"])/float(PackDict[pack]["roleList"][i].money)<11000: PackDict[pack]["roleList"][i].money = float(PackDict[pack]["roleList"][i].money) * 10000 # 2021/7/19 #增加判断中标金额是否远大于第二三中标金额 for pack in PackDict.keys(): tmp_moneys = [] for i in range(len(PackDict[pack]["roleList"])): if float(PackDict[pack]["roleList"][i].money) >100000: tmp_moneys.append(float(PackDict[pack]["roleList"][i].money)) if len(tmp_moneys)>2 and max(tmp_moneys)/min(tmp_moneys)>1000: for i in range(len(PackDict[pack]["roleList"])): if float(PackDict[pack]["roleList"][i].money)/min(tmp_moneys)>1000: PackDict[pack]["roleList"][i].money = float(PackDict[pack]["roleList"][i].money) / 10000 # print('通过其他中标人投标金额校正中标金额') for item in list_pop: PackDict.pop(item) # 公告中只有"招标人"且无"联系人"链接时 if len(PackDict)==1: k = list(PackDict.keys())[0] if len(PackDict[k]["roleList"])==1: if PackDict[k]["roleList"][0].role_name == "tenderee": if not PackDict[k]["roleList"][0].linklist: get_contacts = False if not get_contacts: # 根据大纲Outline类召回联系人 for outline in list_outline: if re.search("联系人|联系方|联系方式|联系电话|电话|负责人|与.{2,4}联系",outline.outline_summary): for t_person in [p for p in temporary_list2 if p.entity_type=='person' and p.label==3]: if words_num_dict[t_person.sentence_index] + t_person.wordOffset_begin >= words_num_dict[outline.sentence_begin_index] + outline.wordOffset_begin and words_num_dict[ t_person.sentence_index] + t_person.wordOffset_end < words_num_dict[outline.sentence_end_index] + outline.wordOffset_end: if t_person.person_phone: _phone = [p.entity_text for p in t_person.person_phone] for _p in _phone: PackDict[k]["roleList"][0].linklist.append((t_person.entity_text, _p)) get_contacts = True break elif words_num_dict[t_person.sentence_index] + t_person.wordOffset_begin >= \ words_num_dict[outline.sentence_end_index] + outline.wordOffset_end: break if not get_contacts: sentence_phone = phone.findall(outline.outline_text) if sentence_phone: PackDict[k]["roleList"][0].linklist.append(("", sentence_phone[0])) get_contacts = True break if not get_contacts: # 直接取文中倒数第一个联系人 for _entity in temporary_list2[::-1]: if _entity.entity_type=='person' and _entity.label==3: if _entity.person_phone: _phone = [p.entity_text for p in _entity.person_phone] for _p in _phone: PackDict[k]["roleList"][0].linklist.append((_entity.entity_text, _p)) get_contacts = True break if not get_contacts: # 如果文中只有一个“phone”实体,则直接取为联系人电话 if len(phone_entitys) == 1: PackDict[k]["roleList"][0].linklist.append(("", phone_entitys[0].entity_text)) get_contacts = True if not get_contacts: # 通过大纲Outline类直接取电话 if len(new_split_list) > 1: for _start, _end in new_split_list: temp_sentence = _content[_start:_end] sentence_outline = temp_sentence.split(",::")[0] if re.search("联系人|联系方|联系方式|联系电话|电话|负责人|与.{2,4}联系", sentence_outline): sentence_phone = phone.findall(temp_sentence) if sentence_phone: PackDict[k]["roleList"][0].linklist.append(("", sentence_phone[0])) get_contacts = True break if not get_contacts: # 通过正则提取句子段落进行提取电话 contacts_person = "(?:联系人|联系方|联系方式|负责人|电话|联系电话)[::]?" tenderee_pattern = "(?:(?:采购|招标|议价|议标|比选)(?:人|公司|单位|组织|部门)|建设(?:单位|业主)|(?:采购|招标|甲)方|询价单位|项目业主|业主|业主单位)[^。]{0,5}" contact_pattern_list = [tenderee_pattern + contacts_person, "(?:采购[^。,]{0,2}项目|采购事项|招标)[^。,]{0,4}" + contacts_person, "(?:项目|采购)[^。,]{0,4}" + contacts_person, "(?:报名|报价|业务咨询|业务|投标咨询)[^。,]{0,4}" + contacts_person, ] for _pattern in contact_pattern_list: get_tenderee_contacts = False for regular_match in re.finditer(_pattern, _content): match_text = _content[regular_match.end():regular_match.end() + 40] match_text = match_text.split("。")[0] sentence_phone = phone.findall(match_text) if sentence_phone: PackDict[k]["roleList"][0].linklist.append(("", sentence_phone[0])) get_tenderee_contacts = True break if get_tenderee_contacts: break for pack in PackDict.keys(): for i in range(len(PackDict[pack]["roleList"])): PackDict[pack]["roleList"][i] = PackDict[pack]["roleList"][i].getString() return PackDict def initPackageAttr(RoleList,PackageSet): ''' @summary: 根据拿到的roleList和packageSet初始化接口返回的数据 ''' packDict = dict() packDict["Project"] = {"code":"","tendereeMoney":0,"roleList":[], 'tendereeMoneyUnit':''} for item in list(PackageSet): packDict[item] = {"code":"","tendereeMoney":0,"roleList":[], 'tendereeMoneyUnit':''} for item in RoleList: if packDict[item.packageName]["code"] =="": packDict[item.packageName]["code"] = item.packageCode # packDict[item.packageName]["roleList"].append(Role(item.role_name,item.entity_text,0,0,0.0,[])) packDict[item.packageName]["roleList"].append(Role(item.role_name,item.entity_text,0,0,0.0,[])) #Role(角色名称,实体名称,角色阈值,金额,金额阈值,连接列表,金额单位) return packDict def getPackageRoleMoney(list_sentence,list_entity,list_outline): ''' @param: list_sentence:文章的句子list list_entity:文章的实体list @return: 拿到文章的包-标段号-角色-实体名称-金额-联系人-联系电话 ''' # print("=1") theRole = getRoleList(list_sentence,list_entity) if not theRole: return [] RoleList,RoleSet,PackageList,PackageSet = theRole ''' for item in PackageList: # print(item) ''' PackDict = initPackageAttr(RoleList, PackageSet) PackDict = findAttributeAfterEntity(PackDict, RoleSet, PackageList, PackageSet, list_sentence, list_entity, list_outline) return PackDict def turnBidWay(bidway): if bidway in ("邀请招标","采购方式:邀请"): return "邀请招标" elif bidway in ("询价","询单","询比","采购方式:询价"): return "询价" elif bidway in ("竞谈","竞争性谈判","公开竞谈"): return "竞争性谈判" elif bidway in ("竞争性磋商","磋商"): return "竞争性磋商" elif bidway in ("竞价","竞标","电子竞价","以电子竞价","电子书面竞投"): return "竞价" elif bidway in ("公开招标","网上电子投标","网上招标","采购方式:公开","招标为其他"): return "公开招标" elif bidway in ("单一来源"): return "单一来源" elif bidway in ("比选"): return "比选" else: return "其他" my_time_format_pattern = re.compile("((?P\d{4}|\d{2})\s*[-\/年\.]\s*(?P\d{1,2})\s*[-\/月\.]\s*(?P\d{1,2}))") import time def my_timeFormat(_time): current_year = time.strftime("%Y",time.localtime()) all_match = re.finditer(my_time_format_pattern,_time) time_list = [] for _match in all_match: if len(_match.group())>0: legal = True year = "" month = "" day = "" for k,v in _match.groupdict().items(): if k=="year": year = v if k=="month": month = v if k=="day": day = v if year!="": if len(year)==2: year = "20"+year if int(year)>int(current_year): legal = False else: legal = False if month!="": if int(month)>12: legal = False else: legal = False if day!="": if int(day)>31: legal = False else: legal = False if legal: # return "%s-%s-%s"%(year,month.rjust(2,"0"),day.rjust(2,"0")) time_list.append("%s-%s-%s"%(year,month.rjust(2,"0"),day.rjust(2,"0"))) return time_list def getTimeAttributes(list_entity,list_sentence): time_entitys = [i for i in list_entity if i.entity_type=='time'] time_entitys = sorted(time_entitys,key=lambda x:(x.sentence_index, x.begin_index)) list_sentence = sorted(list_sentence,key=lambda x:x.sentence_index) dict_time = { "time_release": [], # 1 发布时间 "time_bidopen": [], # 2 开标时间 "time_bidclose": [], # 3 截标时间 'time_bidstart': [], # 12 投标(开始)时间、响应文件接收(开始)时间 'time_publicityStart': [], # 4 公示开始时间(公示时间、公示期) 'time_publicityEnd': [], # 5 公示截止时间 'time_getFileStart': [], # 6 文件获取开始时间(文件获取时间) 'time_getFileEnd': [], # 7 文件获取截止时间 'time_registrationStart': [], # 8 报名开始时间(报名时间) 'time_registrationEnd': [], # 9 报名截止时间 'time_earnestMoneyStart': [], #10 保证金递交开始时间(保证金递交时间) 'time_earnestMoneyEnd': [] , # 11 保证金递交截止时间 'time_commencement':[] , #13 开工日期 'time_completion': [] # 14 竣工日期 } last_sentence_index = 0 last_time_type = "" last_time_index = { 'time_bidstart':"time_bidclose", 'time_publicityStart':"time_publicityEnd", 'time_getFileStart':"time_getFileEnd", 'time_registrationStart':"time_registrationEnd", 'time_earnestMoneyStart':"time_earnestMoneyEnd", 'time_commencement':"time_completion", } for entity in time_entitys: sentence_text = list_sentence[entity.sentence_index].sentence_text entity_left = sentence_text[max(0, entity.wordOffset_begin - 2):entity.wordOffset_begin] entity_right = sentence_text[entity.wordOffset_end:entity.wordOffset_end + 3] label_prob = entity.values[entity.label] entity_text = entity.entity_text extract_time = my_timeFormat(entity_text) if extract_time: if re.search("至|到", entity_left): if entity.sentence_index == last_sentence_index: time_type = last_time_index.get(last_time_type) if time_type: dict_time[time_type].append((extract_time[0], 0.5 + label_prob / 10)) last_time_type = "" continue if entity.label!=0: if entity.label==1 and label_prob>0.5: dict_time['time_release'].append((extract_time[0],label_prob)) last_time_type = 'time_release' elif entity.label==2 and label_prob>0.5: dict_time['time_bidopen'].append((extract_time[0],label_prob)) last_time_type = 'time_bidopen' elif entity.label==3 and label_prob>0.5: dict_time['time_bidclose'].append((extract_time[0],label_prob)) last_time_type = 'time_bidclose' elif entity.label==12 and label_prob>0.5: if len(extract_time)==1: if re.search("前|止|截止",entity_right) or re.search("至|止|到",entity_left) or re.search("前",entity_text[-2:]): dict_time['time_bidclose'].append((extract_time[0], label_prob)) last_time_type = 'time_bidclose' else: dict_time['time_bidstart'].append((extract_time[0], label_prob)) last_time_type = 'time_bidstart' else: dict_time['time_bidstart'].append((extract_time[0],label_prob)) dict_time['time_bidclose'].append((extract_time[1],label_prob)) last_time_type = '' elif entity.label==4 and label_prob>0.5: if len(extract_time)==1: if re.search("前|止|截止",entity_right) or re.search("至|止|到",entity_left) or re.search("前",entity_text[-2:]): dict_time['time_publicityEnd'].append((extract_time[0], label_prob)) last_time_type = 'time_publicityEnd' else: dict_time['time_publicityStart'].append((extract_time[0], label_prob)) last_time_type = 'time_publicityStart' else: dict_time['time_publicityStart'].append((extract_time[0],label_prob)) dict_time['time_publicityEnd'].append((extract_time[1],label_prob)) last_time_type = '' elif entity.label==5 and label_prob>0.5: if len(extract_time)==1: dict_time['time_publicityEnd'].append((extract_time[0], label_prob)) last_time_type = 'time_publicityEnd' else: dict_time['time_publicityStart'].append((extract_time[0],label_prob)) dict_time['time_publicityEnd'].append((extract_time[1],label_prob)) last_time_type = '' elif entity.label==6 and label_prob>0.5: if len(extract_time)==1: if re.search("前|止|截止",entity_right) or re.search("至|止|到",entity_left) or re.search("前",entity_text[-2:]): dict_time['time_getFileEnd'].append((extract_time[0], label_prob)) last_time_type = 'time_getFileEnd' else: dict_time['time_getFileStart'].append((extract_time[0], label_prob)) last_time_type = 'time_getFileStart' else: dict_time['time_getFileStart'].append((extract_time[0],label_prob)) dict_time['time_getFileEnd'].append((extract_time[1],label_prob)) last_time_type = '' elif entity.label==7 and label_prob>0.5: if len(extract_time)==1: dict_time['time_getFileEnd'].append((extract_time[0], label_prob)) last_time_type = 'time_getFileEnd' else: dict_time['time_getFileStart'].append((extract_time[0],label_prob)) dict_time['time_getFileEnd'].append((extract_time[1],label_prob)) last_time_type = '' elif entity.label==8 and label_prob>0.5: if len(extract_time)==1: if re.search("前|止|截止",entity_right) or re.search("至|止|到",entity_left) or re.search("前",entity_text[-2:]): dict_time['time_registrationEnd'].append((extract_time[0], label_prob)) last_time_type = 'time_registrationEnd' else: dict_time['time_registrationStart'].append((extract_time[0], label_prob)) last_time_type = 'time_registrationStart' else: dict_time['time_registrationStart'].append((extract_time[0],label_prob)) dict_time['time_registrationEnd'].append((extract_time[1],label_prob)) last_time_type = '' elif entity.label==9 and label_prob>0.5: if len(extract_time)==1: dict_time['time_registrationEnd'].append((extract_time[0], label_prob)) last_time_type = 'time_registrationEnd' else: dict_time['time_registrationStart'].append((extract_time[0],label_prob)) dict_time['time_registrationEnd'].append((extract_time[1],label_prob)) last_time_type = '' elif entity.label==10 and label_prob>0.5: if len(extract_time)==1: if re.search("前|止|截止",entity_right) or re.search("至|止|到",entity_left) or re.search("前",entity_text[-2:]): dict_time['time_earnestMoneyEnd'].append((extract_time[0], label_prob)) last_time_type = 'time_earnestMoneyEnd' else: dict_time['time_earnestMoneyStart'].append((extract_time[0], label_prob)) last_time_type = 'time_earnestMoneyStart' else: dict_time['time_earnestMoneyStart'].append((extract_time[0],label_prob)) dict_time['time_earnestMoneyEnd'].append((extract_time[1],label_prob)) last_time_type = '' elif entity.label==11 and label_prob>0.5: if len(extract_time)==1: dict_time['time_earnestMoneyEnd'].append((extract_time[0], label_prob)) last_time_type = 'time_earnestMoneyEnd' else: dict_time['time_earnestMoneyStart'].append((extract_time[0],label_prob)) dict_time['time_earnestMoneyEnd'].append((extract_time[1],label_prob)) last_time_type = '' elif entity.label==13 and label_prob>0.5: if len(extract_time)==1: if re.search("前|止|截止",entity_right) or re.search("至|止|到",entity_left) or re.search("前",entity_text[-2:]): dict_time['time_completion'].append((extract_time[0], label_prob)) last_time_type = 'time_completion' else: dict_time['time_commencement'].append((extract_time[0], label_prob)) last_time_type = 'time_commencement' else: dict_time['time_commencement'].append((extract_time[0],label_prob)) dict_time['time_completion'].append((extract_time[1],label_prob)) last_time_type = '' elif entity.label==14 and label_prob>0.5: if len(extract_time)==1: dict_time['time_completion'].append((extract_time[0], label_prob)) last_time_type = 'time_completion' else: dict_time['time_commencement'].append((extract_time[0],label_prob)) dict_time['time_completion'].append((extract_time[1],label_prob)) last_time_type = '' else: last_time_type = "" else: last_time_type = "" else: last_time_type = "" last_sentence_index = entity.sentence_index result_dict = dict((key,"") for key in dict_time.keys()) for time_type,value in dict_time.items(): list_time = dict_time[time_type] if list_time: list_time.sort(key=lambda x:x[1],reverse=True) result_dict[time_type] = list_time[0][0] return result_dict def getOtherAttributes(list_entity): dict_other = {"moneysource":"", "person_review":[], "serviceTime":"", "product":[], "total_tendereeMoney":0, "total_tendereeMoneyUnit":''} for entity in list_entity: if entity.entity_type == 'bidway': dict_other["bidway"] = turnBidWay(entity.entity_text) elif entity.entity_type=='moneysource': dict_other["moneysource"] = entity.entity_text elif entity.entity_type=='serviceTime': dict_other["serviceTime"] = entity.entity_text elif entity.entity_type=="person" and entity.label ==4: dict_other["person_review"].append(entity.entity_text) elif entity.entity_type=='product': dict_other["product"].append(entity.entity_text) elif entity.entity_type=='money' and entity.notes=='总投资' and dict_other["total_tendereeMoney"]\ \ \ \ \ \ \ \ ') for item in result: f.write(""+""+""+""+"") f.write("
doc_id角色
"+item[0]+""+item[1]+""+item[2]+"
") '''