# from BiddingKG.dl.common.Utils import findAllIndex,debug,timeFormat,getCurrent_date,API_URL,uniform_package_name,money_process,getDigitsDic,isValidDate from BiddingKG.dl.common.Utils import * from BiddingKG.dl.interface.Entitys import PREM,Role,Entity from decimal import Decimal import re import copy import math import pandas as pd import os from scipy.optimize import linear_sum_assignment from BiddingKG.dl.interface.Entitys import Match import numpy as np import uuid import time,calendar from datetime import datetime def getTheRole(entity,role_list): ''' @summary:根据实体名称拿到index @param: entity:实体名称 role_list:角色list @return:该实体所在下标 ''' for role_index in range(len(role_list)): if entity in role_list[role_index]: return role_index return None dict_role_id = {"0":"tenderee", "1":"agency", "2":"win_tenderer", "3":"second_tenderer", "4":"third_tenderer"} role2id_dict = {"tenderee":0, "agency":1, "win_tenderer":2, "second_tenderer":3, "third_tenderer":4} def getPackage(packageList,sentence_index,begin_index,roleid,MAX_DIS=None,DIRECT=None): ''' @param: packageList:文章的包的信息,包号-sent_index-词偏移-字偏移-[[前作用域句子,句内偏移],[后作用域句子,句内偏移]]-匹配集合 sentence_index:实体所在的句子 begin_index:实体所在句子的起始位置 @return:公司实体所属的包 @summary: 优化多标段,确定标段作用域之后,寻找作用域包含该实体的所有包,从前往后找到一个还没有该roleid的包返回,若找到的包都有roleid,则返回第一个,若没有找到包,返回None ''' ''' if len(packageList)==0: return None before_index = None after_index = None equal_index = None equal_count = 0 for pack_index in range(len(packageList)): if packageList[pack_index][1]>sentence_index and after_index is None: after_index = pack_index if packageList[pack_index][1]int(begin_index): if packageList[i-1][4]: return packageList[i-1][0] else: if packageList[i][4]: return packageList[i-1][0] else: return packageList[i][0] return packageList[end_index-1][0] ''' if len(packageList)==0: return None,False list_legalPack = [] for pack_index in range(len(packageList)): if DIRECT=="L" and (packageList[pack_index]["sentence_index"]>sentence_index or (packageList[pack_index]["sentence_index"]==sentence_index and packageList[pack_index]["offsetWords_begin"]>begin_index)): continue if DIRECT=="R" and (packageList[pack_index]["sentence_index"]sentence_index or (packageList[pack_index]["scope"][1][0]==sentence_index and packageList[pack_index]["scope"][1][1]>=begin_index)): if MAX_DIS is not None: if abs(sentence_index-packageList[pack_index]["sentence_index"])<=MAX_DIS: list_legalPack.append(pack_index) else: list_legalPack.append(pack_index) # if (packageList[pack_index]["scope"][0][0] < sentence_index # or (packageList[pack_index]["scope"][0][0] == sentence_index # and packageList[pack_index]["scope"][0][1] <= begin_index)) # and (packageList[pack_index]["scope"][1][0] > sentence_index # or (packageList[pack_index]["scope"][1][0] == sentence_index # and packageList[pack_index]["scope"][1][1] >= begin_index)): # pass _flag = True for _index in list_legalPack: if roleid in packageList[_index]["hit"]: continue else: _flag = False packageList[_index]["hit"].add(roleid) return packageList[_index]["pointer"],_flag if len(list_legalPack)>0: return packageList[0]["pointer"],_flag return None,False #生成合法的组合 def get_legal_comba(list_entity,dict_role_combination): #拿到一个包中所有合法的组合 def circle_package(_dict_legal_combination): list_dict_role_first = [] for _role in _dict_legal_combination: if len(list_dict_role_first)==0: for _entity in _dict_legal_combination[_role]: if _entity !="": list_dict_role_first.append({_role:_entity}) else: list_dict_role_after = [] _find_count = 0 for _entity in _dict_legal_combination[_role]: if _entity !="": for _dict in list_dict_role_first: _flag = True for _key1 in _dict: if _entity==_dict[_key1]: #修改为招标人和代理人可以为同一个 if str(_key1) in ["0","1"] and str(_role) in ["0","1"]: _flag = True else: _flag = False if _flag: _find_count += 1 _new_dict = copy.copy(_dict) _new_dict[_role] = _entity if len(list_dict_role_after)>100000: break list_dict_role_after.append(_new_dict) else: # 2021/5/25 update,同一实体(entity_text)不同角色 if len(list_dict_role_after) > 100000: break for _dict in list_dict_role_first: for _key1 in _dict: if _entity == _dict[_key1]: _new_dict = copy.copy(_dict) _new_dict.pop(_key1) _new_dict[_role] = _entity list_dict_role_after.append({_role:_entity}) if len(list_dict_role_after)==0: pass else: list_dict_role_first.extend(list_dict_role_after) return list_dict_role_first def recursive_package(_dict_legal_combination,set_legal_entity,dict_one_selution,list_all_selution): last_layer = False #若是空组合则放回空 if len(_dict_legal_combination.keys())==0: return [] #递归到最后一层则修改状态 if len(_dict_legal_combination.keys())==1: last_layer = True #取一个角色开始进行遍历 _key_role = list(_dict_legal_combination.keys())[0] for item in _dict_legal_combination[_key_role]: copy_dict_one_selution = copy.copy(dict_one_selution) copy_dict_legal_combination = {} copy_set_legal_entity = copy.copy(set_legal_entity) #复制余下的所有角色,进行下一轮递归 for _key in _dict_legal_combination.keys(): if _key!=_key_role: copy_dict_legal_combination[_key] = _dict_legal_combination[_key] #修改为招标人和代理人可以为同一个 if item !="": _flag = True if str(_key_role) in ["0","1"]: for _key_flag in copy_dict_one_selution: if _key_flag not in ["0","1"] and copy_dict_one_selution[_key_flag]==item: _flag = False else: for _key_flag in copy_dict_one_selution: if copy_dict_one_selution[_key_flag]==item: _flag = False if _flag: copy_dict_one_selution[_key_role] = item ''' if item not in copy_set_legal_entity: if item !="": copy_dict_one_selution[_key_role] = item ''' copy_set_legal_entity.add(item) if last_layer: list_all_selution.append(copy_dict_one_selution) else: recursive_package(copy_dict_legal_combination,copy_set_legal_entity,copy_dict_one_selution,list_all_selution) #递归匹配各个包的结果 def recursive_packages(_dict_legal_combination,dict_one_selution,list_all_selution): last_layer = False if len(_dict_legal_combination.keys())==0: return [] if len(_dict_legal_combination.keys())==1: last_layer = True _key_pack = list(_dict_legal_combination.keys())[0] for item in _dict_legal_combination[_key_pack]: copy_dict_one_selution = copy.copy(dict_one_selution) copy_dict_legal_combination = {} for _key in _dict_legal_combination.keys(): if _key!=_key_pack: copy_dict_legal_combination[_key] = _dict_legal_combination[_key] for _key_role in item.keys(): copy_dict_one_selution[_key_pack+"$$"+_key_role] = item[_key_role] if last_layer: list_all_selution.append(copy_dict_one_selution) else: recursive_packages(copy_dict_legal_combination,copy_dict_one_selution,list_all_selution) return list_all_selution #循环获取所有包组合 def circle_pageages(_dict_legal_combination): list_all_selution = [] for _key_pack in _dict_legal_combination.keys(): list_key_selution = [] for item in _dict_legal_combination[_key_pack]: _dict = dict() for _key_role in item.keys(): _dict[_key_pack+"$$"+_key_role] = item[_key_role] list_key_selution.append(_dict) if len(list_all_selution)==0: list_all_selution = list_key_selution else: _list_all_selution = [] for item_1 in list_all_selution: for item_2 in list_key_selution: _list_all_selution.append(dict(item_1,**item_2)) list_all_selution = _list_all_selution return list_all_selution #拿到各个包解析之后的结果 _dict_legal_combination = {} for packageName in dict_role_combination.keys(): _list_all_selution = [] # recursive_package(dict_role_combination[packageName], set(), {}, _list_all_selution) _list_all_selution = circle_package(dict_role_combination[packageName]) ''' # print("===1") # print(packageName) for item in _list_all_selution: # print(item) # print("===2") ''' #去除包含子集 list_all_selution_simple = [] _list_set_all_selution = [] for item_selution in _list_all_selution: item_set_selution = set() for _key in item_selution.keys(): item_set_selution.add((_key,item_selution[_key])) _list_set_all_selution.append(item_set_selution) if len(_list_set_all_selution)>1000: _dict_legal_combination[packageName] = _list_all_selution continue for i in range(len(_list_set_all_selution)): be_included = False for j in range(len(_list_set_all_selution)): if i!=j: if len(set(_list_set_all_selution[i])&set(_list_set_all_selution[j]))==len(_list_set_all_selution[i]) and len(_list_set_all_selution[i])!=len(_list_set_all_selution[j]): be_included = True if not be_included: list_all_selution_simple.append(_list_all_selution[i]) _dict_legal_combination[packageName] = list_all_selution_simple _list_final_comba = [] #对各个包的结果进行排列组合 _comba_count = 1 for _key in _dict_legal_combination.keys(): _comba_count *= len(_dict_legal_combination[_key]) #如果过大,则每个包只取概率最大的那个 dict_pack_entity_prob = get_dict_entity_prob(list_entity) if _comba_count>250: new_dict_legal_combination = dict() for _key_pack in _dict_legal_combination.keys(): MAX_PROB = -1000 _MAX_PROB_COMBA = None for item in _dict_legal_combination[_key_pack]: # print(_key_pack,item) _dict = dict() for _key in item.keys(): _dict[str(_key_pack)+"$$"+str(_key)] = item[_key] _prob = getSumExpectation(dict_pack_entity_prob, _dict) if _prob>MAX_PROB: MAX_PROB = _prob _MAX_PROB_COMBA = [item] if _MAX_PROB_COMBA is not None: new_dict_legal_combination[_key_pack] = _MAX_PROB_COMBA _dict_legal_combination = new_dict_legal_combination #recursive_packages(_dict_legal_combination, {}, _list_final_comba) _list_final_comba = circle_pageages(_dict_legal_combination) #除了Project包(招标人和代理人),其他包是不会有冲突的 #查看是否有一个实体出现在了Project包和其他包中,如有,要进行裁剪 _list_real_comba = [] for dict_item in _list_final_comba: set_project = set() set_other = set() for _key in list(dict_item.keys()): if _key.split("$$")[0]=="Project": set_project.add(dict_item[_key]) else: set_other.add(dict_item[_key]) set_common = set_project&set_other if len(set_common)>0: dict_project = {} dict_not_project = {} for _key in list(dict_item.keys()): if dict_item[_key] in set_common: if str(_key.split("$$")[0])=="Project": dict_project[_key] = dict_item[_key] else: dict_not_project[_key] = dict_item[_key] else: dict_project[_key] = dict_item[_key] dict_not_project[_key] = dict_item[_key] _list_real_comba.append(dict_project) _list_real_comba.append(dict_not_project) else: _list_real_comba.append(dict_item) return _list_real_comba def get_dict_entity_prob(list_entity,on_value=0.5): dict_pack_entity_prob = {} for in_attachment in [False,True]: identified_role = [] if in_attachment==True: identified_role = [value[0] for value in dict_pack_entity_prob.values()] for entity in list_entity: if entity.entity_type in ['org','company'] and entity.in_attachment==in_attachment: values = entity.values role_prob = float(values[int(entity.label)]) _key = entity.packageName+"$$"+str(entity.label) if role_prob>=on_value and str(entity.label)!="5": _key_prob = _key+"$text$"+entity.entity_text if in_attachment == True: role_prob = 0.8 if role_prob>0.8 else role_prob #附件的概率修改低点 # if entity.entity_text in identified_role: # 2023/7/3 注释掉,选取概率最大的作为连接概率 # continue if _key_prob in dict_pack_entity_prob: # new_prob = role_prob+dict_pack_entity_prob[_key_prob][1] if role_prob>0.9 else max(role_prob, dict_pack_entity_prob[_key_prob][1]) # dict_pack_entity_prob[_key_prob] = [entity.entity_text, new_prob] #公司同角色多次出现概率累计 if role_prob>dict_pack_entity_prob[_key_prob][1]: dict_pack_entity_prob[_key_prob] = [entity.entity_text,role_prob] else: dict_pack_entity_prob[_key_prob] = [entity.entity_text,role_prob] return dict_pack_entity_prob #计算合计期望 def getSumExpectation(dict_pack_entity_prob,combination,on_value=0.5): ''' expect = 0 for entity in list_entity: if entity.entity_type in ['org','company']: values = entity.values role_prob = float(values[int(entity.label)]) _key = entity.packageName+"$$"+str(entity.label) if role_prob>on_value and str(entity.label)!="5": if _key in combination.keys() and combination[_key]==entity.entity_text: expect += math.pow(role_prob,4) else: expect -= math.pow(role_prob,4) ''' #修改为同一个实体只取对应包-角色的最大的概率值 expect = 0 dict_entity_prob = {} for _key_pack_entity in dict_pack_entity_prob: _key_pack = _key_pack_entity.split("$text$")[0] role_prob = dict_pack_entity_prob[_key_pack_entity][1] if _key_pack in combination.keys() and combination[_key_pack]==dict_pack_entity_prob[_key_pack_entity][0]: if _key_pack_entity in dict_entity_prob.keys(): if dict_entity_prob[_key_pack_entity]-role_prob: dict_entity_prob[_key_pack_entity] = -role_prob else: dict_entity_prob[_key_pack_entity] = -role_prob # for entity in list_entity: # if entity.entity_type in ['org','company']: # values = entity.values # role_prob = float(values[int(entity.label)]) # _key = entity.packageName+"$$"+str(entity.label) # if role_prob>=on_value and str(entity.label)!="5": # if _key in combination.keys() and combination[_key]==entity.entity_text: # _key_prob = _key+entity.entity_text # if _key_prob in dict_entity_prob.keys(): # if dict_entity_prob[_key_prob]-role_prob: # dict_entity_prob[_key_prob] = -role_prob # else: # dict_entity_prob[_key_prob] = -role_prob for _key in dict_entity_prob.keys(): symbol = 1 if dict_entity_prob[_key]>0 else -1 expect += symbol*math.pow(dict_entity_prob[_key],2) return expect def getRoleList(list_sentence,list_entity,on_value = 0.5): ''' @summary: 搜索树,得到所有不矛盾的角色组合,取合计期望值最大的作为结果返回 @param: list_sentence:文章所有的sentence list_entity:文章所有的实体 on_value:概率阈值 @return:文章的角色list ''' pack = getPackagesFromArticle(list_sentence,list_entity) if pack is None: return None # PackageList,PackageSet,dict_PackageCode = pack PackageList,PackageSet,dict_PackageCode,main_body_pack = pack #拿到所有可能的情况 dict_role_combination = {} tenderee_or_agency_set = set() # 记录所有预测为招标或代理的实体集合 win_tenderer_set = set() # 记录所有预测为中标的实体集合 # print(PackageList) #拿到各个实体的packageName,packageCode main_contain_winner = False # 2024/10/11 判断正文是否包含中标人 for entity in list_entity: if entity.entity_type in ['org','company'] and entity.label==2 and entity.values[entity.label]>0.7 and entity.in_attachment==False: main_contain_winner = True break for entity in list_entity: if entity.entity_type in ['org','company']: #限制附件里角色values[label]最大概率prob max_prob = 0.85 if str(entity.label)!="5" and entity.in_attachment: if entity.values[entity.label]>max_prob: entity.values[entity.label] = max_prob #过滤掉字数小于3个的实体 if len(entity.entity_text)<=3: continue values = entity.values role_prob = float(values[int(entity.label)]) if role_prob>=on_value and str(entity.label)!="5": if main_contain_winner and entity.in_attachment and entity.label in [2,3,4]: # 2024/10/11 正文包含中标人,不再提取附件中标人 避免 例:504046747 附件角色OCR错字变两个标段 continue if str(entity.label) in ["0","1"]: packageName = "Project" else: if len(PackageSet)>0: packagePointer,_ = getPackage(PackageList,entity.sentence_index,entity.begin_index,"role-"+str(entity.label)) if packagePointer is None: #continue packageName = "Project" # print(entity.entity_text, packageName,entity.sentence_index,entity.begin_index) else: #add pointer_pack entity.pointer_pack = packagePointer packageName = packagePointer.entity_text # print(entity.entity_text, packageName) else: packageName = "Project" find_flag = False if packageName in dict_PackageCode.keys(): packageCode = dict_PackageCode[packageName] else: packageCode = "" entity.packageCode = packageCode role_name = dict_role_id.get(str(entity.label)) entity.roleName = role_name entity.packageName = packageName if entity.packageName in dict_role_combination.keys(): if str(entity.label) in dict_role_combination[entity.packageName].keys(): dict_role_combination[entity.packageName][str(entity.label)].add(entity.entity_text) else: dict_role_combination[entity.packageName][str(entity.label)] = set([entity.entity_text]) else: dict_role_combination[entity.packageName] = {} #初始化空值 roleIds = [0,1,2,3,4] for _roleId in roleIds: dict_role_combination[entity.packageName][str(_roleId)] = set([""]) dict_role_combination[entity.packageName][str(entity.label)].add(entity.entity_text) list_real_comba = get_legal_comba(list_entity,dict_role_combination) # print("===role_combination",dict_role_combination) # print("== real_comba",list_real_comba) #拿到最大期望值的组合 max_index = 0 max_expect = -100 _index = 0 dict_pack_entity_prob = get_dict_entity_prob(list_entity) for item_combination in list_real_comba: expect = getSumExpectation(dict_pack_entity_prob, item_combination) if expect>max_expect: max_index = _index max_expect = expect _index += 1 RoleList = [] RoleSet = set() if len(list_real_comba)>0: for _key in list_real_comba[max_index].keys(): packageName = _key.split("$$")[0] label = _key.split("$$")[1] role_name = dict_role_id.get(str(label)) entity_text = list_real_comba[max_index][_key] entity_prob = dict_pack_entity_prob.get(_key+'$text$'+entity_text, ['',0])[1] # entity_text = list_real_comba[max_index][_key][0] # entity_prob = list_real_comba[max_index][_key][1] if packageName in dict_PackageCode.keys(): packagecode = dict_PackageCode.get(packageName) else: packagecode = "" RoleList.append(PREM(packageName,packagecode,role_name,entity_text,entity_prob,0,0.0,[])) if str(label) in ["0", "1"]: tenderee_or_agency_set.add(entity_text) elif str(label) in ["2"] and entity_prob > 0.8: win_tenderer_set.add(entity_text) # if len(list_real_comba) > 1 and label == '2': # 20240809 由于包号对应不上注销 # multi_winner = [] # for comba in list_real_comba: # tmp_ent = comba.get(_key, '') # tmp_prob = dict_pack_entity_prob.get(_key+'$text$'+tmp_ent, ['',0])[1] # if tmp_ent !='' and tmp_prob>0.8: # multi_winner.append(comba[_key]) # if len(set(multi_winner)) > 1: # RoleList[-1].multi_winner = multi_winner # print('RoleList: ', RoleList) RoleSet.add(entity_text) #根据最优树来修正list_entity中角色对包的连接 for _entity in list_entity: if _entity.pointer_pack is not None: _pack_name = _entity.pointer_pack.entity_text _find_flag = False for _prem in RoleList: if _prem.packageName==_pack_name and _prem.entity_text==_entity.entity_text: _find_flag = True if not _find_flag: _entity.pointer_pack = None return RoleList,RoleSet,PackageList,PackageSet,win_tenderer_set,tenderee_or_agency_set,main_body_pack def getPackageScopePattern(): ''' @summary: 获取包的作用域关键词 ''' df = pd.read_excel(os.path.dirname(__file__)+"/end.xls") pattern = "(" for item in df["list_word"]: item = str(item).replace("(","\(").replace(")","\)").replace(".","\.").replace("[","\[").replace("]","\]").replace("-","\-") pattern += item+"|" pattern = pattern[:-1]+")[::是为]|业绩.{,30}标段[0-9A-Za-z一二三四五六七八九十]{0,3}|##attachment##" return pattern pattern_packageScope = getPackageScopePattern() def getPackagesFromArticle(list_sentence, list_entity): ''' @param: list_sentence:文章的句子list @summary: 将包的信息插入list_entity中 @return: type:list if [包号,句子index,词偏移,标段号] meaning:文章的包/标段信息 ''' if len(list_sentence) == 0: return None list_sentence.sort(key=lambda x: x.sentence_index) PackageList = [] PackageList_scope = [] PackageSet = set() dict_packageCode = dict() main_body_pack = set() # 2024/04/28 保存正文包号 # package_number_pattern = re.compile( # '((施工|监理|监测|勘察|设计|劳务)(标段)?[0-9一二三四五六七八九十ⅠⅡⅢⅣⅤⅥⅦa-zA-Z]{,4}(标段?|包))|(([a-zA-Z]包[:)]?)?第?[0-9一二三四五六七八九十ⅠⅡⅢⅣⅤⅥⅦa-zA-Z]{1,4}标[段包]?)|((标[段号的包项]|([标分子]|合同|项目|采购|()包|包[组件号])[0-9一二三四五六七八九十ⅠⅡⅢⅣⅤⅥⅦA-Za-z]{1,4})|(([,;。、:(]|第)[0-9一二三四五六七八九十ⅠⅡⅢⅣⅤⅥⅦ]{1,4}分?包)|([a-zA-Z][0-9]{,3}分?[包标])|.{,1}((包组|包件|包号|分?包|标[段号的包]|子项目)编?号?[::]?[a-zA-Z0-9一二三四五六七八九十ⅠⅡⅢⅣⅤⅥⅦ-]+)|[,;。、:(]包[0-9一二三四五六七八九十ⅠⅡⅢⅣⅤⅥⅦ]{1,4}[^\w]') # 标号 # package_number_pattern = re.compile( # '((施工|监理|监测|勘察|设计|劳务)(标段)?:?第?([一二三四五六七八九十]{1,3}|[ⅠⅡⅢⅣⅤⅥⅦ]{1,3}|[a-zA-Z0-9]{1,9}\-?[a-zA-Z0-9-]{,9})?[分子]?(标[段包项]?|包[组件标]?|合同[包段]))\ # |(([a-zA-Z]包[:()]?)?第?([一二三四五六七八九十]{1,3}|[ⅠⅡⅢⅣⅤⅥⅦ]{1,3}|[a-zA-Z0-9]{1,9}\-?[a-zA-Z0-9-]{,9})[分子]?(标[段包项]?|合同[包段]))\ # |(([,;。、:(]|第)?([一二三四五六七八九十]{1,3}|[ⅠⅡⅢⅣⅤⅥⅦ]{1,3}|[a-zA-Z0-9]{1,9}\-?[a-zA-Z0-9-]{,9})[分子]?(标[段包项]?|包[组件标]?|合同[包段]))\ # |((标[段包项]|标段(包)|包[组件标]|[标分子(]包)(\[|【)?:?([一二三四五六七八九十]{1,3}|[ⅠⅡⅢⅣⅤⅥⅦ]{1,3}|[a-zA-Z0-9]{1,9}\-?[a-zA-Z0-9-]{,9}))\ # |[,;。、:(](标的?|项目|子项目?)(\[|【)?:?([一二三四五六七八九十]+|[0-9]{1,9})\ # |((([标分子(]|合同|项目|采购)包|[,。]标的|子项目|[分子]标|标[段包项]|包[组件标]?)编?号[::]?[a-zA-Z0-9一二三四五六七八九十ⅠⅡⅢⅣⅤⅥⅦ-]{1,9})\ # |[,;。、:(]?(合同|分|子)?包:?([一二三四五六七八九十]{1,3}|[ⅠⅡⅢⅣⅤⅥⅦ]{1,3}|[a-zA-Z0-9]{1,9}\-?[a-zA-Z0-9-]{,9})') other_package_pattern = re.compile( '((项目|物资|设备|场次|标段|标的|产品)(名称)?)[::]([^,。]{2,50}?)[,。]') # # 2020/11/23 大网站规则 调整 package_N_name_pattern, package_N_name_pattern 中的项目 改为 子项目 win_tenderer_pattern = re.compile('(中标候?选?人|供应商)(名称)?[::](.{2,25})[,。]') # 2020/11/23 大网站规则 调整 model_pattern = re.compile('(型号|序号)[::]([^,。]{2,20})[,。]') # 2020/11/23 大网站规则 调整 number_pattern = re.compile("[0-9A-Za-z一二三四五六七八九十ⅠⅡⅢⅣⅤⅥⅦ]{1,4}") package_code_pattern = re.compile("(?:编号[::]?\s*)([-\dA-Za-z\(\)]{1,20})") # 纯数字类型的包号统一,例如:'01','1' re_digital = re.compile("^\d+$") def changeIndexFromWordToWords(tokens, word_index): ''' @summary:转换某个字的字偏移为词偏移 ''' before_index = 0 after_index = 0 for i in range(len(tokens)): after_index = after_index + len(tokens[i]) if before_index <= word_index and after_index >= word_index: return i before_index = after_index package_names = [] def extractPackageCode(tokens, word_index, size=20, pattern=package_code_pattern): ''' @summary:抽取包附近的标段号 @param: tokens:包所在句子的分词 word_index:包所在字偏移 size:左右各取多少个词 pattern:提取标段号的正则 @return: type:string,meaning:标段号 ''' index = changeIndexFromWordToWords(tokens, word_index) if index < size: begin = index else: begin = index - size if index + size > len(tokens): end = len(tokens) else: end = index + size # 拿到左右两边的词语组成短语 text = "".join(tokens[begin:end]) # 在短语中的字偏移 new_word_index = word_index - len("".join(tokens[:begin])) min_distance = len(text) packageCode = None for the_iter in re.finditer(pattern, text): # 算出最小距离 distance = min([abs(new_word_index - the_iter.span()[0]), abs(new_word_index - the_iter.span()[1])]) if distance < min_distance: min_distance = distance packageCode = the_iter.group(1) return packageCode def get_package(): PackageList_scope = [] True_package = set() for i in range(len(list_sentence)): PackageList_item = [] PackageList_item_scope = [] content = list_sentence[i].sentence_text # content = content.replace('号,', '号:').replace(':', ':').replace('(', '(').replace(')', ')') # # .replace('-包',' 包').replace('包-', '包 ').replace('-标', ' 标').replace('标段-', '标段 ').replace('-合同包', ' 合同包') # 72760191 标段:№10 # content = re.sub('[一二三四五六七八九十\d](标[段包项]|包[组件标])编号', ' 标段编号', content) # # for it in re.finditer('CA标|(每个?|所有|相关|个|各|不分)[分子]?(标[段包项]?|包[组件标]?|合同包)|(质量|责任)三包|包[/每]|标段(划分|范围)|(承|压缩|软|皮|书|挂)包\ # |标[识注签贴配]|[商油]标号|第X包|第[一二三四五六七八九十]+至[一二三四五六七八九十]+(标[段包项]?|包[组件标]?|合同[包段])\ # |\.(docx|doc|pdf|xlsx|xls|jpg)|[一二三四五]次|五金|\d+[年月]|[\d.,]+万?元|\d+\.\d+', content): # content = content.replace(it.group(0), ' ' * len(it.group(0))) # tokens = list_sentence[i].tokens # _names = [] # for iter in re.finditer(package_number_pattern, content): # if re.search('(业绩|信誉要求):', content[:iter.start()]): # 前面有业绩或信誉的标段去掉 # continue # # print('提取到标段:%s, 前后文:%s'%(iter.group(), content[iter.start()-5:iter.end()+5])) # if re.match('\d', iter.group(0)) and re.search('\d\.$', content[:iter.start()]): # 排除2.10标段3 5.4标段划分 这种情况 # # print('过滤掉错误包:', iter.group()) # continue # if re.search('[承每书/]包|XX|xx', iter.group(0)) or re.search('\d包[/每]\w|一包[0-9一二三四五六七八九十]+', content[iter.start():iter.end()+3]) or re.search('[a-zA-Z0-9一二三四五六七八九十ⅠⅡⅢⅣⅤⅥⅦ-]{6,}', iter.group(0)): # # print('过滤掉错误包:', iter.group()) # continue # elif iter.end()+2 < len(content) and re.search('标准|标的物|标志|包装|划分|标书', content[iter.start():iter.end()+2]): # # print('过滤掉错误包:',iter.group()) # continue # elif re.search('同一(标段?|包)', content[max(0, iter.start()-2):iter.end()]): # 不得参加同一标段 # # print('过滤掉错误包:', iter.group()) # continue # elif re.search('三包', content[max(0, iter.start()-2):iter.end()]) and re.search('第三包', content[max(0, iter.start()-2):iter.end()])==None: # 规规章和“三包”规定 # # print('过滤掉错误包:', iter.group()) # continue # elif re.search('[1-9]\d{2,}$|\d{4,}|^[1-9]\d{2,}|合同包[A-Za-z]{2,}', iter.group(0)): # # print('过滤掉错误包号5:', iter.group(0)) # continue tokens = list_sentence[i].tokens _names = [] for iter in find_package(content): temp_package_number = uniform_package_name(iter.group(0)) True_package.add(temp_package_number) PackageList_item.append({"name": temp_package_number, "sentence_index": list_sentence[i].sentence_index, "offsetWords_begin": changeIndexFromWordToWords(tokens, iter.span()[0]), "offsetWord_begin": iter.span()[0], "offsetWord_end": iter.span()[1]}) # PackageList_item.append([temp_package_number,i,changeIndexFromWordToWords(tokens,iter.span()[0]),iter.span()[0],iter.span()[1]]) code = extractPackageCode(tokens, iter.span()[0]) if code is not None: dict_packageCode[temp_package_number] = code PackageSet.add(temp_package_number) if not list_sentence[i].in_attachment: # 保存不在附件的包号 main_body_pack.add(temp_package_number) # 识别packageScope for iter in re.finditer(pattern_packageScope, content): PackageList_item_scope.append({"name": "", "sentence_index": list_sentence[i].sentence_index, "offsetWords_begin": changeIndexFromWordToWords(tokens, iter.span()[0]), "offsetWord_begin": iter.span()[0], "offsetWord_end": iter.span()[1]}) # PackageList_item_scope.append(["",i,changeIndexFromWordToWords(tokens,iter.span()[0]),iter.span()[0],iter.span()[1]]) PackageList_item_scope = PackageList_item + PackageList_item_scope PackageList_item_scope.sort(key=lambda x: x["offsetWord_begin"]) PackageList_scope = PackageList_scope + PackageList_item_scope PackageList_item.sort(key=lambda x: x["sentence_index"]) return PackageList_scope, True_package def get_win_project(): '''获取多个项目多个中标人的项目''' PackageList_scope = [] True_package = set() # 2020/11/23 大网站规则 调整 if len(PackageSet) == 0 and len( set([it.entity_text for it in list_entity if it.entity_type in ['org', 'company'] and it.label == 2])) > 1: for i in range(len(list_sentence)): PackageList_item = [] PackageList_item_scope = [] content = list_sentence[i].sentence_text tokens = list_sentence[i].tokens names = re.findall(other_package_pattern, content) N_names = re.findall(win_tenderer_pattern, content) if len(names) != 1 or len(N_names) != 1: continue for iter in re.finditer(other_package_pattern, content): temp_package_number = iter.group(4) xinghao = re.search(model_pattern, content) if xinghao: temp_package_number = temp_package_number + '+' + xinghao.group(2) # print('新正则采购包名补充',temp_package_number) if re.search(re_digital, temp_package_number): temp_package_number = str(int(temp_package_number)) True_package.add(temp_package_number) PackageList_item.append( {"name": temp_package_number, "sentence_index": list_sentence[i].sentence_index, "offsetWords_begin": changeIndexFromWordToWords(tokens, iter.span()[0]), "offsetWord_begin": iter.span()[0], "offsetWord_end": iter.span()[1]}) # PackageList_item.append([temp_package_number,i,changeIndexFromWordToWords(tokens,iter.span()[0]),iter.span()[0],iter.span()[1]]) code = extractPackageCode(tokens, iter.span()[0]) if code is not None: dict_packageCode[temp_package_number] = code PackageSet.add(temp_package_number) if not list_sentence[i].in_attachment: # 保存不在附件的包号 main_body_pack.add(temp_package_number) # 识别packageScope for iter in re.finditer(pattern_packageScope, content): PackageList_item_scope.append({"name": "", "sentence_index": list_sentence[i].sentence_index, "offsetWords_begin": changeIndexFromWordToWords(tokens, iter.span()[0]), "offsetWord_begin": iter.span()[0], "offsetWord_end": iter.span()[1]}) # PackageList_item_scope.append(["",i,changeIndexFromWordToWords(tokens,iter.span()[0]),iter.span()[0],iter.span()[1]]) PackageList_item_scope = PackageList_item + PackageList_item_scope PackageList_item_scope.sort(key=lambda x: x["offsetWord_begin"]) PackageList_scope = PackageList_scope + PackageList_item_scope PackageList_item.sort(key=lambda x: x["sentence_index"]) return PackageList_scope, True_package def get_package_scope(PackageList_scope): PackageList = [] pattern_punctuation = "[::()\(\),,。;;]" # print("===packageList_scope",PackageList_scope) for i in range(len(list_sentence)): for j in range(len(PackageList_scope)): if i == PackageList_scope[j]["sentence_index"] and PackageList_scope[j]["name"] != "": _flag = False left_str = list_sentence[i].sentence_text[ PackageList_scope[j]["offsetWord_begin"] - 30:PackageList_scope[j][ "offsetWord_begin"] + 1] right_str = list_sentence[i].sentence_text[ PackageList_scope[j]["offsetWord_begin"]:PackageList_scope[j]["offsetWord_begin"] + 30] _left_find = re.findall(pattern_punctuation, left_str) _right_find = re.findall(pattern_punctuation, right_str) # print(left_str) if re.search("同", left_str[-1:]) is not None and PackageList_scope[j]["name"] == "一": continue if re.search("划分", right_str[:10]) is not None: continue if len(_left_find) > 0 and _left_find[-1] in [":", ":"]: _flag = True if len(_right_find) > 0 and _right_find[0] in [":", ":"]: _flag = True if _flag: scope_begin = [PackageList_scope[j]["sentence_index"], PackageList_scope[j]["offsetWords_begin"]] else: scope_begin = [PackageList_scope[j]["sentence_index"], 0] # 2024/10/10 改为包作用域开始位置为包号所在句子开头 # if j == 0: # scope_begin = [0, 0] # else: # scope_begin = [PackageList_scope[j - 1]["sentence_index"], # PackageList_scope[j - 1]["offsetWords_begin"]] if j == len(PackageList_scope) - 1: scope_end = [list_sentence[-1].sentence_index, changeIndexFromWordToWords(list_sentence[-1].tokens, len(list_sentence[ -1].sentence_text))] else: scope_end = [PackageList_scope[j + 1]["sentence_index"], PackageList_scope[j + 1]["offsetWords_begin"]] if j>0 and PackageList_scope[j - 1]["sentence_index"] == PackageList_scope[j]["sentence_index"] and \ PackageList_scope[j - 1]["offsetWord_begin"] <= PackageList_scope[j]["offsetWord_begin"] and \ PackageList_scope[j - 1]["offsetWord_end"] >= PackageList_scope[j]["offsetWord_end"]: continue # add package to entity _pack_entity = Entity(doc_id=list_sentence[0].doc_id, entity_id="%s_%s_%s_%s" % ( list_sentence[0].doc_id, i, PackageList_scope[j]["offsetWord_begin"], PackageList_scope[j]["offsetWord_begin"]), entity_text=PackageList_scope[j]["name"], entity_type="package", sentence_index=PackageList_scope[j]["sentence_index"], begin_index=changeIndexFromWordToWords(list_sentence[i].tokens, PackageList_scope[j][ "offsetWord_begin"]), end_index=changeIndexFromWordToWords(list_sentence[i].tokens, PackageList_scope[j]["offsetWord_end"]), wordOffset_begin=PackageList_scope[j]["offsetWord_begin"], wordOffset_end=PackageList_scope[j]["offsetWord_end"], in_attachment=list_sentence[i].in_attachment) list_entity.append(_pack_entity) copy_pack = copy.copy(PackageList_scope[j]) copy_pack["scope"] = [scope_begin, scope_end] copy_pack["hit"] = set() copy_pack["pointer"] = _pack_entity PackageList.append(copy_pack) return PackageList PackageList_scope, True_package = get_package() # PackageList_scope2, True_package2 = get_win_project() # 20240508 与表格提取重复,去掉 # if len(True_package2) > 2: # 同时包含多标段及多中标人的 # PackageList_scope = PackageList_scope + PackageList_scope2 PackageList = get_package_scope(PackageList_scope) # if len(PackageSet)<2: # 20230922只提取到一个包号的去掉,都放在默认包project 2024/02/02 注释掉,防止多标段每篇公告只公布一个标段的没法提取标段号 # return [], set(), {} return PackageList, PackageSet, dict_packageCode, main_body_pack # km配对方法 def dispatch(match_list): main_roles = list(set([match.main_role for match in match_list])) attributes = list(set([match.attribute for match in match_list])) label = np.zeros(shape=(len(main_roles), len(attributes))) for match in match_list: main_role = match.main_role attribute = match.attribute value = match.value label[main_roles.index(main_role), attributes.index(attribute)] = value + 10000 # print(label) gragh = -label # km算法 row, col = linear_sum_assignment(gragh) max_dispatch = [(i, j) for i, j, value in zip(row, col, gragh[row, col]) if value] # return [Match(main_roles[row], attributes[col]) for row, col in max_dispatch] return [(main_roles[row], attributes[col]) for row, col in max_dispatch] from BiddingKG.dl.common.Utils import getUnifyMoney from BiddingKG.dl.interface.modelFactory import Model_relation_extraction relationExtraction_model = Model_relation_extraction() def findAttributeAfterEntity(PackDict,roleSet,PackageList,PackageSet,list_sentence,list_entity,list_outline,on_value = 0.5,on_value_person=0.5,sentence_len=4): ''' @param: PackDict:文章包dict roleSet:文章所有角色的公司名称 PackageList:文章的包信息 PackageSet:文章所有包的名称 list_entity:文章所有经过模型处理的实体 on_value:金额模型的阈值 on_value_person:联系人模型的阈值 sentence_len:公司和属性间隔句子的最大长度 @return:添加了属性信息的角色list ''' #根据roleid添加金额到rolelist中 def addMoneyByRoleid(packDict,packageName,roleid,money,money_prob): for i in range(len(packDict[packageName]["roleList"])): if packDict[packageName]["roleList"][i].role_name==dict_role_id.get(str(roleid)): if money_prob>packDict[packageName]["roleList"][i].money_prob: packDict[packageName]["roleList"][i].money = money packDict[packageName]["roleList"][i].money_prob = money_prob return packDict #根据实体名称添加金额到rolelist中 def addMoneyByEntity(packDict,packageName,entity,money,money_prob): for i in range(len(packDict[packageName]["roleList"])): if packDict[packageName]["roleList"][i].entity_text==entity: # if money_prob>packDict[packageName]["roleList"][i].money_prob: # packDict[packageName]["roleList"][i].money = money # packDict[packageName]["roleList"][i].money_prob = money_prob if packDict[packageName]["roleList"][i].money_prob==0 : # 2021/7/20第一次更新金额 if money.notes == '单价': packDict[packageName]["roleList"][i].unit_price = money.entity_text else: packDict[packageName]["roleList"][i].money = money.entity_text packDict[packageName]["roleList"][i].money_prob = money_prob packDict[packageName]["roleList"][i].money_unit = money.money_unit elif money_prob>packDict[packageName]["roleList"][i].money_prob+0.2 or (money.notes in ['大写'] and money.in_attachment==False): # 2021/7/20改为优先选择大写金额, # print('已连接金额概率:money_prob:',packDict[packageName]["roleList"][i].money_prob) # print('链接金额备注 ',money.notes, money.entity_text, money.values) if money.notes == '单价': packDict[packageName]["roleList"][i].unit_price = money.entity_text else: packDict[packageName]["roleList"][i].money = money.entity_text packDict[packageName]["roleList"][i].money_prob = money_prob packDict[packageName]["roleList"][i].money_unit = money.money_unit # print('链接中的金额:{0}, 单位:{1}'.format(money.entity_text, money.money_unit)) return packDict def addRatioByEntity(packDict,packageName,entity,ratio): for i in range(len(packDict[packageName]["roleList"])): if packDict[packageName]["roleList"][i].entity_text==entity: packDict[packageName]["roleList"][i].ratio = ratio.ratio_value def addServiceTimeByEntity(packDict,packageName,entity,serviceTime): for i in range(len(packDict[packageName]["roleList"])): if packDict[packageName]["roleList"][i].entity_text==entity and not packDict[packageName]["roleList"][i].serviceTime: # packDict[packageName]["roleList"][i].serviceTime = serviceTime.entity_text packDict[packageName]["roleList"][i].serviceTime = extract_serviceTime(serviceTime.entity_text,"") #根据实体名称得到角色 def getRoleWithText(packDict,entity_text): for pack in packDict.keys(): for i in range(len(packDict[pack]["roleList"])): if packDict[pack]["roleList"][i].entity_text==entity_text: return packDict[pack]["roleList"][i].role_name def doesEntityOrLinkedEntity_inRoleSet(entity,RoleSet): _list_entitys = [entity]+entity.linked_entitys for _entity in _list_entitys: if _entity.entity_text in RoleSet: return True p_entity = 0 # 2021/7/19 顺序比较金额,前面是后面的一万倍则把前面金额/10000 # money_list = [it for it in list_entity if it.entity_type=="money"] # for i in range(len(money_list)-1): # for j in range(1, len(money_list)): # if (float(money_list[i].entity_text) > 5000000000 or money_list[j].notes=='大写') and \ # Decimal(money_list[i].entity_text)/Decimal(money_list[j].entity_text)==10000: # money_list[i].entity_text = str(Decimal(money_list[i].entity_text)/10000) # # print('连接前修改大于50亿金额:前面是后面的一万倍则把前面金额/10000') '''同样金额同时有元及万元单位的,把万元的金额改为元''' wanyuan = [] yuan = [] for it in list_entity: if it.entity_type == "money" and float(it.entity_text)>1000000: # 20240523 修改为百万以上金额才对比万倍关系,其他又行业限额纠正避免有些万元单位提取不到从而被除一万 例:52435607 最高限价(万元):22679.32 蜀冈招标控制价22679.32工程地点南路西侧(万元) if it.money_unit == '万元' or float(it.entity_text)>5000000000: wanyuan.append(it) if it.money_unit == '元' or float(it.entity_text)<5000000: yuan.append(it) if wanyuan != [] and yuan != []: for m1 in wanyuan: for m2 in yuan: if Decimal(m1.entity_text)/Decimal(m2.entity_text) == 10000: m1.entity_text = m2.entity_text #遍历所有实体 # while(p_entity=on_value: if str(entity.label)=="0": packagePointer,_ = getPackage(PackageList,entity.sentence_index,entity.begin_index,"money-"+str(entity.label)) if packagePointer is None: packageName = "Project" else: packageName = packagePointer.entity_text addMoneyByRoleid(PackDict, packageName, "0", entity.entity_text, entity.values[entity.label]) ''' ''' # 2020/11/25 与下面的联系人连接步骤重复,取消 if entity.entity_type=="person": if entity.values[entity.label]>=on_value_person: if str(entity.label)=="1": for i in range(len(PackDict["Project"]["roleList"])): if PackDict["Project"]["roleList"][i].role_name=="tenderee": PackDict["Project"]["roleList"][i].linklist.append((entity.entity_text,entity.person_phone)) # add pointer_person for _entity in list_entity: if dict_role_id.get(str(_entity.label))=="tenderee": for i in range(len(PackDict["Project"]["roleList"])): if PackDict["Project"]["roleList"][i].entity_text==_entity.entity_text and PackDict["Project"]["roleList"][i].role_name=="tenderee": _entity.pointer_person = entity elif str(entity.label)=="2": for i in range(len(PackDict["Project"]["roleList"])): if PackDict["Project"]["roleList"][i].role_name=="agency": PackDict["Project"]["roleList"][i].linklist.append((entity.entity_text,entity.person_phone)) # add pointer_person for _entity in list_entity: if dict_role_id.get(str(_entity.label))=="agency": for i in range(len(PackDict["Project"]["roleList"])): if PackDict["Project"]["roleList"][i].entity_text==_entity.entity_text and PackDict["Project"]["roleList"][i].role_name=="agency": _entity.pointer_person = entity ''' # #金额往前找实体 # if entity.entity_type=="money": # if entity.values[entity.label]>=on_value: # p_entity_money= p_entity # entity_money = list_entity[p_entity_money] # if len(PackageSet)>0: # packagePointer,_ = getPackage(PackageList,entity_money.sentence_index,entity_money.begin_index,"money-"+str(entity_money.entity_text)+"-"+str(entity_money.label)) # if packagePointer is None: # packageName_entity = "Project" # else: # packageName_entity = packagePointer.entity_text # else: # packageName_entity = "Project" # while(p_entity_money>0): # entity_before = list_entity[p_entity_money] # if entity_before.entity_type in ['org','company']: # if str(entity_before.label)=="1": # addMoneyByEntity(PackDict, packageName_entity, entity_before.entity_text, entity_money.entity_text, entity_money.values[entity_money.label]) # #add pointer_money # entity_before.pointer_money = entity_money # break # p_entity_money -= 1 #如果实体属于角色集合,则往后找属性 # if doesEntityOrLinkedEntity_inRoleSet(entity, roleSet): # # p_entity += 1 # #循环查找符合的属性 # while(p_entity=sentence_len: # p_entity -= 1 # break # #若是遇到公司实体,则跳出循环 # if entity_after.entity_type in ['org','company']: # p_entity -= 1 # break # if entity_after.values is not None: # if entity_after.entity_type=="money": # if entity_after.values[entity_after.label]>=on_value: # ''' # #招标金额从后往前找 # if str(entity_after.label)=="0": # packagePointer,_ = getPackage(PackageList,entity.sentence_index,entity.begin_index,"money-"+str(entity.label)) # if packagePointer is None: # packageName = "Project" # else: # packageName = packagePointer.entity_text # addMoneyByRoleid(PackDict, packageName, "0", entity_after.entity_text, entity_after.values[entity_after.label]) # ''' # if str(entity_after.label)=="1": # #print(entity_after.entity_text,entity.entity_text) # _list_entitys = [entity]+entity.linked_entitys # if len(PackageSet)>0: # packagePointer,_ = getPackage(PackageList,entity_after.sentence_index,entity_after.begin_index,"money-"+str(entity_after.entity_text)+"-"+str(entity_after.label)) # if packagePointer is None: # packageName_entity = "Project" # else: # packageName_entity = packagePointer.entity_text # else: # packageName_entity = "Project" # if str(entity.label) in ["2","3","4"]: # # addMoneyByEntity(PackDict, packageName_entity, entity.entity_text, entity_after.entity_text, entity_after.values[entity_after.label]) # if entity_after.notes == '单价' or float(entity_after.entity_text)<5000: #2021/12/17 调整小金额阈值,避免203608823.html 两次金额一次万元没提取到的情况 # addMoneyByEntity(PackDict, packageName_entity, entity.entity_text, entity_after, # 0.5) # entity.pointer_money = entity_after # # print('role zhao money', entity.entity_text, '中标金额:', entity_after.entity_text) # else: # addMoneyByEntity(PackDict, packageName_entity, entity.entity_text, entity_after, # entity_after.values[entity_after.label]) # entity.pointer_money = entity_after # # print('role zhao money', entity.entity_text, '中标金额:', entity_after.entity_text) # if entity_after.values[entity_after.label]>0.6: # break # 2021/7/16 新增,找到中标金额,非单价即停止,不再往后找金额 # #add pointer_money # # entity.pointer_money = entity_after # # print('role zhao money', entity.entity_text, '中标金额:', entity_after.entity_text) # # if entity_after.notes!='单价': # # break # 2021/7/16 新增,找到中标金额即停止,不再往后找金额 # ''' # if entity_after.entity_type=="person": # if entity_after.values[entity_after.label]>=on_value_person: # if str(entity_after.label)=="1": # for i in range(len(roleList)): # if roleList[i].role_name=="tenderee": # roleList[i].linklist.append((entity_after.entity_text,entity_after.person_phone)) # elif str(entity_after.label)=="2": # for i in range(len(roleList)): # if roleList[i].role_name=="agency": # roleList[i].linklist.append((entity_after.entity_text,entity_after.person_phone)) # elif str(entity_after.label)=="3": # _list_entitys = [entity]+entity.linked_entitys # for _entity in _list_entitys: # for i in range(len(roleList)): # if roleList[i].entity_text==_entity.entity_text: # if entity_after.sentence_index-_entity.sentence_index>1 and len(roleList[i].linklist)>0: # break # roleList[i].linklist.append((entity_after.entity_text,entity_after.person_phone)) # ''' # # p_entity += 1 # # p_entity += 1 # 记录每句的分词数量 tokens_num_dict = dict() last_tokens_num = 0 for sentence in list_sentence: _index = sentence.sentence_index if _index == 0: tokens_num_dict[_index] = 0 else: tokens_num_dict[_index] = tokens_num_dict[_index - 1] + last_tokens_num last_tokens_num = len(sentence.tokens) attribute_type = ['money','serviceTime','ratio']# 'money'仅指“中投标金额” for link_attribute in attribute_type: temp_entity_list = [] if link_attribute=="money": temp_entity_list = [ent for ent in list_entity if (ent.entity_type in ['org','company'] and ent.label in [2,3,4]) or (ent.entity_type=='money' and ent.label==1 and ent.values[ent.label]>=0.5)] # 删除重复的‘中投标金额’,一般为大小写两种样式 drop_tendererMoney = [] for ent_idx in range(len(temp_entity_list)-1): entity = temp_entity_list[ent_idx] if entity.entity_type=='money': next_entity = temp_entity_list[ent_idx+1] if next_entity.entity_type=='money': if getUnifyMoney(entity.entity_text)==getUnifyMoney(next_entity.entity_text): if (tokens_num_dict[next_entity.sentence_index] + next_entity.begin_index) - ( tokens_num_dict[entity.sentence_index] + entity.end_index) < 10: drop_tendererMoney.append(next_entity) for _drop in drop_tendererMoney: temp_entity_list.remove(_drop) elif link_attribute=="serviceTime": temp_entity_list = [ent for ent in list_entity if (ent.entity_type in ['org','company'] and ent.label in [2,3,4]) or ent.entity_type=='serviceTime'] elif link_attribute=="ratio": temp_entity_list = [ent for ent in list_entity if (ent.entity_type in ['org','company'] and ent.label in [2,3,4]) or ent.entity_type=='ratio'] temp_entity_list = sorted(temp_entity_list,key=lambda x: (x.sentence_index, x.begin_index)) temp_match_list = [] for ent_idx in range(len(temp_entity_list)): entity = temp_entity_list[ent_idx] if entity.entity_type in ['org','company']: match_nums = 0 tenderer_nums = 0 #经过其他中投标人的数量 byNotTenderer_match_nums = 0 #跟在中投标人后面的属性 for after_index in range(ent_idx + 1, min(len(temp_entity_list), ent_idx + 4)): after_entity = temp_entity_list[after_index] if entity.in_attachment != after_entity.in_attachment: # 正文与附件的不能相连 break if after_entity.entity_type == link_attribute: distance = (tokens_num_dict[after_entity.sentence_index] + after_entity.begin_index) - ( tokens_num_dict[entity.sentence_index] + entity.end_index) sentence_distance = after_entity.sentence_index - entity.sentence_index value = (-1 / 2 * (distance ** 2)) / 10000 if link_attribute == "money": if after_entity.notes == '单价': value = value * 100 if sentence_distance == 0: if distance < 100: # value = (-1 / 2 * (distance ** 2)) / 10000 temp_match_list.append(Match(entity, after_entity, value)) match_nums += 1 if not tenderer_nums: byNotTenderer_match_nums += 1 else: break else: if distance < 60: # value = (-1 / 2 * (distance ** 2)) / 10000 temp_match_list.append(Match(entity, after_entity, value)) match_nums += 1 if not tenderer_nums: byNotTenderer_match_nums += 1 else: break else: tenderer_nums += 1 #前向查找属性 if ent_idx!=0 and (not match_nums or not byNotTenderer_match_nums): previous_entity = temp_entity_list[ent_idx - 1] if previous_entity.entity_type == link_attribute: # if previous_entity.sentence_index == entity.sentence_index: distance = (tokens_num_dict[entity.sentence_index] + entity.begin_index) - ( tokens_num_dict[previous_entity.sentence_index] + previous_entity.end_index) if distance < 40: # 前向 没有 /10000 value = (-1 / 2 * (distance ** 2)) temp_match_list.append(Match(entity, previous_entity, value)) # km算法分配求解 dispatch_result = dispatch(temp_match_list) dispatch_result = sorted(dispatch_result, key=lambda x: (x[0].sentence_index,x[0].begin_index)) for match in dispatch_result: _entity = match[0] _attribute = match[1] if link_attribute=='money': _entity.pointer_money = _attribute packagePointer, _ = getPackage(PackageList, _attribute.sentence_index, _attribute.begin_index, "money-" + str(_attribute.entity_text) + "-" + str(_attribute.label)) # print(_entity.entity_text,_attribute.entity_text) if packagePointer is None: packageName_entity = "Project" else: packageName_entity = packagePointer.entity_text if _attribute.notes == '单价' or float(_attribute.entity_text) < 5000: # 2021/12/17 调整小金额阈值,避免203608823.html 两次金额一次万元没提取到的情况 # print(packageName_entity,_attribute.entity_text, _attribute.values[_attribute.label]) addMoneyByEntity(PackDict, packageName_entity, _entity.entity_text, _attribute,0.5) else: # print(packageName_entity,_attribute.entity_text, _attribute.values[_attribute.label]) addMoneyByEntity(PackDict, packageName_entity, _entity.entity_text, _attribute, _attribute.values[_attribute.label]) elif link_attribute=='serviceTime': _entity.pointer_serviceTime = _attribute packagePointer, _ = getPackage(PackageList, _attribute.sentence_index, _attribute.begin_index, "serviceTime-" + str(_attribute.entity_text) + "-" + str(_attribute.label)) if packagePointer is None: packageName_entity = "Project" else: packageName_entity = packagePointer.entity_text addServiceTimeByEntity(PackDict, packageName_entity, _entity.entity_text, _attribute) elif link_attribute=='ratio': _entity.pointer_ratio = _attribute packagePointer, _ = getPackage(PackageList, _attribute.sentence_index, _attribute.begin_index, "ratio-" + str(_attribute.entity_text) + "-" + str(_attribute.label)) if packagePointer is None: packageName_entity = "Project" else: packageName_entity = packagePointer.entity_text addRatioByEntity(PackDict, packageName_entity, _entity.entity_text, _attribute) '''''' # 通过模型分类的招标/代理联系人 list_sentence = sorted(list_sentence, key=lambda x: x.sentence_index) person_list = [entity for entity in list_entity if entity.entity_type == 'person' and entity.label in [1, 2]] tenderee_contact = set() tenderee_phone = set() agency_contact = set() agency_phone = set() winter_contact = set() for _person in person_list: if _person.label == 1: tenderee_contact.add(_person.entity_text) if _person.label == 2: agency_contact.add(_person.entity_text) # 正则匹配无 '主体/联系人' 的电话 # 例:"采购人联系方式:0833-5226788," phone_pattern = '(1[3-9][0-9][-—-―]?\d{4}[-—-―]?\d{4}|' \ '\+86.?1[3-9]\d{9}|' \ '0[1-9]\d{1,2}[-—-―][2-9]\d{6,7}/[1-9]\d{6,10}|' \ '0[1-9]\d{1,2}[-—-―][2-9]\d{6}\d?.?转\d{1,4}|' \ '0[1-9]\d{1,2}[-—-―][2-9]\d{6}\d?[-—-―]\d{1,4}|' \ '0[1-9]\d{1,2}[-—-―]?[2-9]\d{6}\d?(?=1[3-9]\d{9})|' \ '0[1-9]\d{1,2}[-—-―]?[2-9]\d{6}\d?(?=0[1-9]\d{1,2}[-—-―]?[2-9]\d{6}\d?)|' \ '0[1-9]\d{1,2}[-—-―]?[2-9]\d{6}\d?(?=[2-9]\d{6,7})|' \ '0[1-9]\d{1,2}[-—-―]?[2-9]\d{6}\d?|' \ '[\(|\(]0[1-9]\d{1,2}[\)|\)]-?[2-9]\d{6}\d?-?\d{,4}|' \ '[2-9]\d{6,7})' re_tenderee_phone = re.compile( "(?:(?:(?:采购|招标|议价|议标|比选)(?:人|公司|单位|组织|部门)|建设(?:单位|业主)|(?:采购|招标|甲)方|询价单位|项目业主|业主)[^。]{0,5}(?:电话|联系方式|联系人|联系电话)[::]?[^。]{0,7}?)" # 电话号码 + phone_pattern) # 例:"采购人地址和联系方式:峨边彝族自治县教育局,0833-5226788," re_tenderee_phone2 = re.compile( "(?:(?:(?:采购|招标|议价|议标|比选)(?:人|公司|单位|组织|部门)|建设(?:单位|业主)|(?:采购|招标|甲)方|询价单位|项目业主|业主)[^。]{0,3}(?:地址)[^。]{0,3}(?:电话|联系方式|联系人|联系电话)[::]?[^。]{0,20}?)" # 电话号码 + phone_pattern) re_agent_phone = re.compile( "(?:(?:代理(?:人|机构|公司|单位|组织|方)|采购机构|集中采购机构|集采机构|招标机构)[^。]{0,5}(?:电话|联系方式|联系人|联系电话)[::]?[^。]{0,7}?)" # 电话号码 + phone_pattern) re_agent_phone2 = re.compile( "(?:(?:代理(?:人|机构|公司|单位|组织|方)|采购机构|集中采购机构|集采机构|招标机构)[^。]{0,3}(?:地址)[^。]{0,3}(?:电话|联系方式|联系人|联系电话)[::]?[^。]{0,20}?)" # 电话号码 + phone_pattern) content = "" for _sentence in list_sentence: content += "".join(_sentence.tokens) _content = copy.deepcopy(content) while re.search("(.)(,)([^0-9])|([^0-9])(,)(.)", content): content_words = list(content) for i in re.finditer("(.)(,)([^0-9])", content): content_words[i.span(2)[0]] = "" for i in re.finditer("([^0-9])(,)(.)", content): content_words[i.span(2)[0]] = "" content = "".join(content_words) content = re.sub("[::]|[\((]|[\))]", "", content) _tenderee_phone = re.findall(re_tenderee_phone, content) # 更新正则确定的角色属性 for i in range(len(PackDict["Project"]["roleList"])): if PackDict["Project"]["roleList"][i].role_name == "tenderee": _tenderee_phone = re.findall(re_tenderee_phone, content) if _tenderee_phone: for _phone in _tenderee_phone: _phone = _phone.split("/") # 分割多个号码 for one_phone in _phone: PackDict["Project"]["roleList"][i].linklist.append(("", one_phone)) tenderee_phone.add(one_phone) _tenderee_phone2 = re.findall(re_tenderee_phone2, content) if _tenderee_phone2: for _phone in _tenderee_phone2: _phone = _phone.split("/") for one_phone in _phone: PackDict["Project"]["roleList"][i].linklist.append(("", one_phone)) tenderee_phone.add(one_phone) if PackDict["Project"]["roleList"][i].role_name == "agency": _agent_phone = re.findall(re_agent_phone, content) if _agent_phone: for _phone in _agent_phone: _phone = _phone.split("/") for one_phone in _phone: PackDict["Project"]["roleList"][i].linklist.append(("", one_phone)) agency_phone.add(one_phone) _agent_phone2 = re.findall(re_agent_phone2, content) if _agent_phone2: for _phone in _agent_phone2: _phone = _phone.split("/") for one_phone in _phone: PackDict["Project"]["roleList"][i].linklist.append(("", one_phone)) agency_phone.add(one_phone) # 正则提取电话号码实体 # key_word = re.compile('((?:电话|联系方式|联系人).{0,4}?)([0-1]\d{6,11})') phone = re.compile('1[3-9][0-9][-—-―]?\d{4}[-—-―]?\d{4}|' '\+86.?1[3-9]\d{9}|' # '0[^0]\d{1,2}[-—-―][1-9]\d{6,7}/[1-9]\d{6,10}|' '0[1-9]\d{1,2}[-—-―][2-9]\d{6}\d?[-—-―]\d{1,4}|' '0[1-9]\d{1,2}[-—-―]{0,2}[2-9]\d{6}\d?(?=1[3-9]\d{9})|' '0[1-9]\d{1,2}[-—-―]{0,2}[2-9]\d{6}\d?(?=0[1-9]\d{1,2}[-—-―]?[2-9]\d{6}\d?)|' '0[1-9]\d{1,2}[-—-―]{0,2}[2-9]\d{6}\d?(?=[2-9]\d{6,7})|' '0[1-9]\d{1,2}[-—-―]{0,2}[2-9]\d{6}\d?|' '[\(|\(]0[1-9]\d{1,2}[\)|\)]-?[2-9]\d{6}\d?-?\d{,4}|' '400\d{7}转\d{1,4}|' '[2-9]\d{6,7}') url_pattern = re.compile("http[s]?://(?:[a-zA-Z]|[0-9]|[#$\-_@.&+=\?:/]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+") email_pattern = re.compile("[a-zA-Z0-9][a-zA-Z0-9_-]+(?:\.[a-zA-Z0-9_-]+)*@" "[a-zA-Z0-9_-]+(?:\.[a-zA-Z0-9_-]+)*(?:\.[a-zA-Z]{2,})") phone_entitys = [] code_entitys = [ent for ent in list_entity if ent.entity_type=='code'] for _sentence in list_sentence: sentence_text = _sentence.sentence_text # 过长数字串直接过滤替换 for _re in re.findall("\d{50,}",sentence_text): sentence_text = sentence_text.replace(_re,"#"*len(_re)) in_attachment = _sentence.in_attachment list_tokenbegin = [] begin = 0 for i in range(0, len(_sentence.tokens)): list_tokenbegin.append(begin) begin += len(str(_sentence.tokens[i])) list_tokenbegin.append(begin + 1) # 排除网址、邮箱、项目编号实体 error_list = [] for i in re.finditer(url_pattern, sentence_text): error_list.append((i.start(), i.end())) for i in re.finditer(email_pattern, sentence_text): error_list.append((i.start(), i.end())) for code_ent in [ent for ent in code_entitys if ent.sentence_index==_sentence.sentence_index]: error_list.append((code_ent.wordOffset_begin,code_ent.wordOffset_end)) res_set = set() for i in re.finditer(phone, sentence_text): is_continue = False for error_ent in error_list: if i.start()>=error_ent[0] and i.end()<=error_ent[1]: is_continue = True break if is_continue: continue res_set.add((i.group(), i.start(), i.end())) res_set = sorted(list(res_set),key=lambda x:x[1]) # 限制数量,防止异常数据处理时间过长 res_set = res_set[:200] last_phone_mask = True error_numStr_index = [] sentence_phone_list = [] for item_idx in range(len(res_set)): item = res_set[item_idx] phone_left = sentence_text[max(0, item[1] - 10):item[1]] phone_right = sentence_text[item[2]:item[2] + 10] phone_left_num = re.search("[\da-zA-Z\-—-―]+$",phone_left) numStr_left = item[1] if phone_left_num: numStr_left -= len(phone_left_num.group()) phone_right_num = re.search("^[\da-zA-Z\-—-―]+",phone_right) numStr_right = item[2] if phone_right_num: numStr_right += len(phone_right_num.group()) numStr_index = (numStr_left,numStr_right) if re.search("电话|手机|联系[人方]|联系方式",re.sub(",","",phone_left)): pass else: # 排除“传真号”和其它错误项 if re.search("传,?真|信,?箱|邮,?[编箱件]|QQ|qq", phone_left): if not re.search("电,?话", phone_left): error_numStr_index.append(numStr_index) last_phone_mask = False continue if re.search("身份证号?码?|注册[证号]|帐号|编[号码]|报价|费率|标价|证号|价格|型号|附件|代码|列号|行号|税号|[\(\(]万?元[\)\)]|[a-zA-Z]+\d*$", re.sub(",","",phone_left)): error_numStr_index.append(numStr_index) last_phone_mask = False continue if re.search("^\d{0,4}[.,]\d{2,}|^[0-9a-zA-Z\.]*@|^\d*[a-zA-Z]+|元", phone_right): error_numStr_index.append(numStr_index) last_phone_mask = False continue # 号码含有0过多,不符合规则 if re.search("0{6,}",item[0]): error_numStr_index.append(numStr_index) last_phone_mask = False continue # 前后跟着字母 if re.search("[a-zA-Z/]+$", phone_left) or re.search("^[a-zA-Z/]+", phone_right): error_numStr_index.append(numStr_index) last_phone_mask = False continue # 时间日期类排除 if re.search("时间|日期", phone_left): error_numStr_index.append(numStr_index) last_phone_mask = False continue # 排除号码实体为时间格式 ,例如:20150515 if re.search("^20(1[0-9]|2[0-5])(0[1-9]|1[012])(0[1-9]|[1-2][0-9]|3[01])$",item[0]): error_numStr_index.append(numStr_index) last_phone_mask = False continue # 前后跟着长度小于一定值数字的正则排除 if re.search("\d+[-—-―]?\d*$",phone_left) or re.search("^\d+[-—-―]?\d*",phone_right): phone_left_number = re.search("\d+[-—-―]?\d*$",phone_left) phone_right_number = re.search("^\d+[-—-―]?\d+",phone_right) if phone_left_number: if len(phone_left_number.group())<7: error_numStr_index.append(numStr_index) last_phone_mask = False continue if phone_right_number: if len(phone_right_number.group())<7: error_numStr_index.append(numStr_index) last_phone_mask = False continue left_context = re.search("[\da-zA-Z\-—-―]+$",sentence_text[:item[1]]) if left_context: if len(left_context.group()) != len("".join(re.findall(phone, left_context.group()))): # if not re.search("(" + phone.pattern + ")$", left_context.group()): error_numStr_index.append(numStr_index) last_phone_mask = False continue right_context = re.search("^[\da-zA-Z\-—-―]+", sentence_text[item[2]:]) if right_context: if len(right_context.group()) != len("".join(re.findall(phone, right_context.group()))): # if not re.search("^(" + phone.pattern + ")", right_context.group()): error_numStr_index.append(numStr_index) last_phone_mask = False continue # if:上一个phone实体不符合条件 if not last_phone_mask: item_start = item[1] last_item_end = res_set[item_idx-1][2] if item_start - last_item_end<=1 or re.search("^[\da-zA-Z\-—-―、]+$",sentence_text[last_item_end:item_start]): error_numStr_index.append(numStr_index) last_phone_mask = False continue sentence_phone_list.append(item) last_phone_mask = True if error_numStr_index: drop_list = [] for item in sentence_phone_list: for err_index in error_numStr_index: if (item[1]>=err_index[0] and item[1]<=err_index[1]) or (item[2]>=err_index[0] and item[2]<=err_index[1]) or (item[1]<=err_index[0] and item[2]>=err_index[1]): drop_list.append(item) break for _drop_item in drop_list: sentence_phone_list.remove(_drop_item) for item in sentence_phone_list: for j in range(len(list_tokenbegin)): if list_tokenbegin[j] == item[1]: begin_index = j break elif list_tokenbegin[j] > item[1]: begin_index = j - 1 break for j in range(begin_index, len(list_tokenbegin)): if list_tokenbegin[j] >= item[2]: end_index = j - 1 break phone_text = re.sub("[-—-―]+","-",item[0]).replace("(","(").replace(")",")") _entity = Entity(_sentence.doc_id, None, phone_text, "phone", _sentence.sentence_index, begin_index, end_index, item[1], item[2],in_attachment=in_attachment) phone_entitys.append(_entity) # print('phone_set:',set([ent.entity_text for ent in phone_entitys])) def is_company(entity,text): # 判断"公司"实体是否为地址地点 if entity.label!=5 and entity.values[entity.label]>0.5: return True if ent.is_tail==True: return False entity_left = text[max(0,entity.wordOffset_begin-10):entity.wordOffset_begin] entity_left = re.sub(",()\(\)","",entity_left) entity_left = entity_left[-5:] if re.search("地址|地点|银行[::]",entity_left): return False else: return True pre_entity = [] for ent in list_entity: if (ent.entity_type in ['company','org','phone'] and is_company(ent,list_sentence[ent.sentence_index].sentence_text)) or (ent.entity_type=='person' and ent.label in [1,2,3]) \ or (ent.entity_type=='location' and len(ent.entity_text)>5): pre_entity.append(ent) text_data,pre_data = relationExtraction_model.encode(pre_entity + phone_entitys, list_sentence) # print(pre_data) maxlen = 512 relation_list = [] if 04: break for _text_data, _pre_data in temp_data: relation_list.extend(relationExtraction_model.predict(_text_data,_pre_data)) temp_data = [] start = start + maxlen - 120 if temp_data: deal_data += len(temp_data) if deal_data <= 4: for _text_data, _pre_data in temp_data: relation_list.extend(relationExtraction_model.predict(_text_data, _pre_data)) # print("预测数据:",len(temp_data)) # 去重结果 relation_list = list(set(relation_list)) # print([(rel[0].entity_text,rel[2].entity_text) for rel in relation_list]) right_combination = [('org','person'),('company','person'),('company','location'),('org','location'),('person','phone')] linked_company = set() linked_person = set() linked_connetPerson = set() linked_phone = set() for predicate in ["rel_address","rel_phone","rel_person"]: _match_list = [] _match_combo = [] for relation in relation_list: _subject = relation[0] _object = relation[2] if isinstance(_subject,Entity) and isinstance(_object,Entity) and (_subject.entity_type,_object.entity_type) in right_combination: if _subject.in_attachment != _object.in_attachment: continue if relation[1]==predicate: distance = (tokens_num_dict[_object.sentence_index] + _object.begin_index) - ( tokens_num_dict[_subject.sentence_index] + _subject.end_index) if predicate=="rel_person": if (_subject.label==0 and _object.entity_text in agency_contact ) or (_subject.label==1 and _object.entity_text in tenderee_contact): continue # 角色为中标候选人,排除"质疑|投诉|监督|受理"相关的联系人 if _subject.label in [2,3,4] and re.search("纪检|监察|质疑|投诉|监督|受理|项目(单位)?联系|^联系人|请.{0,4}联系",list_sentence[_object.sentence_index].sentence_text[max(0,_object.wordOffset_begin-10):_object.wordOffset_begin]): continue # 角色为招标/代理人,排除"纪检|监察"相关的联系人 if _subject.label in [0,1] and re.search("纪检|监察",list_sentence[_object.sentence_index].sentence_text[max(0,_object.wordOffset_begin - 10):_object.wordOffset_begin]): continue if _object.sentence_index!=0 and _object.wordOffset_begin<=10: if _subject.label in [2, 3, 4] and re.search("请.{0,4}联系", list_sentence[_object.sentence_index-1].sentence_text[-10:]+ list_sentence[_object.sentence_index].sentence_text[0:_object.wordOffset_begin]): continue # 角色为中标候选人,排除距离过远的联系人 if _subject.label in [2, 3, 4] and distance>=40: continue if distance>0: value = (-1 / 2 * (distance ** 2))/10000 else: distance = abs(distance) value = (-1 / 2 * (distance ** 2)) _match_list.append(Match(_subject,_object,value)) _match_combo.append((_subject,_object)) match_result = dispatch(_match_list) error_list = [] for mat in list(set(_match_combo)-set(match_result)): for temp in match_result: if mat[1]==temp[1] and mat[0]!=temp[0]: error_list.append(mat) break result = list(set(_match_combo)-set(error_list)) if predicate=='rel_person': # 从后往前更新状态,已近后向链接的属性不在前向链接(解决错误链接) result = sorted(result,key=lambda x:x[1].begin_index,reverse=True) for combo in result: is_continue = False if not combo[0].pointer_person: combo[0].pointer_person = [] if combo[1].begin_indexcombo[0].begin_index: is_continue = True break if is_continue: continue combo[0].pointer_person.append(combo[1]) linked_company.add(combo[0]) linked_person.add(combo[1]) # print(1,combo[0].entity_text,combo[1].entity_text) if predicate=='rel_address': result = sorted(result,key=lambda x:x[1].begin_index,reverse=True) for combo in result: if combo[0].pointer_address: continue combo[0].pointer_address = combo[1] # print(2,combo[0].entity_text,combo[1].entity_text) if predicate=='rel_phone': result = sorted(result,key=lambda x:x[1].begin_index,reverse=True) for combo in result: is_continue = False if not combo[0].person_phone: combo[0].person_phone = [] if combo[1].begin_indexcombo[0].begin_index: is_continue = True break if is_continue: continue combo[0].person_phone.append(combo[1]) linked_connetPerson.add(combo[0]) linked_phone.add(combo[1]) if combo[0].label in [1,2]: if PackDict.get("Project"): for i in range(len(PackDict["Project"]["roleList"])): if (combo[0].label==1 and PackDict["Project"]["roleList"][i].role_name=='tenderee') \ or (combo[0].label==2 and PackDict["Project"]["roleList"][i].role_name=='agency'): PackDict["Project"]["roleList"][i].linklist.append((combo[0].entity_text,combo[1].entity_text)) break # print(3,combo[0].entity_text,combo[1].entity_text) # "公司——地址" 链接规则补充 company_lacation_EntityList = [ent for ent in pre_entity if ent.entity_type in ['company', 'org', 'location']] # company_lacation_EntityList = [ent for ent in pre_entity if (ent.entity_type in ['company', 'org'] and ent.label!=5) or ent.entity_type=="location"] company_lacation_EntityList = sorted(company_lacation_EntityList, key=lambda x: (x.sentence_index, x.begin_index)) t_match_list = [] for ent_idx in range(len(company_lacation_EntityList)): entity = company_lacation_EntityList[ent_idx] if entity.entity_type in ['company', 'org'] and entity.label!=5: match_nums = 0 company_nums = 0 # 经过其他公司的数量 location_nums = 0 # 经过电话的数量 for after_index in range(ent_idx + 1, min(len(company_lacation_EntityList), ent_idx + 5)): after_entity = company_lacation_EntityList[after_index] if after_entity.entity_type == "location": distance = (tokens_num_dict[after_entity.sentence_index] + after_entity.begin_index) - ( tokens_num_dict[entity.sentence_index] + entity.end_index) location_nums += 1 if distance > 100 or location_nums >= 3: break sentence_distance = after_entity.sentence_index - entity.sentence_index value = (-1 / 2 * (distance ** 2)) / 10000 if sentence_distance == 0: if distance < 80: t_match_list.append(Match(entity, after_entity, value)) match_nums += 1 if company_nums: break else: if distance < 50: t_match_list.append(Match(entity, after_entity, value)) match_nums += 1 if company_nums: break else: # type:company/org company_nums += 1 if entity.label in [2, 3, 4] and after_entity.label in [0, 1]: break if entity.label in [0, 1] and after_entity.label in [2, 3, 4]: break if entity.label in [0, 1] and after_entity.label not in [0, 1]: break # km算法分配求解 # for item in t_match_list: # print("loc_rela",item.main_role.entity_text,item.attribute.entity_text) relate_location_result = dispatch(t_match_list) relate_location_result = sorted(relate_location_result, key=lambda x: (x[0].sentence_index, x[0].begin_index)) for match in relate_location_result: _company = match[0] _relation = match[1] # print("loc_relation1", _company.entity_text, _relation.entity_text, ) if not _company.pointer_address: # print('loc_relation2',_company.entity_text,_relation.entity_text) _company.pointer_address = _relation # "联系人——联系电话" 链接规则补充 person_phone_EntityList = [ent for ent in pre_entity+ phone_entitys if ent.entity_type not in ['company','org','location']] person_phone_EntityList = sorted(person_phone_EntityList, key=lambda x: (x.sentence_index, x.begin_index)) t_match_list = [] for ent_idx in range(len(person_phone_EntityList)): entity = person_phone_EntityList[ent_idx] if entity.entity_type=="person": match_nums = 0 person_nums = 0 # 经过其他中联系人的数量 byNotPerson_match_nums = 0 # 跟在联系人后面的属性 phone_nums = 0 # 经过电话的数量 for after_index in range(ent_idx + 1, min(len(person_phone_EntityList), ent_idx + 8)): after_entity = person_phone_EntityList[after_index] if after_entity.entity_type == "phone": distance = (tokens_num_dict[after_entity.sentence_index] + after_entity.begin_index) - ( tokens_num_dict[entity.sentence_index] + entity.end_index) phone_nums += 1 if distance>100 or phone_nums>=4: break sentence_distance = after_entity.sentence_index - entity.sentence_index value = (-1 / 2 * (distance ** 2)) / 10000 if sentence_distance == 0: if distance < 70: # value = (-1 / 2 * (distance ** 2)) / 10000 t_match_list.append(Match(entity, after_entity, value)) match_nums += 1 if not person_nums: byNotPerson_match_nums += 1 else: break else: if distance < 40: # value = (-1 / 2 * (distance ** 2)) / 10000 t_match_list.append(Match(entity, after_entity, value)) match_nums += 1 if not person_nums: byNotPerson_match_nums += 1 else: break else: person_nums += 1 # 前向查找属性 if ent_idx != 0 and (not match_nums or not byNotPerson_match_nums): previous_entity = person_phone_EntityList[ent_idx - 1] if previous_entity.entity_type == 'phone': # if previous_entity.sentence_index == entity.sentence_index: distance = (tokens_num_dict[entity.sentence_index] + entity.begin_index) - ( tokens_num_dict[previous_entity.sentence_index] + previous_entity.end_index) if distance < 40: # 前向 没有 /10000 value = (-1 / 2 * (distance ** 2)) t_match_list.append(Match(entity, previous_entity, value)) # km算法分配求解(person-phone) t_match_list = [mat for mat in t_match_list if mat.main_role not in linked_connetPerson and mat.attribute not in linked_phone] personphone_result = dispatch(t_match_list) personphone_result = sorted(personphone_result, key=lambda x: (x[0].sentence_index, x[0].begin_index)) for match in personphone_result: _person = match[0] _phone = match[1] if not _person.person_phone: _person.person_phone = [] _person.person_phone.append(_phone) # 多个招标人/代理人或者别称 for idx in range(1,len(pre_entity)): _pre_entity = pre_entity[idx] if _pre_entity in linked_company and _pre_entity.label==5: last_ent = pre_entity[idx-1] if last_ent.entity_type in ['company','org'] and last_ent.label in [0,1]: if last_ent.sentence_index==_pre_entity.sentence_index: mid_text = list_sentence[_pre_entity.sentence_index].sentence_text[last_ent.wordOffset_end:_pre_entity.wordOffset_begin] if len(mid_text)<=20 and "," not in mid_text and re.search("[、\((]",mid_text): _pre_entity.label = last_ent.label _pre_entity.values[last_ent.label] = 0.6 # 2022/01/25 固定电话可连多个联系人 temp_person_entitys = [entity for entity in pre_entity if entity.entity_type == 'person'] temp_person_entitys2 = [] #和固定电话相连的联系人 for entity in temp_person_entitys: if entity.person_phone: for _phone in entity.person_phone: if not re.search("^1[3-9]\d{9}$", _phone.entity_text): temp_person_entitys2.append(entity) break for index in range(len(temp_person_entitys)): entity = temp_person_entitys[index] if entity in temp_person_entitys2: last_person = entity for after_index in range(index + 1, min(len(temp_person_entitys), index + 5)): after_entity = temp_person_entitys[after_index] if after_entity.sentence_index == last_person.sentence_index and after_entity.begin_index - last_person.end_index < 3: for _phone in entity.person_phone: if not re.search("^1[3-9]\d{9}$", _phone.entity_text): if _phone not in after_entity.person_phone: after_entity.person_phone.append(_phone) last_person = after_entity else: break if index==0: continue last_person = entity for before_index in range(index-1, max(-1,index-5), -1): before_entity = temp_person_entitys[before_index] if before_entity.sentence_index == last_person.sentence_index and last_person.begin_index - before_entity.end_index < 3: for _phone in entity.person_phone: if not re.search("^1[3-9]\d{9}$", _phone.entity_text): if _phone not in before_entity.person_phone: before_entity.person_phone.append(_phone) last_person = before_entity else: break # 更新person为招标/代理联系人的联系方式 for k in PackDict.keys(): for i in range(len(PackDict[k]["roleList"])): if PackDict[k]["roleList"][i].role_name == "tenderee": for _person in person_list: if _person.label==1:#招标联系人 person_phone = [phone for phone in _person.person_phone] if _person.person_phone else [] for _p in person_phone: PackDict[k]["roleList"][i].linklist.append((_person.entity_text, _p.entity_text)) if not person_phone: PackDict[k]["roleList"][i].linklist.append((_person.entity_text,"")) if PackDict[k]["roleList"][i].role_name == "agency": for _person in person_list: if _person.label==2:#代理联系人 person_phone = [phone for phone in _person.person_phone] if _person.person_phone else [] for _p in person_phone: PackDict[k]["roleList"][i].linklist.append((_person.entity_text, _p.entity_text)) if not person_phone: PackDict[k]["roleList"][i].linklist.append((_person.entity_text,"")) # 更新 PackDict not_sure_linked = [] for link_p in list(linked_company): for k in PackDict.keys(): for i in range(len(PackDict[k]["roleList"])): if PackDict[k]["roleList"][i].role_name == "tenderee": if PackDict[k]["roleList"][i].entity_text != link_p.entity_text and link_p.label == 0: not_sure_linked.append(link_p) continue if PackDict[k]["roleList"][i].entity_text == link_p.entity_text: for per in link_p.pointer_person: person_phone = [phone for phone in per.person_phone] if per.person_phone else [] if not person_phone: if per.entity_text not in agency_contact: PackDict[k]["roleList"][i].linklist.append((per.entity_text, "")) continue for _p in person_phone: if per.entity_text not in agency_contact and _p.entity_text not in agency_phone: PackDict[k]["roleList"][i].linklist.append((per.entity_text, _p.entity_text)) elif PackDict[k]["roleList"][i].role_name == "agency": if PackDict[k]["roleList"][i].entity_text != link_p.entity_text and link_p.label == 1: not_sure_linked.append(link_p) continue if PackDict[k]["roleList"][i].entity_text == link_p.entity_text: for per in link_p.pointer_person: person_phone = [phone for phone in per.person_phone] if per.person_phone else [] if not person_phone: if per.entity_text not in tenderee_contact: PackDict[k]["roleList"][i].linklist.append((per.entity_text, "")) continue for _p in person_phone: if per.entity_text not in tenderee_contact and _p.entity_text not in tenderee_phone: PackDict[k]["roleList"][i].linklist.append((per.entity_text, _p.entity_text)) else: if PackDict[k]["roleList"][i].entity_text == link_p.entity_text: for per in link_p.pointer_person: person_phone = [phone for phone in per.person_phone] if per.person_phone else [] if not person_phone: if per.entity_text not in tenderee_contact and per.entity_text not in agency_contact: PackDict[k]["roleList"][i].linklist.append((per.entity_text, "")) winter_contact.add(per.entity_text) continue for _p in person_phone: if per.entity_text not in tenderee_contact and _p.entity_text not in tenderee_phone and \ per.entity_text not in agency_contact and _p.entity_text not in agency_phone: PackDict[k]["roleList"][i].linklist.append((per.entity_text, _p.entity_text)) winter_contact.add(per.entity_text) # 更新org/company实体label为0,1的链接 for link_p in not_sure_linked: for k in PackDict.keys(): for i in range(len(PackDict[k]["roleList"])): if PackDict[k]["roleList"][i].role_name == "tenderee": if link_p.label == 0: for per in link_p.pointer_person: person_phone = [phone for phone in per.person_phone] if per.person_phone else [] if not person_phone: if per.entity_text not in agency_contact and per.entity_text not in winter_contact: PackDict[k]["roleList"][i].linklist.append((per.entity_text, "")) continue for _p in person_phone: if per.entity_text not in agency_contact and _p.entity_text not in agency_phone and per.entity_text not in winter_contact: PackDict[k]["roleList"][i].linklist.append((per.entity_text, _p.entity_text)) elif PackDict[k]["roleList"][i].role_name == "agency": if link_p.label == 1: for per in link_p.pointer_person: person_phone = [phone for phone in per.person_phone] if per.person_phone else [] if not person_phone: if per.entity_text not in tenderee_contact and per.entity_text not in winter_contact: PackDict[k]["roleList"][i].linklist.append((per.entity_text, "")) continue for _p in person_phone: if per.entity_text not in tenderee_contact and _p.entity_text not in tenderee_phone and per.entity_text not in winter_contact: PackDict[k]["roleList"][i].linklist.append((per.entity_text, _p.entity_text)) re_split = re.compile("[^\u4e00-\u9fa5、](十一|十二|十三|十四|十五|一|二|三|四|五|六|七|八|九|十)、") split_list = [0] * 16 split_dict = { "一、": 1, "二、": 2, "三、": 3, "四、": 4, "五、": 5, "六、": 6, "七、": 7, "八、": 8, "九、": 9, "十、": 10, "十一、": 11, "十二、": 12, "十三、": 13, "十四、": 14, "十五、": 15 } for item in re.finditer(re_split, _content): _index = split_dict.get(item.group()[1:]) if not split_list[_index]: split_list[_index] = item.span()[0] + 1 split_list = [i for i in split_list if i != 0] start = 0 new_split_list = [] for idx in split_list: new_split_list.append((start, idx)) start = idx new_split_list.append((start, len(_content))) # 实体列表按照“公告分段”分组 words_num_dict = dict() last_words_num = 0 for sentence in list_sentence: _index = sentence.sentence_index if _index == 0: words_num_dict[_index] = 0 else: words_num_dict[_index] = words_num_dict[_index - 1] + last_words_num last_words_num = len(sentence.sentence_text) # 公司-联系人连接(km算法) re_phone = re.compile('1[3-9][0-9][-—-―]?\d{4}[-—-―]?\d{4}|' '\+86.?1[3-9]\d{9}|' # '0[1-9]\d{1,2}[-—-―][1-9]\d{6,7}/[1-9]\d{6,10}|' '0[1-9]\d{1,2}[-—-―][2-9]\d{6,7}[^\d]?转\d{1,4}|' '0[1-9]\d{1,2}[-—-―][2-9]\d{6}\d?[-—-―]\d{1,4}|' '0[1-9]\d{1,2}[-—-―]{0,2}[2-9]\d{6}\d?(?=1[3-9]\d{9})|' '0[1-9]\d{1,2}[-—-―]{0,2}[2-9]\d{6}\d?(?=0[1-9]\d{1,2}[-—-―]?[2-9]\d{6}\d?)|' '0[1-9]\d{1,2}[-—-―]{0,2}[2-9]\d{6}\d?(?=[2-9]\d{6,7})|' '0[1-9]\d{1,2}[-—-―]{0,2}[2-9]\d{6}\d?|' '[\(|\(]0[1-9]\d{1,2}[\)|\)]-?[2-9]\d{6,7}-?\d{,4}|' '400\d{7}转\d{1,4}|' '[2-9]\d{6,7}') key_phone = re.compile("联系方式|电话|联系人|负责人") temporary_list2 = [] for entity in list_entity: # if entity.entity_type in ['org', 'company', 'person'] and entity.is_tail==False: if entity.entity_type in ['org', 'company', 'person']: temporary_list2.append(entity) temporary_list2 = sorted(temporary_list2, key=lambda x: (x.sentence_index, x.begin_index)) new_temporary_list2 = [] for _split in new_split_list: temp_list = [] for _entity in temporary_list2: if words_num_dict[_entity.sentence_index] + _entity.wordOffset_begin >= _split[0] and words_num_dict[ _entity.sentence_index] + _entity.wordOffset_end < _split[1]: temp_list.append(_entity) elif words_num_dict[_entity.sentence_index] + _entity.wordOffset_begin >= _split[1]: break new_temporary_list2.append(temp_list) # print(new_temporary_list2) match_list2 = [] for split_index in range(len(new_temporary_list2)): split_entitys = new_temporary_list2[split_index] if len(split_entitys)<=1: continue is_skip = False for index in range(len(split_entitys)): entity = split_entitys[index] if is_skip: is_skip = False continue else: if entity.entity_type in ['org', 'company']: if entity.label != 5 or entity.entity_text in roleSet: match_nums = 0 for after_index in range(index + 1, min(len(split_entitys), index + 4)): after_entity = split_entitys[after_index] if entity.in_attachment != after_entity.in_attachment: break if after_entity.entity_type in ['person']: distance = (tokens_num_dict[after_entity.sentence_index] + after_entity.begin_index) - ( tokens_num_dict[entity.sentence_index] + entity.end_index) # 实体为中标人/候选人,联系人已确定类别【1,2】 if entity.label in [2, 3, 4] and after_entity.label in [1, 2]: break if entity.label in [2, 3, 4] and distance>=20: break # 角色为中标候选人,排除"质疑|投诉|监督|受理"相关的联系人 if entity.label in [2, 3, 4] and re.search("纪检|监察|质疑|投诉|监督|受理|项目(单位)?联系", list_sentence[after_entity.sentence_index].sentence_text[max(0,after_entity.wordOffset_begin - 10):after_entity.wordOffset_begin]): break # 角色为招标/代理人,排除"纪检|监察"相关的联系人 if entity.label in [0,1] and re.search("纪检|监察",list_sentence[after_entity.sentence_index].sentence_text[max(0,after_entity.wordOffset_begin - 10):after_entity.wordOffset_begin]): break if after_entity.sentence_index != 0 and after_entity.wordOffset_begin <= 10: if entity.label in [2, 3, 4] and re.search("请.{0,5}联系", list_sentence[after_entity.sentence_index - 1].sentence_text[-10:] + list_sentence[after_entity.sentence_index].sentence_text[0:after_entity.wordOffset_begin]): continue if after_entity.label in [1, 2, 3]: # distance = (tokens_num_dict[ # after_entity.sentence_index] + after_entity.begin_index) - ( # tokens_num_dict[entity.sentence_index] + entity.end_index) sentence_distance = after_entity.sentence_index - entity.sentence_index if sentence_distance == 0: if distance < 100: if entity.label in [2, 3, 4] and distance>40: break if (entity.label == 0 and after_entity.label == 1) or ( entity.label == 1 and after_entity.label == 2): distance = distance / 100 value = (-1 / 2 * (distance ** 2)) / 10000 match_list2.append(Match(entity, after_entity, value)) match_nums += 1 else: if distance < 60: if entity.label in [2, 3, 4] and distance>20: break if (entity.label == 0 and after_entity.label == 1) or ( entity.label == 1 and after_entity.label == 2): distance = distance / 100 value = (-1 / 2 * (distance ** 2)) / 10000 match_list2.append(Match(entity, after_entity, value)) match_nums += 1 if after_entity.entity_type in ['org', 'company']: if entity.label in [2, 3, 4] and after_entity.label in [0, 1]: break # 解决在‘地址’中识别出org/company的问题 # if entity.label in [0,1] and after_index==index+1 and after_entity.label not in [0,1]: if entity.label != 5 and after_index == index + 1 and ( after_entity.label == entity.label or after_entity.label == 5): distance = (tokens_num_dict[ after_entity.sentence_index] + after_entity.begin_index) - ( tokens_num_dict[entity.sentence_index] + entity.end_index) if distance < 20: after_entity_left = list_sentence[after_entity.sentence_index].tokens[max(0, after_entity.begin_index - 10):after_entity.begin_index] after_entity_right = list_sentence[after_entity.sentence_index].tokens[ after_entity.end_index + 1:after_entity.end_index + 6] after_entity_left = "".join(after_entity_left) if len(after_entity_left) > 20: after_entity_left = after_entity_left[-20:] after_entity_right = "".join(after_entity_right)[:10] if re.search("地,?址", after_entity_left): is_skip = True continue if re.search("\(|(", after_entity_left) and re.search("\)|)",after_entity_right): is_skip = True continue if entity.label in [0, 1] and after_entity.label in [0, 1] and entity.label == after_entity.label: break if entity.label in [0, 1] and after_entity.label in [0, 1] and split_entitys[ index + 1].entity_type == "person": break if entity.label in [0, 1 ,5] and after_entity.label in [2, 3, 4]: break if entity.label in [2, 3, 4] and after_entity.label in [0, 1]: break # 搜索没有联系人的电话 mid_tokens = [] is_same_sentence = False if index == len(split_entitys) - 1: for i in range(entity.sentence_index, len(list_sentence)): mid_tokens += list_sentence[i].tokens mid_tokens = mid_tokens[entity.end_index + 1:] mid_sentence = "".join(mid_tokens) have_phone = re.findall(re_phone, mid_sentence) if have_phone: if re.findall(re_phone, mid_sentence.split("。")[0]): is_same_sentence = True _phone = have_phone[0] if _phone in [ent.entity_text for ent in phone_entitys]: phone_begin = mid_sentence.find(_phone) if words_num_dict[entity.sentence_index] + entity.wordOffset_begin + phone_begin < \ new_split_list[split_index][1]: mid_sentence = mid_sentence[max(0, phone_begin - 15):phone_begin].replace(",", "") if re.search(key_phone, mid_sentence): if entity.label in [2, 3, 4] and re.search("纪检|监察|质疑|投诉|监督|受理|项目(单位)?联系",mid_sentence[-8:]): pass else: distance = 1 if is_same_sentence: if phone_begin <= 200: if entity.label in [2,3,4] and phone_begin>80: break value = (-1 / 2 * (distance ** 2)) / 10000 match_list2.append(Match(entity, (entity, _phone), value)) match_nums += 1 else: if phone_begin <= 60: if entity.label in [2,3,4] and phone_begin>40: break value = (-1 / 2 * (distance ** 2)) / 10000 match_list2.append(Match(entity, (entity, _phone), value)) match_nums += 1 else: next_entity = split_entitys[index + 1] if next_entity.entity_type in ["org","company"]: _entity_left = list_sentence[next_entity.sentence_index].sentence_text[entity.wordOffset_end:next_entity.wordOffset_begin] _entity_left2 = re.sub(",()\(\)::", "", _entity_left) _entity_left2 = _entity_left2[-5:] if re.search("(地,?址|地,?点)[::][^,。]*$", _entity_left) or re.search("地址|地点", _entity_left2): if index + 2<= len(split_entitys) - 1: next_entity = split_entitys[index + 2] if len(_entity_left)<=2 and re.search("[、(\(]",_entity_left): if index + 2 <= len(split_entitys) - 1: next_entity = split_entitys[index + 2] if entity.sentence_index == next_entity.sentence_index: mid_tokens += list_sentence[entity.sentence_index].tokens[ entity.end_index + 1:next_entity.begin_index] else: sentence_index = entity.sentence_index while sentence_index <= next_entity.sentence_index: mid_tokens += list_sentence[sentence_index].tokens sentence_index += 1 mid_tokens = mid_tokens[entity.end_index + 1:-(len( list_sentence[next_entity.sentence_index].tokens) - next_entity.begin_index) + 1] mid_sentence = "".join(mid_tokens) have_phone = re.findall(re_phone, mid_sentence) if have_phone: if re.findall(re_phone, mid_sentence.split("。")[0]): is_same_sentence = True _phone = have_phone[0] if _phone in [ent.entity_text for ent in phone_entitys]: phone_begin = mid_sentence.find(_phone) mid_sentence = mid_sentence[max(0, phone_begin - 15):phone_begin].replace(",", "") if re.search(key_phone, mid_sentence): p_phone = [p.entity_text for p in next_entity.person_phone] if next_entity.person_phone else [] if next_entity.entity_type == 'person' and _phone in p_phone: pass elif entity.label in [2, 3, 4] and re.search("纪检|监察|质疑|投诉|监督|受理|项目(单位)?联系", mid_sentence[-8:]): pass else: distance = (tokens_num_dict[ next_entity.sentence_index] + next_entity.begin_index) - ( tokens_num_dict[entity.sentence_index] + entity.end_index) distance = distance / 2 if is_same_sentence: if phone_begin <= 200: value = (-1 / 2 * (distance ** 2)) / 10000 match_list2.append(Match(entity, (entity, _phone), value)) match_nums += 1 else: if phone_begin <= 60: value = (-1 / 2 * (distance ** 2)) / 10000 match_list2.append(Match(entity, (entity, _phone), value)) match_nums += 1 # 实体无匹配时,尝试前向查找匹配 if not match_nums: if (entity.label != 5 or entity.entity_text in roleSet) and entity.values[entity.label] >= 0.5 and index != 0: previous_entity = split_entitys[index - 1] if previous_entity.entity_type == 'person' and previous_entity.label in [1, 2, 3]: if entity.label in [2, 3, 4] and previous_entity.label in [1, 2]: continue if previous_entity.sentence_index == entity.sentence_index: distance = (tokens_num_dict[entity.sentence_index] + entity.begin_index) - ( tokens_num_dict[ previous_entity.sentence_index] + previous_entity.end_index) if distance < 20: # 距离相等时,前向添加处罚值 # distance += 1 # 前向 没有 /10000 value = (-1 / 2 * (distance ** 2)) match_list2.append(Match(entity, previous_entity, value)) # print(match_list2) # print([(mat.main_role.entity_text,mat.attribute.entity_text if not isinstance(mat.attribute, tuple) else mat.attribute[1]) for mat in match_list2]) match_list2 = [mat for mat in match_list2 if mat.main_role not in linked_company and mat.attribute not in linked_person] # print(match_list2) # print([(mat.main_role.entity_text,mat.attribute.entity_text if not isinstance(mat.attribute, tuple) else mat.attribute[1]) for mat in match_list2]) # km算法分配求解 result2 = dispatch(match_list2) # print(result2) for match in result2: entity = match[0] # print(entity.entity_text) # print(entity.label) # print(match.attribute) entity_index = list_entity.index(entity) is_update = False if isinstance(match[1], tuple): person_ = '' phone_ = match[1][1].split("/") # 分割多个号码 # print(person_,phone_) else: person_ = match[1].entity_text phone_ = [i.entity_text for i in match[1].person_phone] if match[1].person_phone else [] for k in PackDict.keys(): for i in range(len(PackDict[k]["roleList"])): if PackDict[k]["roleList"][i].role_name == "tenderee": # if not PackDict[k]["roleList"][i].linklist: if PackDict[k]["roleList"][i].entity_text == entity.entity_text or entity.label == 0: if person_ not in agency_contact and len(set(phone_)&set(agency_phone))==0 and person_ not in winter_contact: if not phone_: PackDict[k]["roleList"][i].linklist.append((person_, "")) for p in phone_: # if not person_ and len() PackDict[k]["roleList"][i].linklist.append((person_, p)) is_update = True elif PackDict[k]["roleList"][i].role_name == "agency": # if not PackDict[k]["roleList"][i].linklist: if PackDict[k]["roleList"][i].entity_text == entity.entity_text or entity.label == 1 and person_ not in winter_contact: if person_ not in tenderee_contact and len(set(phone_)&set(tenderee_phone))==0: if not phone_: PackDict[k]["roleList"][i].linklist.append((person_, "")) for p in phone_: PackDict[k]["roleList"][i].linklist.append((person_, p)) is_update = True else: if PackDict[k]["roleList"][i].entity_text == entity.entity_text: # if not PackDict[k]["roleList"][i].linklist: if len([item for item in PackDict[k]["roleList"][i].linklist if item[1]])==0: # 有联系人但无联系方式(号码) if person_ not in tenderee_contact and len(set(phone_)&set(tenderee_phone))==0 and \ person_ not in agency_contact and len(set(phone_)&set(agency_phone))==0: if not phone_: PackDict[k]["roleList"][i].linklist.append((person_, "")) for p in phone_: PackDict[k]["roleList"][i].linklist.append((person_, p)) is_update = True if not person_: is_update = False if is_update: # 更新 list_entity if not list_entity[entity_index].pointer_person: list_entity[entity_index].pointer_person = [] list_entity[entity_index].pointer_person.append(match[1]) # print('tenderee_contact',tenderee_contact) # print('tenderee_phone',tenderee_phone) # print('agency_contact',agency_contact) # print('agency_phone',agency_phone) # print('PackDict') # for k in PackDict.keys(): # for i in range(len(PackDict[k]["roleList"])): # print(PackDict[k]["roleList"][i].role_name) # print(PackDict[k]["roleList"][i].entity_text) # print(PackDict[k]["roleList"][i].linklist) linked_person = [] linked_persons_with = [] for company_entity in [entity for entity in list_entity if entity.entity_type in ['company','org']]: if company_entity.pointer_person: for _person in company_entity.pointer_person: linked_person.append(_person) linked_persons_with.append(company_entity) # 一个公司对应多个联系人的补充 person_entitys = [entity for entity in list_entity if entity.entity_type=='person'] person_entitys = person_entitys[::-1] for index in range(len(person_entitys)): entity = person_entitys[index] prepare_link = [] if entity not in linked_person: prepare_link.append(entity) last_person = entity for after_index in range(index + 1, min(len(person_entitys), index + 5)): after_entity = person_entitys[after_index] if after_entity.sentence_index==last_person.sentence_index and last_person.begin_index-after_entity.end_index<5: if after_entity in linked_person: _index = linked_person.index(after_entity) with_company = linked_persons_with[_index] for i in range(len(PackDict["Project"]["roleList"])): if PackDict["Project"]["roleList"][i].role_name == "tenderee": if PackDict["Project"]["roleList"][i].entity_text == with_company.entity_text or with_company.label == 0: for item in prepare_link: person_phone = [p.entity_text for p in item.person_phone] if item.person_phone else [] for _p in person_phone: PackDict["Project"]["roleList"][i].linklist.append((item.entity_text, _p)) with_company.pointer_person.append(item) linked_person.append(item) elif PackDict["Project"]["roleList"][i].role_name == "agency": if PackDict["Project"]["roleList"][i].entity_text == with_company.entity_text or with_company.label == 1: for item in prepare_link: person_phone = [p.entity_text for p in item.person_phone] if item.person_phone else [] for _p in person_phone: PackDict["Project"]["roleList"][i].linklist.append((item.entity_text, _p)) with_company.pointer_person.append(item) linked_person.append(item) else: if PackDict["Project"]["roleList"][i].entity_text == with_company.entity_text: for item in prepare_link: person_phone = [p.entity_text for p in item.person_phone] if item.person_phone else [] for _p in person_phone: PackDict["Project"]["roleList"][i].linklist.append((item.entity_text, _p)) with_company.pointer_person.append(item) linked_person.append(item) break else: prepare_link.append(after_entity) last_person = after_entity continue # 统一同类角色的属性 for k in PackDict.keys(): for i in range(len(PackDict[k]["roleList"])): for _entity in list_entity: if _entity.entity_type in ['org','company']: is_same = False is_similar = False # entity_text相同 if _entity.entity_text==PackDict[k]["roleList"][i].entity_text: is_same = True # entity.label为【0,1】 if _entity.label in [0,1] and dict_role_id[str(_entity.label)]==PackDict[k]["roleList"][i].role_name: is_similar = True if is_same: linked_entitys = _entity.linked_entitys if linked_entitys: for linked_entity in linked_entitys: pointer_person = linked_entity.pointer_person if linked_entity.pointer_person else [] for _pointer_person in pointer_person: _phone = [p.entity_text for p in _pointer_person.person_phone] if _pointer_person.person_phone else [] for _p in _phone: if (_pointer_person.entity_text,_p) not in PackDict[k]["roleList"][i].linklist: PackDict[k]["roleList"][i].linklist.append((_pointer_person.entity_text,_p)) elif is_similar: pointer_person = _entity.pointer_person if _entity.pointer_person else [] for _pointer_person in pointer_person: _phone = [p.entity_text for p in _pointer_person.person_phone] if _pointer_person.person_phone else [] for _p in _phone: if (_pointer_person.entity_text, _p) not in PackDict[k]["roleList"][i].linklist: PackDict[k]["roleList"][i].linklist.append( (_pointer_person.entity_text, _p)) # "roleList"中联系人电话去重 tenderee_agency_phone = [] for k in PackDict.keys(): for i in range(len(PackDict[k]["roleList"])): if PackDict[k]["roleList"][i].role_name in ['agency','tenderee']: tenderee_agency_phone.extend([person_phone[1] for person_phone in PackDict[k]["roleList"][i].linklist if person_phone[1]]) # 带有联系人的电话 with_person = [person_phone[1] for person_phone in PackDict[k]["roleList"][i].linklist if person_phone[0]] # 带有电话的联系人 with_phone = [person_phone[0] for person_phone in PackDict[k]["roleList"][i].linklist if person_phone[1]] remove_list = [] for item in PackDict[k]["roleList"][i].linklist: if not item[0]: if item[1] in with_person: # 删除重复的无联系人电话 remove_list.append(item) elif not item[1]: if item[0] in with_phone: remove_list.append(item) for _item in remove_list: PackDict[k]["roleList"][i].linklist.remove(_item) # 中标候选人联系方式异常排除 for k in PackDict.keys(): for i in range(len(PackDict[k]["roleList"])): if PackDict[k]["roleList"][i].role_name in ['win_tenderer', 'second_tenderer','third_tenderer']: if tenderee_agency_phone: remove_list = [] for item in PackDict[k]["roleList"][i].linklist: if item[1] and item[1] in tenderee_agency_phone: remove_list.append(item) for _item in remove_list: PackDict[k]["roleList"][i].linklist.remove(_item) # else: # # 公告中无招标代理联系方式时,可排除中标联系方式 # remove_list = [] # for _item in PackDict[k]["roleList"][i].linklist: # # 有联系方式 # if _item[1]: # remove_list.append(_item) # for _item in remove_list: # PackDict[k]["roleList"][i].linklist.remove(_item) # PackDict更新company/org地址 last_role_prob = {} for ent in pre_entity: if ent.entity_type in ['company','org']: if ent.pointer_address: for k in PackDict.keys(): for i in range(len(PackDict[k]["roleList"])): if PackDict[k]["roleList"][i].entity_text == ent.entity_text: if not PackDict[k]["roleList"][i].address: PackDict[k]["roleList"][i].address = ent.pointer_address.entity_text last_role_prob[PackDict[k]["roleList"][i].role_name] = ent.values[role2id_dict[PackDict[k]["roleList"][i].role_name]] else: if PackDict[k]["roleList"][i].role_name in ['tenderee','agency']: # 角色为招标/代理人时,取其实体概率高的链接地址作为角色address if ent.values[role2id_dict[PackDict[k]["roleList"][i].role_name]] > last_role_prob[PackDict[k]["roleList"][i].role_name]: PackDict[k]["roleList"][i].address = ent.pointer_address.entity_text last_role_prob[PackDict[k]["roleList"][i].role_name] = ent.values[role2id_dict[PackDict[k]["roleList"][i].role_name]] else: if len(ent.pointer_address.entity_text) > len(PackDict[k]["roleList"][i].address): PackDict[k]["roleList"][i].address = ent.pointer_address.entity_text # 联系人——电子邮箱链接 temporary_list3 = [entity for entity in list_entity if entity.entity_type=='email' or (entity.entity_type=='person' and entity.label in [1,2,3])] temporary_list3 = sorted(temporary_list3, key=lambda x: (x.sentence_index, x.begin_index)) new_temporary_list3 = [] for _split in new_split_list: temp_list = [] for _entity in temporary_list3: if words_num_dict[_entity.sentence_index] + _entity.wordOffset_begin >= _split[0] and words_num_dict[ _entity.sentence_index] + _entity.wordOffset_end < _split[1]: temp_list.append(_entity) elif words_num_dict[_entity.sentence_index] + _entity.wordOffset_begin >= _split[1]: break new_temporary_list3.append(temp_list) # print(new_temporary_list3) match_list3 = [] for split_index in range(len(new_temporary_list3)): split_entitys = new_temporary_list3[split_index] for index in range(len(split_entitys)): entity = split_entitys[index] if entity.entity_type == 'person': match_nums = 0 for after_index in range(index + 1, min(len(split_entitys), index + 4)): after_entity = split_entitys[after_index] if match_nums > 2: break if after_entity.entity_type == 'email': distance = (tokens_num_dict[after_entity.sentence_index] + after_entity.begin_index) - ( tokens_num_dict[entity.sentence_index] + entity.end_index) sentence_distance = after_entity.sentence_index - entity.sentence_index if sentence_distance == 0: if distance < 100: if (entity.label == 0 and after_entity.label == 1) or ( entity.label == 1 and after_entity.label == 2): distance = distance / 100 value = (-1 / 2 * (distance ** 2)) / 10000 match_list3.append(Match(entity, after_entity, value)) match_nums += 1 else: if distance < 60: if (entity.label == 0 and after_entity.label == 1) or ( entity.label == 1 and after_entity.label == 2): distance = distance / 100 value = (-1 / 2 * (distance ** 2)) / 10000 match_list3.append(Match(entity, after_entity, value)) match_nums += 1 # 前向查找匹配 # if not match_nums: if index != 0: previous_entity = split_entitys[index - 1] if previous_entity.entity_type == 'email': if previous_entity.sentence_index == entity.sentence_index: distance = (tokens_num_dict[entity.sentence_index] + entity.begin_index) - ( tokens_num_dict[ previous_entity.sentence_index] + previous_entity.end_index) if distance < 30: # 距离相等时,前向添加处罚值 # distance += 1 # 前向 没有 /10000 value = (-1 / 2 * (distance ** 2)) match_list3.append(Match(entity, previous_entity, value)) # print(match_list3) # km算法分配求解 result3 = dispatch(match_list3) for match in result3: match_person = match[0] match_email = match[1] match_person.pointer_email = match_email # # 1)第一个公司实体的招标人,则看看下一个实体是否为代理人,如果是则联系人错位连接 。2)在同一句中往后找联系人。3)连接不上在整个文章找联系人。 # temp_ent_list = [] # 临时列表,记录0,1角色及3联系人 # other_person = [] # 阈值以上的联系人列表 # link_person = [] # 有电话没联系上角色的person列表 # other_ent = [] # link_ent = [] # found_person = False # ent_list = [] # for entity in list_entity: # if entity.entity_type in ['org','company','person']: # ent_list.append(entity) # # ent_list = [entity for entity in list_entity if entity.entity_type in ['org','company','person']] # #for list_index in range(len(ent_list)): # #if ent_list[list_index].entity_type in ['org','company'] and ent_list[list_index].label == 0 and list_index+2on_value_person: # if str(entity.label)=="1": # for i in range(len(PackDict["Project"]["roleList"])): # if PackDict["Project"]["roleList"][i].role_name=="tenderee": # PackDict["Project"]["roleList"][i].linklist.append((entity.entity_text,entity.person_phone)) # link_person.append(entity.entity_text) # link_ent.append(PackDict["Project"]["roleList"][i].entity_text) # # add pointer_person # for _entity in list_entity: # if dict_role_id.get(str(_entity.label))=="tenderee": # for i in range(len(PackDict["Project"]["roleList"])): # if PackDict["Project"]["roleList"][i].entity_text==_entity.entity_text and PackDict["Project"]["roleList"][i].role_name=="tenderee": # _entity.pointer_person = entity # elif str(entity.label)=="2": # for i in range(len(PackDict["Project"]["roleList"])): # if PackDict["Project"]["roleList"][i].role_name=="agency": # PackDict["Project"]["roleList"][i].linklist.append((entity.entity_text,entity.person_phone)) # link_person.append(entity.entity_text) # link_ent.append(PackDict["Project"]["roleList"][i].entity_text) # # add pointer_person # for _entity in list_entity: # if dict_role_id.get(str(_entity.label))=="agency": # for i in range(len(PackDict["Project"]["roleList"])): # if PackDict["Project"]["roleList"][i].entity_text==_entity.entity_text and PackDict["Project"]["roleList"][i].role_name=="agency": # _entity.pointer_person = entity # elif str(entity.label)=="3": # if entity.entity_text in sure_person_set: # 2020/11/25 排除已经确定角色的联系人 # continue # #not_link_person.append((entity_after.entity_text,entity_after.person_phone)) # other_person.append(entity.entity_text) # temp_ent_list.append((entity.entity_text,entity.person_phone,entity)) # # #if entity.entity_text in roleSet: # if entity.entity_text in roleSet: # if entity.label in [0,1]: # other_ent.append(entity.entity_text) # temp_ent_list.append((entity.entity_text, entity.label,entity)) # for behind_index in range(index+1, len(ent_list)): # entity_after = ent_list[behind_index] # if entity_after.sentence_index-entity.sentence_index>=1 or entity_after.entity_type in ['org','company']: # 只在本句中找联系人 # break # if entity_after.values is not None: # if entity_after.entity_type=="person": # if str(entity_after.label) == "0": # 2020/11/25角色后面为非联系人 停止继续往后找 # break # if entity_after.values[entity_after.label]>on_value_person: # if str(entity_after.label)=="1": # for i in range(len(PackDict["Project"]["roleList"])): # if PackDict["Project"]["roleList"][i].role_name=="tenderee": # PackDict["Project"]["roleList"][i].linklist.append((entity_after.entity_text,entity_after.person_phone)) # link_person.append(entity_after.entity_text) # link_ent.append(PackDict["Project"]["roleList"][i].entity_text) # elif str(entity_after.label)=="2": # for i in range(len(PackDict["Project"]["roleList"])): # if PackDict["Project"]["roleList"][i].role_name=="agency": # PackDict["Project"]["roleList"][i].linklist.append((entity_after.entity_text,entity_after.person_phone)) # link_person.append(entity_after.entity_text) # link_ent.append(PackDict["Project"]["roleList"][i].entity_text) # elif str(entity_after.label)=="3": # if entity_after.entity_text in sure_person_set: # 2020/11/25 如果姓名已经出现在确定角色联系人中则停止往后找 # break # elif entity_after.begin_index - entity.end_index > 30:#2020/10/25 如果角色实体与联系人实体间隔大于阈值停止 # break # for pack in PackDict.keys(): # for i in range(len(PackDict[pack]["roleList"])): # if PackDict[pack]["roleList"][i].entity_text==entity.entity_text: # #if entity_after.sentence_index-entity.sentence_index>1 and len(roleList[i].linklist)>0: # #break # PackDict[pack]["roleList"][i].linklist.append((entity_after.entity_text,entity_after.person_phone)) # link_person.append(entity_after.entity_text) # #add pointer_person # entity.pointer_person = entity_after # # not_link_person = [person for person in other_person if person not in link_person] # not_link_ent = [ent for ent in other_ent if ent not in link_ent] # if len(not_link_person) > 0 and len(not_link_ent) > 0 : # item = temp_ent_list # for i in range(len(item)): # if item[i][0] in not_link_ent and item[i][1] == 0 and i+3 < len(item): # if item[i+1][0] in other_ent and item[i+1][1] == 1 and item[i+2][0] in other_person and item[i+3][0] in other_person: # item[i+1], item[i+2] = item[i+2], item[i+1] # for i in range(len(item)-1, -1, -1): # if item[i][0] in not_link_ent: # for pack in PackDict.keys(): # for role in PackDict[pack]["roleList"]: # if role.entity_text == item[i][0] and len(role.linklist) < 1: # for j in range(i+1, len(item)): # if item[j][0] in not_link_person: # role.linklist.append(item[j][:2]) # #add pointer_person # item[i][2].pointer_person = item[j][2] # break # else: # break # # 电话没有联系人的处理 # role_with_no_phone = [] # for i in range(len(PackDict["Project"]["roleList"])): # if PackDict["Project"]["roleList"][i].role_name in ["tenderee","agency"]: # if len(PackDict["Project"]["roleList"][i].linklist)==0: # 找出没有联系人的招标/代理人 # role_with_no_phone.append(PackDict["Project"]["roleList"][i].entity_text) # else: # phone_nums = 0 # for link in PackDict["Project"]["roleList"][i].linklist: # if link[1]: # phone_nums += 1 # break # if not phone_nums: # role_with_no_phone.append(PackDict["Project"]["roleList"][i].entity_text) # if role_with_no_phone: # phone_with_person = [entity.person_phone for entity in list_entity if entity.entity_type == "person"] # # phone_with_person = [phone for phone in phone_with_person if phone] # # dict_index_sentence = {} # for _sentence in list_sentence: # dict_index_sentence[_sentence.sentence_index] = _sentence # new_entity_list = [entity for entity in list_entity if entity.entity_type in ['org','company','person']] # for index in range(len(new_entity_list)): # entity = new_entity_list[index] # if entity.entity_text in role_with_no_phone: # e_sentence = dict_index_sentence[entity.sentence_index] # entity_right = e_sentence.tokens[entity.end_index:entity.end_index+40] # entity_right = "".join(entity_right) # if index+1-1: # entity_right = entity_right[:entity_right.find(new_entity_list[index+1].entity_text)] # have_phone = re.findall(phone,entity_right) # if have_phone: # _phone = have_phone[0] # phone_begin = entity_right.find(_phone) # if _phone not in phone_with_person and re.search(key_phone,entity_right[:phone_begin]): # # entity.person_phone = _phone # for i in range(len(PackDict["Project"]["roleList"])): # if PackDict["Project"]["roleList"][i].entity_text == entity.entity_text: # PackDict["Project"]["roleList"][i].linklist.append(('', _phone)) #寻找多标段招标金额 p_entity = len(list_entity)-1 set_tenderer_money = set() list_tenderer_money = [] #2021/7/16 新增列表,倒序保存所有中标金额 unit_list = [] #2021/8/17 新增,保存金额单位 #遍历所有实体 max_prob = 0 # 保存招标金额最大概率 while(p_entity>=0): entity = list_entity[p_entity] if entity.entity_type=="money": # 2021/12/03 添加成本警戒线、保证金 if entity.notes in ['保证金', '成本警戒线']: packagePointer, _flag = getPackage(PackageList, entity.sentence_index, entity.begin_index, "money-" + str(entity.label), MAX_DIS=2, DIRECT="L") if packagePointer is None: packageName = "Project" else: packageName = packagePointer.entity_text if packageName == "Project": # if PackDict["Project"]["tendereeMoney"]=on_value: if str(entity.label)=="1" and entity.notes != '单价': set_tenderer_money.add(float(entity.entity_text)) list_tenderer_money.append(float(entity.entity_text)) # 2021/7/16 新增列表,倒序保存所有中标金额 unit_list.append(entity.money_unit) # if str(entity.label)=="0": if str(entity.label)=="0" and (entity.notes!='总投资' or float(entity.entity_text)<100000000): ''' if p_entity>0: p_before = list_entity[p_entity-1] if p_before.entity_type=="money" and p_before.label==entity.label and p_before.entity_text==entity.entity_text and abs(entity.begin_index-p_before.end_index)<=2: p_entity -= 1 continue ''' packagePointer,_flag = getPackage(PackageList,entity.sentence_index,entity.begin_index,"money-"+str(entity.label),MAX_DIS=2,DIRECT="L") if packagePointer is None: packageName = "Project" else: packageName = packagePointer.entity_text if packageName=="Project": # if PackDict["Project"]["tendereeMoney"]on_value: if entity.values[entity.label]>max_prob-0.005: # 选择最大概率招标金额 2024/05/23 相差0.005尽量选前面的 if entity.notes == '单价': PackDict["Project"]["unit_tendereeMoney"] = str(Decimal(entity.entity_text)) else: PackDict["Project"]["tendereeMoney"] = str(Decimal(entity.entity_text)) PackDict["Project"]["tendereeMoneyUnit"] = entity.money_unit max_prob = entity.values[entity.label] else: if entity.notes == '单价': PackDict[packageName]["unit_tendereeMoney"] = str(Decimal(entity.entity_text)) else: PackDict[packageName]["tendereeMoney"] = str(Decimal(entity.entity_text)) PackDict[packageName]["tendereeMoneyUnit"] = entity.money_unit #add pointer_tendereeMoney packagePointer.pointer_tendereeMoney = entity p_entity -= 1 '''标段链接包名包号''' pk_name_l = [] pk_code_l = [] count_dic = { 'package': set(), 'name': set(), 'code': set() } def get_sort_dist(l, max_sent_dist=2): ''' 计算标段与其他要素距离,并按距离排序返回字典 :param l: [(entity.entity_type, entity.entity_text, entity.sentence_index, entity.wordOffset_begin, entity.wordOffset_end)] :param max_sent_dist: 最大句子距离 :return: ''' l.sort(key=lambda x: [x[2],x[3],x[4]]) # 20241204 多个字段排序 修复 561998414 第一标段西铭矿清水泵采购 标段和包名开始位置一样的情况 link_dic = {} i = 1 while i < len(l): ty1, ent1, s1, b1, e1, in_att1 = l[i - 1] ty2, ent2, s2, b2, e2, in_att2 = l[i] if ty1 != ty2 and in_att1 == in_att2 and s2 - s1 <= max_sent_dist: if ty1 == 'package': if ent1 not in link_dic: link_dic[ent1] = [] if s1 == s2: dist = abs(b2 - e1) if b2 > e1 else 0 else: dist = len(list_sentence[s1].sentence_text) - e1 for id in range(s1+1, s2): dist += len(list_sentence[id].sentence_text) dist += b2 if in_att1: dist += 100 # 附件的距离加100 link_dic[ent1].append((s2 - s1, dist, ent2)) elif ty2 == 'package': if ent2 not in link_dic: link_dic[ent2] = [] if s1 == s2: dist = abs(b2 - e1) if b2 > e1 else 0 else: dist = len(list_sentence[s1].sentence_text) - e1 for id in range(s1+1, s2): dist += len(list_sentence[id].sentence_text) dist += b2 if in_att1: dist += 100 # 附件的距离加100 if s1!=s2 or e1!=e2: dist += 30 # 包号在实体后面距离再加30 link_dic[ent2].append((s2 - s1, dist, ent1)) i += 1 return link_dic for entity in list_entity: if entity.entity_type == 'package': pk_name_l.append((entity.entity_type, entity.entity_text, entity.sentence_index, entity.wordOffset_begin, entity.wordOffset_end, entity.in_attachment)) pk_code_l.append((entity.entity_type, entity.entity_text, entity.sentence_index, entity.wordOffset_begin, entity.wordOffset_end, entity.in_attachment)) count_dic['package'].add(entity.entity_text) elif entity.entity_type == 'name': pk_name_l.append((entity.entity_type, entity.entity_text, entity.sentence_index, entity.wordOffset_begin, entity.wordOffset_end, entity.in_attachment)) count_dic['name'].add(entity.entity_text) elif entity.entity_type == 'code': pk_code_l.append((entity.entity_type, entity.entity_text, entity.sentence_index, entity.wordOffset_begin, entity.wordOffset_end, entity.in_attachment)) count_dic['code'].add(entity.entity_text) if len(count_dic['package']) > 0: if len(count_dic['name'])>0: link_dic = get_sort_dist(pk_name_l) for k, v in link_dic.items(): v.sort(key=lambda x: [x[0], x[1]]) if v[0][0] < 2 and v[0][1] < 200: # 标段号与包名句子数小于2,字距离小于200的才添加 PackDict[k]["name"] = v[0][2] if len(count_dic['code'])>0: link_dic = get_sort_dist(pk_code_l) for k, v in link_dic.items(): v.sort(key=lambda x: [x[0], x[1]]) if v[0][0] < 2 and v[0][1] < 200: PackDict[k]["code"] = v[0][2] #删除一个机构有多个角色的数据 #删除重复人、概率不回传 final_roleList = [] list_pop = [] set_tenderer_role = set() dict_pack_tenderer_money = dict() for pack in PackDict.keys(): #删除无效包 if PackDict[pack]["code"]=="" and PackDict[pack]["tendereeMoney"]==0 and len(PackDict[pack]["roleList"])==0: list_pop.append(pack) for i in range(len(PackDict[pack]["roleList"])): if PackDict[pack]["roleList"][i].role_name=="win_tenderer": if PackDict[pack]["roleList"][i].money==0: set_tenderer_role.add(PackDict[pack]["roleList"][i]) dict_pack_tenderer_money[pack] = [PackDict[pack]["roleList"][i],set()] #找到包的中投标金额 for _index in range(len(PackageList)): if "hit" in PackageList[_index]: for _hit in list(PackageList[_index]["hit"]): if len(_hit.split("-"))==3: _money = float(_hit.split("-")[1]) if _hit.split("-")[0]=="money" else None # 补充金额前新增负号‘-’导致错误的规则 elif len(_hit.split("-"))==4: _money = float(_hit.split("-")[2]) if _hit.split("-")[0] == "money" else None else: _money = None if PackageList[_index]["name"] in dict_pack_tenderer_money and _money is not None: dict_pack_tenderer_money[PackageList[_index]["name"]][1].add(_money) #只找到一个中标人和中标金额 if len(set_tenderer_money)==1 and len(set_tenderer_role)==1: list(set_tenderer_role)[0].money = list(set_tenderer_money)[0] list(set_tenderer_role)[0].money_unit = unit_list[0] # print('一个中标人一个金额:', list(set_tenderer_money)[0]) #找到一个中标人和多个招标金额 if len(set_tenderer_money)>1 and len(set_tenderer_role)==1: _maxMoney = list(set_tenderer_money)[0] _sumMoney = 0 for _m in list(set_tenderer_money): _sumMoney += _m if _m>_maxMoney: _maxMoney = _m if _sumMoney/_maxMoney==2: list(set_tenderer_role)[0].money = _maxMoney # print('一人多金额分项合计 取最大金额:', _maxMoney) else: # list(set_tenderer_role)[0].money = _maxMoney if min(list_tenderer_money)>200000 and list_tenderer_money[-1]/min(list_tenderer_money)>9000: list(set_tenderer_role)[0].money = min(list_tenderer_money) list(set_tenderer_role)[0].money_unit = unit_list[list_tenderer_money.index(min(list_tenderer_money))] # print('一人多金额 且最小的大于20万第一个金额比最小金额大几千倍的最小中标金额:', min(list_tenderer_money)) else: list(set_tenderer_role)[0].money = list_tenderer_money[-1] # 2021/7/16 修改 不是单价合计方式取第一个中标金额 list(set_tenderer_role)[0].money_unit = unit_list[-1] # 金额单位 # print('一人多金额 取第一个中标金额:', list_tenderer_money[-1]) #每个包都只找到一个金额 _flag_pack_money = True for k,v in dict_pack_tenderer_money.items(): if len(v[1])!=1: _flag_pack_money = False if _flag_pack_money and len(PackageSet)==len(dict_pack_tenderer_money.keys()): for k,v in dict_pack_tenderer_money.items(): if float(v[0].unit_price) < float(list(v[1])[0]): # 20241128 金额大于单价时才作链接金额 v[0].money = list(v[1])[0] # 2021/7/16 #增加判断中标金额是否远大于招标金额逻辑 for pack in PackDict.keys(): for i in range(len(PackDict[pack]["roleList"])): if float(PackDict[pack]["tendereeMoney"]) > 0: # print('金额数据类型:',type(PackDict[pack]["roleList"][i].money)) if float(PackDict[pack]["roleList"][i].money) >10000000 and \ float(PackDict[pack]["roleList"][i].money)/float(PackDict[pack]["tendereeMoney"])>=1000: PackDict[pack]["roleList"][i].money = float(PackDict[pack]["roleList"][i].money) / 10000 # print('招标金额校正中标金额') # 2022/04/01 #增加判断中标金额是否远小于招标金额逻辑,比例相差10000倍左右(中标金额“万”单位丢失或未识别) for pack in PackDict.keys(): for i in range(len(PackDict[pack]["roleList"])): if float(PackDict[pack]["tendereeMoney"]) > 0 and float(PackDict[pack]["roleList"][i].money) > 0.: if float(PackDict[pack]["roleList"][i].money) < 1000 and \ float(PackDict[pack]["tendereeMoney"])/float(PackDict[pack]["roleList"][i].money)>=9995 and \ float(PackDict[pack]["tendereeMoney"])/float(PackDict[pack]["roleList"][i].money)<11000: PackDict[pack]["roleList"][i].money = float(PackDict[pack]["roleList"][i].money) * 10000 # 2021/7/19 #增加判断中标金额是否远大于第二三中标金额 for pack in PackDict.keys(): tmp_moneys = [] for i in range(len(PackDict[pack]["roleList"])): if float(PackDict[pack]["roleList"][i].money) >100000: tmp_moneys.append(float(PackDict[pack]["roleList"][i].money)) if len(tmp_moneys)>2 and max(tmp_moneys)/min(tmp_moneys)>1000: for i in range(len(PackDict[pack]["roleList"])): if float(PackDict[pack]["roleList"][i].money)/min(tmp_moneys)>1000: PackDict[pack]["roleList"][i].money = float(PackDict[pack]["roleList"][i].money) / 10000 # print('通过其他中标人投标金额校正中标金额') for item in list_pop: PackDict.pop(item) # 公告中只有"招标人"且无"联系人"链接时 if len(PackDict)==1: k = list(PackDict.keys())[0] tenderee_agency_role = [role for role in PackDict[k]["roleList"] if role.role_name in ['tenderee','agency']] if len(tenderee_agency_role)==1: exist_person = [] exist_phone = [] for role in PackDict[k]["roleList"]: for group in role.linklist: if group[0]: exist_person.append(group[0]) if group[1]: exist_phone.append(group[1]) if tenderee_agency_role[0].role_name == "tenderee": if not tenderee_agency_role[0].linklist: get_contacts = False if not get_contacts: # 根据大纲Outline类召回联系人 for outline in list_outline: if re.search("联系人|联系方|联系方式|联系电话|电话|负责人|与.{2,4}联系",outline.outline_summary): for t_person in [p for p in temporary_list2 if p.entity_type=='person' and p.label==3]: if words_num_dict[t_person.sentence_index] + t_person.wordOffset_begin >= words_num_dict[outline.sentence_begin_index] + outline.wordOffset_begin and words_num_dict[ t_person.sentence_index] + t_person.wordOffset_end < words_num_dict[outline.sentence_end_index] + outline.wordOffset_end: if t_person.person_phone: _phone = [p.entity_text for p in t_person.person_phone] for _p in _phone: if t_person.entity_text not in exist_person and _p not in exist_phone: tenderee_agency_role[0].linklist.append((t_person.entity_text, _p)) get_contacts = True break elif words_num_dict[t_person.sentence_index] + t_person.wordOffset_begin >= \ words_num_dict[outline.sentence_end_index] + outline.wordOffset_end: break if not get_contacts: sentence_phone = phone.findall(outline.outline_text) if sentence_phone: if sentence_phone[0] not in exist_phone: tenderee_agency_role[0].linklist.append(("", sentence_phone[0])) get_contacts = True break if not get_contacts: # 直接取文中倒数第一个联系人 for _entity in temporary_list2[::-1]: if _entity.entity_type=='person' and _entity.label==3: if _entity.person_phone: _phone = [p.entity_text for p in _entity.person_phone] for _p in _phone: if _entity.entity_text not in exist_person and _p not in exist_phone: tenderee_agency_role[0].linklist.append((_entity.entity_text, _p)) get_contacts = True break if not get_contacts: # 如果文中只有一个“phone”实体,则直接取为联系人电话 if len(phone_entitys) == 1: if phone_entitys[0].entity_text not in exist_phone: tenderee_agency_role[0].linklist.append(("", phone_entitys[0].entity_text)) get_contacts = True if not get_contacts: # 通过大纲Outline类直接取电话 if len(new_split_list) > 1: for _start, _end in new_split_list: temp_sentence = _content[_start:_end] sentence_outline = temp_sentence.split(",::")[0] if re.search("联系人|联系方|联系方式|联系电话|电话|负责人|与.{2,4}联系", sentence_outline): sentence_phone = phone.findall(temp_sentence) if sentence_phone: if sentence_phone[0] in [ent.entity_text for ent in phone_entitys] and sentence_phone[0] not in exist_phone: tenderee_agency_role[0].linklist.append(("", sentence_phone[0])) get_contacts = True break if not get_contacts: # 通过正则提取句子段落进行提取电话 contacts_person = "(?:联系人|联系方|联系方式|负责人|电话|联系电话)[::]?" tenderee_pattern = "(?:(?:采购|招标|议价|议标|比选)(?:人|公司|单位|组织|部门)|建设(?:单位|业主)|(?:采购|招标|甲)方|询价单位|项目业主|业主|业主单位)[^。]{0,5}" contact_pattern_list = [tenderee_pattern + contacts_person, "(?:采购[^。,]{0,2}项目|采购事项|招标)[^。,]{0,4}" + contacts_person, "(?:项目|采购)[^。,]{0,4}" + contacts_person, "(?:报名|报价|业务咨询|业务|投标咨询)[^。,]{0,4}" + contacts_person, ] for _pattern in contact_pattern_list: get_tenderee_contacts = False for regular_match in re.finditer(_pattern, _content): match_text = _content[regular_match.end():regular_match.end() + 40] match_text = match_text.split("。")[0] sentence_phone = phone.findall(match_text) if sentence_phone: if sentence_phone[0] not in exist_phone: tenderee_agency_role[0].linklist.append(("", sentence_phone[0])) get_tenderee_contacts = True break if get_tenderee_contacts: break # 如果同一个电话连到了不同的单位就直接去掉(2024-09-03 新增) get_phone_dict = dict() for k in PackDict.keys(): for i in range(len(PackDict[k]["roleList"])): for item in PackDict[k]["roleList"][i].linklist: if item[1]: if item[1] not in get_phone_dict: get_phone_dict[item[1]] = set() get_phone_dict[item[1]].add(PackDict[k]["roleList"][i].entity_text) # print(get_phone_dict) remove_phone = [] for phone,role_list in get_phone_dict.items(): if len(role_list)>1: remove_phone.append(phone) for k in PackDict.keys(): for i in range(len(PackDict[k]["roleList"])): remove_list = [] for item in PackDict[k]["roleList"][i].linklist: if item[1] and item[1] in remove_phone: remove_list.append(item) for _item in remove_list: PackDict[k]["roleList"][i].linklist.remove(_item) for pack in PackDict.keys(): for i in range(len(PackDict[pack]["roleList"])): PackDict[pack]["roleList"][i] = PackDict[pack]["roleList"][i].getString() return PackDict def initPackageAttr(RoleList,PackageSet,win_tenderer_set,tenderee_or_agency_set, main_body_pack): ''' @summary: 根据拿到的roleList和packageSet初始化接口返回的数据 ''' packDict = dict() packDict["Project"] = {"code":"","tendereeMoney":0,"roleList":[], 'tendereeMoneyUnit':''} for item in list(PackageSet): packDict[item] = {"code":"","tendereeMoney":0,"roleList":[], 'tendereeMoneyUnit':''} packDict[item]['in_attachment'] = False if item in main_body_pack else True for item in RoleList: if packDict[item.packageName]["code"] =="": packDict[item.packageName]["code"] = item.packageCode # packDict[item.packageName]["roleList"].append(Role(item.role_name,item.entity_text,0,0,0.0,[])) # packDict[item.packageName]["roleList"].append(Role(item.role_name,item.entity_text,0,0,0.0,[])) #Role(角色名称,实体名称,角色阈值,金额,金额阈值,连接列表,金额单位) packDict[item.packageName]["roleList"].append(Role(item.role_name,item.entity_text,item.role_prob,0,0.0,[],set(item.multi_winner)-win_tenderer_set-tenderee_or_agency_set)) #Role(角色名称,实体名称,角色阈值,金额,金额阈值,连接列表,多中标人) return packDict def getPackageRoleMoney(list_sentence,list_entity,list_outline): ''' @param: list_sentence:文章的句子list list_entity:文章的实体list @return: 拿到文章的包-标段号-角色-实体名称-金额-联系人-联系电话 ''' # print("=1") theRole = getRoleList(list_sentence,list_entity) if not theRole: return [] # RoleList,RoleSet,PackageList,PackageSet = theRole RoleList,RoleSet,PackageList,PackageSet,win_tenderer_set,tenderee_or_agency_set,main_body_pack = theRole ''' for item in PackageList: # print(item) ''' # PackDict = initPackageAttr(RoleList, PackageSet) PackDict = initPackageAttr(RoleList, PackageSet, win_tenderer_set,tenderee_or_agency_set,main_body_pack) PackDict = findAttributeAfterEntity(PackDict, RoleSet, PackageList, PackageSet, list_sentence, list_entity, list_outline) return PackDict def turnBidWay(bidway): if bidway in ("邀请招标","采购方式:邀请"): return "邀请招标" elif bidway in ("询价","询单","询比","采购方式:询价"): return "询价" elif bidway in ("竞谈","竞争性谈判","公开竞谈"): return "竞争性谈判" elif bidway in ("竞争性磋商","磋商"): return "竞争性磋商" elif bidway in ("竞价","竞标","电子竞价","以电子竞价","电子书面竞投"): return "竞价" elif bidway in ("公开招标","网上电子投标","网上招标","采购方式:公开","招标为其他"): return "公开招标" elif bidway in ("单一来源"): return "单一来源" elif bidway in ("比选"): return "比选" else: return "其他" def turnMoneySource(moneysource): result_list = [] if re.search("自筹|业主筹集|筹资|自有",moneysource): result_list.append("自筹") if re.search("财政",moneysource) and not re.search("非财政",moneysource): result_list.append("财政资金") if re.search("拨款|补助|划拨|拨付|国拨|上级资金",moneysource): result_list.append("上级拨款") if re.search("社会资本|社会资金",moneysource): result_list.append("社会资本") if re.search("贷款|借款|借贷",moneysource): result_list.append("贷款资金") if re.search("债券|债|国债",moneysource): result_list.append("债券资金") if re.search("专项|项目资金",moneysource): result_list.append("项目专项资金") if re.search("配套",moneysource): result_list.append("配套资金") if re.search("外资",moneysource): result_list.append("外资") if re.search("国有资金|国企资金|国资|国家投资",moneysource): result_list.append("国有资金") if re.search("投资|融资",moneysource): result_list.append("投资资金") if re.search("预算(?0 and len(result_list)<5: return ",".join(result_list) else: return "其他资金" my_time_format_pattern = re.compile("((?P20\d{2}|\d{2}|二[零〇0][零〇一二三四五六七八九0]{2})\s*[-/年.]\s*(?P\d{1,2}|[一二三四五六七八九十]{1,3})\s*[-/月.]\s*(?P\d{1,2}|[一二三四五六七八九十]{1,3}))") from BiddingKG.dl.ratio.re_ratio import getUnifyNum def my_timeFormat(_time,page_time): if page_time: current_year = time.strftime("%Y",time.localtime(int(datetime.strptime(page_time, '%Y-%m-%d').timestamp()))) else: current_year = time.strftime("%Y",time.localtime()) all_match = re.finditer(my_time_format_pattern,_time) time_list = [] for _match in all_match: if len(_match.group())>0: legal = True year = "" month = "" day = "" for k,v in _match.groupdict().items(): if k=="year": year = v if k=="month": month = v if k=="day": day = v if year!="": if re.search("^\d+$", year): if len(year) == 2: year = "20" + year if int(year) - int(current_year) > 5 or int(year) - int(current_year) < -1: legal = False else: if int(year) - int(current_year)>10 or int(year) - int(current_year) < -1: legal = False else: _year = "" for word in year: if word == '0': _year += word else: _year += str(getDigitsDic(word)) year = _year else: legal = False if month!="": if re.search("^\d+$", month): if int(month) > 12: legal = False else: month = int(getUnifyNum(month)) if month >= 1 and month <= 12: month = str(month) else: legal = False else: legal = False if day!="": if re.search("^\d+$", day): if int(day) > 31: legal = False else: day = int(getUnifyNum(day)) if day >= 1 and day <= 31: day = str(day) else: legal = False else: legal = False if not isValidDate(int(year),int(month),int(day)): legal = False if legal: # 数字字符格式化 year = str(int(year)) month = str(int(month)) day = str(int(day)) time_list.append("%s-%s-%s"%(year,month.rjust(2,"0"),day.rjust(2,"0"))) return time_list def getTimeAttributes(list_entity,list_sentence,page_time): # from BiddingKG.dl.interface.htmlparser import get_childs # document_tree = parse_document.tree # new_document_tree = [] # _data_i = -1 # while _data_i < len(document_tree) - 1: # _data_i += 1 # _data = document_tree[_data_i] # _type = _data["type"] # if _type == "sentence": # if _data["sentence_title"] is not None: # new_document_tree.append(_data) # document_tree = new_document_tree time_entitys = [i for i in list_entity if i.entity_type=='time'] time_entitys = sorted(time_entitys,key=lambda x:(x.sentence_index, x.begin_index)) list_sentence = sorted(list_sentence,key=lambda x:x.sentence_index) dict_time = { "time_release": [], # 1 发布时间 "time_bidopen": [], # 2 开标时间 "time_bidclose": [], # 3 截标时间 'time_bidstart': [], # 12 投标(开始)时间、响应文件接收(开始)时间 'time_publicityStart': [], # 4 公示开始时间(公示时间、公示期) 'time_publicityEnd': [], # 5 公示截止时间 'time_getFileStart': [], # 6 文件获取开始时间(文件获取时间) 'time_getFileEnd': [], # 7 文件获取截止时间 'time_registrationStart': [], # 8 报名开始时间(报名时间) 'time_registrationEnd': [], # 9 报名截止时间 'time_earnestMoneyStart': [], #10 保证金递交开始时间(保证金递交时间) 'time_earnestMoneyEnd': [] , # 11 保证金递交截止时间 'time_commencement':[] , #13 开工日期 'time_completion': [], # 14 竣工日期 'time_listingStart': [], # 15 挂牌开始日期(挂牌时间) 'time_listingEnd': [], # 16 挂牌结束日期、挂牌截止日期 'time_signContract': [], # 17 合同签订时间 'time_contractStart': [], # 18 合同开始时间 'time_contractEnd': [] # 19 合同结束时间 } dict_time2label = { "time_release": 1, # 1 发布时间 "time_bidopen": 2, # 2 开标时间 "time_bidclose": 3, # 3 截标时间 'time_bidstart': 12, # 12 投标(开始)时间、响应文件接收(开始)时间 'time_publicityStart': 4, # 4 公示开始时间(公示时间、公示期) 'time_publicityEnd': 5, # 5 公示截止时间 'time_getFileStart': 6, # 6 文件获取开始时间(文件获取时间) 'time_getFileEnd': 7, # 7 文件获取截止时间 'time_registrationStart': 8, # 8 报名开始时间(报名时间) 'time_registrationEnd': 9, # 9 报名截止时间 'time_earnestMoneyStart': 10, # 10 保证金递交开始时间(保证金递交时间) 'time_earnestMoneyEnd': 11, # 11 保证金递交截止时间 'time_commencement': 13, # 13 开工日期 'time_completion': 14, # 14 竣工日期 'time_listingStart': 15, # 15 挂牌开始日期(挂牌时间) 'time_listingEnd': 16, # 16 挂牌结束日期、挂牌截止日期 'time_signContract': 17, # 17 合同签订时间 'time_contractStart': 18, # 18 合同开始时间 'time_contractEnd': 19 # 19 合同结束时间 } last_sentence_index = 0 last_time_type = "" last_time_index = { 'time_bidstart':"time_bidclose", 'time_publicityStart':"time_publicityEnd", 'time_getFileStart':"time_getFileEnd", 'time_registrationStart':"time_registrationEnd", 'time_earnestMoneyStart':"time_earnestMoneyEnd", 'time_commencement':"time_completion", 'time_listingStart':"time_listingEnd", 'time_contractStart':"time_contractEnd" } time_entitys = [[_entity,my_timeFormat(_entity.entity_text,page_time)] for _entity in time_entitys] time_entitys = [item for item in time_entitys if item[1]] # print(time_entitys) for entity_idx in range(len(time_entitys)): entity = time_entitys[entity_idx][0] extract_time = time_entitys[entity_idx][1] sentence_text = list_sentence[entity.sentence_index].sentence_text previous_entity = time_entitys[entity_idx-1][0] if entity_idx!=0 else None previous_extract_time = time_entitys[entity_idx-1][1] if entity_idx!=0 else None next_entity = time_entitys[entity_idx+1][0] if entity_idx!=len(time_entitys)-1 else None next_extract_time = time_entitys[entity_idx+1][1] if entity_idx!=len(time_entitys)-1 else None # 实体有效上下文 entity_context_begin = previous_entity.wordOffset_end if previous_entity and previous_entity.sentence_index==entity.sentence_index else 0 entity_context_end = next_entity.wordOffset_begin if next_entity and next_entity.sentence_index==entity.sentence_index else len(sentence_text) if entity.sentence_index!=last_sentence_index: # sentence_index 不同句子重置last_time_type last_time_type = "" entity_left = sentence_text[max(entity_context_begin, entity.wordOffset_begin - 2):entity.wordOffset_begin] entity_left2 = sentence_text[max(entity_context_begin, entity.wordOffset_begin - 10):entity.wordOffset_begin] entity_left3 = sentence_text[max(entity_context_begin, entity.wordOffset_begin - 30):entity.wordOffset_begin] entity_right = sentence_text[entity.wordOffset_end:min(entity.wordOffset_end + 3,entity_context_end)] entity_right2 = sentence_text[entity.wordOffset_end:entity_context_end] entity_right2 = re.sub(r"http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\\(\\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+",'',entity_right2)[:60] # 去除网址 # print(entity.entity_text,entity_right2) label_prob = entity.values[entity.label] entity_text = entity.entity_text in_attachment = entity.in_attachment # extract_time = my_timeFormat(entity_text,page_time) # print(entity_text,entity_left2) if extract_time: definite_time_list = [] t = re.compile("(北京时间)?(?P下午|上午|早上)?(?P\d{1,2})[::时点](?P半)?(?P\d{1,2})?[::分]?(?P\d{2})?秒?") _entity_text = re.sub(" (?=[^\d])|(?<=[^\d]) ","",entity_text) _entity_text_len = len(_entity_text) _entity_text = _entity_text + sentence_text[entity.wordOffset_end:entity.wordOffset_end+20] t_in_word_num = len(re.findall(t,_entity_text)) # t_out_of_word = re.search("^[^\d]{,2}"+t.pattern,re.sub(" (?=[^\d])|(?<=[^\d]) ","",sentence_text[entity.wordOffset_end:])) begin_index = 0 definite_time_idx_list = [] for _num in range(t_in_word_num): if begin_index> _entity_text_len + 8: break t_in_word = re.search(t, _entity_text[begin_index:]) # print(_entity_text[begin_index:]) if t_in_word: if _num==0 and t_in_word.start() > _entity_text_len + 8: break begin_index += t_in_word.end() # print('t_in_word',entity_text,t_in_word.groupdict()) day = t_in_word.groupdict().get('day',"") hour = t_in_word.groupdict().get('hour',"") half_hour = t_in_word.groupdict().get('half_hour',"") minute = t_in_word.groupdict().get('minute',"") second = t_in_word.groupdict().get('second',"") if hour: if day=='下午' and int(hour)<12: hour = str(int(hour)+12) if int(hour)>24: continue else: hour = "00" if not minute: if half_hour: minute = "30" else: minute = "00" if int(minute)>60: continue if not second: second = "00" if int(second)>60: continue definite_time = "%s:%s:%s"%(hour.rjust(2,"0"),minute.rjust(2,"0"),second.rjust(2,"0")) # print(definite_time) definite_time_list.append(definite_time) definite_time_idx_list.append([begin_index-len(t_in_word.group()),begin_index]) if len(extract_time)==1 and len(definite_time_list)>=2: # 实体只包含一个时间,"2024-12-09 09:00~16:00" 考虑单个时间对应两个详细时间段的识别 # 前两个详细时间的间隔 distance = definite_time_idx_list[1][0] - definite_time_idx_list[0][1] if distance<=8 and int(definite_time_list[1][:2])>=int(definite_time_list[0][:2]): # 判断详细时间都‘小时’顺序从小到大 new_extract_time = [] for d_time in definite_time_list[:2]: if d_time == "24:00:00": # 修正不规范时间表述 d_time = "23:59:59" new_extract_time.append(extract_time[0] + " " + d_time) extract_time = new_extract_time else: if definite_time_list[0] == "24:00:00": # 修正不规范时间表述 definite_time_list[0] = "23:59:59" if definite_time_list[0] != "00:00:00": extract_time[0] = extract_time[0] + " " + definite_time_list[0] else: min_len = min(len(extract_time),len(definite_time_list)) for i in range(min_len): if definite_time_list[i] == "24:00:00": # 修正不规范时间表述 definite_time_list[i] = "23:59:59" if definite_time_list[i] != "00:00:00": extract_time[i] = extract_time[i] + " " + definite_time_list[i] if extract_time: # 时间变更prob优化 if re.search("原",entity_left2): last_index = 0 for item in re.finditer("原",entity_left2): last_index = item.start() + 1 label_prob = label_prob - 0.2 * last_index / len(entity_left2) # print('prob优化',label_prob,extract_time) elif re.search("改正|更正|修正|更改|延期",entity_left2): new_label = dict_time2label.get(last_time_type,None) if new_label and entity.label==0: entity.label = new_label label_prob = 1 # 优化多个并列的时间,如:开标时间和截标时间,截标时间和报名结束时间 if entity.label in [2,3,9]: if entity.label==2 and re.search("截标|投标.{,2}截止|([递提]交|接收)(?:文件)?.{,2}截止|报价.{,2}截止|响应.{,2}截止|文件.{,2}([递提]交|接收)",entity_left3): dict_time['time_bidclose'].append((extract_time[0], label_prob-0.1, in_attachment)) if entity.label==3 and re.search("开标|(评审|比选).{,2}(?:开始)?(时间|日期)|选取.{,2}(时间|日期)",entity_left3): dict_time['time_bidopen'].append((extract_time[0], label_prob-0.1, in_attachment)) if entity.label==3 and re.search("报名",entity_left3): dict_time['time_registrationEnd'].append((extract_time[0], 0.5, in_attachment)) if entity.label==3 and re.search("获取",entity_left3[-20:]): dict_time['time_getFileEnd'].append((extract_time[0], 0.45, in_attachment)) if entity.label==9 and re.search("截标|投标.{,2}截止|([递提]交|接收)(?:文件)?.{,2}截止|报价.{,2}截止|响应.{,2}截止|文件.{,2}([递提]交|接收)",entity_left3): dict_time['time_bidclose'].append((extract_time[0], label_prob-0.1, in_attachment)) if entity.label in [11, 3]: if entity.label==11 and re.search("文件.{,2}([递提]交|接收)|截标|投标.{,2}截止|([递提]交|接收)(?:文件)?.{,2}截止|报价.{,2}截止|响应.{,2}截止",entity_left3): dict_time['time_bidclose'].append((extract_time[0], 0.5, in_attachment)) if entity.label==3 and re.search("保证金.{,2}(接受|收取)|(接受|收取).{,2}保证金",entity_left3): dict_time['time_earnestMoneyEnd'].append((extract_time[0], 0.5, in_attachment)) if entity.label in [6, 7]: if re.search("文件.{,2}([递提]交|接收)|截标|投标.{,2}截止|([递提]交|接收)(?:文件)?.{,2}截止|报价.{,2}截止|响应.{,2}截止",entity_left3): dict_time['time_bidclose'].append((extract_time[0], 0.5, in_attachment)) if entity.label==0: if re.search("文件.{,2}([递提]交|接收)|截标|投标.{,2}截止|([递提]交|接收)(?:文件)?.{,2}截止|报价.{,2}截止|响应.{,2}截止",entity_left3): if len(extract_time)>=2: dict_time['time_bidstart'].append((extract_time[0], 0.45, in_attachment)) dict_time['time_bidclose'].append((extract_time[1], 0.45, in_attachment)) else: dict_time['time_bidclose'].append((extract_time[0], 0.45, in_attachment)) if entity.label==6: # "文件获取时间"和"报名时间"并列 if re.search("报名",entity_left3): if len(extract_time)==1: dict_time['time_registrationStart'].append((extract_time[0], 0.51, in_attachment)) else: dict_time['time_registrationStart'].append((extract_time[0], 0.51, in_attachment)) dict_time['time_registrationEnd'].append((extract_time[1], 0.51, in_attachment)) # 获取文件/报名/报价 时间补充(上下文表达过长无法通过模型识别) # if entity.label == 0: # if re.search("(获取|领取|售卖|出售|购买|下载).{,4}(招标|投标|采购)?(文件|标书)|(文件|标书).{,4}(获取|售卖|出售|发售|购买)", entity_left3): # if len(extract_time)==2: # dict_time['time_getFileStart'].append((extract_time[0], 0.51, in_attachment)) # dict_time['time_getFileEnd'].append((extract_time[1], 0.51, in_attachment)) # else: # if next_entity and next_entity.sentence_index==entity.sentence_index: # mid_text = sentence_text[entity.wordOffset_end:next_entity.wordOffset_begin] # if len(mid_text)<=10 and re.search("至|到|[-—]|[~~]",mid_text) and len(next_extract_time)==1: # dict_time['time_getFileStart'].append((extract_time[0], 0.51, in_attachment)) # dict_time['time_getFileEnd'].append((next_extract_time[0], 0.51, in_attachment)) # if not dict_time['time_getFileEnd']: # if re.search("前|止|截止", entity_right) or re.search("前",entity_text[-2:]): # dict_time['time_getFileEnd'].append((extract_time[0], 0.51, in_attachment)) # elif re.search("起|开?始", entity_right) or re.search("起",entity_text[-2:]): # dict_time['time_getFileStart'].append((extract_time[0], 0.51, in_attachment)) # if re.search("(进行|在线|线下|线上|网上).{,2}报名|报名.{,2}(开始)?(时间|日期)", entity_left3): # if len(extract_time)==2: # dict_time['time_registrationStart'].append((extract_time[0], 0.51, in_attachment)) # dict_time['time_registrationEnd'].append((extract_time[1], 0.51, in_attachment)) # else: # if next_entity and next_entity.sentence_index==entity.sentence_index: # mid_text = sentence_text[entity.wordOffset_end:next_entity.wordOffset_begin] # if len(mid_text)<=10 and re.search("至|到|[-—]|[~~]",mid_text) and len(next_extract_time)==1: # dict_time['time_registrationStart'].append((extract_time[0], 0.51, in_attachment)) # dict_time['time_registrationEnd'].append((next_extract_time[0], 0.51, in_attachment)) # if not dict_time['time_registrationEnd']: # if re.search("前|止|截止", entity_right) or re.search("前",entity_text[-2:]): # dict_time['time_registrationEnd'].append((extract_time[0], 0.51, in_attachment)) # elif re.search("起|开?始", entity_right) or re.search("起",entity_text[-2:]): # dict_time['time_registrationStart'].append((extract_time[0], 0.51, in_attachment)) # # if re.search("(获取|售卖|出售|购买).{,4}(招标|投标|采购)?(文件|标书)|(文件|标书).{,4}(获取|售卖|出售|发售|购买)", entity_right2): # if len(extract_time)==2: # dict_time['time_getFileStart'].append((extract_time[0], 0.51, in_attachment)) # dict_time['time_getFileEnd'].append((extract_time[1], 0.51, in_attachment)) # else: # if previous_entity and previous_entity.sentence_index==entity.sentence_index: # mid_text = sentence_text[previous_entity.wordOffset_end:entity.wordOffset_begin] # if len(mid_text)<=10 and re.search("至|到|[-—]|[~~]",mid_text) and len(previous_extract_time)==1: # dict_time['time_getFileStart'].append((previous_extract_time[0], 0.51, in_attachment)) # dict_time['time_getFileEnd'].append((extract_time[0], 0.51, in_attachment)) # if not dict_time['time_getFileEnd']: # if re.search("前|止|截止", entity_right) or re.search("前",entity_text[-2:]): # dict_time['time_getFileEnd'].append((extract_time[0], 0.51, in_attachment)) # elif re.search("起|开?始", entity_right) or re.search("起",entity_text[-2:]): # dict_time['time_getFileStart'].append((extract_time[0], 0.51, in_attachment)) # if re.search("(进行|在线|线下).{,2}报名", entity_right2): # if len(extract_time) == 2: # dict_time['time_registrationStart'].append((extract_time[0], 0.51, in_attachment)) # dict_time['time_registrationEnd'].append((extract_time[1], 0.51, in_attachment)) # else: # if previous_entity and previous_entity.sentence_index==entity.sentence_index: # mid_text = sentence_text[previous_entity.wordOffset_end:entity.wordOffset_begin] # if len(mid_text)<=10 and re.search("至|到|[-—]|[~~]",mid_text) and len(previous_extract_time)==1: # dict_time['time_registrationStart'].append((previous_extract_time[0], 0.51, in_attachment)) # dict_time['time_registrationEnd'].append((extract_time[0], 0.51, in_attachment)) # if not dict_time['time_registrationEnd']: # if re.search("前|止|截止", entity_right) or re.search("前",entity_text[-2:]): # dict_time['time_registrationEnd'].append((extract_time[0], 0.51, in_attachment)) # elif re.search("起|开?始", entity_right) or re.search("起",entity_text[-2:]): # dict_time['time_registrationStart'].append((extract_time[0], 0.51, in_attachment)) # if re.search("(进行|开始).{,4}(报价|投标|竞价)", entity_right2): # if len(extract_time) == 2: # dict_time['time_bidstart'].append((extract_time[0], 0.51, in_attachment)) # # dict_time['time_bidclose'].append((extract_time[1], 0.51, in_attachment)) # 补充公告末尾处的发布时间 if entity.label==0: if entity.is_tail: entity.label = 1 entity.values[1] = 0.5 dict_time['time_release'].append((extract_time[0], 0.5, in_attachment)) # 2022/12/12 新增挂牌时间正则 if re.search("挂牌.{,4}(?:时间|日期)",entity_left2): if re.search("挂牌.{,4}(?:时间|日期)",entity_left2).end()>len(entity_left2)/2: if len(extract_time) == 1: if re.search("挂牌.?(开始|起始).?(?:时间|日期)",entity_left2): dict_time['time_listingStart'].append((extract_time[0], 0.5, in_attachment)) last_time_type = 'time_listingStart' elif re.search("挂牌.?(截[止至]|结束).?(?:时间|日期)",entity_left2): dict_time['time_listingEnd'].append((extract_time[0], 0.5, in_attachment)) last_time_type = 'time_listingEnd' elif re.search("挂牌.?(?:时间|日期)",entity_left2): if re.search("前|止|截止",entity_right) or re.search("至|止|到",entity_left) or re.search("前",entity_text[-2:]): dict_time['time_listingEnd'].append((extract_time[0], 0.5, in_attachment)) last_time_type = 'time_listingEnd' else: dict_time['time_listingStart'].append((extract_time[0], 0.5, in_attachment)) last_time_type = 'time_listingStart' else: dict_time['time_listingStart'].append((extract_time[0], 0.5, in_attachment)) dict_time['time_listingEnd'].append((extract_time[1], 0.5, in_attachment)) last_time_type = '' last_sentence_index = entity.sentence_index continue # 2023/9/13 新增合同相关时间 if re.search("合同|服务|履[约行]", entity_left3[-15:]): if len(extract_time) == 1: if re.search("(合同.{,2}签[订定署].{,2}|签[订定署].{,2}合同.{,2})(?:时间|日期)|合同签[订定署].{,1}$", entity_left2): dict_time['time_signContract'].append((extract_time[0], 0.5, in_attachment)) last_time_type = 'time_signContract' last_sentence_index = entity.sentence_index continue elif re.search("(合同|服务|履约|(合同|服务)履行).{,4}(?:起始|开始)(?:时间|日期)", entity_left3[-15:]): dict_time['time_contractStart'].append((extract_time[0], 0.55, in_attachment)) last_time_type = 'time_contractStart' last_sentence_index = entity.sentence_index continue elif re.search("(合同|服务|履约).{,2}(?:完成|截止|结束)(?:时间|日期|时限)", entity_left2): dict_time['time_contractEnd'].append((extract_time[0], 0.55, in_attachment)) last_time_type = 'time_contractEnd' last_sentence_index = entity.sentence_index continue elif re.search("(?:合同|服务|履约|(合同|服务)履行)(?:期限?|有效期)|(?:服务|履约|(合同|服务)履行)(?:时间|日期|周期)|服务[时年]限|合同周期", entity_left2): if re.search("到|至|截[至止]",entity_left) or re.search("前|止|截止",entity_right) or re.search("前",entity_text[-2:]): dict_time['time_contractEnd'].append((extract_time[0], 0.5, in_attachment)) last_time_type = 'time_contractEnd' else: dict_time['time_contractStart'].append((extract_time[0], 0.5, in_attachment)) last_time_type = 'time_contractStart' last_sentence_index = entity.sentence_index continue else: if re.search("(?:合同|服务|履约|(合同|服务)履行)(?:期限?|有效期)|(?:服务|履约|(合同|服务)履行)(?:时间|日期|周期)|服务[时年]限|合同周期", entity_left2): # 排除开始和借宿时间一样的错误模板,例:“履约期限:2023年02月15日至2023年02月15日” if extract_time[0]!=extract_time[1]: dict_time['time_contractStart'].append((extract_time[0], 0.6, in_attachment)) dict_time['time_contractEnd'].append((extract_time[1], 0.6, in_attachment)) last_time_type = '' last_sentence_index = entity.sentence_index continue # 服务期限表达补充 if entity.label==0: re_service = '合同期限|工期/交货期/服务期|工期\(交货期\)|合格工期|服务期限|工期' \ '|工期要求|项目周期|工期\(交货期\)|计划工期\(服务期限\)|服务时限|履行期限|服务周期|供货期限' \ '|合格工期|计划工期\(服务期\)|服务期|服务,期|交货\(完工\)(时间|日期)|交付\(服务、完工\)(时间|日期)' \ '|交货(时间|日期)|工期承诺|(服务|合同|施工|实施|工程|设计)的?(年限|期限|周期|期:)' \ '|服务期限为|计划工期|工期要求|服务期限|服务期' \ '|投标工期|设计工期|合格服务周期|总工期|服务(时间|日期)(范围)?|流转期限|维护期限|服务时限|交货期' \ '|完成(时间|日期)|服务期限|中标工期|项目周期|期限要求|供货期|合同履行日期|计划的?周期' \ '|履约期限|合同约定完成时限|合同完成日期|承诺完成日期' \ '|合同起始日起|合同履约期|履约截止日期|承包期限|合同完成日期' \ '|服务期间|服务履行期|委托(管理)?期限|履约期限、地点等简要信息' if len(extract_time)==2: if re.search(re_service,entity_left2) or re.search("履约期限、地点等简要信息",entity_left3[-20:]): dict_time['time_contractStart'].append((extract_time[0], 0.5, in_attachment)) dict_time['time_contractEnd'].append((extract_time[1], 0.5, in_attachment)) last_time_type = '' # 报价/投标时间补充(规则补充) if entity.label == 0: if re.search("[报竞]价.{,2}(开始|起始).{,2}(时间|日期)",entity_left2): entity.label = 12 label_prob = 0.8 elif re.search("[报竞]价.{,2}起止.{,2}(时间|日期)",entity_left2): entity.label = 12 label_prob = 0.6 elif re.search("响应.{,2}文件([递提]交|接收).{,2}(时间|日期)[::]|([递提]交|接收).{,2}响应.{,2}文件.{,2}(时间|日期)[::]",entity_left2): entity.label = 3 label_prob = 0.501 elif re.search("响应.{,2}文件([递提]交|接收).{,2}(时间|日期)|([递提]交|接收).{,2}响应.{,2}文件.{,2}(时间|日期)",entity_left2) and not re.search("截[止至]",entity_left2): entity.label = 12 label_prob = 0.51 elif re.search("[报竞]价.{,2}截[止至].{,2}(时间|日期)",entity_left2): entity.label = 3 label_prob = 0.8 elif re.search("(竞价|报价).?(时间|日期)",entity_left2): entity.label = 12 label_prob = 0.51 elif re.search("(竞价|报价).?(时间|日期)",entity_left3) and re.search("参与|报价|有意",entity_left2): entity.label = 12 label_prob = 0.501 # 文档结构补充 # if entity.label == 0: # re_registration = re.compile("报名|(文件|标书)[\u4e00-\u9fa5、]{,4}(获取|出售|售卖|购买|下载)|" # "(获取|出售|售卖|购买|下载)[\u4e00-\u9fa5、]{,4}(文件|标书)") # _data_i = -1 # while _data_i < len(document_tree) - 1: # _data_i += 1 # _data = document_tree[_data_i] # _type = _data["type"] # _text = _data["text"].strip() # childs = get_childs([_data]) # last_child = childs[-1] # if entity.sentence_index>=_data.sentence_index and entity.wordOffset_begin>=_data.wordOffset_begin and # (): # if re.search(re_registration, re.split("[::;;,]", _text)[0][:20]) is not None: # # content_text = "" # for c in childs: # content_text += c["text"] + "" # print('concat_text', content_text) if re.search(",(完成|截止|结束)(时间|日期)", entity_left2[-8:]) and entity.label==0: if entity.sentence_index == last_sentence_index: time_type = last_time_index.get(last_time_type) if time_type: dict_time[time_type].append((extract_time[0], 0.5 + label_prob / 10,in_attachment)) last_time_type = "" last_sentence_index = entity.sentence_index continue if re.search("至|到|[日\d][-—]$|[~~]", entity_left): if entity.sentence_index == last_sentence_index: time_type = last_time_index.get(last_time_type) if time_type: dict_time[time_type].append((extract_time[0], 0.5 + label_prob / 10,in_attachment)) last_time_type = "" last_sentence_index = entity.sentence_index continue if entity.label!=0: if entity.label==1 and label_prob>0.5: dict_time['time_release'].append((extract_time[0],label_prob,in_attachment)) last_time_type = 'time_release' elif entity.label==2 and label_prob>0.5: dict_time['time_bidopen'].append((extract_time[0],label_prob,in_attachment)) last_time_type = 'time_bidopen' elif entity.label==3 and label_prob>0.5: if len(extract_time)==1: dict_time['time_bidclose'].append((extract_time[0],label_prob,in_attachment)) last_time_type = 'time_bidclose' elif len(extract_time)==2: dict_time['time_bidstart'].append((extract_time[0], 0.6, in_attachment)) dict_time['time_bidclose'].append((extract_time[1], label_prob, in_attachment)) last_time_type = 'time_bidclose' elif entity.label==12 and label_prob>0.5: if len(extract_time)==1: if re.search("前|止|截止",entity_right) or re.search("至|止|到",entity_left) or re.search("前",entity_text[-2:]): dict_time['time_bidclose'].append((extract_time[0], label_prob,in_attachment)) last_time_type = 'time_bidclose' else: dict_time['time_bidstart'].append((extract_time[0], label_prob,in_attachment)) last_time_type = 'time_bidstart' else: dict_time['time_bidstart'].append((extract_time[0],label_prob,in_attachment)) dict_time['time_bidclose'].append((extract_time[1],label_prob,in_attachment)) last_time_type = '' elif entity.label==4 and label_prob>0.5: if len(extract_time)==1: if re.search("前|止|截止",entity_right) or re.search("至|止|到",entity_left) or re.search("前",entity_text[-2:]): dict_time['time_publicityEnd'].append((extract_time[0], label_prob,in_attachment)) last_time_type = 'time_publicityEnd' else: dict_time['time_publicityStart'].append((extract_time[0], label_prob,in_attachment)) last_time_type = 'time_publicityStart' else: dict_time['time_publicityStart'].append((extract_time[0],label_prob,in_attachment)) dict_time['time_publicityEnd'].append((extract_time[1],label_prob,in_attachment)) last_time_type = '' elif entity.label==5 and label_prob>0.5: if len(extract_time)==1: dict_time['time_publicityEnd'].append((extract_time[0], label_prob,in_attachment)) last_time_type = 'time_publicityEnd' else: dict_time['time_publicityStart'].append((extract_time[0],label_prob,in_attachment)) dict_time['time_publicityEnd'].append((extract_time[1],label_prob,in_attachment)) last_time_type = '' elif entity.label==6 and label_prob>0.5: if len(extract_time)==1: if re.search("前|止|截止",entity_right) or re.search("至|止|到",entity_left) or re.search("前",entity_text[-2:]): dict_time['time_getFileEnd'].append((extract_time[0], label_prob,in_attachment)) last_time_type = 'time_getFileEnd' else: dict_time['time_getFileStart'].append((extract_time[0], label_prob,in_attachment)) last_time_type = 'time_getFileStart' else: dict_time['time_getFileStart'].append((extract_time[0],label_prob,in_attachment)) dict_time['time_getFileEnd'].append((extract_time[1],label_prob,in_attachment)) last_time_type = '' elif entity.label==7 and label_prob>0.5: if len(extract_time)==1: dict_time['time_getFileEnd'].append((extract_time[0], label_prob,in_attachment)) last_time_type = 'time_getFileEnd' else: dict_time['time_getFileStart'].append((extract_time[0],label_prob,in_attachment)) dict_time['time_getFileEnd'].append((extract_time[1],label_prob,in_attachment)) last_time_type = '' elif entity.label==8 and label_prob>0.5: if len(extract_time)==1: if re.search("前|止|截止",entity_right) or re.search("至|止|到",entity_left) or re.search("前",entity_text[-2:]): dict_time['time_registrationEnd'].append((extract_time[0], label_prob,in_attachment)) last_time_type = 'time_registrationEnd' else: dict_time['time_registrationStart'].append((extract_time[0], label_prob,in_attachment)) last_time_type = 'time_registrationStart' else: dict_time['time_registrationStart'].append((extract_time[0],label_prob,in_attachment)) dict_time['time_registrationEnd'].append((extract_time[1],label_prob,in_attachment)) last_time_type = '' elif entity.label==9 and label_prob>0.5: if len(extract_time)==1: dict_time['time_registrationEnd'].append((extract_time[0], label_prob,in_attachment)) last_time_type = 'time_registrationEnd' else: dict_time['time_registrationStart'].append((extract_time[0],label_prob,in_attachment)) dict_time['time_registrationEnd'].append((extract_time[1],label_prob,in_attachment)) last_time_type = '' elif entity.label==10 and label_prob>0.5: if len(extract_time)==1: if re.search("前|止|截止",entity_right) or re.search("至|止|到",entity_left) or re.search("前",entity_text[-2:]): dict_time['time_earnestMoneyEnd'].append((extract_time[0], label_prob,in_attachment)) last_time_type = 'time_earnestMoneyEnd' else: dict_time['time_earnestMoneyStart'].append((extract_time[0], label_prob,in_attachment)) last_time_type = 'time_earnestMoneyStart' else: dict_time['time_earnestMoneyStart'].append((extract_time[0],label_prob,in_attachment)) dict_time['time_earnestMoneyEnd'].append((extract_time[1],label_prob,in_attachment)) last_time_type = '' elif entity.label==11 and label_prob>0.5: if len(extract_time)==1: dict_time['time_earnestMoneyEnd'].append((extract_time[0], label_prob,in_attachment)) last_time_type = 'time_earnestMoneyEnd' else: dict_time['time_earnestMoneyStart'].append((extract_time[0],label_prob,in_attachment)) dict_time['time_earnestMoneyEnd'].append((extract_time[1],label_prob,in_attachment)) last_time_type = '' elif entity.label==13 and label_prob>0.5: if len(extract_time)==1: if re.search("前|止|截止",entity_right) or re.search("至|止|到",entity_left) or re.search("前",entity_text[-2:]): dict_time['time_completion'].append((extract_time[0], label_prob,in_attachment)) last_time_type = 'time_completion' else: dict_time['time_commencement'].append((extract_time[0], label_prob,in_attachment)) last_time_type = 'time_commencement' else: dict_time['time_commencement'].append((extract_time[0],label_prob,in_attachment)) dict_time['time_completion'].append((extract_time[1],label_prob,in_attachment)) last_time_type = '' elif entity.label==14 and label_prob>0.5: if len(extract_time)==1: dict_time['time_completion'].append((extract_time[0], label_prob,in_attachment)) last_time_type = 'time_completion' else: dict_time['time_commencement'].append((extract_time[0],label_prob,in_attachment)) dict_time['time_completion'].append((extract_time[1],label_prob,in_attachment)) last_time_type = '' else: last_time_type = "" else: last_time_type = "" else: last_time_type = "" last_sentence_index = entity.sentence_index # 通过文档分析树形结构补充部分时间实体 def add_time_by_parseDocument(dict_time,parse_document): from BiddingKG.dl.interface.htmlparser import get_childs document_tree = parse_document.tree # if not dict_time['time_getFileStart'] or not dict_time['time_getFileEnd']: # time_pattern = re.compile("") concat_text_list = [] if not dict_time['time_registrationStart'] or not dict_time['time_registrationEnd']: re_registration = re.compile("报名|(文件|标书)[\u4e00-\u9fa5、]{,4}(获取|出售|售卖|购买|下载)|" "(获取|出售|售卖|购买|下载)[\u4e00-\u9fa5、]{,4}(文件|标书)") _data_i = -1 while _data_i < len(document_tree) - 1: _data_i += 1 _data = document_tree[_data_i] _type = _data["type"] _text = _data["text"].strip() # print(_data.keys()) if _type == "sentence": print('_text:',_text,_data["sentence_title"]) if _data["sentence_title"] is not None: print("aptitude_pattern", _text) print(_data['sentence_index'],_data['wordOffset_begin'],_data['wordOffset_end']) if re.search(re_registration, re.split("[::;;。]",_text)[0][:15]) is not None: childs = get_childs([_data]) concat_text = "" for c in childs: concat_text += c["text"] + "" print('concat_text',concat_text) concat_text_list.append(concat_text) _data_i += len(childs)-1 # if _type == "table": # list_table = _data["list_table"] # parent_title = _data["parent_title"] # if list_table is not None: # for line in list_table[:2]: # for cell_i in range(len(line)): # cell = line[cell_i] # cell_text = cell[0] # if len(cell_text) > 120 and re.search(re_registration, cell_text) is not None: # concat_text += cell_text + "\n" print('_text',concat_text_list) for text in concat_text_list: time_list = re.finditer(my_time_format_pattern,text) time_list = [(i,my_timeFormat(i.group(),page_time)) for i in time_list] for time_idx in range(len(time_list)): _time = time_list[time_idx][0] extract_time = time_list[time_idx][1] entity_left = text[:_time.start()] entity_left = re.split("[。;;!??]",entity_left)[-1] # entity_left2 = sentence_text[ # max(entity_context_begin, entity.wordOffset_begin - 10):entity.wordOffset_begin] # entity_left3 = sentence_text[ # max(entity_context_begin, entity.wordOffset_begin - 30):entity.wordOffset_begin] entity_right = text[_time.end():] entity_right = re.split("[。;;!??]",entity_right)[0] # entity_right2 = sentence_text[entity.wordOffset_end:entity_context_end] entity_right2 = re.sub(r"(http[s]?://)?(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\\(\\),]|(?:%[0-9a-fA-F][0-9a-fA-F])){6,}", '', entity_right)[:60] # 去除网址 print('entity_right2',entity_right2) if re.search("(进行|在线|线下).{,2}报名", entity_right2): print('报名text',entity_right2) if len(extract_time) == 2: dict_time['time_registrationStart'].append((extract_time[0], 0.51, in_attachment)) dict_time['time_registrationEnd'].append((extract_time[1], 0.51, in_attachment)) else: if previous_entity and previous_entity.sentence_index==entity.sentence_index: mid_text = sentence_text[previous_entity.wordOffset_end:entity.wordOffset_begin] if len(mid_text)<=10 and re.search("至|到|[-—]|[~~]",mid_text) and len(previous_extract_time)==1: dict_time['time_registrationStart'].append((previous_extract_time[0], 0.51, in_attachment)) dict_time['time_registrationEnd'].append((extract_time[0], 0.51, in_attachment)) if not dict_time['time_registrationEnd']: if re.search("前|止|截止", entity_right) or re.search("前",entity_text[-2:]): dict_time['time_registrationEnd'].append((extract_time[0], 0.51, in_attachment)) elif re.search("起|开?始", entity_right) or re.search("起",entity_text[-2:]): dict_time['time_registrationStart'].append((extract_time[0], 0.51, in_attachment)) return dict_time # dict_time = add_time_by_parseDocument(dict_time,parse_document) # print(dict_time) result_dict = dict((key,"") for key in dict_time.keys()) for time_type,value in dict_time.items(): list_time = dict_time[time_type] if list_time: for in_attachment in [False,True]: _list_time = [_time for _time in list_time if _time[2]==in_attachment] if _list_time: _list_time.sort(key=lambda x:(x[1],len(x[0])),reverse=True) # sort_key: label_prob,时间文本长度(优先有具体时分秒的) if in_attachment==True and len(result_dict[time_type])>0: break result_dict[time_type] = _list_time[0][0] # result_dict 纠错 if not result_dict['time_bidclose']: if result_dict['time_bidstart']: # 无截标时间,投标开始和开标时间一样 if result_dict['time_bidstart'][:10] in result_dict['time_bidopen']: result_dict['time_bidstart'] = "" result_dict['time_bidclose'] = result_dict['time_bidopen'] if not result_dict['time_bidclose']: if result_dict['time_getFileEnd']: # 无截标时间,获取文件截止时间和开标时间一样 if result_dict['time_getFileEnd'][:10] in result_dict['time_bidopen']: result_dict['time_bidclose'] = result_dict['time_bidopen'] else: if result_dict['time_bidopen']: # 截标时间 和 开标时间 时分秒互补 if len(result_dict['time_bidclose'])len(result_dict['time_bidopen']) and result_dict['time_bidopen'] in result_dict['time_bidclose']: result_dict['time_bidopen'] = result_dict['time_bidclose'] return result_dict def get_days_between(day1,day2,get_abs=0): ''' :param day1: 较小日期 :param day2: 较大日期 :param get_abs: 是否取绝对值 :return: 天数差 ''' # 将日期字符串转换为datetime对象 date1 = datetime.strptime(day1, '%Y-%m-%d') date2 = datetime.strptime(day2, '%Y-%m-%d') # 计算日期差 delta = date2 - date1 # 获取天数差 days_difference = delta.days if get_abs: return abs(days_difference) else: return days_difference def extract_serviceTime(service_time,page_time): pattern1 = re.compile("\d{4}[年\-./]\d{1,2}[月\-./]\d{1,2}日?") pattern2 = re.compile("\d+(?:\.\d+)?[((]?个?[^\d]?[^\d]?(?:日|天|周年|整年|学?年|月|周|日历[天日]|工作[天日])") pattern3 = re.compile("\d{4}[年\-./]\d{1,2}月?") pattern4 = re.compile("(?:日|天|周年|年|月|周|日历[天日]|工作[天日]|星期)[^\d]{1,3}\d+(?:\.\d+)?") DigitsDic = {"零":0, "壹":1, "贰":2, "叁":3, "肆":4, "伍":5, "陆":6, "柒":7, "捌":8, "玖":9, "〇":0, "一":1, "二":2, "三":3, "四":4, "五":5, "六":6, "七":7, "八":8, "九":9, "两":2, '貮': 2} def get_month_days(year, month): # calendar.monthrange(year, month)返回一个元组,其中第一个元素是月份的第一天是星期几(0-6为星期一到星期日), # 第二个元素是该月的天数。 _, last_day = calendar.monthrange(year, month) return last_day def get_num(text): CN_UNIT = {'十': 10,'拾': 10,'百': 100, '佰': 100,'千': 1000,'仟': 1000} regex = re.compile(r'[〇一二三四五六七八九零壹贰叁肆伍陆柒捌玖貮两十拾百佰千仟]+') text = regex.search(text) if text: text = text.group() else: return "" result = 0 result_list = [] unit = 0 control = 0 for i, d in enumerate(text): if d in '零百佰千仟' and i == 0: return "" if d in DigitsDic: result += DigitsDic[d] elif d in CN_UNIT: if unit == 0: unit_1 = CN_UNIT[d] # 这里的处理主要是考虑到类似于二十三亿五千万这种数 if result == 0: result = CN_UNIT[d] else: result *= CN_UNIT[d] unit = CN_UNIT[d] result_1 = result elif unit > CN_UNIT[d]: result -= DigitsDic[text[i - 1]] result += DigitsDic[text[i - 1]] * CN_UNIT[d] unit = CN_UNIT[d] elif unit <= CN_UNIT[d]: if (CN_UNIT[d] < unit_1) and (len(result_list) == control): result_list.append(result_1) result = (result - result_1) * CN_UNIT[d] control += 1 else: result *= CN_UNIT[d] unit = CN_UNIT[d] if len(result_list) == control: unit_1 = unit result_1 = result else: return "" return sum(result_list) + result serviceTime_dict = {"service_start": "", "service_end": "", "service_days": 0} re_num = re.findall(r'[〇一二三四五六七八九零壹贰叁肆伍陆柒捌玖貮两十拾百佰千仟]+',service_time) for _num in re_num: if not re.search("[十拾百佰千仟]",_num): num = "" for word in _num: num += str(DigitsDic.get(word,word)) service_time = service_time.replace(_num,num,1) else: num = str(get_num(_num)) service_time = service_time.replace(_num,num,1) end_time = "" service_days = 0 re_page_time = re.search("20\d{2}-\d{2}-\d{2}", page_time) page_time = re_page_time.group() if re_page_time else "2000-01-01" # page_time为空时默认值为2000-01-01 if re.search(pattern1,service_time): # end_time = re.findall(pattern1,service_time)[-1] time_list = [] for _time in re.findall(pattern1,service_time): _time = re.sub("日","",_time) _time = re.sub("[年月./]","-",_time) _year,_month,_day = _time.split("-") _month = int(_month) _day = int(_day) _year = int(_year) if _year>2050 or _year<=2000 or _month>12 or _month<=0 or _day<=0 or _day>31: service_days = 0 else: if isValidDate(_year,_month,_day): _time = str(_year)+'-'+str(_month)+'-'+str(_day) _time = _time.split("-")[0] + '-' + _time.split("-")[1].rjust(2,"0") + '-' + _time.split("-")[2].rjust(2,"0") time_list.append(_time) if len(time_list)>=2: if get_days_between(page_time,time_list[1])>1 and get_days_between(time_list[0],time_list[1])>0: serviceTime_dict['service_end'] = time_list[1] serviceTime_dict['service_start'] = time_list[0] elif len(time_list)==1: if get_days_between(page_time, time_list[0]) > 1: serviceTime_dict['service_end'] = time_list[0] # service_days = (time.mktime(time.strptime(end_time,"%Y-%m-%d"))-page_timestamp)/(24*60*60) elif re.search(pattern3,service_time): time_list = [] # end_time = re.findall(pattern3,service_time)[-1] for _time in re.findall(pattern3,service_time): _time = re.sub("月","",_time) _time = re.sub("[年./]","-",_time) _year,_month = _time.split("-") _day = 0 _month = int(_month) _year = int(_year) if _year>2050 or _year<=2000 or _month>12 or _month<=0: service_days = 0 else: _day = get_month_days(_year,_month) if isValidDate(_year, _month, _day): _time = str(_year)+'-'+str(_month)+'-'+str(_day) _time = _time.split("-")[0] + '-' + _time.split("-")[1].rjust(2,"0") + '-' + _time.split("-")[2].rjust(2,"0") time_list.append(_time) if len(time_list) >= 2: if get_days_between(page_time, time_list[1]) > 1 and get_days_between(time_list[0], time_list[1]) > 0: serviceTime_dict['service_end'] = time_list[1] serviceTime_dict['service_start'] = time_list[0] elif len(time_list)==1: if get_days_between(page_time, time_list[0]) > 1: serviceTime_dict['service_end'] = time_list[0] # service_days = (time.mktime(time.strptime(end_time,"%Y-%m-%d"))-page_timestamp)/(24*60*60) elif re.search(pattern2,service_time) or re.search(pattern4,service_time): for pattern in [pattern2,pattern4]: unit = 1 match = re.findall(pattern,service_time) if len(set(match))==1: match_text = match[0] if "月" in match_text: unit = 30 elif "年" in match_text: unit = 365 elif "周" in match_text or "星期" in match_text: unit = 7 match_num = float(re.search("\d+",match_text).group()) # 数字能被365整除,单位更正为天 if int(match_num)%365==0: unit = 1 if unit==365: if match_num>10:#单位为'年'时,排除数字过大的 match_num = 0 elif unit==30: if match_num>60:#单位为'月'时,排除数字过大的 match_num = 0 elif unit==1: if match_num>4000:#单位为'日'时,排除数字过大的 match_num = 0 service_days = int(match_num * unit) if service_days % 360==0: service_days = service_days / 360 * 365 elif service_days % 180==0 and service_days % 360!=0: service_days = service_days // 360 * 365 + 180 service_days = int(service_days) if service_days <= 1 and service_days > 4000: service_days = 0 if service_days>3: # service_days = str(service_days) + "天" serviceTime_dict['service_days'] = service_days break elif "半年" in service_time: service_days = 180 # service_days = str(service_days) + "天" serviceTime_dict['service_days'] = service_days if serviceTime_dict['service_start'] and serviceTime_dict['service_end']: service_days = get_days_between(serviceTime_dict['service_start'],serviceTime_dict['service_end']) serviceTime_dict['service_days'] = service_days return serviceTime_dict def getServiceTime(): pass def getOtherAttributes(list_entity,page_time,prem,channel_dic): dict_other = {"moneysource":"", "person_review":[], "serviceTime":"", "product":[], "total_tendereeMoney":0, "total_tendereeMoneyUnit":''} list_serviceTime = [] last_moneysource_prob = 0 for entity in list_entity: if entity.entity_type == 'bidway': dict_other["bidway"] = turnBidWay(entity.entity_text) elif entity.entity_type=='moneysource': if dict_other["moneysource"] and entity.in_attachment: continue if not dict_other["moneysource"]: dict_other["moneysource"] = entity.entity_text last_moneysource_prob = entity.prob elif entity.prob>last_moneysource_prob: dict_other["moneysource"] = entity.entity_text last_moneysource_prob = entity.prob elif entity.entity_type=='serviceTime': # print(entity.entity_text) # if list_serviceTime and entity.in_attachment: # continue if re.search("[^之]日|天|年|月|周|星期", entity.entity_text) or re.search("\d{4}[-./]\d{1,2}", entity.entity_text): list_serviceTime.append(entity) elif entity.entity_type=="person" and entity.label ==4 and entity.entity_text not in dict_other["person_review"]: # 20240624评审专家去重 dict_other["person_review"].append(entity.entity_text) elif entity.entity_type=='product' and entity.entity_text not in dict_other["product"]: #顺序去重保留 dict_other["product"].append(entity.entity_text) elif entity.entity_type=='money' and entity.notes=='总投资' and float(dict_other["total_tendereeMoney"])0: serviceTime_dict['service_start'] = time_contractStart # print([i.entity_text for i in list_serviceTime]) if list_serviceTime and not serviceTime_dict['service_end']: list_serviceTime_inAtt = [serviceTime for serviceTime in list_serviceTime if serviceTime.in_attachment==1] list_serviceTime = [serviceTime for serviceTime in list_serviceTime if serviceTime.in_attachment==0] error_serviceTime = [] for list_time in [list_serviceTime,list_serviceTime_inAtt]: # if not dict_other["serviceTime"]: if not serviceTime_dict['service_end'] and not serviceTime_dict['service_days']: list_time.sort(key=lambda x: (x.prob,-x.sentence_index,-x.begin_index), reverse=True) for _serviceTime in list_time: # 优先取具体时间(20XX年x月x日-20XX年x月x日) if re.search("20\d{2}[年/.\-]\d{1,2}[月/.\-]\d{1,2}日?[^。\d半一二三四五六七八九十壹两叁贰肆伍陆柒捌玖拾;;]{,4}20\d{2}[年/.\-]\d{1,2}[月/.\-]\d{1,2}日?",_serviceTime.entity_text): _extract_time = my_timeFormat(_serviceTime.entity_text,page_time) if _extract_time and len(_extract_time)==2: # 排除开始和结束时间一样的错误模板,例:“履约期限:2023年02月15日至2023年02月15日” if _extract_time[0]!=_extract_time[1]: # dict_other["serviceTime"] = _serviceTime.entity_text # extract_time = extract_serviceTime(_serviceTime.entity_text) # if extract_time['service_end']: serviceTime_dict['service_start'] = _extract_time[0] serviceTime_dict['service_end'] = _extract_time[1] break else: error_serviceTime.append(_serviceTime.entity_text) # if not dict_other["serviceTime"]: if not serviceTime_dict['service_end']: for _serviceTime in list_time: # 优先取具体时间(20XX年x月-20XX年x月) if re.search("20\d{2}[年/.\-]\d{1,2}月?[^。\d半一二三四五六七八九十壹两叁贰肆伍陆柒捌玖拾;;]{,3}20\d{2}[年/.\-]\d{1,2}月?", _serviceTime.entity_text): # dict_other["serviceTime"] = _serviceTime.entity_text extract_time = extract_serviceTime(_serviceTime.entity_text,page_time) if extract_time['service_end']: serviceTime_dict = extract_time break # if not dict_other["serviceTime"]: if not serviceTime_dict['service_end']: for _serviceTime in list_time: # 优先取具体时间(20XX年x月x日) if re.search("20\d{2}[年/.\-]\d{1,2}[月/.\-]\d{1,2}日?",_serviceTime.entity_text): if _serviceTime.entity_text not in error_serviceTime: # dict_other["serviceTime"] = _serviceTime.entity_text extract_time = extract_serviceTime(_serviceTime.entity_text,page_time) if extract_time['service_end']: serviceTime_dict = extract_time break # if not dict_other["serviceTime"]: if not serviceTime_dict['service_end'] and not serviceTime_dict['service_days']: for _serviceTime in list_time: if _serviceTime.entity_text not in error_serviceTime: # dict_other["serviceTime"] = _serviceTime.entity_text extract_time = extract_serviceTime(_serviceTime.entity_text,page_time) if extract_time['service_end'] or extract_time['service_days']: serviceTime_dict = extract_time break if serviceTime_dict['service_start'] and serviceTime_dict['service_end']: service_days = get_days_between(serviceTime_dict['service_start'],serviceTime_dict['service_end']) serviceTime_dict['service_days'] = service_days dict_other["serviceTime"] = serviceTime_dict if not time_contractEnd and channel_dic['docchannel']['docchannel']=='合同公告': # 用serviceTime补充合同开始结束时间,公告类型为合同公告 if serviceTime_dict['service_start'] and serviceTime_dict['service_end']: prem[0]["time_contractStart"] = serviceTime_dict['service_start'] prem[0]["time_contractEnd"] = serviceTime_dict['service_end'] if dict_other['moneysource']: dict_other['moneysource'] = turnMoneySource(dict_other['moneysource']) # dict_other["product"] = list(set(dict_other["product"])) # 已在添加时 顺序去重保留 return dict_other def getMoneyRange(RoleList): pass def getProjectContacts(list_entity, list_sentence): # project_contacts "项目联系人"提取 temp_person_entitys = [ent for ent in list_entity if ent.entity_type=='person' and ent.label in [1,2,3]] temp_person_entitys = sorted(temp_person_entitys,key=lambda x:(x.sentence_index,x.wordOffset_begin)) project_contacts_patterns = ['项目.?联系[人方]','项目.?联系.?方式', '项目.?负责人'] project_contacts_patterns_prob = [0.9, 0.85, 0.8] project_contacts_patterns_res = [] for ent in temp_person_entitys: sent_idx = ent.sentence_index word_begin = ent.wordOffset_begin # word_end = ent.wordOffset_end in_att = ent.in_attachment if word_begin >= 5: # > len('项目联系人') left_text = list_sentence[sent_idx].sentence_text[max(0, word_begin - 15):word_begin] # print('left_text', left_text) for pattern, prob in zip(project_contacts_patterns, project_contacts_patterns_prob): if re.search(pattern, left_text): project_contacts_patterns_res.append([ent, sent_idx, word_begin, prob if not in_att else prob / 2]) project_contacts_patterns_res = sorted(project_contacts_patterns_res, key=lambda x: (x[3], -x[1], -x[2]), reverse=True) # print('project_contacts_patterns_res', project_contacts_patterns_res) project_contacts_list = [] phone_set = set() have_in_text = False if project_contacts_patterns_res: for item in project_contacts_patterns_res: in_att = item[0].in_attachment contacts_person = item[0].entity_text contacts_phone = item[0].person_phone[0].entity_text if item[0].person_phone else "" if contacts_phone: if not in_att: have_in_text = True if in_att and have_in_text: # 正文已提取,则排除附件的 break if contacts_phone not in phone_set: phone_set.add(contacts_phone) project_contacts_list.append([contacts_person,contacts_phone]) return {'project_contacts':project_contacts_list} def getPREMs(list_sentences,list_entitys,list_articles,list_outlines,page_time): ''' @param: list_sentence:所有文章的句子list list_entity:所有文章的实体list @return:list of dict which include文章的包-角色-实体名称-金额-联系人-联系电话 ''' result = [] for list_sentence,list_entity,list_article,list_outline in zip(list_sentences,list_entitys,list_articles,list_outlines): RoleList = getPackageRoleMoney(list_sentence,list_entity,list_outline) result.append(dict({"prem": RoleList, "docid": list_article.doc_id}, **getTimeAttributes(list_entity, list_sentence,page_time), **getProjectContacts(list_entity, list_sentence), **{"fingerprint": list_article.fingerprint, "match_enterprise": list_article.match_enterprise, "match_enterprise_type": list_article.match_enterprise_type, "process_time": getCurrent_date(), "attachmentTypes": list_article.attachmentTypes, "bidway": list_article.bidway})) # result.append(dict({"prem":RoleList,"docid":list_article.doc_id},**getOtherAttributes(list_entity),**getTimeAttributes(list_entity,list_sentence), # **{"fingerprint":list_article.fingerprint,"match_enterprise":list_article.match_enterprise, # "match_enterprise_type":list_article.match_enterprise_type,"process_time":getCurrent_date(), # "attachmentTypes":list_article.attachmentTypes, "bidway": list_article.bidway})) return result def correct_rolemoney(prem, total_product_money, list_articles): # 2022/9/26修改为 中标金额小于表格单价数量合计总金额十分之一时替换 ''' 最后根据表格提取的单价数量合计对比更新中标金额,或中标金额为0全文只有一个总价或合计时,作为中标金额 :param prem: 列表 :param total_product_money: 表格统计金额 :param list_articles: 文章对象 :return: ''' if '##attachment##' in list_articles[0].content: content, attachment = list_articles[0].content.split('##attachment##') if len(content) < 200: content += attachment else: content = list_articles[0].content if len(re.findall('win_tenderer|second_tenderer|third_tenderer', str(prem[0]['prem'])))==1 and re.search('(中标|成交|合同|投标))?(总?金额|[报总]?价):', content) == None: # 只有一个中标角色且没有明确中标金额表达的 if total_product_money>0 and total_product_money<5000000000: for value in prem[0]['prem'].values(): ree_money = float(value['tendereeMoney']) for l in value['roleList']: try: # if l[0] == 'win_tenderer' and float(l[2])合计((万?元))?:)(?P[\d,.]+(万?元)?)', content) if len(re.findall('合计', content)) == 1 else re.search('(?P
总价((万?元))?:)(?P[\d,.]+(万?元)?)', content) if ser: money_text = ser.group('money') header = ser.group('header') money, money_unit = money_process(money_text, header) if 100 10000000000: # 工期小于180天且金额大于百亿的,错误 l["role_money"]['money'] = str(Decimal(l["role_money"]['money']) / 10000) # print('工期纠正百亿以上金额 ') elif float(l["role_money"]['money']) > maximum_amount: flag = 1 for money in moneys: if float(l["role_money"]['money'])/money == 10000 and l['role_money']['money_unit'] == '万元': l["role_money"]['money'] = str(Decimal(l["role_money"]['money']) / 10000) # print('万倍关系纠正连接金额') flag = 0 break if flag and l["role_money"]['money_unit'] == '万元' or re.search('^\d{11,}(\.0)?$', str(l["role_money"]['money'])): l["role_money"]['money'] = str(Decimal(l["role_money"]['money']) / 10000) # print('行业限额纠正连接金额') elif industry in ['餐饮业', '物业管理'] and maximum_amount == indu_amount[industry]: l["role_money"]['money'] = str(Decimal(l["role_money"]['money']) / 10000) # elif flag and l["role_money"]['money_unit'] == '元': # l["role_money"]['money'] = 0 elif 0 maximum_amount: flag = 1 for money in moneys: if float(value['tendereeMoney'])/money == 10000 and l['role_money']['money_unit'] == '万元': value['tendereeMoney'] = str(Decimal(value['tendereeMoney'])/10000) # print('万倍关系纠正连接金额') flag = 0 break if (flag and value['tendereeMoneyUnit'] == '万元' or re.search('^\d{11,}(\.0)?$', str(value['tendereeMoney']))) and float(value['tendereeMoney']) > maximum_amount*100: #2024/5/23 改为单位万元且超过限额100倍才除一万,避免不合理纠正 比如 174255856 项目(系统)一亿变一万 value['tendereeMoney'] = str(Decimal(value['tendereeMoney']) / 10000) # print('行业限额纠正连接金额') elif industry in ['餐饮业', '物业管理'] and maximum_amount == indu_amount[industry]: value['tendereeMoney'] = str(Decimal(value['tendereeMoney']) / 10000) # elif flag and value['tendereeMoneyUnit'] == '元': # value['tendereeMoney'] = 0 elif 0 maximum_amount: if indu in ['餐饮业', '物业管理']: l["role_money"]['money'] = str(float(l["role_money"]['money'])/10000) elif l["role_money"]['money_unit'] == '万元': l["role_money"]['money'] = str(float(l["role_money"]['money'])/10000) if float(value['tendereeMoney']) > maximum_amount: if indu in ['餐饮业', '物业管理']: value['tendereeMoney'] = float(value['tendereeMoney'])/10000 elif value['tendereeMoneyUnit'] == '万元': value['tendereeMoney'] = float(value['tendereeMoney']) / 10000 except Exception as e: print('行业分类限制最高金额抛出异常:%s' % e) def get_win_joint(prem, list_entitys, list_sentences, list_articles): ''' 获取联合体信息, 添加到prem :param prem: :param list_entitys: :param list_sentences: :param list_articles: :return: ''' try: if 'win_tenderer' in str(prem[0]['prem']) and re.search('联合(体|方|投标人):|联合体(成员|单位)[12345一二三四五]?:|(联合体)?成员单位[12345一二三四五]?:|特殊普通合伙:|(联合(体|投标人))|(联合体(成员|单位)方?[12345一二三四五]?)|((联合体)?成员单位[12345一二三四五]?)|(特殊普通合伙|成员?)|[,;]成:|(成[),]|与[^,。]{6,100}联合体', list_articles[0].content): sentences = sorted(list_sentences[0], key=lambda x:x.sentence_index) for v in prem[0]['prem'].values(): for d in v['roleList']: if d.get('role_name', '') == 'win_tenderer': winner = d.get('role_text') join_l = [winner] for list_entity in list_entitys: for i in range(len(list_entity)-1): _entity = list_entity[i] b = _entity.wordOffset_begin e = _entity.wordOffset_end if _entity.entity_type in ['org', 'company'] and _entity.label==2\ and _entity.entity_text==winner: s = sentences[_entity.sentence_index].sentence_text find_joint = 0 # 是否包含联合体 for j in range(i+1, len(list_entity)): behind_entity = list_entity[j] b2 = behind_entity.wordOffset_begin e2 = behind_entity.wordOffset_end if _entity.sentence_index == behind_entity.sentence_index and behind_entity.entity_type in ['org', 'company'] \ and b2-e<13 and re.search('联合(体|方|投标人):|联合体(成员|单位)[12345一二三四五]?:|(联合体)?成员单位[12345一二三四五]?:|特殊普通合伙:|[,;]成:|(成)$', s[e:b2]) or \ re.search('(联合(体|方|投标人))|(联合体(成员|单位)方?[12345一二三四五]?)|((联合体)?成员单位[12345一二三四五]?)|(特殊普通合伙|成员?)|^(成[),]$', s[e2:e2+10]) and behind_entity.label in [2, 5]: join_l.append(behind_entity.entity_text) b = b2 e = e2 find_joint = 1 elif (find_joint or re.search('与[^,。]{6,100}联合体', list_articles[0].content)) and behind_entity.entity_type in ['org', 'company'] and s[e:b2] in ['与',';','、','&',',','/','//'] and (len(s)==e2 or s[e2] in [';','、','&',',','/','//', '。', ')'] or s[e2:e2+3]=='联合体'): join_l.append(behind_entity.entity_text) b = b2 e = e2 elif e == e2: # 修复重复实体导致中断情况 continue else: break if len(join_l)>1: d['win_tenderer_joint'] = ','.join(set(join_l)) # behind_entity = list_entity[i + 1] # if _entity.sentence_index== behind_entity.sentence_index and _entity.entity_type in ['org', 'company'] and _entity.label==2\ # and _entity.entity_text==winner and behind_entity.entity_type in ['org', 'company'] and behind_entity.label==5: # s = sentences[_entity.sentence_index].sentence_text # b = _entity.wordOffset_begin # e = _entity.wordOffset_end # b2 = behind_entity.wordOffset_begin # e2 = behind_entity.wordOffset_end # if re.search('(联合体)', s[e2:e2+6]) and b2-e<3: # print('联合体:', s[max(0, b-10):e2+10]) # d['win_tenderer_joint'] = '%s,%s'%(_entity.entity_text, behind_entity.entity_text) # break # elif re.search('(联合体((牵头|主办)(人|方|单位)|主体)|牵头(人|方|单位))|(联合体)?成员:|特殊普通合伙:', s[e:b2]) and b2-e<10: # d['win_tenderer_joint'] = '%s,%s' % (_entity.entity_text, behind_entity.entity_text) # print('联合体:', s[max(0, b - 10):e2 + 10]) # break except Exception as e: print('获取联合体抛出异常', e) def get_multi_winner_and_money(channel_dic, prem, list_entitys,list_sentences, all_winner=False): ''' 获取多中标人及正文、附件所有金额,多中标人multi_winner写入prem,返回金额列表 :param channel_dic: :param prem: :param list_entitys: :param list_sentences: :return: ''' def add_multi_winner(pack_l, winner_l): if len(prem[0]['prem']) > 1 and len(set([it[0] for it in pack_l])) > 1: # 多标段多中标人处理 pk_dic = {} for ent in winner_l: for i in range(len(pack_l)): pk, s1, b1, _ = pack_l[i] if ent[1] < s1 or ent[1] == s1 and ent[2] < b1: break elif (ent[1] > s1 or ent[1] == s1 and ent[2] > b1): if i < len(pack_l) - 1: pk2, s2, b2, _ = pack_l[i + 1] if (ent[1] < s2 or ent[1] == s2 and ent[2] < b2): if pk not in pk_dic: pk_dic[pk] = set() pk_dic[pk].add(ent[0]) else: continue else: if pk not in pk_dic: pk_dic[pk] = set() pk_dic[pk].add(ent[0]) else: continue for pk, multi_winner in pk_dic.items(): multi_winner = multi_winner - tenderee_or_agency if len(multi_winner) < 2: continue for k, v in prem[0]['prem'].items(): if pk == k: for d in v['roleList']: if d.get('role_name', '') == 'win_tenderer': if d.get('role_text', '') in multi_winner and 'multi_winner' not in d: d['multi_winner'] = ','.join(set(multi_winner)) elif 0 < len(prem[0]['prem']) < 3: # 修复 单包多中标人 例:285780273 multi_winner = set([it[0] for it in winner_l]) - tenderee_or_agency if len(multi_winner) > 1: for v in prem[0]['prem'].values(): for d in v['roleList']: if d.get('role_name', '') == 'win_tenderer': if d.get('role_text', '') in multi_winner and 'multi_winner' not in d: d['multi_winner'] = ','.join(set(multi_winner)) break moneys = [] moneys_attachment = [] if channel_dic['docchannel']['life_docchannel'] in ['中标信息','候选人公示','合同公告'] and 'win_tenderer' in str(prem): sentences = sorted(list_sentences[0], key=lambda x: x.sentence_index) finalists = [] # 入围供应商 multi_winner_l = [] # 保存中标人名称列表 tenderee_or_agency = set() package_l = [] i = 0 while i < len(list_entitys[0])-1: ent = list_entitys[0][i] b_idx_fr = ent.wordOffset_begin e_idx_fr = ent.wordOffset_end i += 1 if ent.entity_type in ['money']: money = float(ent.entity_text) if ent.in_attachment: moneys_attachment.append(money) else: moneys.append(money) elif ent.entity_type in ['package']: package_l.append((ent.entity_text, ent.sentence_index, ent.wordOffset_begin, ent.in_attachment)) elif ent.entity_type in ['org', 'company']: sentence_text = sentences[ent.sentence_index].sentence_text pre_text = sentence_text[max(0, b_idx_fr - 10):b_idx_fr] if ent.label in [0,1] and ent.values[ent.label] > 0.8: tenderee_or_agency.add(ent.entity_text) elif ent.label == 2 and (ent.values[ent.label] > 0.8 or all_winner): multi_winner_l.append((ent.entity_text, ent.sentence_index, ent.wordOffset_begin, ent.in_attachment)) for j in range(i, len(list_entitys[0])): ent_bh = list_entitys[0][j] b_idx_bh = ent_bh.wordOffset_begin e_idx_bh = ent_bh.wordOffset_end if ent_bh.entity_type in ['org','company'] and ent_bh.label in [2,5] and ent_bh.sentence_index == ent.sentence_index and b_idx_bh - e_idx_fr in [1, 2]: if sentence_text[e_idx_fr:b_idx_bh] in [';', '、', '&', ',', '/', '//'] and ( len(sentence_text) == e_idx_bh or sentence_text[e_idx_bh] in [';', '、', '&', ',','/', '//','。']): # 修复多中标人刚好在文末index超出报错,例子 407126558 multi_winner_l.append((ent_bh.entity_text, ent_bh.sentence_index, ent_bh.wordOffset_begin, ent_bh.in_attachment)) e_idx_fr = e_idx_bh i = j + 1 else: break elif ent_bh.entity_type in ['org', 'company'] and ent_bh.label == 5 and ent_bh.sentence_index == ent.sentence_index and b_idx_bh == e_idx_fr: # 两实体间没符号分割情况 multi_winner_l.append((ent_bh.entity_text, ent_bh.sentence_index, ent_bh.wordOffset_begin, ent_bh.in_attachment)) e_idx_fr = e_idx_bh i = j + 1 elif ent_bh.entity_type in ['org', 'company'] and ent_bh.label == 5 and e_idx_fr == e_idx_bh: # 处理 514603520 中国邮政储蓄银行股份有限公司淄博市临淄区支行 实体由于字典匹配重复两次情况 i = j + 1 else: break if re.search('入围', pre_text) and re.search('未入围', pre_text)==None: finalists.append((ent.entity_text, ent.sentence_index, ent.wordOffset_begin, ent.in_attachment)) elif all_winner==1 and ent.label in [3,4,5] and re.search('第[一二三四五六七八九十0-9]+名|候选(人|单位)|入围(单位|供应商)|投标银行', pre_text) and re.search('未', pre_text)==None: multi_winner_l.append((ent.entity_text, ent.sentence_index, ent.wordOffset_begin, ent.in_attachment)) if len(multi_winner_l)>=2: winner_main = [it for it in multi_winner_l if not it[3]] winner_attn = [it for it in multi_winner_l if it[3]] pack_main = [it for it in package_l if not it[3]] pack_attn = [it for it in package_l if it[3]] if len(set([it[0] for it in winner_main]))>=2: # 有两个及以上多中标人及多标段 例:441612746 add_multi_winner(pack_main, winner_main) elif len(set([it[0] for it in winner_attn]))>=2: add_multi_winner(pack_attn, winner_attn) if len(finalists)>=2: # 多入围候选人 winner_main = [it for it in finalists if not it[3]] winner_attn = [it for it in finalists if it[3]] pack_main = [it for it in package_l if not it[3]] pack_attn = [it for it in package_l if it[3]] if len(set([it[0] for it in winner_main]))>=2: # 有两个及以上多中标人及多标段 例:276326152 add_multi_winner(pack_main, winner_main) elif len(set([it[0] for it in winner_attn]))>=2: add_multi_winner(pack_attn, winner_attn) else: for i in range(len(list_entitys[0])): ent = list_entitys[0][i] if ent.entity_type in ['money']: money = float(ent.entity_text) if ent.in_attachment: moneys_attachment.append(money) else: moneys.append(money) return {'moneys': list(set(moneys)), 'moneys_attachment': list(set(moneys_attachment))} def update_prem(old_prem, new_prem, in_attachment=False): ''' 根据新旧对比,更新数据 :param old_prem: :param new_prem: 表格提取的要素 :return: ''' if len(new_prem) >= 1 : '''如果表格提取的包大于2,原来的包比表格提取的包多则删除原来多余的包,以表格的为准''' if len(new_prem) >= 2 and (len(new_prem) len(new_prem) and len(new_prem)>1 and in_attachment==False: # 如果表格有提取,非表格包数比表格提取多,去掉非表格在附件里提取的包 del_k = [] for k in old_prem: if 'in_attachment' in old_prem[k] and old_prem[k]['in_attachment'] and k not in new_prem and k != 'Project': del_k.append(k) for k in del_k: old_prem.pop(k) if in_attachment: # 附件表格提取的,原来提取有中标人,停止替换 for v in old_prem.values(): for d in v['roleList']: if d['role_name'] in ['win_tenderer', 'pre_win_tenderer']: return 0 # if len(new_prem) > len(old_prem) and [k for k in new_prem if '自增' not in k] == []: # 如果表格提取包号都为自增编号且包数大于非表格提取,不进行更新 例 244355092 281854766 # return None if len(old_prem) == 2 and len(new_prem) == 1 and ('Project' in new_prem or set(new_prem)&set(old_prem)==set()): # 如果表格提取包为Project,非表格提取两个包且一个包为Project,把表格提取合并到非Project包 k = list(old_prem.keys()-set(['Project']))[0] k_new = list(new_prem.keys())[0] new_prem[k] = new_prem.pop(k_new) elif len(old_prem) == 1 and len(new_prem) == 1 and 'Project' not in old_prem and set(new_prem)&set(old_prem)==set(): # 如果表格提取包与非表格提取都是一个包且不同,把表格提取包名替换为非表格包名 k = list(old_prem.keys()-set(['Project']))[0] k_new = list(new_prem.keys())[0] new_prem[k] = new_prem.pop(k_new) if len(new_prem) == len(old_prem) == 1 and 'Project' not in new_prem and 'Project' in old_prem: # 如果表格提取到包号,非表格没提取到,合并到Project k = list(new_prem.keys())[0] new_prem['Project'] = new_prem[k] multi_tendereeMoney = [] # 多包招标金额 for k, v in new_prem.items(): if k == 'Project': if 'Project' in old_prem: tmp_l = [] # 保存新旧同时包含的角色 if v.get('code', "") != "": old_prem['Project']['code'] = v.get('code', "") if v.get('name', "") != "": old_prem['Project']['name'] = v.get('name', "") for d in old_prem['Project']['roleList']: for d2 in v['roleList']: if d['role_name'] == d2['role_name']: # 同时包含的角色用表格的替换 tmp_l.append(d2) if d2['role_text'] != "": d['role_text'] = d2['role_text'] if d2['serviceTime'] != "": d['serviceTime'] = d2['serviceTime'] if float(d2['role_money']['money']) != 0: # 如果表格提取的金额不为0才替换 d['role_money']['money'] = d2['role_money']['money'] d['role_money']['money_unit'] = d2['role_money']['money_unit'] for k in set(d2)-set(d): # 把表格提取加的属性补充过来,比如:multi_winner other_winner_dic等 if d2[k]: d[k] = d2[k] for d2 in v['roleList']: if d2 not in tmp_l: # 把新预测有,旧没有的角色添加上去 old_prem['Project']['roleList'].append(d2) if float(new_prem['Project']['tendereeMoney'])!=0: old_prem['Project']['tendereeMoney'] = new_prem['Project']['tendereeMoney'] # 20240508 修复 464187225 表格提取纠正招标金额错误 else: old_prem[k] = v else: if v['tendereeMoney'] != 0: multi_tendereeMoney.append(v['tendereeMoney']) if k.startswith('自增'): # 表格提取的没找到包号 按行数添加包号,前面加自增,例 自增1 k = k[2:] if k not in old_prem: # 新有旧没有的包直接添加 old_prem[k] = v else: tmp_l = [] # 保存新旧同时包含的角色 if v.get('code', "") != "": old_prem[k]['code'] = v.get('code', "") if v.get('name', "") != "": old_prem[k]['name'] = v.get('name', "") for d in old_prem[k]['roleList']: for d2 in v['roleList']: if d['role_name'] == d2['role_name']: tmp_l.append(d2) if d2['role_text'] != "": d['role_text'] = d2['role_text'] if d2['serviceTime'] != "": d['serviceTime'] = d2['serviceTime'] if float(d2['role_money']['money']) != 0: # 如果表格提取的金额不为0才替换 d['role_money']['money'] = d2['role_money']['money'] d['role_money']['money_unit'] = d2['role_money']['money_unit'] for k2 in set(d2)-set(d): # 把表格提取加的属性补充过来,比如:multi_winner other_winner_dic等 if d2[k2]: d[k2] = d2[k2] for d2 in v['roleList']: if d2 not in tmp_l: # 把新预测有,旧没有的角色添加上去 old_prem[k]['roleList'].append(d2) if v['tendereeMoney'] != 0: old_prem[k]['tendereeMoney'] = v['tendereeMoney'] # 2024/05/24 使用表格招标金额 if multi_tendereeMoney and 'Project' in old_prem and float(old_prem['Project']['tendereeMoney'])!=0: # 表格提取到多标段招标金额,去掉Project包招标金额 old_prem['Project']['tendereeMoney'] = 0 # return old_prem def confirm_prem(prem, channel_dic, is_deposit_project=False, total_tendereeMoney=0): ''' 规则检查纠正prem,如果Project包中标人在其他包中标人,去掉project包中标角色;如果有其他包中标人,去掉roleList为空的包; :param prem: prem 字段字典 :return: ''' if len(prem) > 1: # 表格提取到中标人的,去掉project包中标人 pro_winner = set() other_winner = set() other_winner_prob = 0 pro_winner_prob = 0 empty_roleList = [] for k in prem: prem[k]['uuid'] = str(uuid.uuid4()) # 20240627 每个包都添加uuid if prem[k]['roleList'] == []: empty_roleList.append(k) for d in prem[k]['roleList']: if d['role_name'] in ['win_tenderer', 'pre_win_tenderer', 'second_tenderer','third_tenderer']: if k == 'Project': pro_winner.add(d['role_text']) if 'win_tenderer_joint' in d: pro_winner.update(set(d['win_tenderer_joint'].split(','))) if 'multi_winner' in d: pro_winner.update(set(d['multi_winner'].split(','))) if d['role_name'] == 'win_tenderer' and d.get('role_prob', 0)>0.6: pro_winner_prob = d.get('role_prob', 0) else: other_winner.add(d['role_text']) if 'win_tenderer_joint' in d: other_winner.update(set(d['win_tenderer_joint'].split(','))) if 'multi_winner' in d: other_winner.update(set(d['multi_winner'].split(','))) if d['role_name'] == 'win_tenderer' and d.get('role_prob', 0)>0.6: other_winner_prob = d.get('role_prob', 0) if pro_winner!=set() and (pro_winner & other_winner != set() or other_winner_prob>pro_winner_prob): # 如果默认包与其他包中标人重复或其他包中标人概率比默认包大,删除默认包中标人 prem['Project']['roleList'] = [d for d in prem['Project']['roleList'] if d['role_name'] not in ['win_tenderer', 'second_tenderer', 'third_tenderer']] elif other_winner_prob2 and len(product_list)>2: ent_l = [] for entity in list_entity: if entity.entity_type in ['product', 'package']: ent_l.append((entity.entity_type, entity.entity_text, entity.sentence_index, entity.wordOffset_begin, entity.wordOffset_end, entity.in_attachment)) ent_l.sort(key=lambda x: [x[2],x[3]]) i = 0 pk_dic = {} while i < len(ent_l)-1: ty1, ent1, s1, b1, e1, in_att1 = ent_l[i] ty2, ent2, s2, b2, e2, in_att2 = ent_l[i+1] if in_att1 == in_att2 and ty1 == 'package' and ty2 == 'product' and s1 == s2 and 0 1: for k, v in prem.items(): if k in pk_dic and v.get('name', '') == '': v['name'] = pk_dic[k] elif name != '' and len(prem)<=2: # 20241129 小于等于两个包且无包名称,取项目名称 for k in prem: if prem[k].get('name', '') == '': prem[k]['name'] = name def fix_single_source(prem, channel_dic, original_docchannel): if prem.get('bidway', '') == '单一来源' and channel_dic['docchannel']['docchannel'] == '招标公告' and original_docchannel==52: for l in prem['prem'].values(): for d in l['roleList']: if d['role_name'] == "win_tenderer": d['role_name'] = 'pre_win_tenderer' if __name__=="__main__": ''' conn = getConnection() cursor = conn.cursor() #sql = " select distinct A.doc_id from entity_mention A,test_predict_role B where A.entity_id=B.entity_id limit 200" sql = " select B.doc_id,B.prem from articles_processed A, articles_validation B where A.id=B.doc_id " result = [] cursor.execute(sql) rows = cursor.fetchall() count = 0 for row in rows: count += 1 # print(count) doc_id = row[0] roleList = getPackageRoleMoney(doc_id) result.append([doc_id,str(roleList),row[1]]) '''''' with codecs.open("getAttribute.html","w",encoding="utf8") as f: f.write('\ \ \ \ \ \ \ \ ') for item in result: f.write(""+""+""+""+"") f.write("
doc_id角色
"+item[0]+""+item[1]+""+item[2]+"
") '''