1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012101310141015101610171018101910201021102210231024102510261027102810291030103110321033103410351036103710381039104010411042104310441045104610471048104910501051105210531054105510561057105810591060106110621063106410651066106710681069107010711072107310741075107610771078107910801081108210831084108510861087108810891090109110921093109410951096109710981099110011011102110311041105110611071108110911101111111211131114111511161117111811191120112111221123112411251126112711281129113011311132113311341135113611371138113911401141114211431144114511461147114811491150115111521153115411551156115711581159116011611162116311641165116611671168116911701171117211731174117511761177117811791180118111821183118411851186118711881189119011911192119311941195119611971198119912001201120212031204120512061207120812091210121112121213121412151216121712181219122012211222122312241225122612271228122912301231123212331234123512361237123812391240124112421243124412451246124712481249125012511252125312541255125612571258125912601261126212631264126512661267126812691270127112721273127412751276127712781279128012811282128312841285128612871288128912901291129212931294129512961297129812991300130113021303130413051306130713081309131013111312131313141315131613171318131913201321132213231324132513261327132813291330133113321333133413351336133713381339134013411342134313441345134613471348134913501351135213531354135513561357135813591360136113621363136413651366136713681369137013711372137313741375137613771378137913801381138213831384138513861387138813891390139113921393139413951396139713981399140014011402140314041405140614071408140914101411141214131414141514161417141814191420142114221423142414251426142714281429143014311432143314341435143614371438143914401441144214431444144514461447144814491450145114521453145414551456145714581459146014611462146314641465146614671468146914701471147214731474147514761477147814791480148114821483148414851486148714881489149014911492149314941495149614971498149915001501150215031504150515061507150815091510151115121513151415151516151715181519152015211522152315241525152615271528152915301531153215331534153515361537153815391540154115421543154415451546154715481549155015511552155315541555155615571558155915601561156215631564156515661567156815691570157115721573157415751576157715781579158015811582158315841585158615871588158915901591159215931594159515961597159815991600160116021603160416051606160716081609161016111612161316141615161616171618161916201621162216231624162516261627162816291630163116321633163416351636163716381639164016411642164316441645164616471648164916501651165216531654165516561657165816591660166116621663166416651666166716681669167016711672167316741675167616771678167916801681168216831684168516861687168816891690169116921693169416951696169716981699170017011702170317041705170617071708170917101711171217131714171517161717171817191720172117221723172417251726172717281729173017311732173317341735173617371738173917401741174217431744174517461747174817491750175117521753175417551756175717581759176017611762176317641765176617671768176917701771177217731774177517761777177817791780178117821783178417851786178717881789179017911792179317941795179617971798179918001801180218031804180518061807180818091810181118121813181418151816181718181819182018211822182318241825182618271828182918301831183218331834183518361837183818391840184118421843184418451846184718481849185018511852185318541855185618571858185918601861186218631864186518661867186818691870187118721873187418751876187718781879188018811882188318841885188618871888188918901891189218931894189518961897189818991900190119021903190419051906190719081909191019111912191319141915191619171918191919201921192219231924192519261927192819291930193119321933193419351936193719381939194019411942194319441945194619471948194919501951195219531954195519561957195819591960196119621963196419651966196719681969197019711972197319741975197619771978197919801981198219831984198519861987198819891990199119921993199419951996199719981999200020012002200320042005200620072008200920102011201220132014201520162017201820192020202120222023202420252026202720282029203020312032203320342035203620372038203920402041204220432044204520462047204820492050205120522053205420552056205720582059206020612062206320642065206620672068206920702071207220732074207520762077207820792080208120822083208420852086208720882089209020912092209320942095209620972098209921002101210221032104210521062107210821092110211121122113211421152116211721182119212021212122212321242125212621272128212921302131213221332134213521362137213821392140214121422143214421452146214721482149215021512152 |
- from BiddingKG.dl.common.Utils import findAllIndex,debug,timeFormat,getCurrent_date
- from BiddingKG.dl.interface.Entitys import PREM,Role,Entity
- from decimal import Decimal
- import re
- import copy
- import math
- import pandas as pd
- import os
- from scipy.optimize import linear_sum_assignment
- from BiddingKG.dl.interface.Entitys import Match
- import numpy as np
- def getTheRole(entity,role_list):
- '''
- @summary:根据实体名称拿到index
- @param:
- entity:实体名称
- role_list:角色list
- @return:该实体所在下标
- '''
- for role_index in range(len(role_list)):
- if entity in role_list[role_index]:
- return role_index
- return None
- dict_role_id = {"0":"tenderee",
- "1":"agency",
- "2":"win_tenderer",
- "3":"second_tenderer",
- "4":"third_tenderer"}
- def getPackage(packageList,sentence_index,begin_index,roleid,MAX_DIS=None,DIRECT=None):
- '''
- @param:
- packageList:文章的包的信息,包号-sent_index-词偏移-字偏移-[[前作用域句子,句内偏移],[后作用域句子,句内偏移]]-匹配集合
- sentence_index:实体所在的句子
- begin_index:实体所在句子的起始位置
- @return:公司实体所属的包
- @summary: 优化多标段,确定标段作用域之后,寻找作用域包含该实体的所有包,从前往后找到一个还没有该roleid的包返回,若找到的包都有roleid,则返回第一个,若没有找到包,返回None
- '''
-
- '''
- if len(packageList)==0:
- return None
- before_index = None
- after_index = None
- equal_index = None
- equal_count = 0
-
-
- for pack_index in range(len(packageList)):
- if packageList[pack_index][1]>sentence_index and after_index is None:
- after_index = pack_index
- if packageList[pack_index][1]<sentence_index:
- before_index = pack_index
- if packageList[pack_index][1]==sentence_index and equal_index is None:
- equal_index = pack_index
- #当前句子和之前句子未找到包
- if before_index is None and equal_index is None:
- return None
- else:
- if after_index is None:
- end_index = len(packageList)
- else:
- end_index = after_index
- #只在当前句子找到一个包号
- if end_index-max((before_index if before_index is not None else -1,equal_index if equal_index is not None else -1))==1:
- return packageList[end_index-1][0]
- else:
- for i in range(max((before_index if before_index is not None else -1,equal_index if equal_index is not None else -1)),end_index):
- if packageList[i][2]>int(begin_index):
- if packageList[i-1][4]:
- return packageList[i-1][0]
- else:
- if packageList[i][4]:
- return packageList[i-1][0]
- else:
- return packageList[i][0]
- return packageList[end_index-1][0]
- '''
- if len(packageList)==0:
- return None,False
- list_legalPack = []
- for pack_index in range(len(packageList)):
- if DIRECT=="L" and (packageList[pack_index]["sentence_index"]>sentence_index or (packageList[pack_index]["sentence_index"]==sentence_index and packageList[pack_index]["offsetWords_begin"]>begin_index)):
- continue
- if DIRECT=="R" and (packageList[pack_index]["sentence_index"]<sentence_index or (packageList[pack_index]["sentence_index"]==sentence_index and packageList[pack_index]["offsetwords_begin"]<begin_index)):
- continue
- if (packageList[pack_index]["scope"][0][0]<sentence_index or (packageList[pack_index]["scope"][0][0]==sentence_index and packageList[pack_index]["scope"][0][1]<=begin_index)) and (packageList[pack_index]["scope"][1][0]>sentence_index or (packageList[pack_index]["scope"][1][0]==sentence_index and packageList[pack_index]["scope"][1][1]>=begin_index)):
- if MAX_DIS is not None:
- if abs(sentence_index-packageList[pack_index]["sentence_index"])<=MAX_DIS:
- list_legalPack.append(pack_index)
- else:
- list_legalPack.append(pack_index)
- # if (packageList[pack_index]["scope"][0][0] < sentence_index
- # or (packageList[pack_index]["scope"][0][0] == sentence_index
- # and packageList[pack_index]["scope"][0][1] <= begin_index))
- # and (packageList[pack_index]["scope"][1][0] > sentence_index
- # or (packageList[pack_index]["scope"][1][0] == sentence_index
- # and packageList[pack_index]["scope"][1][1] >= begin_index)):
- # pass
- _flag = True
- for _index in list_legalPack:
- if roleid in packageList[_index]["hit"]:
- continue
- else:
- _flag = False
- packageList[_index]["hit"].add(roleid)
- return packageList[_index]["pointer"],_flag
- if len(list_legalPack)>0:
- return packageList[0]["pointer"],_flag
- return None,False
- #生成合法的组合
- def get_legal_comba(list_entity,dict_role_combination):
-
- #拿到一个包中所有合法的组合
- def circle_package(_dict_legal_combination):
- list_dict_role_first = []
- for _role in _dict_legal_combination:
- if len(list_dict_role_first)==0:
- for _entity in _dict_legal_combination[_role]:
- if _entity !="":
- list_dict_role_first.append({_role:_entity})
- else:
- list_dict_role_after = []
- _find_count = 0
- for _entity in _dict_legal_combination[_role]:
- if _entity !="":
- for _dict in list_dict_role_first:
- _flag = True
- for _key1 in _dict:
- if _entity==_dict[_key1]:
- #修改为招标人和代理人可以为同一个
- if str(_key1) in ["0","1"] and str(_role) in ["0","1"]:
- _flag = True
- else:
- _flag = False
- if _flag:
- _find_count += 1
- _new_dict = copy.copy(_dict)
- _new_dict[_role] = _entity
- if len(list_dict_role_after)>100000:
- break
- list_dict_role_after.append(_new_dict)
- else:
- # 2021/5/25 update,同一实体(entity_text)不同角色
- if len(list_dict_role_after) > 100000:
- break
- for _dict in list_dict_role_first:
- for _key1 in _dict:
- if _entity == _dict[_key1]:
- _new_dict = copy.copy(_dict)
- _new_dict.pop(_key1)
- _new_dict[_role] = _entity
- list_dict_role_after.append({_role:_entity})
- if len(list_dict_role_after)==0:
- pass
- else:
- list_dict_role_first.extend(list_dict_role_after)
- return list_dict_role_first
- def recursive_package(_dict_legal_combination,set_legal_entity,dict_one_selution,list_all_selution):
- last_layer = False
- #若是空组合则放回空
- if len(_dict_legal_combination.keys())==0:
- return []
- #递归到最后一层则修改状态
- if len(_dict_legal_combination.keys())==1:
- last_layer = True
- #取一个角色开始进行遍历
- _key_role = list(_dict_legal_combination.keys())[0]
- for item in _dict_legal_combination[_key_role]:
- copy_dict_one_selution = copy.copy(dict_one_selution)
- copy_dict_legal_combination = {}
- copy_set_legal_entity = copy.copy(set_legal_entity)
-
- #复制余下的所有角色,进行下一轮递归
- for _key in _dict_legal_combination.keys():
- if _key!=_key_role:
- copy_dict_legal_combination[_key] = _dict_legal_combination[_key]
- #修改为招标人和代理人可以为同一个
- if item !="":
- _flag = True
- if str(_key_role) in ["0","1"]:
- for _key_flag in copy_dict_one_selution:
- if _key_flag not in ["0","1"] and copy_dict_one_selution[_key_flag]==item:
- _flag = False
- else:
- for _key_flag in copy_dict_one_selution:
- if copy_dict_one_selution[_key_flag]==item:
- _flag = False
- if _flag:
- copy_dict_one_selution[_key_role] = item
-
- '''
- if item not in copy_set_legal_entity:
- if item !="":
- copy_dict_one_selution[_key_role] = item
- '''
- copy_set_legal_entity.add(item)
- if last_layer:
- list_all_selution.append(copy_dict_one_selution)
- else:
- recursive_package(copy_dict_legal_combination,copy_set_legal_entity,copy_dict_one_selution,list_all_selution)
-
- #递归匹配各个包的结果
- def recursive_packages(_dict_legal_combination,dict_one_selution,list_all_selution):
- last_layer = False
- if len(_dict_legal_combination.keys())==0:
- return []
- if len(_dict_legal_combination.keys())==1:
- last_layer = True
- _key_pack = list(_dict_legal_combination.keys())[0]
- for item in _dict_legal_combination[_key_pack]:
- copy_dict_one_selution = copy.copy(dict_one_selution)
- copy_dict_legal_combination = {}
- for _key in _dict_legal_combination.keys():
- if _key!=_key_pack:
- copy_dict_legal_combination[_key] = _dict_legal_combination[_key]
- for _key_role in item.keys():
- copy_dict_one_selution[_key_pack+"$$"+_key_role] = item[_key_role]
- if last_layer:
- list_all_selution.append(copy_dict_one_selution)
- else:
- recursive_packages(copy_dict_legal_combination,copy_dict_one_selution,list_all_selution)
- return list_all_selution
-
- #循环获取所有包组合
- def circle_pageages(_dict_legal_combination):
- list_all_selution = []
- for _key_pack in _dict_legal_combination.keys():
- list_key_selution = []
- for item in _dict_legal_combination[_key_pack]:
- _dict = dict()
- for _key_role in item.keys():
- _dict[_key_pack+"$$"+_key_role] = item[_key_role]
- list_key_selution.append(_dict)
- if len(list_all_selution)==0:
- list_all_selution = list_key_selution
- else:
- _list_all_selution = []
- for item_1 in list_all_selution:
- for item_2 in list_key_selution:
- _list_all_selution.append(dict(item_1,**item_2))
- list_all_selution = _list_all_selution
- return list_all_selution
-
- #拿到各个包解析之后的结果
- _dict_legal_combination = {}
- for packageName in dict_role_combination.keys():
- _list_all_selution = []
- # recursive_package(dict_role_combination[packageName], set(), {}, _list_all_selution)
- _list_all_selution = circle_package(dict_role_combination[packageName])
- '''
- # print("===1")
- # print(packageName)
- for item in _list_all_selution:
- # print(item)
- # print("===2")
- '''
- #去除包含子集
- list_all_selution_simple = []
- _list_set_all_selution = []
- for item_selution in _list_all_selution:
- item_set_selution = set()
- for _key in item_selution.keys():
- item_set_selution.add((_key,item_selution[_key]))
- _list_set_all_selution.append(item_set_selution)
- if len(_list_set_all_selution)>1000:
- _dict_legal_combination[packageName] = _list_all_selution
- continue
- for i in range(len(_list_set_all_selution)):
-
- be_included = False
- for j in range(len(_list_set_all_selution)):
- if i!=j:
- if len(set(_list_set_all_selution[i])&set(_list_set_all_selution[j]))==len(_list_set_all_selution[i]) and len(_list_set_all_selution[i])!=len(_list_set_all_selution[j]):
- be_included = True
- if not be_included:
- list_all_selution_simple.append(_list_all_selution[i])
- _dict_legal_combination[packageName] = list_all_selution_simple
- _list_final_comba = []
- #对各个包的结果进行排列组合
- _comba_count = 1
- for _key in _dict_legal_combination.keys():
- _comba_count *= len(_dict_legal_combination[_key])
- #如果过大,则每个包只取概率最大的那个
- dict_pack_entity_prob = get_dict_entity_prob(list_entity)
- if _comba_count>250:
- new_dict_legal_combination = dict()
- for _key_pack in _dict_legal_combination.keys():
- MAX_PROB = -1000
- _MAX_PROB_COMBA = None
- for item in _dict_legal_combination[_key_pack]:
- # print(_key_pack,item)
- _dict = dict()
- for _key in item.keys():
- _dict[str(_key_pack)+"$$"+str(_key)] = item[_key]
- _prob = getSumExpectation(dict_pack_entity_prob, _dict)
- if _prob>MAX_PROB:
- MAX_PROB = _prob
- _MAX_PROB_COMBA = [item]
- if _MAX_PROB_COMBA is not None:
- new_dict_legal_combination[_key_pack] = _MAX_PROB_COMBA
- _dict_legal_combination = new_dict_legal_combination
- #recursive_packages(_dict_legal_combination, {}, _list_final_comba)
- _list_final_comba = circle_pageages(_dict_legal_combination)
- #除了Project包(招标人和代理人),其他包是不会有冲突的
- #查看是否有一个实体出现在了Project包和其他包中,如有,要进行裁剪
- _list_real_comba = []
- for dict_item in _list_final_comba:
- set_project = set()
- set_other = set()
- for _key in list(dict_item.keys()):
- if _key.split("$$")[0]=="Project":
- set_project.add(dict_item[_key])
- else:
- set_other.add(dict_item[_key])
- set_common = set_project&set_other
- if len(set_common)>0:
- dict_project = {}
- dict_not_project = {}
- for _key in list(dict_item.keys()):
- if dict_item[_key] in set_common:
- if str(_key.split("$$")[0])=="Project":
- dict_project[_key] = dict_item[_key]
- else:
- dict_not_project[_key] = dict_item[_key]
- else:
- dict_project[_key] = dict_item[_key]
- dict_not_project[_key] = dict_item[_key]
-
- _list_real_comba.append(dict_project)
- _list_real_comba.append(dict_not_project)
- else:
- _list_real_comba.append(dict_item)
- return _list_real_comba
- def get_dict_entity_prob(list_entity,on_value=0.5):
- dict_pack_entity_prob = {}
- for entity in list_entity:
- if entity.entity_type in ['org','company']:
- values = entity.values
- role_prob = float(values[int(entity.label)])
- _key = entity.packageName+"$$"+str(entity.label)
- if role_prob>=on_value and str(entity.label)!="5":
- _key_prob = _key+"$text$"+entity.entity_text
- if _key_prob in dict_pack_entity_prob:
- if role_prob>dict_pack_entity_prob[_key_prob][1]:
- dict_pack_entity_prob[_key_prob] = [entity.entity_text,role_prob]
- else:
- dict_pack_entity_prob[_key_prob] = [entity.entity_text,role_prob]
- return dict_pack_entity_prob
- #计算合计期望
- def getSumExpectation(dict_pack_entity_prob,combination,on_value=0.5):
- '''
- expect = 0
- for entity in list_entity:
- if entity.entity_type in ['org','company']:
- values = entity.values
- role_prob = float(values[int(entity.label)])
- _key = entity.packageName+"$$"+str(entity.label)
- if role_prob>on_value and str(entity.label)!="5":
- if _key in combination.keys() and combination[_key]==entity.entity_text:
- expect += math.pow(role_prob,4)
- else:
- expect -= math.pow(role_prob,4)
- '''
- #修改为同一个实体只取对应包-角色的最大的概率值
- expect = 0
- dict_entity_prob = {}
- for _key_pack_entity in dict_pack_entity_prob:
- _key_pack = _key_pack_entity.split("$text$")[0]
- role_prob = dict_pack_entity_prob[_key_pack_entity][1]
- if _key_pack in combination.keys() and combination[_key_pack]==dict_pack_entity_prob[_key_pack_entity][0]:
- if _key_pack_entity in dict_entity_prob.keys():
- if dict_entity_prob[_key_pack_entity]<role_prob:
- dict_entity_prob[_key_pack_entity] = role_prob
- else:
- dict_entity_prob[_key_pack_entity] = role_prob
- else:
- if _key_pack_entity in dict_entity_prob.keys():
- if dict_entity_prob[_key_pack_entity]>-role_prob:
- dict_entity_prob[_key_pack_entity] = -role_prob
- else:
- dict_entity_prob[_key_pack_entity] = -role_prob
- # for entity in list_entity:
- # if entity.entity_type in ['org','company']:
- # values = entity.values
- # role_prob = float(values[int(entity.label)])
- # _key = entity.packageName+"$$"+str(entity.label)
- # if role_prob>=on_value and str(entity.label)!="5":
- # if _key in combination.keys() and combination[_key]==entity.entity_text:
- # _key_prob = _key+entity.entity_text
- # if _key_prob in dict_entity_prob.keys():
- # if dict_entity_prob[_key_prob]<role_prob:
- # dict_entity_prob[_key_prob] = role_prob
- # else:
- # dict_entity_prob[_key_prob] = role_prob
- # else:
- # _key_prob = _key+entity.entity_text
- # if _key_prob in dict_entity_prob.keys():
- # if dict_entity_prob[_key_prob]>-role_prob:
- # dict_entity_prob[_key_prob] = -role_prob
- # else:
- # dict_entity_prob[_key_prob] = -role_prob
- for _key in dict_entity_prob.keys():
- symbol = 1 if dict_entity_prob[_key]>0 else -1
- expect += symbol*math.pow(dict_entity_prob[_key],2)
- return expect
- def getRoleList(list_sentence,list_entity,on_value = 0.5):
- '''
- @summary: 搜索树,得到所有不矛盾的角色组合,取合计期望值最大的作为结果返回
- @param:
- list_sentence:文章所有的sentence
- list_entity:文章所有的实体
- on_value:概率阈值
- @return:文章的角色list
- '''
- pack = getPackagesFromArticle(list_sentence,list_entity)
- if pack is None:
- return None
- PackageList,PackageSet,dict_PackageCode = pack
- #拿到所有可能的情况
- dict_role_combination = {}
- # print(PackageList)
- #拿到各个实体的packageName,packageCode
- for entity in list_entity:
- if entity.entity_type in ['org','company']:
- #过滤掉字数小于3个的实体
- if len(entity.entity_text)<=3:
- continue
- values = entity.values
- role_prob = float(values[int(entity.label)])
- if role_prob>=on_value and str(entity.label)!="5":
- if str(entity.label) in ["0","1"]:
- packageName = "Project"
- else:
- if len(PackageSet)>0:
- packagePointer,_ = getPackage(PackageList,entity.sentence_index,entity.begin_index,"role-"+str(entity.label))
- if packagePointer is None:
- #continue
- packageName = "Project"
- # print(entity.entity_text, packageName,entity.sentence_index,entity.begin_index)
- else:
- #add pointer_pack
- entity.pointer_pack = packagePointer
- packageName = packagePointer.entity_text
- # print(entity.entity_text, packageName)
- else:
- packageName = "Project"
- find_flag = False
- if packageName in dict_PackageCode.keys():
- packageCode = dict_PackageCode[packageName]
- else:
- packageCode = ""
- entity.packageCode = packageCode
- role_name = dict_role_id.get(str(entity.label))
- entity.roleName = role_name
- entity.packageName = packageName
- if entity.packageName in dict_role_combination.keys():
- if str(entity.label) in dict_role_combination[entity.packageName].keys():
- dict_role_combination[entity.packageName][str(entity.label)].add(entity.entity_text)
- else:
- dict_role_combination[entity.packageName][str(entity.label)] = set([entity.entity_text])
- else:
- dict_role_combination[entity.packageName] = {}
- #初始化空值
- roleIds = [0,1,2,3,4]
- for _roleId in roleIds:
- dict_role_combination[entity.packageName][str(_roleId)] = set([""])
- dict_role_combination[entity.packageName][str(entity.label)].add(entity.entity_text)
- list_real_comba = get_legal_comba(list_entity,dict_role_combination)
- # print("===role_combination",dict_role_combination)
- # print("== real_comba",list_real_comba)
- #拿到最大期望值的组合
- max_index = 0
- max_expect = -100
- _index = 0
- dict_pack_entity_prob = get_dict_entity_prob(list_entity)
- for item_combination in list_real_comba:
- expect = getSumExpectation(dict_pack_entity_prob, item_combination)
- if expect>max_expect:
- max_index = _index
- max_expect = expect
- _index += 1
- RoleList = []
- RoleSet = set()
- if len(list_real_comba)>0:
- for _key in list_real_comba[max_index].keys():
- packageName = _key.split("$$")[0]
- label = _key.split("$$")[1]
- role_name = dict_role_id.get(str(label))
- entity_text = list_real_comba[max_index][_key]
- if packageName in dict_PackageCode.keys():
- packagecode = dict_PackageCode.get(packageName)
- else:
- packagecode = ""
- RoleList.append(PREM(packageName,packagecode,role_name,entity_text,0,0,0.0,[]))
- RoleSet.add(entity_text)
- #根据最优树来修正list_entity中角色对包的连接
- for _entity in list_entity:
- if _entity.pointer_pack is not None:
- _pack_name = _entity.pointer_pack.entity_text
- _find_flag = False
- for _prem in RoleList:
- if _prem.packageName==_pack_name and _prem.entity_text==_entity.entity_text:
- _find_flag = True
- if not _find_flag:
- _entity.pointer_pack = None
- return RoleList,RoleSet,PackageList,PackageSet
- def getPackageScopePattern():
- '''
- @summary: 获取包的作用域关键词
- '''
- df = pd.read_excel(os.path.dirname(__file__)+"/end.xls")
- pattern = "("
- for item in df["list_word"]:
- item = str(item).replace("(","\(").replace(")","\)").replace(".","\.").replace("[","\[").replace("]","\]").replace("-","\-")
- pattern += item+"|"
- pattern = pattern[:-1]+")[::是为]|业绩.{,30}标段[0-9A-Za-z一二三四五六七八九十]{0,3}"
- return pattern
-
- pattern_packageScope = getPackageScopePattern()
- def getPackagesFromArticle(list_sentence,list_entity):
- '''
- @param:
- list_sentence:文章的句子list
- @summary: 将包的信息插入list_entity中
- @return: type:list if [包号,句子index,词偏移,标段号] meaning:文章的包/标段信息
- '''
-
- if len(list_sentence)==0:
- return None
- list_sentence.sort(key=lambda x:x.sentence_index)
- PackageList = []
- PackageList_scope = []
- PackageSet = set()
- dict_packageCode = dict()
-
- package_name_pattern = re.compile("((标[段号的包]|分包)的?(名[称单]?|包名))[::]?([^::]{3,30}?),{1}")
- package_N_name_pattern = re.compile("[^承](分?包|标段|标包|标|包|包组|子项目|包件|项目类型)编?号?[::]?[\((]?([0-9A-Za-z一二三四五六七八九十ⅠⅡⅢⅣⅤⅥⅦ]){1,2},{1}")
- package_number_pattern = re.compile("(([^承](包|标[段号的包]|分?包|包组|包件)编?号?|子项目|项目类型)[::]?[0-9A-Za-z一二三四五六七八九十ⅠⅡⅢⅣⅤⅥⅦ]{1,4}[^\.])[^至]?|([^\.]?第?[ⅠⅡⅢⅣⅤⅥⅦ0-9A-Za-z一二三四五六七八九十]{1,4}(包号|标[段号的包]|分?包))")
- # other_package_pattern = re.compile('(项目名称|物资名称|设备名称|场次名称|标段名称)[::](.{,20}?)(,|项目)') # 新正则识别标段
- other_package_pattern = re.compile('((项目|物资|设备|场次|标段|标的|产品)(名称))[::]([^,。]{2,50}?)[,。]') # # 2020/11/23 大网站规则 调整 package_N_name_pattern, package_N_name_pattern 中的项目 改为 子项目
- win_tenderer_pattern = re.compile('(中标人|供应商)[::](.{2,25})[,。]') # 2020/11/23 大网站规则 调整
- model_pattern = re.compile('(型号|序号)[::]([^,。]{2,20})[,。]') # 2020/11/23 大网站规则 调整
- number_pattern = re.compile("[0-9A-Za-z一二三四五六七八九十ⅠⅡⅢⅣⅤⅥⅦ]{1,4}")
- package_code_pattern = re.compile("(?:编号[::]?\s*)([-\dA-Za-z\(\)]+)")
- # 纯数字类型的包号统一,例如:'01','1'
- re_digital = re.compile("^\d+$")
- def changeIndexFromWordToWords(tokens,word_index):
- '''
- @summary:转换某个字的字偏移为词偏移
- '''
- before_index = 0
- after_index = 0
- for i in range(len(tokens)):
- after_index = after_index+len(tokens[i])
- if before_index<=word_index and after_index>=word_index:
- return i
- before_index = after_index
- package_names = []
-
- def extractPackageCode(tokens,word_index,size=20,pattern = package_code_pattern):
- '''
- @summary:抽取包附近的标段号
- @param:
- tokens:包所在句子的分词
- word_index:包所在字偏移
- size:左右各取多少个词
- pattern:提取标段号的正则
- @return: type:string,meaning:标段号
- '''
- index = changeIndexFromWordToWords(tokens,word_index)
- if index<size:
- begin = index
- else:
- begin = index-size
- if index+size>len(tokens):
- end = len(tokens)
- else:
- end = index+size
- #拿到左右两边的词语组成短语
- text = "".join(tokens[begin:end])
- #在短语中的字偏移
- new_word_index = word_index-len("".join(tokens[:begin]))
- min_distance = len(text)
- packageCode = None
- for the_iter in re.finditer(pattern,text):
- #算出最小距离
- distance = min([abs(new_word_index-the_iter.span()[0]),abs(new_word_index-the_iter.span()[1])])
- if distance<min_distance:
- min_distance = distance
- packageCode = the_iter.group(1)
- return packageCode
- #从标段介绍表格中提取包名和包号
- for i in range(len(list_sentence)):
- content = list_sentence[i].sentence_text
- names = re.findall(package_name_pattern,content)
- if names == []:
- names = re.findall(other_package_pattern, content)
- N_names = re.findall(package_N_name_pattern,content)
- if len(names)==1 and len(N_names)==1:
- package_names.append([names[0][-1],N_names[0][-1]])
- for i in range(len(list_sentence)):
- PackageList_item = []
- PackageList_item_scope = []
- content = list_sentence[i].sentence_text
- tokens = list_sentence[i].tokens
- _names = []
- # 2021/6/23 包名称去重
- for name in package_names:
- if name not in _names:
- _names.append(name)
- # for name in package_names[:20]:
- for name in _names[:20]:
- for index in findAllIndex(name[0],content):
- temp_package_number = re.findall(number_pattern,name[1])[0]
- if re.search(re_digital,temp_package_number):
- temp_package_number = str(int(temp_package_number))
- PackageList_item.append({"name":temp_package_number,"sentence_index":list_sentence[i].sentence_index,"offsetWords_begin":changeIndexFromWordToWords(tokens,index),"offsetWord_begin":index,"offsetWord_end":index+len(name[0])})
- # PackageList_item.append([temp_package_number,i,changeIndexFromWordToWords(tokens,index),index,index+len(str(temp_package_number))])
- code = extractPackageCode(tokens, index)
- if code is not None:
- dict_packageCode[temp_package_number] = code
- PackageSet.add(temp_package_number)
- for iter in re.finditer(package_number_pattern,content):
- temp_package_number = re.findall(number_pattern,content[iter.span()[0]:iter.span()[1]])[0]
- if re.search(re_digital, temp_package_number):
- temp_package_number = str(int(temp_package_number))
- PackageList_item.append({"name":temp_package_number,"sentence_index":list_sentence[i].sentence_index,"offsetWords_begin":changeIndexFromWordToWords(tokens,iter.span()[0]),"offsetWord_begin":iter.span()[0],"offsetWord_end":iter.span()[1]})
- # PackageList_item.append([temp_package_number,i,changeIndexFromWordToWords(tokens,iter.span()[0]),iter.span()[0],iter.span()[1]])
- code = extractPackageCode(tokens, iter.span()[0])
- if code is not None:
- dict_packageCode[temp_package_number] = code
- PackageSet.add(temp_package_number)
-
- #识别packageScope
- for iter in re.finditer(pattern_packageScope,content):
- PackageList_item_scope.append({"name":"","sentence_index":list_sentence[i].sentence_index,"offsetWords_begin":changeIndexFromWordToWords(tokens,iter.span()[0]),"offsetWord_begin":iter.span()[0],"offsetWord_end":iter.span()[1]})
- # PackageList_item_scope.append(["",i,changeIndexFromWordToWords(tokens,iter.span()[0]),iter.span()[0],iter.span()[1]])
- PackageList_item_scope = PackageList_item +PackageList_item_scope
- PackageList_item_scope.sort(key=lambda x:x["offsetWord_begin"])
- PackageList_scope = PackageList_scope+PackageList_item_scope
- PackageList_item.sort(key=lambda x:x["sentence_index"])
- #PackageList = PackageList+PackageList_item
- #不作为包
- # if len(PackageSet)==0:
- # for i in range(len(list_sentence)):
- # PackageList_item = []
- # PackageList_item_scope = []
- # content = list_sentence[i].sentence_text
- # tokens = list_sentence[i].tokens
- # for iter in re.finditer(other_package_pattern,content):
- # temp_package_number = iter.group(2)
- # PackageList_item.append({"name":temp_package_number,"sentence_index":list_sentence[i].sentence_index,"offsetWords_begin":changeIndexFromWordToWords(tokens,iter.span()[0]),"offsetWord_begin":iter.span()[0],"offsetWord_end":iter.span()[1]})
- # # PackageList_item.append([temp_package_number,i,changeIndexFromWordToWords(tokens,iter.span()[0]),iter.span()[0],iter.span()[1]])
- # code = extractPackageCode(tokens, iter.span()[0])
- # if code is not None:
- # dict_packageCode[temp_package_number] = code
- # PackageSet.add(temp_package_number)
- # #识别packageScope
- # for iter in re.finditer(pattern_packageScope,content):
- # PackageList_item_scope.append({"name":"","sentence_index":list_sentence[i].sentence_index,"offsetWords_begin":changeIndexFromWordToWords(tokens,iter.span()[0]),"offsetWord_begin":iter.span()[0],"offsetWord_end":iter.span()[1]})
- # # PackageList_item_scope.append(["",i,changeIndexFromWordToWords(tokens,iter.span()[0]),iter.span()[0],iter.span()[1]])
- # PackageList_item_scope = PackageList_item +PackageList_item_scope
- # PackageList_item_scope.sort(key=lambda x:x["offsetWord_begin"])
- # PackageList_scope = PackageList_scope+PackageList_item_scope
- # PackageList_item.sort(key=lambda x:x["sentence_index"])
- # 2020/11/23 大网站规则 调整
- if len(PackageSet)==0 and len(set([it.entity_text for it in list_entity if it.entity_type in ['org', 'company'] and it.label==2]))>1:
- for i in range(len(list_sentence)):
- PackageList_item = []
- PackageList_item_scope = []
- content = list_sentence[i].sentence_text
- tokens = list_sentence[i].tokens
- names = re.findall(other_package_pattern, content)
- N_names = re.findall(win_tenderer_pattern, content)
- if len(names) != 1 or len(N_names) != 1:
- continue
- for iter in re.finditer(other_package_pattern,content):
- temp_package_number = iter.group(4)
- xinghao = re.search(model_pattern, content)
- if xinghao:
- temp_package_number = temp_package_number + '+' + xinghao.group(2)
- # print('新正则采购包名补充',temp_package_number)
- if re.search(re_digital,temp_package_number):
- temp_package_number = str(int(temp_package_number))
- PackageList_item.append({"name":temp_package_number,"sentence_index":list_sentence[i].sentence_index,"offsetWords_begin":changeIndexFromWordToWords(tokens,iter.span()[0]),"offsetWord_begin":iter.span()[0],"offsetWord_end":iter.span()[1]})
- # PackageList_item.append([temp_package_number,i,changeIndexFromWordToWords(tokens,iter.span()[0]),iter.span()[0],iter.span()[1]])
- code = extractPackageCode(tokens, iter.span()[0])
- if code is not None:
- dict_packageCode[temp_package_number] = code
- PackageSet.add(temp_package_number)
- #识别packageScope
- for iter in re.finditer(pattern_packageScope,content):
- PackageList_item_scope.append({"name":"","sentence_index":list_sentence[i].sentence_index,"offsetWords_begin":changeIndexFromWordToWords(tokens,iter.span()[0]),"offsetWord_begin":iter.span()[0],"offsetWord_end":iter.span()[1]})
- # PackageList_item_scope.append(["",i,changeIndexFromWordToWords(tokens,iter.span()[0]),iter.span()[0],iter.span()[1]])
- PackageList_item_scope = PackageList_item +PackageList_item_scope
- PackageList_item_scope.sort(key=lambda x:x["offsetWord_begin"])
- PackageList_scope = PackageList_scope+PackageList_item_scope
- PackageList_item.sort(key=lambda x:x["sentence_index"])
- pattern_punctuation = "[::()\(\),,。;;]"
- # print("===packageList_scope",PackageList_scope)
- for i in range(len(list_sentence)):
- for j in range(len(PackageList_scope)):
- if i==PackageList_scope[j]["sentence_index"] and PackageList_scope[j]["name"]!="":
- _flag = False
- left_str = list_sentence[i].sentence_text[PackageList_scope[j]["offsetWord_begin"]-30:PackageList_scope[j]["offsetWord_begin"]+1]
- right_str = list_sentence[i].sentence_text[PackageList_scope[j]["offsetWord_begin"]:PackageList_scope[j]["offsetWord_begin"]+30]
- _left_find = re.findall(pattern_punctuation,left_str)
- _right_find = re.findall(pattern_punctuation,right_str)
- #print(left_str)
- if re.search("同",left_str[-1:]) is not None and PackageList_scope[j]["name"]=="一":
- continue
- if re.search("划分",right_str[:10]) is not None:
- continue
- if len(_left_find)>0 and _left_find[-1] in [":",":"]:
- _flag = True
- if len(_right_find)>0 and _right_find[0] in [":",":"]:
- _flag = True
- if _flag:
- scope_begin = [PackageList_scope[j]["sentence_index"],PackageList_scope[j]["offsetWords_begin"]]
- else:
- if j==0:
- scope_begin = [0,0]
- else:
- scope_begin = [PackageList_scope[j-1]["sentence_index"],PackageList_scope[j-1]["offsetWords_begin"]]
- if j==len(PackageList_scope)-1:
- scope_end = [list_sentence[-1].sentence_index,changeIndexFromWordToWords(list_sentence[-1].tokens, len(list_sentence[-1].sentence_text))]
- else:
- scope_end = [PackageList_scope[j+1]["sentence_index"],PackageList_scope[j+1]["offsetWords_begin"]]
- if PackageList_scope[j-1]["sentence_index"]==PackageList_scope[j]["sentence_index"] and PackageList_scope[j-1]["offsetWord_begin"]<=PackageList_scope[j]["offsetWord_begin"] and PackageList_scope[j-1]["offsetWord_end"]>=PackageList_scope[j]["offsetWord_end"]:
- continue
- #add package to entity
- _pack_entity = Entity(doc_id=list_sentence[0].doc_id,entity_id="%s_%s_%s_%s"%(list_sentence[0].doc_id,i,PackageList_scope[j]["offsetWord_begin"],PackageList_scope[j]["offsetWord_begin"]),entity_text=PackageList_scope[j]["name"],entity_type="package",sentence_index=PackageList_scope[j]["sentence_index"],begin_index=changeIndexFromWordToWords(list_sentence[i].tokens,PackageList_scope[j]["offsetWord_begin"]),end_index=changeIndexFromWordToWords(list_sentence[i].tokens,PackageList_scope[j]["offsetWord_end"]),wordOffset_begin=PackageList_scope[j]["offsetWord_begin"],wordOffset_end=PackageList_scope[j]["offsetWord_end"])
- list_entity.append(_pack_entity)
- copy_pack = copy.copy(PackageList_scope[j])
- copy_pack["scope"] = [scope_begin,scope_end]
- copy_pack["hit"] = set()
- copy_pack["pointer"] = _pack_entity
- PackageList.append(copy_pack)
- return PackageList,PackageSet,dict_packageCode
- from BiddingKG.dl.interface.modelFactory import Model_relation_extraction
- relationExtraction_model = Model_relation_extraction()
- def findAttributeAfterEntity(PackDict,roleSet,PackageList,PackageSet,list_entity,list_sentence,on_value = 0.5,on_value_person=0.5,sentence_len=4):
- '''
- @param:
- PackDict:文章包dict
- roleSet:文章所有角色的公司名称
- PackageList:文章的包信息
- PackageSet:文章所有包的名称
- list_entity:文章所有经过模型处理的实体
- on_value:金额模型的阈值
- on_value_person:联系人模型的阈值
- sentence_len:公司和属性间隔句子的最大长度
- @return:添加了属性信息的角色list
- '''
-
- #根据roleid添加金额到rolelist中
- def addMoneyByRoleid(packDict,packageName,roleid,money,money_prob):
- for i in range(len(packDict[packageName]["roleList"])):
- if packDict[packageName]["roleList"][i].role_name==dict_role_id.get(str(roleid)):
- if money_prob>packDict[packageName]["roleList"][i].money_prob:
- packDict[packageName]["roleList"][i].money = money
- packDict[packageName]["roleList"][i].money_prob = money_prob
- return packDict
-
- #根据实体名称添加金额到rolelist中
- def addMoneyByEntity(packDict,packageName,entity,money,money_prob):
- for i in range(len(packDict[packageName]["roleList"])):
- if packDict[packageName]["roleList"][i].entity_text==entity:
- # if money_prob>packDict[packageName]["roleList"][i].money_prob:
- # packDict[packageName]["roleList"][i].money = money
- # packDict[packageName]["roleList"][i].money_prob = money_prob
- if packDict[packageName]["roleList"][i].money_prob==0 : # 2021/7/20第一次更新金额
- packDict[packageName]["roleList"][i].money = money.entity_text
- packDict[packageName]["roleList"][i].money_prob = money_prob
- packDict[packageName]["roleList"][i].money_unit = money.money_unit
- elif money_prob>packDict[packageName]["roleList"][i].money_prob+0.2 or money.notes in ['大写']: # 2021/7/20改为优先选择大写金额,
- # print('已连接金额概率:money_prob:',packDict[packageName]["roleList"][i].money_prob)
- # print('链接金额备注 ',money.notes, money.entity_text, money.values)
- packDict[packageName]["roleList"][i].money = money.entity_text
- packDict[packageName]["roleList"][i].money_prob = money_prob
- packDict[packageName]["roleList"][i].money_unit = money.money_unit
- # print('链接中的金额:{0}, 单位:{1}'.format(money.entity_text, money.money_unit))
- return packDict
-
- #根据实体名称得到角色
- def getRoleWithText(packDict,entity_text):
- for pack in packDict.keys():
- for i in range(len(packDict[pack]["roleList"])):
- if packDict[pack]["roleList"][i].entity_text==entity_text:
- return packDict[pack]["roleList"][i].role_name
-
- def doesEntityOrLinkedEntity_inRoleSet(entity,RoleSet):
- _list_entitys = [entity]+entity.linked_entitys
- for _entity in _list_entitys:
- if _entity.entity_text in RoleSet:
- return True
-
- p_entity = 0
- # 2021/7/19 顺序比较金额,前面是后面的一万倍则把前面金额/10000
- money_list = [it for it in list_entity if it.entity_type=="money"]
- for i in range(len(money_list)-1):
- for j in range(1, len(money_list)):
- if (float(money_list[i].entity_text) > 5000000000 or money_list[j].notes=='大写') and \
- Decimal(money_list[i].entity_text)/Decimal(money_list[j].entity_text)==10000:
- money_list[i].entity_text = str(Decimal(money_list[i].entity_text)/10000)
- # print('连接前修改大于50亿金额:前面是后面的一万倍则把前面金额/10000')
-
- #遍历所有实体
- while(p_entity<len(list_entity)):
- entity = list_entity[p_entity]
- '''
- #招标金额从后往前找
- if entity.entity_type=="money":
- if entity.values[entity.label]>=on_value:
- if str(entity.label)=="0":
- packagePointer,_ = getPackage(PackageList,entity.sentence_index,entity.begin_index,"money-"+str(entity.label))
- if packagePointer is None:
- packageName = "Project"
- else:
- packageName = packagePointer.entity_text
- addMoneyByRoleid(PackDict, packageName, "0", entity.entity_text, entity.values[entity.label])
- '''
- ''' # 2020/11/25 与下面的联系人连接步骤重复,取消
- if entity.entity_type=="person":
- if entity.values[entity.label]>=on_value_person:
- if str(entity.label)=="1":
- for i in range(len(PackDict["Project"]["roleList"])):
- if PackDict["Project"]["roleList"][i].role_name=="tenderee":
- PackDict["Project"]["roleList"][i].linklist.append((entity.entity_text,entity.person_phone))
- # add pointer_person
- for _entity in list_entity:
- if dict_role_id.get(str(_entity.label))=="tenderee":
- for i in range(len(PackDict["Project"]["roleList"])):
- if PackDict["Project"]["roleList"][i].entity_text==_entity.entity_text and PackDict["Project"]["roleList"][i].role_name=="tenderee":
- _entity.pointer_person = entity
- elif str(entity.label)=="2":
- for i in range(len(PackDict["Project"]["roleList"])):
- if PackDict["Project"]["roleList"][i].role_name=="agency":
- PackDict["Project"]["roleList"][i].linklist.append((entity.entity_text,entity.person_phone))
- # add pointer_person
- for _entity in list_entity:
- if dict_role_id.get(str(_entity.label))=="agency":
- for i in range(len(PackDict["Project"]["roleList"])):
- if PackDict["Project"]["roleList"][i].entity_text==_entity.entity_text and PackDict["Project"]["roleList"][i].role_name=="agency":
- _entity.pointer_person = entity
- '''
- # #金额往前找实体
- # if entity.entity_type=="money":
- # if entity.values[entity.label]>=on_value:
- # p_entity_money= p_entity
- # entity_money = list_entity[p_entity_money]
- # if len(PackageSet)>0:
- # packagePointer,_ = getPackage(PackageList,entity_money.sentence_index,entity_money.begin_index,"money-"+str(entity_money.entity_text)+"-"+str(entity_money.label))
- # if packagePointer is None:
- # packageName_entity = "Project"
- # else:
- # packageName_entity = packagePointer.entity_text
- # else:
- # packageName_entity = "Project"
- # while(p_entity_money>0):
- # entity_before = list_entity[p_entity_money]
- # if entity_before.entity_type in ['org','company']:
- # if str(entity_before.label)=="1":
- # addMoneyByEntity(PackDict, packageName_entity, entity_before.entity_text, entity_money.entity_text, entity_money.values[entity_money.label])
- # #add pointer_money
- # entity_before.pointer_money = entity_money
- # break
- # p_entity_money -= 1
- #如果实体属于角色集合,则往后找属性
- if doesEntityOrLinkedEntity_inRoleSet(entity, roleSet):
-
- p_entity += 1
- #循环查找符合的属性
- while(p_entity<len(list_entity)):
-
- entity_after = list_entity[p_entity]
- if entity_after.sentence_index-entity.sentence_index>=sentence_len:
- p_entity -= 1
- break
- #若是遇到公司实体,则跳出循环
- if entity_after.entity_type in ['org','company']:
- p_entity -= 1
- break
- if entity_after.values is not None:
- if entity_after.entity_type=="money":
- if entity_after.values[entity_after.label]>=on_value:
- '''
- #招标金额从后往前找
- if str(entity_after.label)=="0":
- packagePointer,_ = getPackage(PackageList,entity.sentence_index,entity.begin_index,"money-"+str(entity.label))
- if packagePointer is None:
- packageName = "Project"
- else:
- packageName = packagePointer.entity_text
- addMoneyByRoleid(PackDict, packageName, "0", entity_after.entity_text, entity_after.values[entity_after.label])
- '''
- if str(entity_after.label)=="1":
- #print(entity_after.entity_text,entity.entity_text)
- _list_entitys = [entity]+entity.linked_entitys
- if len(PackageSet)>0:
- packagePointer,_ = getPackage(PackageList,entity_after.sentence_index,entity_after.begin_index,"money-"+str(entity_after.entity_text)+"-"+str(entity_after.label))
- if packagePointer is None:
- packageName_entity = "Project"
- else:
- packageName_entity = packagePointer.entity_text
- else:
- packageName_entity = "Project"
- if str(entity.label) in ["2","3","4"]:
- # addMoneyByEntity(PackDict, packageName_entity, entity.entity_text, entity_after.entity_text, entity_after.values[entity_after.label])
- if entity_after.notes == '单价':
- addMoneyByEntity(PackDict, packageName_entity, entity.entity_text, entity_after,
- 0.5)
- entity.pointer_money = entity_after
- # print('role zhao money', entity.entity_text, '中标金额:', entity_after.entity_text)
- else:
- addMoneyByEntity(PackDict, packageName_entity, entity.entity_text, entity_after,
- entity_after.values[entity_after.label])
- entity.pointer_money = entity_after
- # print('role zhao money', entity.entity_text, '中标金额:', entity_after.entity_text)
- break # 2021/7/16 新增,找到中标金额,非单价即停止,不再往后找金额
- #add pointer_money
- # entity.pointer_money = entity_after
- # print('role zhao money', entity.entity_text, '中标金额:', entity_after.entity_text)
- # if entity_after.notes!='单价':
- # break # 2021/7/16 新增,找到中标金额即停止,不再往后找金额
- '''
- if entity_after.entity_type=="person":
- if entity_after.values[entity_after.label]>=on_value_person:
- if str(entity_after.label)=="1":
- for i in range(len(roleList)):
- if roleList[i].role_name=="tenderee":
- roleList[i].linklist.append((entity_after.entity_text,entity_after.person_phone))
- elif str(entity_after.label)=="2":
- for i in range(len(roleList)):
- if roleList[i].role_name=="agency":
- roleList[i].linklist.append((entity_after.entity_text,entity_after.person_phone))
- elif str(entity_after.label)=="3":
- _list_entitys = [entity]+entity.linked_entitys
- for _entity in _list_entitys:
- for i in range(len(roleList)):
- if roleList[i].entity_text==_entity.entity_text:
- if entity_after.sentence_index-_entity.sentence_index>1 and len(roleList[i].linklist)>0:
- break
- roleList[i].linklist.append((entity_after.entity_text,entity_after.person_phone))
- '''
-
- p_entity += 1
-
- p_entity += 1
-
- ''''''
- # 通过模型分类的招标/代理联系人
- list_sentence = sorted(list_sentence, key=lambda x: x.sentence_index)
- person_list = [entity for entity in list_entity if entity.entity_type == 'person' and entity.label in [1, 2]]
- tenderee_contact = set()
- tenderee_phone = set()
- agency_contact = set()
- agency_phone = set()
- winter_contact = set()
- for _person in person_list:
- if _person.label == 1:
- tenderee_contact.add(_person.entity_text)
- if _person.label == 2:
- agency_contact.add(_person.entity_text)
- # 正则匹配无 '主体/联系人' 的电话
- # 例:"采购人联系方式:0833-5226788,"
- re_tenderee_phone = re.compile(
- "(?:(?:(?:采购|招标|议价|议标|比选)(?:人|公司|单位|组织|部门)|建设(?:单位|业主)|(?:采购|招标|甲)方|询价单位|项目业主|业主)[^。]{0,5}(?:电话|联系方式|联系人)[::]?[^。]{0,7}?)"
- # 电话号码
- "(1[3-9][0-9][-—-]?\d{4}[-—-]?\d{4}|0\d{2,3}[-—-][1-9]\d{6,7}/[1-9]\d{6,7}|0\d{2,3}[-—-][1-9]\d{6,7}转\d{1,4}|0\d{2,3}[-—-]?[1-9]\d{6,7}|[\(|\(]0\d{2,3}[\)|\)]-?\d{7,8}-?\d{,4}|[1-9]\d{6,7})(?:[^\.]|$)")
- # 例:"采购人地址和联系方式:峨边彝族自治县教育局,0833-5226788,"
- re_tenderee_phone2 = re.compile(
- "(?:(?:(?:采购|招标|议价|议标|比选)(?:人|公司|单位|组织|部门)|建设(?:单位|业主)|(?:采购|招标|甲)方|询价单位|项目业主|业主)[^。]{0,3}(?:地址)[^。]{0,3}(?:电话|联系方式|联系人)[::]?[^。]{0,20}?)"
- # 电话号码
- "(1[3-9][0-9][-—-]?\d{4}[-—-]?\d{4}|0\d{2,3}[-—-][1-9]\d{6,7}/[1-9]\d{6,7}|0\d{2,3}[-—-][1-9]\d{6,7}转\d{1,4}|0\d{2,3}[-—-]?[1-9]\d{6,7}|[\(|\(]0\d{2,3}[\)|\)]-?\d{7,8}-?\d{,4}|[1-9]\d{6,7})(?:[^\.]|$)")
- re_agent_phone = re.compile(
- "(?:(?:代理(?:人|机构|公司|单位|组织|方)|采购机构|集中采购机构|集采机构|招标机构)[^。]{0,5}(?:电话|联系方式|联系人)[::]?[^。]{0,7}?)"
- # 电话号码
- "(1[3-9][0-9][-—-]?\d{4}[-—-]?\d{4}|0\d{2,3}[-—-][1-9]\d{6,7}/[1-9]\d{6,7}|0\d{2,3}[-—-][1-9]\d{6,7}转\d{1,4}|0\d{2,3}[-—-]?[1-9]\d{6,7}|[\(|\(]0\d{2,3}[\)|\)]-?\d{7,8}-?\d{,4}|[1-9]\d{6,7})(?:[^\.]|$)")
- re_agent_phone2 = re.compile(
- "(?:(?:代理(?:人|机构|公司|单位|组织|方)|采购机构|集中采购机构|集采机构|招标机构)[^。]{0,3}(?:地址)[^。]{0,3}(?:电话|联系方式|联系人)[::]?[^。]{0,20}?)"
- # 电话号码
- "(1[3-9][0-9][-—-]?\d{4}[-—-]?\d{4}|0\d{2,3}[-—-][1-9]\d{6,7}/[1-9]\d{6,7}|0\d{2,3}[-—-][1-9]\d{6,7}转\d{1,4}|0\d{2,3}[-—-]?[1-9]\d{6,7}|[\(|\(]0\d{2,3}[\)|\)]-?\d{7,8}-?\d{,4}|[1-9]\d{6,7})(?:[^\.]|$)")
- content = ""
- for _sentence in list_sentence:
- content += "".join(_sentence.tokens)
- _content = copy.deepcopy(content)
- while re.search("(.)(,)([^0-9])|([^0-9])(,)(.)", content):
- content_words = list(content)
- for i in re.finditer("(.)(,)([^0-9])", content):
- content_words[i.span(2)[0]] = ""
- for i in re.finditer("([^0-9])(,)(.)", content):
- content_words[i.span(2)[0]] = ""
- content = "".join(content_words)
- content = re.sub("[::]|[\((]|[\))]", "", content)
- _tenderee_phone = re.findall(re_tenderee_phone, content)
- # 更新正则确定的角色属性
- for i in range(len(PackDict["Project"]["roleList"])):
- if PackDict["Project"]["roleList"][i].role_name == "tenderee":
- _tenderee_phone = re.findall(re_tenderee_phone, content)
- if _tenderee_phone:
- for _phone in _tenderee_phone:
- PackDict["Project"]["roleList"][i].linklist.append(("", _phone))
- tenderee_phone.add(_phone)
- _tenderee_phone2 = re.findall(re_tenderee_phone2, content)
- if _tenderee_phone2:
- for _phone in _tenderee_phone2:
- PackDict["Project"]["roleList"][i].linklist.append(("", _phone))
- tenderee_phone.add(_phone)
- if PackDict["Project"]["roleList"][i].role_name == "agency":
- _agent_phone = re.findall(re_agent_phone, content)
- if _agent_phone:
- for _phone in _agent_phone:
- PackDict["Project"]["roleList"][i].linklist.append(("", _phone))
- agency_phone.add(_phone)
- _agent_phone2 = re.findall(re_agent_phone2, content)
- if _agent_phone2:
- for _phone in _agent_phone2:
- PackDict["Project"]["roleList"][i].linklist.append(("", _phone))
- agency_phone.add(_phone)
- # km配对方法
- def dispatch(match_list):
- main_roles = list(set([match.main_role for match in match_list]))
- attributes = list(set([match.attribute for match in match_list]))
- label = np.zeros(shape=(len(main_roles), len(attributes)))
- for match in match_list:
- main_role = match.main_role
- attribute = match.attribute
- value = match.value
- label[main_roles.index(main_role), attributes.index(attribute)] = value + 10000
- # print(label)
- gragh = -label
- # km算法
- row, col = linear_sum_assignment(gragh)
- max_dispatch = [(i, j) for i, j, value in zip(row, col, gragh[row, col]) if value]
- # return [Match(main_roles[row], attributes[col]) for row, col in max_dispatch]
- return [(main_roles[row], attributes[col]) for row, col in max_dispatch]
- # 正则提取电话号码实体
- key_word = re.compile('((?:电话|联系方式|联系人).{0,4}?)([0-1]\d{6,11})')
- phone = re.compile('1[3|4|5|6|7|8|9][0-9][-—-―]?\d{4}[-—-―]?\d{4}|'
- '\+86.?1[3|4|5|6|7|8|9]\d{9}|'
- '0[^0]\d{1,2}[-—-―][1-9]\d{6,7}/[1-9]\d{6,10}|'
- '0[^0]\d{1,2}[-—-―]\d{7,8}转\d{1,4}|'
- '0[^0]\d{1,2}[-—-―]?[1-9]\d{6}\d?(?=1[3|4|5|6|7|8|9]\d{9})|'
- '0[^0]\d{1,2}[-—-―]?[1-9]\d{6}\d?(?=0[^0]\d{1,2}[-—-―]?[1-9]\d{6}\d?)|'
- '0[^0]\d{1,2}[-—-―]?[1-9]\d{6}\d?(?=[1-9]\d{6,7})|'
- '0[^0]\d{1,2}[-—-―]?[1-9]\d{6}\d?|'
- '[\(|\(]0[^0]\d{1,2}[\)|\)]-?\d{7,8}-?\d{,4}|'
- '[2-9]\d{6,7}')
- phone_entitys = []
- for _sentence in list_sentence:
- sentence_text = _sentence.sentence_text
- list_tokenbegin = []
- begin = 0
- for i in range(0, len(_sentence.tokens)):
- list_tokenbegin.append(begin)
- begin += len(str(_sentence.tokens[i]))
- list_tokenbegin.append(begin + 1)
- res_set = set()
- for i in re.finditer(phone, sentence_text):
- res_set.add((i.group(), i.start(), i.end()))
- # for i in re.finditer(key_word, sentence_text):
- # res_set.add((i.group(2), i.start() + len(i.group(1)), i.end()))
- for item in list(res_set):
- phone_left = sentence_text[max(0, item[1] - 10):item[1]]
- phone_right = sentence_text[item[2]:item[2] + 8]
- # 排除“传真号”和其它错误项
- if re.search("传,?真|信,?箱|邮,?箱", phone_left):
- if not re.search("电,?话", phone_left):
- continue
- if re.search("帐,?号|编,?号|报,?价|证,?号|价,?格|[\((]万?元[\))]", phone_left):
- continue
- if re.search("[.,]\d{2,}", phone_right):
- continue
- for j in range(len(list_tokenbegin)):
- if list_tokenbegin[j] == item[1]:
- begin_index = j
- break
- elif list_tokenbegin[j] > item[1]:
- begin_index = j - 1
- break
- for j in range(begin_index, len(list_tokenbegin)):
- if list_tokenbegin[j] >= item[2]:
- end_index = j - 1
- break
- _entity = Entity(_sentence.doc_id, None, item[0], "phone", _sentence.sentence_index, begin_index, end_index, item[1],
- item[2])
- phone_entitys.append(_entity)
- def is_company(entity,text):
- # 判断"公司"实体是否为地址地点
- if entity.label!=5 and entity.values[entity.label]>0.5:
- return True
- if ent.is_tail==True:
- return False
- entity_left = text[max(0,entity.wordOffset_begin-10):entity.wordOffset_begin]
- entity_left = re.sub(",()\(\)::","",entity_left)
- entity_left = entity_left[-5:]
- if re.search("地址|地点",entity_left):
- return False
- else:
- return True
- pre_entity = []
- for ent in list_entity:
- if (ent.entity_type in ['company','org','phone'] and is_company(ent,list_sentence[ent.sentence_index].sentence_text)) or (ent.entity_type=='person' and ent.label in [1,2,3]) \
- or (ent.entity_type=='location' and len(ent.entity_text)>5):
- pre_entity.append(ent)
- text_data,pre_data = relationExtraction_model.encode(pre_entity + phone_entitys, list_sentence)
- # print(pre_data)
- maxlen = 512
- relation_list = []
- if 0<len(text_data)<=maxlen:
- relation_list = relationExtraction_model.predict(text_data, pre_data)
- else:
- # 公告大于maxlen时,分段预测
- start = 0
- while start<len(pre_data):
- _pre_data = pre_data[start:start+maxlen]
- _text_data = text_data[start:start+maxlen]
- relation_list.extend(relationExtraction_model.predict(_text_data,_pre_data))
- start = start + maxlen - 100
- # 去重结果
- relation_list = list(set(relation_list))
- # print(relation_list)
- tokens_num_dict = dict()
- last_tokens_num = 0
- for sentence in list_sentence:
- _index = sentence.sentence_index
- if _index == 0:
- tokens_num_dict[_index] = 0
- else:
- tokens_num_dict[_index] = tokens_num_dict[_index - 1] + last_tokens_num
- last_tokens_num = len(sentence.tokens)
- right_combination = [('org','person'),('company','person'),('company','location'),('org','location'),('person','phone')]
- linked_company = set()
- linked_person = set()
- for predicate in ["rel_address","rel_phone","rel_person"]:
- _match_list = []
- _match_combo = []
- for relation in relation_list:
- _subject = relation[0]
- _object = relation[2]
- if isinstance(_subject,Entity) and isinstance(_object,Entity) and (_subject.entity_type,_object.entity_type) in right_combination:
- if relation[1]==predicate:
- if predicate=="rel_person":
- if (_subject.label==0 and _object.entity_text in agency_contact ) or (_subject.label==1 and _object.entity_text in tenderee_contact):
- continue
- distance = (tokens_num_dict[_object.sentence_index] + _object.begin_index) - (
- tokens_num_dict[_subject.sentence_index] + _subject.end_index)
- if distance>0:
- value = (-1 / 2 * (distance ** 2))/10000
- else:
- distance = abs(distance)
- value = (-1 / 2 * (distance ** 2))
- _match_list.append(Match(_subject,_object,value))
- _match_combo.append((_subject,_object))
- match_result = dispatch(_match_list)
- error_list = []
- for mat in list(set(_match_combo)-set(match_result)):
- for temp in match_result:
- if mat[1]==temp[1] and mat[0]!=temp[0]:
- error_list.append(mat)
- break
- result = list(set(_match_combo)-set(error_list))
- if predicate=='rel_person':
- # 从后往前更新状态,已近后向链接的属性不在前向链接(解决错误链接)
- result = sorted(result,key=lambda x:x[1].begin_index,reverse=True)
- for combo in result:
- is_continue = False
- if not combo[0].pointer_person:
- combo[0].pointer_person = []
- if combo[1].begin_index<combo[0].begin_index:
- if combo[0].pointer_person:
- for temp in combo[0].pointer_person:
- if temp.begin_index>combo[0].begin_index:
- is_continue = True
- break
- if is_continue: continue
- combo[0].pointer_person.append(combo[1])
- linked_company.add(combo[0])
- linked_person.add(combo[1])
- # print(1,combo[0].entity_text,combo[1].entity_text)
- if predicate=='rel_address':
- result = sorted(result,key=lambda x:x[1].begin_index,reverse=True)
- for combo in result:
- if combo[0].pointer_address:
- continue
- combo[0].pointer_address = combo[1]
- # print(2,combo[0].entity_text,combo[1].entity_text)
- if predicate=='rel_phone':
- result = sorted(result,key=lambda x:x[1].begin_index,reverse=True)
- for combo in result:
- is_continue = False
- if not combo[0].person_phone:
- combo[0].person_phone = []
- if combo[1].begin_index<combo[0].begin_index:
- if combo[0].person_phone:
- for temp in combo[0].person_phone:
- if temp.begin_index>combo[0].begin_index:
- is_continue = True
- break
- if is_continue: continue
- combo[0].person_phone.append(combo[1])
- if combo[0].label in [1,2]:
- if PackDict.get("Project"):
- for i in range(len(PackDict["Project"]["roleList"])):
- if (combo[0].label==1 and PackDict["Project"]["roleList"][i].role_name=='tenderee') \
- or (combo[0].label==2 and PackDict["Project"]["roleList"][i].role_name=='agency'):
- PackDict["Project"]["roleList"][i].linklist.append((combo[0].entity_text,combo[1].entity_text))
- break
- # print(3,combo[0].entity_text,combo[1].entity_text)
- # 更新 PackDict
- not_sure_linked = []
- for link_p in list(linked_company):
- for k in PackDict.keys():
- for i in range(len(PackDict[k]["roleList"])):
- if PackDict[k]["roleList"][i].role_name == "tenderee":
- if PackDict[k]["roleList"][i].entity_text != link_p.entity_text and link_p.label == 0:
- not_sure_linked.append(link_p)
- continue
- if PackDict[k]["roleList"][i].entity_text == link_p.entity_text:
- for per in link_p.pointer_person:
- person_phone = [phone for phone in per.person_phone] if per.person_phone else []
- if not person_phone:
- if per.entity_text not in agency_contact:
- PackDict[k]["roleList"][i].linklist.append((per.entity_text, ""))
- continue
- for _p in person_phone:
- if per.entity_text not in agency_contact and _p.entity_text not in agency_phone:
- PackDict[k]["roleList"][i].linklist.append((per.entity_text, _p.entity_text))
- elif PackDict[k]["roleList"][i].role_name == "agency":
- if PackDict[k]["roleList"][i].entity_text != link_p.entity_text and link_p.label == 1:
- not_sure_linked.append(link_p)
- continue
- if PackDict[k]["roleList"][i].entity_text == link_p.entity_text:
- for per in link_p.pointer_person:
- person_phone = [phone for phone in per.person_phone] if per.person_phone else []
- if not person_phone:
- if per.entity_text not in tenderee_contact:
- PackDict[k]["roleList"][i].linklist.append((per.entity_text, ""))
- continue
- for _p in person_phone:
- if per.entity_text not in tenderee_contact and _p.entity_text not in tenderee_phone:
- PackDict[k]["roleList"][i].linklist.append((per.entity_text, _p.entity_text))
- else:
- if PackDict[k]["roleList"][i].entity_text == link_p.entity_text:
- for per in link_p.pointer_person:
- person_phone = [phone for phone in per.person_phone] if per.person_phone else []
- if not person_phone:
- if per.entity_text not in tenderee_contact and per.entity_text not in agency_contact:
- PackDict[k]["roleList"][i].linklist.append((per.entity_text, ""))
- winter_contact.add(per.entity_text)
- continue
- for _p in person_phone:
- if per.entity_text not in tenderee_contact and _p.entity_text not in tenderee_phone and \
- per.entity_text not in agency_contact and _p.entity_text not in agency_phone:
- PackDict[k]["roleList"][i].linklist.append((per.entity_text, _p.entity_text))
- winter_contact.add(per.entity_text)
- # 更新org/company实体label为0,1的链接
- for link_p in not_sure_linked:
- for k in PackDict.keys():
- for i in range(len(PackDict[k]["roleList"])):
- if PackDict[k]["roleList"][i].role_name == "tenderee":
- if link_p.label == 0:
- for per in link_p.pointer_person:
- person_phone = [phone for phone in per.person_phone] if per.person_phone else []
- if not person_phone:
- if per.entity_text not in agency_contact and per.entity_text not in winter_contact:
- PackDict[k]["roleList"][i].linklist.append((per.entity_text, ""))
- continue
- for _p in person_phone:
- if per.entity_text not in agency_contact and _p.entity_text not in agency_phone and per.entity_text not in winter_contact:
- PackDict[k]["roleList"][i].linklist.append((per.entity_text, _p.entity_text))
- elif PackDict[k]["roleList"][i].role_name == "agency":
- if link_p.label == 1:
- for per in link_p.pointer_person:
- person_phone = [phone for phone in per.person_phone] if per.person_phone else []
- if not person_phone:
- if per.entity_text not in tenderee_contact and per.entity_text not in winter_contact:
- PackDict[k]["roleList"][i].linklist.append((per.entity_text, ""))
- continue
- for _p in person_phone:
- if per.entity_text not in tenderee_contact and _p.entity_text not in tenderee_phone and per.entity_text not in winter_contact:
- PackDict[k]["roleList"][i].linklist.append((per.entity_text, _p.entity_text))
- re_split = re.compile("[^\u4e00-\u9fa5、](十一|十二|十三|十四|十五|一|二|三|四|五|六|七|八|九|十)、")
- split_list = [0] * 16
- split_dict = {
- "一、": 1,
- "二、": 2,
- "三、": 3,
- "四、": 4,
- "五、": 5,
- "六、": 6,
- "七、": 7,
- "八、": 8,
- "九、": 9,
- "十、": 10,
- "十一、": 11,
- "十二、": 12,
- "十三、": 13,
- "十四、": 14,
- "十五、": 15
- }
- for item in re.finditer(re_split, _content):
- _index = split_dict.get(item.group()[1:])
- if not split_list[_index]:
- split_list[_index] = item.span()[0] + 1
- split_list = [i for i in split_list if i != 0]
- start = 0
- new_split_list = []
- for idx in split_list:
- new_split_list.append((start, idx))
- start = idx
- new_split_list.append((start, len(_content)))
- # 实体列表按照“公告分段”分组
- words_num_dict = dict()
- last_words_num = 0
- for sentence in list_sentence:
- _index = sentence.sentence_index
- if _index == 0:
- words_num_dict[_index] = 0
- else:
- words_num_dict[_index] = words_num_dict[_index - 1] + last_words_num
- last_words_num = len(sentence.sentence_text)
- # 公司-联系人连接(km算法)
- re_phone = re.compile('1[3-9][0-9][-—-]?\d{4}[-—-]?\d{4}|'
- '0\d{2,3}[-—-][1-9]\d{6,7}/[1-9]\d{6,10}|'
- '0\d{2,3}[-—-][1-9]\d{6,7}转\d{1,4}|'
- '0\d{2,3}[-—-]?[1-9]\d{6,7}|'
- '[\(|\(]0\d{2,3}[\)|\)]-?\d{7,8}-?\d{,4}|'
- '[1-9]\d{6,7}')
- key_phone = re.compile("联系方式|电话|联系人|负责人")
- temporary_list2 = []
- for entity in list_entity:
- # if entity.entity_type in ['org', 'company', 'person'] and entity.is_tail==False:
- if entity.entity_type in ['org', 'company', 'person']:
- temporary_list2.append(entity)
- temporary_list2 = sorted(temporary_list2, key=lambda x: (x.sentence_index, x.begin_index))
- new_temporary_list2 = []
- for _split in new_split_list:
- temp_list = []
- for _entity in temporary_list2:
- if words_num_dict[_entity.sentence_index] + _entity.wordOffset_begin >= _split[0] and words_num_dict[
- _entity.sentence_index] + _entity.wordOffset_end < _split[1]:
- temp_list.append(_entity)
- elif words_num_dict[_entity.sentence_index] + _entity.wordOffset_begin >= _split[1]:
- break
- new_temporary_list2.append(temp_list)
- # print(new_temporary_list2)
- match_list2 = []
- for split_index in range(len(new_temporary_list2)):
- split_entitys = new_temporary_list2[split_index]
- is_skip = False
- for index in range(len(split_entitys)):
- entity = split_entitys[index]
- if is_skip:
- is_skip = False
- continue
- else:
- if entity.entity_type in ['org', 'company']:
- if entity.label != 5 or entity.entity_text in roleSet:
- match_nums = 0
- for after_index in range(index + 1, min(len(split_entitys), index + 4)):
- after_entity = split_entitys[after_index]
- if after_entity.entity_type in ['person']:
- # 实体为中标人/候选人,联系人已确定类别【1,2】
- if entity.label in [2, 3, 4] and after_entity.label in [1, 2]:
- break
- if after_entity.label in [1, 2, 3]:
- distance = (tokens_num_dict[
- after_entity.sentence_index] + after_entity.begin_index) - (
- tokens_num_dict[entity.sentence_index] + entity.end_index)
- sentence_distance = after_entity.sentence_index - entity.sentence_index
- if sentence_distance == 0:
- if distance < 100:
- if (entity.label == 0 and after_entity.label == 1) or (
- entity.label == 1 and after_entity.label == 2):
- distance = distance / 100
- value = (-1 / 2 * (distance ** 2)) / 10000
- match_list2.append(Match(entity, after_entity, value))
- match_nums += 1
- else:
- if distance < 60:
- if (entity.label == 0 and after_entity.label == 1) or (
- entity.label == 1 and after_entity.label == 2):
- distance = distance / 100
- value = (-1 / 2 * (distance ** 2)) / 10000
- match_list2.append(Match(entity, after_entity, value))
- match_nums += 1
- if after_entity.entity_type in ['org', 'company']:
- # 解决在‘地址’中识别出org/company的问题
- # if entity.label in [0,1] and after_index==index+1 and after_entity.label not in [0,1]:
- if entity.label != 5 and after_index == index + 1 and (
- after_entity.label == entity.label or after_entity.label == 5):
- distance = (tokens_num_dict[
- after_entity.sentence_index] + after_entity.begin_index) - (
- tokens_num_dict[entity.sentence_index] + entity.end_index)
- if distance < 20:
- after_entity_left = list_sentence[after_entity.sentence_index].tokens[max(0,
- after_entity.begin_index - 10):after_entity.begin_index]
- after_entity_right = list_sentence[after_entity.sentence_index].tokens[
- after_entity.end_index + 1:after_entity.end_index + 6]
- after_entity_left = "".join(after_entity_left)
- if len(after_entity_left) > 20:
- after_entity_left = after_entity_left[-20:]
- after_entity_right = "".join(after_entity_right)[:10]
- if re.search("地,?址", after_entity_left):
- is_skip = True
- continue
- if re.search("\(|(", after_entity_left) and re.search("\)|)",
- after_entity_right):
- is_skip = True
- continue
- if entity.label in [0, 1] and after_entity.label in [0,
- 1] and entity.label == after_entity.label:
- break
- if entity.label in [0, 1] and after_entity.label in [0, 1] and split_entitys[
- index + 1].entity_type == "person":
- break
- if entity.label in [0, 1] and after_entity.label in [2, 3, 4]:
- break
- if entity.label in [2, 3, 4] and after_entity.label in [0, 1]:
- break
- # 搜索没有联系人的电话
- mid_tokens = []
- is_same_sentence = False
- if index == len(split_entitys) - 1:
- for i in range(entity.sentence_index, len(list_sentence)):
- mid_tokens += list_sentence[i].tokens
- mid_tokens = mid_tokens[entity.end_index + 1:]
- mid_sentence = "".join(mid_tokens)
- have_phone = re.findall(re_phone, mid_sentence)
- if have_phone:
- if re.findall(re_phone, mid_sentence.split("。")[0]):
- is_same_sentence = True
- _phone = have_phone[0]
- phone_begin = mid_sentence.find(_phone)
- if words_num_dict[entity.sentence_index] + entity.wordOffset_begin + phone_begin < \
- new_split_list[split_index][1]:
- mid_sentence = mid_sentence[max(0, phone_begin - 15):phone_begin].replace(",", "")
- if re.search(key_phone, mid_sentence):
- distance = 1
- if is_same_sentence:
- if phone_begin <= 200:
- value = (-1 / 2 * (distance ** 2)) / 10000
- match_list2.append(Match(entity, (entity, _phone), value))
- match_nums += 1
- else:
- if phone_begin <= 60:
- value = (-1 / 2 * (distance ** 2)) / 10000
- match_list2.append(Match(entity, (entity, _phone), value))
- match_nums += 1
- else:
- next_entity = split_entitys[index + 1]
- if entity.sentence_index == next_entity.sentence_index:
- mid_tokens += list_sentence[entity.sentence_index].tokens[
- entity.end_index + 1:next_entity.begin_index]
- else:
- sentence_index = entity.sentence_index
- while sentence_index <= next_entity.sentence_index:
- mid_tokens += list_sentence[sentence_index].tokens
- sentence_index += 1
- mid_tokens = mid_tokens[entity.end_index + 1:-(len(
- list_sentence[next_entity.sentence_index].tokens) - next_entity.begin_index) + 1]
- mid_sentence = "".join(mid_tokens)
- have_phone = re.findall(re_phone, mid_sentence)
- if have_phone:
- if re.findall(re_phone, mid_sentence.split("。")[0]):
- is_same_sentence = True
- _phone = have_phone[0]
- phone_begin = mid_sentence.find(_phone)
- mid_sentence = mid_sentence[max(0, phone_begin - 15):phone_begin].replace(",", "")
- if re.search(key_phone, mid_sentence):
- p_phone = [p.entity_text for p in next_entity.person_phone] if next_entity.person_phone else []
- if next_entity.entity_type == 'person' and _phone in p_phone:
- pass
- else:
- distance = (tokens_num_dict[
- next_entity.sentence_index] + next_entity.begin_index) - (
- tokens_num_dict[entity.sentence_index] + entity.end_index)
- distance = distance / 2
- if is_same_sentence:
- if phone_begin <= 200:
- value = (-1 / 2 * (distance ** 2)) / 10000
- match_list2.append(Match(entity, (entity, _phone), value))
- match_nums += 1
- else:
- if phone_begin <= 60:
- value = (-1 / 2 * (distance ** 2)) / 10000
- match_list2.append(Match(entity, (entity, _phone), value))
- match_nums += 1
- # 实体无匹配时,尝试前向查找匹配
- if not match_nums:
- if entity.label != 5 and entity.values[entity.label] > 0.5 and index != 0:
- previous_entity = split_entitys[index - 1]
- if previous_entity.entity_type == 'person' and previous_entity.label in [1, 2, 3]:
- if entity.label in [2, 3, 4] and previous_entity.label in [1, 2]:
- continue
- if previous_entity.sentence_index == entity.sentence_index:
- distance = (tokens_num_dict[entity.sentence_index] + entity.begin_index) - (
- tokens_num_dict[
- previous_entity.sentence_index] + previous_entity.end_index)
- if distance < 20:
- # 距离相等时,前向添加处罚值
- # distance += 1
- # 前向 没有 /10000
- value = (-1 / 2 * (distance ** 2))
- match_list2.append(Match(entity, previous_entity, value))
- # print(match_list2)
- match_list2 = [mat for mat in match_list2 if mat.main_role not in linked_company and mat.attribute not in linked_person]
- # print(match_list2)
- # km算法分配求解
- result2 = dispatch(match_list2)
- # print(result2)
- linked_person = []
- linked_persons_with = []
- for match in result2:
- entity = match[0]
- # print(entity.entity_text)
- # print(match.attribute)
- entity_index = list_entity.index(entity)
- is_update = False
- if isinstance(match[1], tuple):
- person_ = ''
- phone_ = [match[1][1]]
- else:
- person_ = match[1].entity_text
- phone_ = [i.entity_text for i in match[1].person_phone] if match[1].person_phone else []
- for k in PackDict.keys():
- for i in range(len(PackDict[k]["roleList"])):
- if PackDict[k]["roleList"][i].role_name == "tenderee":
- if not PackDict[k]["roleList"][i].linklist:
- if PackDict[k]["roleList"][i].entity_text == entity.entity_text or entity.label == 0:
- if person_ not in agency_contact and len(set(phone_)&set(agency_phone))==0 and person_ not in winter_contact:
- if not phone_:
- PackDict[k]["roleList"][i].linklist.append((person_, ""))
- for p in phone_:
- PackDict[k]["roleList"][i].linklist.append((person_, p))
- is_update = True
- elif PackDict[k]["roleList"][i].role_name == "agency":
- if not PackDict[k]["roleList"][i].linklist:
- if PackDict[k]["roleList"][i].entity_text == entity.entity_text or entity.label == 1 and person_ not in winter_contact:
- if person_ not in tenderee_contact and len(set(phone_)&set(tenderee_phone))==0:
- if not phone_:
- PackDict[k]["roleList"][i].linklist.append((person_, ""))
- for p in phone_:
- PackDict[k]["roleList"][i].linklist.append((person_, p))
- is_update = True
- else:
- if PackDict[k]["roleList"][i].entity_text == entity.entity_text:
- if not PackDict[k]["roleList"][i].linklist:
- if person_ not in tenderee_contact and len(set(phone_)&set(tenderee_phone))==0 and \
- person_ not in agency_contact and len(set(phone_)&set(agency_phone))==0:
- if not phone_:
- PackDict[k]["roleList"][i].linklist.append((person_, ""))
- for p in phone_:
- PackDict[k]["roleList"][i].linklist.append((person_, p))
- is_update = True
- if not person_:
- is_update = False
- if is_update:
- # 更新 list_entity
- if not list_entity[entity_index].pointer_person:
- list_entity[entity_index].pointer_person = []
- list_entity[entity_index].pointer_person.append(match[1])
- linked_person.append(match[1])
- linked_persons_with.append(entity)
- # 一个公司对应多个联系人的补充
- person_entitys = [entity for entity in list_entity if entity.entity_type=='person']
- person_entitys = person_entitys[::-1]
- for index in range(len(person_entitys)):
- entity = person_entitys[index]
- prepare_link = []
- if entity not in linked_person:
- prepare_link.append(entity)
- last_person = entity
- for after_index in range(index + 1, min(len(person_entitys), index + 5)):
- after_entity = person_entitys[after_index]
- if after_entity.sentence_index==last_person.sentence_index and last_person.begin_index-after_entity.end_index<5:
- if after_entity in linked_person:
- _index = linked_person.index(after_entity)
- with_company = linked_persons_with[_index]
- for i in range(len(PackDict["Project"]["roleList"])):
- if PackDict["Project"]["roleList"][i].role_name == "tenderee":
- if PackDict["Project"]["roleList"][i].entity_text == with_company.entity_text or with_company.label == 0:
- for item in prepare_link:
- person_phone = [p.entity_text for p in item.person_phone] if item.person_phone else []
- for _p in person_phone:
- PackDict["Project"]["roleList"][i].linklist.append((item.entity_text, _p))
- with_company.pointer_person.append(item)
- linked_person.append(item)
- elif PackDict["Project"]["roleList"][i].role_name == "agency":
- if PackDict["Project"]["roleList"][i].entity_text == with_company.entity_text or with_company.label == 1:
- for item in prepare_link:
- person_phone = [p.entity_text for p in item.person_phone] if item.person_phone else []
- for _p in person_phone:
- PackDict["Project"]["roleList"][i].linklist.append((item.entity_text, _p))
- with_company.pointer_person.append(item)
- linked_person.append(item)
- else:
- if PackDict["Project"]["roleList"][i].entity_text == with_company.entity_text:
- for item in prepare_link:
- person_phone = [p.entity_text for p in item.person_phone] if item.person_phone else []
- for _p in person_phone:
- PackDict["Project"]["roleList"][i].linklist.append((item.entity_text, _p))
- with_company.pointer_person.append(item)
- linked_person.append(item)
- break
- else:
- prepare_link.append(after_entity)
- last_person = after_entity
- continue
- # 统一同类角色的属性
- if PackDict.get("Project"):
- for i in range(len(PackDict["Project"]["roleList"])):
- # if PackDict["Project"]["roleList"][i].role_name in ["tenderee","agency"]:
- for _entity in list_entity:
- if _entity.entity_type in ['org','company']:
- is_similar = False
- # entity_text相同
- if _entity.entity_text==PackDict["Project"]["roleList"][i].entity_text:
- is_similar = True
- # entity.label为【0,1】
- if _entity.label in [0,1] and dict_role_id[str(_entity.label)]==PackDict["Project"]["roleList"][i].role_name:
- is_similar = True
- if is_similar:
- linked_entitys = _entity.linked_entitys
- if linked_entitys:
- for linked_entity in linked_entitys:
- pointer_person = linked_entity.pointer_person if linked_entity.pointer_person else []
- for _pointer_person in pointer_person:
- _phone = [p.entity_text for p in _pointer_person.person_phone] if _pointer_person.person_phone else []
- for _p in _phone:
- if (_pointer_person.entity_text,_p) not in PackDict["Project"]["roleList"][i].linklist:
- PackDict["Project"]["roleList"][i].linklist.append((_pointer_person.entity_text,_p))
- # "roleList"中联系人电话去重
- for i in range(len(PackDict["Project"]["roleList"])):
- # print(123, PackDict["Project"]["roleList"][i].linklist)
- # 带有联系人的电话
- with_person = [person_phone[1] for person_phone in PackDict["Project"]["roleList"][i].linklist if person_phone[0]]
- # 带有电话的联系人
- with_phone = [person_phone[0] for person_phone in PackDict["Project"]["roleList"][i].linklist if person_phone[1]]
- remove_list = []
- for item in PackDict["Project"]["roleList"][i].linklist:
- if not item[0]:
- if item[1] in with_person:
- # 删除重复的无联系人电话
- remove_list.append(item)
- elif not item[1]:
- if item[0] in with_phone:
- remove_list.append(item)
- for _item in remove_list:
- PackDict["Project"]["roleList"][i].linklist.remove(_item)
- # # 1)第一个公司实体的招标人,则看看下一个实体是否为代理人,如果是则联系人错位连接 。2)在同一句中往后找联系人。3)连接不上在整个文章找联系人。
- # temp_ent_list = [] # 临时列表,记录0,1角色及3联系人
- # other_person = [] # 阈值以上的联系人列表
- # link_person = [] # 有电话没联系上角色的person列表
- # other_ent = []
- # link_ent = []
- # found_person = False
- # ent_list = []
- # for entity in list_entity:
- # if entity.entity_type in ['org','company','person']:
- # ent_list.append(entity)
- # # ent_list = [entity for entity in list_entity if entity.entity_type in ['org','company','person']]
- # #for list_index in range(len(ent_list)):
- # #if ent_list[list_index].entity_type in ['org','company'] and ent_list[list_index].label == 0 and list_index+2<len(ent_list) and \
- # #ent_list[list_index+1].entity_type in ['org','company'] and ent_list[list_index+1].label == 1 and ent_list[list_index+2].entity_type in ['person']:
- # #ent_list[list_index+1], ent_list[list_index+2] = ent_list[list_index+2], ent_list[list_index+1]
- # # 2020/11/25增加确定角色联系人判断
- # sure_person_set = set([entity.entity_text for entity in ent_list if entity.entity_type == 'person' and entity.label in [1, 2]])
- # # 招标/代理在同一句中交叉情况的处理
- # for index in range(len(ent_list)):
- # entity = ent_list[index]
- # if entity.entity_text in roleSet and entity.label in [0, 1] and index+3<len(ent_list):
- # if entity.sentence_index==ent_list[index+1].sentence_index==ent_list[index+2].sentence_index==ent_list[index+3].sentence_index:
- # if ent_list[index+1].begin_index - entity.end_index < 30:
- # if ent_list[index+1].entity_text in roleSet and ent_list[index+1].label in [0, 1] and entity.label!=ent_list[index+1].label:
- # if ent_list[index+2].entity_type=="person" and ent_list[index+3].entity_type=="person" and \
- # ent_list[index+2].label==3 and ent_list[index+3].label==3:
- # ent_list[index + 1], ent_list[index + 2] = ent_list[index + 2], ent_list[index + 1]
- #
- #
- # for index in range(len(ent_list)):
- # entity = ent_list[index]
- # if entity.entity_type=="person":
- # if str(entity.label) == "0": # 2020/11/25 非联系人直接跳过
- # continue
- # if entity.values[entity.label]>on_value_person:
- # if str(entity.label)=="1":
- # for i in range(len(PackDict["Project"]["roleList"])):
- # if PackDict["Project"]["roleList"][i].role_name=="tenderee":
- # PackDict["Project"]["roleList"][i].linklist.append((entity.entity_text,entity.person_phone))
- # link_person.append(entity.entity_text)
- # link_ent.append(PackDict["Project"]["roleList"][i].entity_text)
- # # add pointer_person
- # for _entity in list_entity:
- # if dict_role_id.get(str(_entity.label))=="tenderee":
- # for i in range(len(PackDict["Project"]["roleList"])):
- # if PackDict["Project"]["roleList"][i].entity_text==_entity.entity_text and PackDict["Project"]["roleList"][i].role_name=="tenderee":
- # _entity.pointer_person = entity
- # elif str(entity.label)=="2":
- # for i in range(len(PackDict["Project"]["roleList"])):
- # if PackDict["Project"]["roleList"][i].role_name=="agency":
- # PackDict["Project"]["roleList"][i].linklist.append((entity.entity_text,entity.person_phone))
- # link_person.append(entity.entity_text)
- # link_ent.append(PackDict["Project"]["roleList"][i].entity_text)
- # # add pointer_person
- # for _entity in list_entity:
- # if dict_role_id.get(str(_entity.label))=="agency":
- # for i in range(len(PackDict["Project"]["roleList"])):
- # if PackDict["Project"]["roleList"][i].entity_text==_entity.entity_text and PackDict["Project"]["roleList"][i].role_name=="agency":
- # _entity.pointer_person = entity
- # elif str(entity.label)=="3":
- # if entity.entity_text in sure_person_set: # 2020/11/25 排除已经确定角色的联系人
- # continue
- # #not_link_person.append((entity_after.entity_text,entity_after.person_phone))
- # other_person.append(entity.entity_text)
- # temp_ent_list.append((entity.entity_text,entity.person_phone,entity))
- #
- # #if entity.entity_text in roleSet:
- # if entity.entity_text in roleSet:
- # if entity.label in [0,1]:
- # other_ent.append(entity.entity_text)
- # temp_ent_list.append((entity.entity_text, entity.label,entity))
- # for behind_index in range(index+1, len(ent_list)):
- # entity_after = ent_list[behind_index]
- # if entity_after.sentence_index-entity.sentence_index>=1 or entity_after.entity_type in ['org','company']: # 只在本句中找联系人
- # break
- # if entity_after.values is not None:
- # if entity_after.entity_type=="person":
- # if str(entity_after.label) == "0": # 2020/11/25角色后面为非联系人 停止继续往后找
- # break
- # if entity_after.values[entity_after.label]>on_value_person:
- # if str(entity_after.label)=="1":
- # for i in range(len(PackDict["Project"]["roleList"])):
- # if PackDict["Project"]["roleList"][i].role_name=="tenderee":
- # PackDict["Project"]["roleList"][i].linklist.append((entity_after.entity_text,entity_after.person_phone))
- # link_person.append(entity_after.entity_text)
- # link_ent.append(PackDict["Project"]["roleList"][i].entity_text)
- # elif str(entity_after.label)=="2":
- # for i in range(len(PackDict["Project"]["roleList"])):
- # if PackDict["Project"]["roleList"][i].role_name=="agency":
- # PackDict["Project"]["roleList"][i].linklist.append((entity_after.entity_text,entity_after.person_phone))
- # link_person.append(entity_after.entity_text)
- # link_ent.append(PackDict["Project"]["roleList"][i].entity_text)
- # elif str(entity_after.label)=="3":
- # if entity_after.entity_text in sure_person_set: # 2020/11/25 如果姓名已经出现在确定角色联系人中则停止往后找
- # break
- # elif entity_after.begin_index - entity.end_index > 30:#2020/10/25 如果角色实体与联系人实体间隔大于阈值停止
- # break
- # for pack in PackDict.keys():
- # for i in range(len(PackDict[pack]["roleList"])):
- # if PackDict[pack]["roleList"][i].entity_text==entity.entity_text:
- # #if entity_after.sentence_index-entity.sentence_index>1 and len(roleList[i].linklist)>0:
- # #break
- # PackDict[pack]["roleList"][i].linklist.append((entity_after.entity_text,entity_after.person_phone))
- # link_person.append(entity_after.entity_text)
- # #add pointer_person
- # entity.pointer_person = entity_after
- #
- # not_link_person = [person for person in other_person if person not in link_person]
- # not_link_ent = [ent for ent in other_ent if ent not in link_ent]
- # if len(not_link_person) > 0 and len(not_link_ent) > 0 :
- # item = temp_ent_list
- # for i in range(len(item)):
- # if item[i][0] in not_link_ent and item[i][1] == 0 and i+3 < len(item):
- # if item[i+1][0] in other_ent and item[i+1][1] == 1 and item[i+2][0] in other_person and item[i+3][0] in other_person:
- # item[i+1], item[i+2] = item[i+2], item[i+1]
- # for i in range(len(item)-1, -1, -1):
- # if item[i][0] in not_link_ent:
- # for pack in PackDict.keys():
- # for role in PackDict[pack]["roleList"]:
- # if role.entity_text == item[i][0] and len(role.linklist) < 1:
- # for j in range(i+1, len(item)):
- # if item[j][0] in not_link_person:
- # role.linklist.append(item[j][:2])
- # #add pointer_person
- # item[i][2].pointer_person = item[j][2]
- # break
- # else:
- # break
- # # 电话没有联系人的处理
- # role_with_no_phone = []
- # for i in range(len(PackDict["Project"]["roleList"])):
- # if PackDict["Project"]["roleList"][i].role_name in ["tenderee","agency"]:
- # if len(PackDict["Project"]["roleList"][i].linklist)==0: # 找出没有联系人的招标/代理人
- # role_with_no_phone.append(PackDict["Project"]["roleList"][i].entity_text)
- # else:
- # phone_nums = 0
- # for link in PackDict["Project"]["roleList"][i].linklist:
- # if link[1]:
- # phone_nums += 1
- # break
- # if not phone_nums:
- # role_with_no_phone.append(PackDict["Project"]["roleList"][i].entity_text)
- # if role_with_no_phone:
- # phone_with_person = [entity.person_phone for entity in list_entity if entity.entity_type == "person"]
- # # phone_with_person = [phone for phone in phone_with_person if phone]
- #
- # dict_index_sentence = {}
- # for _sentence in list_sentence:
- # dict_index_sentence[_sentence.sentence_index] = _sentence
- # new_entity_list = [entity for entity in list_entity if entity.entity_type in ['org','company','person']]
- # for index in range(len(new_entity_list)):
- # entity = new_entity_list[index]
- # if entity.entity_text in role_with_no_phone:
- # e_sentence = dict_index_sentence[entity.sentence_index]
- # entity_right = e_sentence.tokens[entity.end_index:entity.end_index+40]
- # entity_right = "".join(entity_right)
- # if index+1<len(new_entity_list) and entity_right.find(new_entity_list[index+1].entity_text)>-1:
- # entity_right = entity_right[:entity_right.find(new_entity_list[index+1].entity_text)]
- # have_phone = re.findall(phone,entity_right)
- # if have_phone:
- # _phone = have_phone[0]
- # phone_begin = entity_right.find(_phone)
- # if _phone not in phone_with_person and re.search(key_phone,entity_right[:phone_begin]):
- # # entity.person_phone = _phone
- # for i in range(len(PackDict["Project"]["roleList"])):
- # if PackDict["Project"]["roleList"][i].entity_text == entity.entity_text:
- # PackDict["Project"]["roleList"][i].linklist.append(('', _phone))
-
- #寻找多标段招标金额
- p_entity = len(list_entity)-1
- set_tenderer_money = set()
- list_tenderer_money = [] #2021/7/16 新增列表,倒序保存所有中标金额
- unit_list = [] #2021/8/17 新增,保存金额单位
- #遍历所有实体
- while(p_entity>=0):
- entity = list_entity[p_entity]
- if entity.entity_type=="money":
- if entity.values[entity.label]>=on_value:
- if str(entity.label)=="1":
- set_tenderer_money.add(float(entity.entity_text))
- list_tenderer_money.append(float(entity.entity_text)) # 2021/7/16 新增列表,倒序保存所有中标金额
- unit_list.append(entity.money_unit)
- # if str(entity.label)=="0":
- if str(entity.label)=="0" and entity.notes!='总投资':
- '''
- if p_entity>0:
- p_before = list_entity[p_entity-1]
- if p_before.entity_type=="money" and p_before.label==entity.label and p_before.entity_text==entity.entity_text and abs(entity.begin_index-p_before.end_index)<=2:
- p_entity -= 1
- continue
- '''
- packagePointer,_flag = getPackage(PackageList,entity.sentence_index,entity.begin_index,"money-"+str(entity.label),MAX_DIS=2,DIRECT="L")
- if packagePointer is None:
- packageName = "Project"
- else:
- packageName = packagePointer.entity_text
-
- if packageName=="Project":
- # if PackDict["Project"]["tendereeMoney"]<float(entity.entity_text):
- # PackDict["Project"]["tendereeMoney"] = float(entity.entity_text)
- if entity.values[entity.label]>on_value:
- PackDict["Project"]["tendereeMoney"] = float(entity.entity_text)
- PackDict["Project"]["tendereeMoneyUnit"] = entity.money_unit
- else:
- PackDict[packageName]["tendereeMoney"] = float(entity.entity_text)
- PackDict[packageName]["tendereeMoneyUnit"] = entity.money_unit
- #add pointer_tendereeMoney
- packagePointer.pointer_tendereeMoney = entity
- p_entity -= 1
-
-
- #删除一个机构有多个角色的数据
- #删除重复人、概率不回传
- final_roleList = []
- list_pop = []
- set_tenderer_role = set()
- dict_pack_tenderer_money = dict()
- for pack in PackDict.keys():
- #删除无效包
- if PackDict[pack]["code"]=="" and PackDict[pack]["tendereeMoney"]==0 and len(PackDict[pack]["roleList"])==0:
- list_pop.append(pack)
- for i in range(len(PackDict[pack]["roleList"])):
- if PackDict[pack]["roleList"][i].role_name=="win_tenderer":
- if PackDict[pack]["roleList"][i].money==0:
- set_tenderer_role.add(PackDict[pack]["roleList"][i])
- dict_pack_tenderer_money[pack] = [PackDict[pack]["roleList"][i],set()]
- #找到包的中投标金额
- for _index in range(len(PackageList)):
- if "hit" in PackageList[_index]:
- for _hit in list(PackageList[_index]["hit"]):
- _money = float(_hit.split("-")[1]) if _hit.split("-")[0]=="money" else None
- if PackageList[_index]["name"] in dict_pack_tenderer_money and _money is not None:
- dict_pack_tenderer_money[PackageList[_index]["name"]][1].add(_money)
- #只找到一个中标人和中标金额
- if len(set_tenderer_money)==1 and len(set_tenderer_role)==1:
- list(set_tenderer_role)[0].money = list(set_tenderer_money)[0]
- list(set_tenderer_role)[0].money_unit = unit_list[0]
- # print('一个中标人一个金额:', list(set_tenderer_money)[0])
- #找到一个中标人和多个招标金额
- if len(set_tenderer_money)>1 and len(set_tenderer_role)==1:
- _maxMoney = 0
- _sumMoney = 0
- for _m in list(set_tenderer_money):
- _sumMoney += _m
- if _m>_maxMoney:
- _maxMoney = _m
- if _sumMoney/_maxMoney==2:
- list(set_tenderer_role)[0].money = _maxMoney
- # print('一人多金额分项合计 取最大金额:', _maxMoney)
- else:
- # list(set_tenderer_role)[0].money = _maxMoney
- if min(list_tenderer_money)>200000 and list_tenderer_money[-1]/min(list_tenderer_money)>9000:
- list(set_tenderer_role)[0].money = min(list_tenderer_money)
- list(set_tenderer_role)[0].money_unit = unit_list[list_tenderer_money.index(min(list_tenderer_money))]
- # print('一人多金额 且最小的大于20万第一个金额比最小金额大几千倍的最小中标金额:', min(list_tenderer_money))
- else:
- list(set_tenderer_role)[0].money = list_tenderer_money[-1] # 2021/7/16 修改 不是单价合计方式取第一个中标金额
- list(set_tenderer_role)[0].money_unit = unit_list[-1] # 金额单位
- # print('一人多金额 取第一个中标金额:', list_tenderer_money[-1])
- #每个包都只找到一个金额
- _flag_pack_money = True
- for k,v in dict_pack_tenderer_money.items():
- if len(v[1])!=1:
- _flag_pack_money = False
- if _flag_pack_money and len(PackageSet)==len(dict_pack_tenderer_money.keys()):
- for k,v in dict_pack_tenderer_money.items():
- v[0].money = list(v[1])[0]
- # print('k,v in dict_pack_tenderer_money.items', k, v)
- # 2021/7/16 #增加判断中标金额是否远大于招标金额逻辑
- for pack in PackDict.keys():
- for i in range(len(PackDict[pack]["roleList"])):
- if PackDict[pack]["tendereeMoney"] > 0:
- # print('金额数据类型:',type(PackDict[pack]["roleList"][i].money))
- if float(PackDict[pack]["roleList"][i].money) >10000000 and \
- float(PackDict[pack]["roleList"][i].money)/float(PackDict[pack]["tendereeMoney"])>=1000:
- PackDict[pack]["roleList"][i].money = float(PackDict[pack]["roleList"][i].money) / 10000
- # print('招标金额校正中标金额')
- # 2021/7/19 #增加判断中标金额是否远大于第二三中标金额
- for pack in PackDict.keys():
- tmp_moneys = []
- for i in range(len(PackDict[pack]["roleList"])):
- if float(PackDict[pack]["roleList"][i].money) >100000:
- tmp_moneys.append(float(PackDict[pack]["roleList"][i].money))
- if len(tmp_moneys)>2 and max(tmp_moneys)/min(tmp_moneys)>1000:
- for i in range(len(PackDict[pack]["roleList"])):
- if float(PackDict[pack]["roleList"][i].money)/min(tmp_moneys)>1000:
- PackDict[pack]["roleList"][i].money = float(PackDict[pack]["roleList"][i].money) / 10000
- # print('通过其他中标人投标金额校正中标金额')
- for pack in PackDict.keys():
- for i in range(len(PackDict[pack]["roleList"])):
- PackDict[pack]["roleList"][i] = PackDict[pack]["roleList"][i].getString()
- for item in list_pop:
- PackDict.pop(item)
-
- return PackDict
- def initPackageAttr(RoleList,PackageSet):
- '''
- @summary: 根据拿到的roleList和packageSet初始化接口返回的数据
- '''
- packDict = dict()
- packDict["Project"] = {"code":"","tendereeMoney":0,"roleList":[], 'tendereeMoneyUnit':''}
- for item in list(PackageSet):
- packDict[item] = {"code":"","tendereeMoney":0,"roleList":[], 'tendereeMoneyUnit':''}
- for item in RoleList:
- if packDict[item.packageName]["code"] =="":
- packDict[item.packageName]["code"] = item.packageCode
- # packDict[item.packageName]["roleList"].append(Role(item.role_name,item.entity_text,0,0,0.0,[]))
- packDict[item.packageName]["roleList"].append(Role(item.role_name,item.entity_text,0,0,0.0,[])) #Role(角色名称,实体名称,角色阈值,金额,金额阈值,连接列表,金额单位)
- return packDict
-
- def getPackageRoleMoney(list_sentence,list_entity):
- '''
- @param:
- list_sentence:文章的句子list
- list_entity:文章的实体list
- @return: 拿到文章的包-标段号-角色-实体名称-金额-联系人-联系电话
- '''
- # print("=1")
- theRole = getRoleList(list_sentence,list_entity)
- if not theRole:
- return []
- RoleList,RoleSet,PackageList,PackageSet = theRole
- '''
- for item in PackageList:
- # print(item)
- '''
- # print("=2")
- PackDict = initPackageAttr(RoleList, PackageSet)
- # print("=3")
- PackDict = findAttributeAfterEntity(PackDict, RoleSet, PackageList, PackageSet, list_entity, list_sentence)
- # print("=4")
- return PackDict
- def turnBidWay(bidway):
- if bidway in ("邀请招标","采购方式:邀请"):
- return "邀请招标"
- elif bidway in ("询价","询单","询比","采购方式:询价"):
- return "询价"
- elif bidway in ("竞谈","竞争性谈判","公开竞谈"):
- return "竞争性谈判"
- elif bidway in ("竞争性磋商","磋商"):
- return "竞争性磋商"
- elif bidway in ("竞价","竞标","电子竞价","以电子竞价","电子书面竞投"):
- return "竞价"
- elif bidway in ("公开招标","网上电子投标","网上招标","采购方式:公开","招标为其他"):
- return "公开招标"
- elif bidway in ("单一来源"):
- return "单一来源"
- elif bidway in ("比选"):
- return "比选"
- else:
- return "其他"
- def getOtherAttributes(list_entity):
- dict_other = {"moneysource":"",
- "person_review":[],
- "time_release":"",
- "time_bidopen":"",
- "time_bidclose":"",
- "serviceTime":"",
- "product":[],
- "total_tendereeMoney":0,
- "total_tendereeMoneyUnit":''}
- for entity in list_entity:
- if entity.entity_type == 'bidway':
- dict_other["bidway"] = turnBidWay(entity.entity_text)
- elif entity.entity_type=='moneysource':
- dict_other["moneysource"] = entity.entity_text
- elif entity.entity_type=='serviceTime':
- dict_other["serviceTime"] = entity.entity_text
- elif entity.entity_type == 'time' and entity.label==1:
- dict_other["time_release"] = timeFormat(entity.entity_text)
- elif entity.entity_type == 'time' and entity.label==2:
- dict_other["time_bidopen"] = timeFormat(entity.entity_text)
- elif entity.entity_type == 'time' and entity.label == 3:
- dict_other["time_bidclose"] = timeFormat(entity.entity_text)
- elif entity.entity_type=="person" and entity.label ==4:
- dict_other["person_review"].append(entity.entity_text)
- elif entity.entity_type=='product':
- dict_other["product"].append(entity.entity_text)
- elif entity.entity_type=='money' and entity.notes=='总投资' and dict_other["total_tendereeMoney"]<float(entity.entity_text):
- dict_other["total_tendereeMoney"] = float(entity.entity_text)
- dict_other["total_tendereeMoneyUnit"] = entity.money_unit
- dict_other["product"] = list(set(dict_other["product"]))
- return dict_other
- def getMoneyRange(RoleList):
- pass
- def getPREMs(list_sentences,list_entitys,list_articles):
- '''
- @param:
- list_sentence:所有文章的句子list
- list_entity:所有文章的实体list
- @return:list of dict which include文章的包-角色-实体名称-金额-联系人-联系电话
- '''
- result = []
- for list_sentence,list_entity,list_article in zip(list_sentences,list_entitys,list_articles):
- RoleList = getPackageRoleMoney(list_sentence,list_entity)
- result.append(dict({"prem":RoleList,"docid":list_article.doc_id},**getOtherAttributes(list_entity),
- **{"fingerprint":list_article.fingerprint,"match_enterprise":list_article.match_enterprise,
- "match_enterprise_type":list_article.match_enterprise_type,"process_time":getCurrent_date(),
- "attachmentTypes":list_article.attachmentTypes, "bidway": list_article.bidway}))
- return result
- if __name__=="__main__":
- '''
- conn = getConnection()
- cursor = conn.cursor()
- #sql = " select distinct A.doc_id from entity_mention A,test_predict_role B where A.entity_id=B.entity_id limit 200"
- sql = " select B.doc_id,B.prem from articles_processed A, articles_validation B where A.id=B.doc_id "
-
- result = []
-
- cursor.execute(sql)
- rows = cursor.fetchall()
- count = 0
- for row in rows:
-
- count += 1
- # print(count)
- doc_id = row[0]
-
- roleList = getPackageRoleMoney(doc_id)
- result.append([doc_id,str(roleList),row[1]])
- ''''''
- with codecs.open("getAttribute.html","w",encoding="utf8") as f:
- f.write('<html><head>\
- <meta http-equiv="Content-Type"\
- content="text/html; charset=UTF-8">\
- </head>\
- <body bgcolor="#FFFFFF">\
- <table border="1">\
- <tr>\
- <td>doc_id</td>\
- <td>角色</td>\
- </tr>')
- for item in result:
- f.write("<tr>"+"<td>"+item[0]+"</td>"+"<td>"+item[1]+"</td>"+"<td>"+item[2]+"</td>"+"</tr>")
- f.write("</table></body>")
- '''
|