getAttributes.py 168 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012101310141015101610171018101910201021102210231024102510261027102810291030103110321033103410351036103710381039104010411042104310441045104610471048104910501051105210531054105510561057105810591060106110621063106410651066106710681069107010711072107310741075107610771078107910801081108210831084108510861087108810891090109110921093109410951096109710981099110011011102110311041105110611071108110911101111111211131114111511161117111811191120112111221123112411251126112711281129113011311132113311341135113611371138113911401141114211431144114511461147114811491150115111521153115411551156115711581159116011611162116311641165116611671168116911701171117211731174117511761177117811791180118111821183118411851186118711881189119011911192119311941195119611971198119912001201120212031204120512061207120812091210121112121213121412151216121712181219122012211222122312241225122612271228122912301231123212331234123512361237123812391240124112421243124412451246124712481249125012511252125312541255125612571258125912601261126212631264126512661267126812691270127112721273127412751276127712781279128012811282128312841285128612871288128912901291129212931294129512961297129812991300130113021303130413051306130713081309131013111312131313141315131613171318131913201321132213231324132513261327132813291330133113321333133413351336133713381339134013411342134313441345134613471348134913501351135213531354135513561357135813591360136113621363136413651366136713681369137013711372137313741375137613771378137913801381138213831384138513861387138813891390139113921393139413951396139713981399140014011402140314041405140614071408140914101411141214131414141514161417141814191420142114221423142414251426142714281429143014311432143314341435143614371438143914401441144214431444144514461447144814491450145114521453145414551456145714581459146014611462146314641465146614671468146914701471147214731474147514761477147814791480148114821483148414851486148714881489149014911492149314941495149614971498149915001501150215031504150515061507150815091510151115121513151415151516151715181519152015211522152315241525152615271528152915301531153215331534153515361537153815391540154115421543154415451546154715481549155015511552155315541555155615571558155915601561156215631564156515661567156815691570157115721573157415751576157715781579158015811582158315841585158615871588158915901591159215931594159515961597159815991600160116021603160416051606160716081609161016111612161316141615161616171618161916201621162216231624162516261627162816291630163116321633163416351636163716381639164016411642164316441645164616471648164916501651165216531654165516561657165816591660166116621663166416651666166716681669167016711672167316741675167616771678167916801681168216831684168516861687168816891690169116921693169416951696169716981699170017011702170317041705170617071708170917101711171217131714171517161717171817191720172117221723172417251726172717281729173017311732173317341735173617371738173917401741174217431744174517461747174817491750175117521753175417551756175717581759176017611762176317641765176617671768176917701771177217731774177517761777177817791780178117821783178417851786178717881789179017911792179317941795179617971798179918001801180218031804180518061807180818091810181118121813181418151816181718181819182018211822182318241825182618271828182918301831183218331834183518361837183818391840184118421843184418451846184718481849185018511852185318541855185618571858185918601861186218631864186518661867186818691870187118721873187418751876187718781879188018811882188318841885188618871888188918901891189218931894189518961897189818991900190119021903190419051906190719081909191019111912191319141915191619171918191919201921192219231924192519261927192819291930193119321933193419351936193719381939194019411942194319441945194619471948194919501951195219531954195519561957195819591960196119621963196419651966196719681969197019711972197319741975197619771978197919801981198219831984198519861987198819891990199119921993199419951996199719981999200020012002200320042005200620072008200920102011201220132014201520162017201820192020202120222023202420252026202720282029203020312032203320342035203620372038203920402041204220432044204520462047204820492050205120522053205420552056205720582059206020612062206320642065206620672068206920702071207220732074207520762077207820792080208120822083208420852086208720882089209020912092209320942095209620972098209921002101210221032104210521062107210821092110211121122113211421152116211721182119212021212122212321242125212621272128212921302131213221332134213521362137213821392140214121422143214421452146214721482149215021512152215321542155215621572158215921602161216221632164216521662167216821692170217121722173217421752176217721782179218021812182218321842185218621872188218921902191219221932194219521962197219821992200220122022203220422052206220722082209221022112212221322142215221622172218221922202221222222232224222522262227222822292230223122322233223422352236223722382239224022412242224322442245224622472248224922502251225222532254225522562257225822592260226122622263226422652266226722682269227022712272227322742275227622772278227922802281228222832284228522862287228822892290229122922293229422952296229722982299230023012302230323042305230623072308230923102311231223132314231523162317231823192320232123222323232423252326232723282329233023312332233323342335233623372338233923402341234223432344234523462347234823492350235123522353235423552356235723582359236023612362236323642365236623672368236923702371237223732374237523762377237823792380238123822383238423852386238723882389239023912392239323942395239623972398239924002401240224032404240524062407240824092410241124122413241424152416241724182419242024212422242324242425242624272428242924302431243224332434243524362437243824392440244124422443244424452446244724482449245024512452245324542455245624572458245924602461246224632464246524662467246824692470247124722473247424752476247724782479248024812482248324842485248624872488248924902491249224932494249524962497249824992500250125022503250425052506250725082509251025112512251325142515251625172518251925202521252225232524252525262527252825292530253125322533253425352536253725382539254025412542254325442545254625472548254925502551255225532554255525562557255825592560256125622563256425652566256725682569257025712572257325742575257625772578257925802581258225832584258525862587258825892590259125922593259425952596259725982599260026012602260326042605260626072608260926102611261226132614261526162617261826192620262126222623262426252626262726282629263026312632263326342635263626372638263926402641264226432644264526462647264826492650265126522653265426552656265726582659266026612662266326642665266626672668266926702671267226732674267526762677267826792680268126822683268426852686268726882689269026912692269326942695269626972698269927002701270227032704270527062707270827092710271127122713271427152716271727182719272027212722272327242725272627272728272927302731273227332734273527362737273827392740274127422743274427452746274727482749275027512752275327542755275627572758275927602761276227632764276527662767276827692770277127722773277427752776277727782779278027812782278327842785278627872788278927902791279227932794279527962797279827992800280128022803280428052806280728082809281028112812281328142815281628172818281928202821282228232824282528262827282828292830283128322833283428352836283728382839284028412842284328442845284628472848284928502851285228532854285528562857285828592860286128622863286428652866286728682869287028712872287328742875287628772878287928802881288228832884288528862887288828892890289128922893289428952896289728982899290029012902290329042905290629072908290929102911291229132914
  1. from BiddingKG.dl.common.Utils import findAllIndex,debug,timeFormat,getCurrent_date
  2. from BiddingKG.dl.interface.Entitys import PREM,Role,Entity
  3. from decimal import Decimal
  4. import re
  5. import copy
  6. import math
  7. import pandas as pd
  8. import os
  9. from scipy.optimize import linear_sum_assignment
  10. from BiddingKG.dl.interface.Entitys import Match
  11. import numpy as np
  12. def getTheRole(entity,role_list):
  13. '''
  14. @summary:根据实体名称拿到index
  15. @param:
  16. entity:实体名称
  17. role_list:角色list
  18. @return:该实体所在下标
  19. '''
  20. for role_index in range(len(role_list)):
  21. if entity in role_list[role_index]:
  22. return role_index
  23. return None
  24. dict_role_id = {"0":"tenderee",
  25. "1":"agency",
  26. "2":"win_tenderer",
  27. "3":"second_tenderer",
  28. "4":"third_tenderer"}
  29. def getPackage(packageList,sentence_index,begin_index,roleid,MAX_DIS=None,DIRECT=None):
  30. '''
  31. @param:
  32. packageList:文章的包的信息,包号-sent_index-词偏移-字偏移-[[前作用域句子,句内偏移],[后作用域句子,句内偏移]]-匹配集合
  33. sentence_index:实体所在的句子
  34. begin_index:实体所在句子的起始位置
  35. @return:公司实体所属的包
  36. @summary: 优化多标段,确定标段作用域之后,寻找作用域包含该实体的所有包,从前往后找到一个还没有该roleid的包返回,若找到的包都有roleid,则返回第一个,若没有找到包,返回None
  37. '''
  38. '''
  39. if len(packageList)==0:
  40. return None
  41. before_index = None
  42. after_index = None
  43. equal_index = None
  44. equal_count = 0
  45. for pack_index in range(len(packageList)):
  46. if packageList[pack_index][1]>sentence_index and after_index is None:
  47. after_index = pack_index
  48. if packageList[pack_index][1]<sentence_index:
  49. before_index = pack_index
  50. if packageList[pack_index][1]==sentence_index and equal_index is None:
  51. equal_index = pack_index
  52. #当前句子和之前句子未找到包
  53. if before_index is None and equal_index is None:
  54. return None
  55. else:
  56. if after_index is None:
  57. end_index = len(packageList)
  58. else:
  59. end_index = after_index
  60. #只在当前句子找到一个包号
  61. if end_index-max((before_index if before_index is not None else -1,equal_index if equal_index is not None else -1))==1:
  62. return packageList[end_index-1][0]
  63. else:
  64. for i in range(max((before_index if before_index is not None else -1,equal_index if equal_index is not None else -1)),end_index):
  65. if packageList[i][2]>int(begin_index):
  66. if packageList[i-1][4]:
  67. return packageList[i-1][0]
  68. else:
  69. if packageList[i][4]:
  70. return packageList[i-1][0]
  71. else:
  72. return packageList[i][0]
  73. return packageList[end_index-1][0]
  74. '''
  75. if len(packageList)==0:
  76. return None,False
  77. list_legalPack = []
  78. for pack_index in range(len(packageList)):
  79. if DIRECT=="L" and (packageList[pack_index]["sentence_index"]>sentence_index or (packageList[pack_index]["sentence_index"]==sentence_index and packageList[pack_index]["offsetWords_begin"]>begin_index)):
  80. continue
  81. if DIRECT=="R" and (packageList[pack_index]["sentence_index"]<sentence_index or (packageList[pack_index]["sentence_index"]==sentence_index and packageList[pack_index]["offsetwords_begin"]<begin_index)):
  82. continue
  83. if (packageList[pack_index]["scope"][0][0]<sentence_index or (packageList[pack_index]["scope"][0][0]==sentence_index and packageList[pack_index]["scope"][0][1]<=begin_index)) and (packageList[pack_index]["scope"][1][0]>sentence_index or (packageList[pack_index]["scope"][1][0]==sentence_index and packageList[pack_index]["scope"][1][1]>=begin_index)):
  84. if MAX_DIS is not None:
  85. if abs(sentence_index-packageList[pack_index]["sentence_index"])<=MAX_DIS:
  86. list_legalPack.append(pack_index)
  87. else:
  88. list_legalPack.append(pack_index)
  89. # if (packageList[pack_index]["scope"][0][0] < sentence_index
  90. # or (packageList[pack_index]["scope"][0][0] == sentence_index
  91. # and packageList[pack_index]["scope"][0][1] <= begin_index))
  92. # and (packageList[pack_index]["scope"][1][0] > sentence_index
  93. # or (packageList[pack_index]["scope"][1][0] == sentence_index
  94. # and packageList[pack_index]["scope"][1][1] >= begin_index)):
  95. # pass
  96. _flag = True
  97. for _index in list_legalPack:
  98. if roleid in packageList[_index]["hit"]:
  99. continue
  100. else:
  101. _flag = False
  102. packageList[_index]["hit"].add(roleid)
  103. return packageList[_index]["pointer"],_flag
  104. if len(list_legalPack)>0:
  105. return packageList[0]["pointer"],_flag
  106. return None,False
  107. #生成合法的组合
  108. def get_legal_comba(list_entity,dict_role_combination):
  109. #拿到一个包中所有合法的组合
  110. def circle_package(_dict_legal_combination):
  111. list_dict_role_first = []
  112. for _role in _dict_legal_combination:
  113. if len(list_dict_role_first)==0:
  114. for _entity in _dict_legal_combination[_role]:
  115. if _entity !="":
  116. list_dict_role_first.append({_role:_entity})
  117. else:
  118. list_dict_role_after = []
  119. _find_count = 0
  120. for _entity in _dict_legal_combination[_role]:
  121. if _entity !="":
  122. for _dict in list_dict_role_first:
  123. _flag = True
  124. for _key1 in _dict:
  125. if _entity==_dict[_key1]:
  126. #修改为招标人和代理人可以为同一个
  127. if str(_key1) in ["0","1"] and str(_role) in ["0","1"]:
  128. _flag = True
  129. else:
  130. _flag = False
  131. if _flag:
  132. _find_count += 1
  133. _new_dict = copy.copy(_dict)
  134. _new_dict[_role] = _entity
  135. if len(list_dict_role_after)>100000:
  136. break
  137. list_dict_role_after.append(_new_dict)
  138. else:
  139. # 2021/5/25 update,同一实体(entity_text)不同角色
  140. if len(list_dict_role_after) > 100000:
  141. break
  142. for _dict in list_dict_role_first:
  143. for _key1 in _dict:
  144. if _entity == _dict[_key1]:
  145. _new_dict = copy.copy(_dict)
  146. _new_dict.pop(_key1)
  147. _new_dict[_role] = _entity
  148. list_dict_role_after.append({_role:_entity})
  149. if len(list_dict_role_after)==0:
  150. pass
  151. else:
  152. list_dict_role_first.extend(list_dict_role_after)
  153. return list_dict_role_first
  154. def recursive_package(_dict_legal_combination,set_legal_entity,dict_one_selution,list_all_selution):
  155. last_layer = False
  156. #若是空组合则放回空
  157. if len(_dict_legal_combination.keys())==0:
  158. return []
  159. #递归到最后一层则修改状态
  160. if len(_dict_legal_combination.keys())==1:
  161. last_layer = True
  162. #取一个角色开始进行遍历
  163. _key_role = list(_dict_legal_combination.keys())[0]
  164. for item in _dict_legal_combination[_key_role]:
  165. copy_dict_one_selution = copy.copy(dict_one_selution)
  166. copy_dict_legal_combination = {}
  167. copy_set_legal_entity = copy.copy(set_legal_entity)
  168. #复制余下的所有角色,进行下一轮递归
  169. for _key in _dict_legal_combination.keys():
  170. if _key!=_key_role:
  171. copy_dict_legal_combination[_key] = _dict_legal_combination[_key]
  172. #修改为招标人和代理人可以为同一个
  173. if item !="":
  174. _flag = True
  175. if str(_key_role) in ["0","1"]:
  176. for _key_flag in copy_dict_one_selution:
  177. if _key_flag not in ["0","1"] and copy_dict_one_selution[_key_flag]==item:
  178. _flag = False
  179. else:
  180. for _key_flag in copy_dict_one_selution:
  181. if copy_dict_one_selution[_key_flag]==item:
  182. _flag = False
  183. if _flag:
  184. copy_dict_one_selution[_key_role] = item
  185. '''
  186. if item not in copy_set_legal_entity:
  187. if item !="":
  188. copy_dict_one_selution[_key_role] = item
  189. '''
  190. copy_set_legal_entity.add(item)
  191. if last_layer:
  192. list_all_selution.append(copy_dict_one_selution)
  193. else:
  194. recursive_package(copy_dict_legal_combination,copy_set_legal_entity,copy_dict_one_selution,list_all_selution)
  195. #递归匹配各个包的结果
  196. def recursive_packages(_dict_legal_combination,dict_one_selution,list_all_selution):
  197. last_layer = False
  198. if len(_dict_legal_combination.keys())==0:
  199. return []
  200. if len(_dict_legal_combination.keys())==1:
  201. last_layer = True
  202. _key_pack = list(_dict_legal_combination.keys())[0]
  203. for item in _dict_legal_combination[_key_pack]:
  204. copy_dict_one_selution = copy.copy(dict_one_selution)
  205. copy_dict_legal_combination = {}
  206. for _key in _dict_legal_combination.keys():
  207. if _key!=_key_pack:
  208. copy_dict_legal_combination[_key] = _dict_legal_combination[_key]
  209. for _key_role in item.keys():
  210. copy_dict_one_selution[_key_pack+"$$"+_key_role] = item[_key_role]
  211. if last_layer:
  212. list_all_selution.append(copy_dict_one_selution)
  213. else:
  214. recursive_packages(copy_dict_legal_combination,copy_dict_one_selution,list_all_selution)
  215. return list_all_selution
  216. #循环获取所有包组合
  217. def circle_pageages(_dict_legal_combination):
  218. list_all_selution = []
  219. for _key_pack in _dict_legal_combination.keys():
  220. list_key_selution = []
  221. for item in _dict_legal_combination[_key_pack]:
  222. _dict = dict()
  223. for _key_role in item.keys():
  224. _dict[_key_pack+"$$"+_key_role] = item[_key_role]
  225. list_key_selution.append(_dict)
  226. if len(list_all_selution)==0:
  227. list_all_selution = list_key_selution
  228. else:
  229. _list_all_selution = []
  230. for item_1 in list_all_selution:
  231. for item_2 in list_key_selution:
  232. _list_all_selution.append(dict(item_1,**item_2))
  233. list_all_selution = _list_all_selution
  234. return list_all_selution
  235. #拿到各个包解析之后的结果
  236. _dict_legal_combination = {}
  237. for packageName in dict_role_combination.keys():
  238. _list_all_selution = []
  239. # recursive_package(dict_role_combination[packageName], set(), {}, _list_all_selution)
  240. _list_all_selution = circle_package(dict_role_combination[packageName])
  241. '''
  242. # print("===1")
  243. # print(packageName)
  244. for item in _list_all_selution:
  245. # print(item)
  246. # print("===2")
  247. '''
  248. #去除包含子集
  249. list_all_selution_simple = []
  250. _list_set_all_selution = []
  251. for item_selution in _list_all_selution:
  252. item_set_selution = set()
  253. for _key in item_selution.keys():
  254. item_set_selution.add((_key,item_selution[_key]))
  255. _list_set_all_selution.append(item_set_selution)
  256. if len(_list_set_all_selution)>1000:
  257. _dict_legal_combination[packageName] = _list_all_selution
  258. continue
  259. for i in range(len(_list_set_all_selution)):
  260. be_included = False
  261. for j in range(len(_list_set_all_selution)):
  262. if i!=j:
  263. if len(set(_list_set_all_selution[i])&set(_list_set_all_selution[j]))==len(_list_set_all_selution[i]) and len(_list_set_all_selution[i])!=len(_list_set_all_selution[j]):
  264. be_included = True
  265. if not be_included:
  266. list_all_selution_simple.append(_list_all_selution[i])
  267. _dict_legal_combination[packageName] = list_all_selution_simple
  268. _list_final_comba = []
  269. #对各个包的结果进行排列组合
  270. _comba_count = 1
  271. for _key in _dict_legal_combination.keys():
  272. _comba_count *= len(_dict_legal_combination[_key])
  273. #如果过大,则每个包只取概率最大的那个
  274. dict_pack_entity_prob = get_dict_entity_prob(list_entity)
  275. if _comba_count>250:
  276. new_dict_legal_combination = dict()
  277. for _key_pack in _dict_legal_combination.keys():
  278. MAX_PROB = -1000
  279. _MAX_PROB_COMBA = None
  280. for item in _dict_legal_combination[_key_pack]:
  281. # print(_key_pack,item)
  282. _dict = dict()
  283. for _key in item.keys():
  284. _dict[str(_key_pack)+"$$"+str(_key)] = item[_key]
  285. _prob = getSumExpectation(dict_pack_entity_prob, _dict)
  286. if _prob>MAX_PROB:
  287. MAX_PROB = _prob
  288. _MAX_PROB_COMBA = [item]
  289. if _MAX_PROB_COMBA is not None:
  290. new_dict_legal_combination[_key_pack] = _MAX_PROB_COMBA
  291. _dict_legal_combination = new_dict_legal_combination
  292. #recursive_packages(_dict_legal_combination, {}, _list_final_comba)
  293. _list_final_comba = circle_pageages(_dict_legal_combination)
  294. #除了Project包(招标人和代理人),其他包是不会有冲突的
  295. #查看是否有一个实体出现在了Project包和其他包中,如有,要进行裁剪
  296. _list_real_comba = []
  297. for dict_item in _list_final_comba:
  298. set_project = set()
  299. set_other = set()
  300. for _key in list(dict_item.keys()):
  301. if _key.split("$$")[0]=="Project":
  302. set_project.add(dict_item[_key])
  303. else:
  304. set_other.add(dict_item[_key])
  305. set_common = set_project&set_other
  306. if len(set_common)>0:
  307. dict_project = {}
  308. dict_not_project = {}
  309. for _key in list(dict_item.keys()):
  310. if dict_item[_key] in set_common:
  311. if str(_key.split("$$")[0])=="Project":
  312. dict_project[_key] = dict_item[_key]
  313. else:
  314. dict_not_project[_key] = dict_item[_key]
  315. else:
  316. dict_project[_key] = dict_item[_key]
  317. dict_not_project[_key] = dict_item[_key]
  318. _list_real_comba.append(dict_project)
  319. _list_real_comba.append(dict_not_project)
  320. else:
  321. _list_real_comba.append(dict_item)
  322. return _list_real_comba
  323. def get_dict_entity_prob(list_entity,on_value=0.5):
  324. dict_pack_entity_prob = {}
  325. for entity in list_entity:
  326. if entity.entity_type in ['org','company']:
  327. values = entity.values
  328. role_prob = float(values[int(entity.label)])
  329. _key = entity.packageName+"$$"+str(entity.label)
  330. if role_prob>=on_value and str(entity.label)!="5":
  331. _key_prob = _key+"$text$"+entity.entity_text
  332. if _key_prob in dict_pack_entity_prob:
  333. if role_prob>dict_pack_entity_prob[_key_prob][1]:
  334. dict_pack_entity_prob[_key_prob] = [entity.entity_text,role_prob]
  335. else:
  336. dict_pack_entity_prob[_key_prob] = [entity.entity_text,role_prob]
  337. return dict_pack_entity_prob
  338. #计算合计期望
  339. def getSumExpectation(dict_pack_entity_prob,combination,on_value=0.5):
  340. '''
  341. expect = 0
  342. for entity in list_entity:
  343. if entity.entity_type in ['org','company']:
  344. values = entity.values
  345. role_prob = float(values[int(entity.label)])
  346. _key = entity.packageName+"$$"+str(entity.label)
  347. if role_prob>on_value and str(entity.label)!="5":
  348. if _key in combination.keys() and combination[_key]==entity.entity_text:
  349. expect += math.pow(role_prob,4)
  350. else:
  351. expect -= math.pow(role_prob,4)
  352. '''
  353. #修改为同一个实体只取对应包-角色的最大的概率值
  354. expect = 0
  355. dict_entity_prob = {}
  356. for _key_pack_entity in dict_pack_entity_prob:
  357. _key_pack = _key_pack_entity.split("$text$")[0]
  358. role_prob = dict_pack_entity_prob[_key_pack_entity][1]
  359. if _key_pack in combination.keys() and combination[_key_pack]==dict_pack_entity_prob[_key_pack_entity][0]:
  360. if _key_pack_entity in dict_entity_prob.keys():
  361. if dict_entity_prob[_key_pack_entity]<role_prob:
  362. dict_entity_prob[_key_pack_entity] = role_prob
  363. else:
  364. dict_entity_prob[_key_pack_entity] = role_prob
  365. else:
  366. if _key_pack_entity in dict_entity_prob.keys():
  367. if dict_entity_prob[_key_pack_entity]>-role_prob:
  368. dict_entity_prob[_key_pack_entity] = -role_prob
  369. else:
  370. dict_entity_prob[_key_pack_entity] = -role_prob
  371. # for entity in list_entity:
  372. # if entity.entity_type in ['org','company']:
  373. # values = entity.values
  374. # role_prob = float(values[int(entity.label)])
  375. # _key = entity.packageName+"$$"+str(entity.label)
  376. # if role_prob>=on_value and str(entity.label)!="5":
  377. # if _key in combination.keys() and combination[_key]==entity.entity_text:
  378. # _key_prob = _key+entity.entity_text
  379. # if _key_prob in dict_entity_prob.keys():
  380. # if dict_entity_prob[_key_prob]<role_prob:
  381. # dict_entity_prob[_key_prob] = role_prob
  382. # else:
  383. # dict_entity_prob[_key_prob] = role_prob
  384. # else:
  385. # _key_prob = _key+entity.entity_text
  386. # if _key_prob in dict_entity_prob.keys():
  387. # if dict_entity_prob[_key_prob]>-role_prob:
  388. # dict_entity_prob[_key_prob] = -role_prob
  389. # else:
  390. # dict_entity_prob[_key_prob] = -role_prob
  391. for _key in dict_entity_prob.keys():
  392. symbol = 1 if dict_entity_prob[_key]>0 else -1
  393. expect += symbol*math.pow(dict_entity_prob[_key],2)
  394. return expect
  395. def getRoleList(list_sentence,list_entity,on_value = 0.5):
  396. '''
  397. @summary: 搜索树,得到所有不矛盾的角色组合,取合计期望值最大的作为结果返回
  398. @param:
  399. list_sentence:文章所有的sentence
  400. list_entity:文章所有的实体
  401. on_value:概率阈值
  402. @return:文章的角色list
  403. '''
  404. pack = getPackagesFromArticle(list_sentence,list_entity)
  405. if pack is None:
  406. return None
  407. PackageList,PackageSet,dict_PackageCode = pack
  408. #拿到所有可能的情况
  409. dict_role_combination = {}
  410. # print(PackageList)
  411. #拿到各个实体的packageName,packageCode
  412. for entity in list_entity:
  413. if entity.entity_type in ['org','company']:
  414. #限制附件里角色values[label]最大概率prob
  415. max_prob = 0.85
  416. if str(entity.label)!="5" and entity.in_attachment:
  417. if entity.values[entity.label]>max_prob:
  418. entity.values[entity.label] = max_prob
  419. #过滤掉字数小于3个的实体
  420. if len(entity.entity_text)<=3:
  421. continue
  422. values = entity.values
  423. role_prob = float(values[int(entity.label)])
  424. if role_prob>=on_value and str(entity.label)!="5":
  425. if str(entity.label) in ["0","1"]:
  426. packageName = "Project"
  427. else:
  428. if len(PackageSet)>0:
  429. packagePointer,_ = getPackage(PackageList,entity.sentence_index,entity.begin_index,"role-"+str(entity.label))
  430. if packagePointer is None:
  431. #continue
  432. packageName = "Project"
  433. # print(entity.entity_text, packageName,entity.sentence_index,entity.begin_index)
  434. else:
  435. #add pointer_pack
  436. entity.pointer_pack = packagePointer
  437. packageName = packagePointer.entity_text
  438. # print(entity.entity_text, packageName)
  439. else:
  440. packageName = "Project"
  441. find_flag = False
  442. if packageName in dict_PackageCode.keys():
  443. packageCode = dict_PackageCode[packageName]
  444. else:
  445. packageCode = ""
  446. entity.packageCode = packageCode
  447. role_name = dict_role_id.get(str(entity.label))
  448. entity.roleName = role_name
  449. entity.packageName = packageName
  450. if entity.packageName in dict_role_combination.keys():
  451. if str(entity.label) in dict_role_combination[entity.packageName].keys():
  452. dict_role_combination[entity.packageName][str(entity.label)].add(entity.entity_text)
  453. else:
  454. dict_role_combination[entity.packageName][str(entity.label)] = set([entity.entity_text])
  455. else:
  456. dict_role_combination[entity.packageName] = {}
  457. #初始化空值
  458. roleIds = [0,1,2,3,4]
  459. for _roleId in roleIds:
  460. dict_role_combination[entity.packageName][str(_roleId)] = set([""])
  461. dict_role_combination[entity.packageName][str(entity.label)].add(entity.entity_text)
  462. list_real_comba = get_legal_comba(list_entity,dict_role_combination)
  463. # print("===role_combination",dict_role_combination)
  464. # print("== real_comba",list_real_comba)
  465. #拿到最大期望值的组合
  466. max_index = 0
  467. max_expect = -100
  468. _index = 0
  469. dict_pack_entity_prob = get_dict_entity_prob(list_entity)
  470. for item_combination in list_real_comba:
  471. expect = getSumExpectation(dict_pack_entity_prob, item_combination)
  472. if expect>max_expect:
  473. max_index = _index
  474. max_expect = expect
  475. _index += 1
  476. RoleList = []
  477. RoleSet = set()
  478. if len(list_real_comba)>0:
  479. for _key in list_real_comba[max_index].keys():
  480. packageName = _key.split("$$")[0]
  481. label = _key.split("$$")[1]
  482. role_name = dict_role_id.get(str(label))
  483. entity_text = list_real_comba[max_index][_key]
  484. if packageName in dict_PackageCode.keys():
  485. packagecode = dict_PackageCode.get(packageName)
  486. else:
  487. packagecode = ""
  488. RoleList.append(PREM(packageName,packagecode,role_name,entity_text,0,0,0.0,[]))
  489. RoleSet.add(entity_text)
  490. #根据最优树来修正list_entity中角色对包的连接
  491. for _entity in list_entity:
  492. if _entity.pointer_pack is not None:
  493. _pack_name = _entity.pointer_pack.entity_text
  494. _find_flag = False
  495. for _prem in RoleList:
  496. if _prem.packageName==_pack_name and _prem.entity_text==_entity.entity_text:
  497. _find_flag = True
  498. if not _find_flag:
  499. _entity.pointer_pack = None
  500. return RoleList,RoleSet,PackageList,PackageSet
  501. def getPackageScopePattern():
  502. '''
  503. @summary: 获取包的作用域关键词
  504. '''
  505. df = pd.read_excel(os.path.dirname(__file__)+"/end.xls")
  506. pattern = "("
  507. for item in df["list_word"]:
  508. item = str(item).replace("(","\(").replace(")","\)").replace(".","\.").replace("[","\[").replace("]","\]").replace("-","\-")
  509. pattern += item+"|"
  510. pattern = pattern[:-1]+")[::是为]|业绩.{,30}标段[0-9A-Za-z一二三四五六七八九十]{0,3}"
  511. return pattern
  512. pattern_packageScope = getPackageScopePattern()
  513. def getPackagesFromArticle(list_sentence,list_entity):
  514. '''
  515. @param:
  516. list_sentence:文章的句子list
  517. @summary: 将包的信息插入list_entity中
  518. @return: type:list if [包号,句子index,词偏移,标段号] meaning:文章的包/标段信息
  519. '''
  520. if len(list_sentence)==0:
  521. return None
  522. list_sentence.sort(key=lambda x:x.sentence_index)
  523. PackageList = []
  524. PackageList_scope = []
  525. PackageSet = set()
  526. dict_packageCode = dict()
  527. package_name_pattern = re.compile("((标[段号的包]|分包)的?(名[称单]?|包名))[::]?([^::]{3,30}?),{1}")
  528. package_N_name_pattern = re.compile("(([^承]|^)分?包|标段|标包|标|包|包组|子项目|包件|项目类型)编?号?[::]?[\((]?([0-9A-Za-z一二三四五六七八九十ⅠⅡⅢⅣⅤⅥⅦ]){1,2},{1}")
  529. package_number_pattern = re.compile("(((([^承]|^)包|标[段号的包]|分?包|包组|包件)编?号?|子项目|项目类型)[::]?[0-9A-Za-z一二三四五六七八九十ⅠⅡⅢⅣⅤⅥⅦ]{1,4}[^\.])[^至]?|([^\.]?第[ⅠⅡⅢⅣⅤⅥⅦ0-9A-Za-z一二三四五六七八九十]{1,4}(包号|标[段号的包]|分?包))") # 第? 去掉问号 修复 纯木浆8包/箱复印 这种作为包号
  530. # other_package_pattern = re.compile('(项目名称|物资名称|设备名称|场次名称|标段名称)[::](.{,20}?)(,|项目)') # 新正则识别标段
  531. other_package_pattern = re.compile('((项目|物资|设备|场次|标段|标的|产品)(名称)?)[::]([^,。]{2,50}?)[,。]') # # 2020/11/23 大网站规则 调整 package_N_name_pattern, package_N_name_pattern 中的项目 改为 子项目
  532. win_tenderer_pattern = re.compile('(中标候?选?人|供应商)(名称)?[::](.{2,25})[,。]') # 2020/11/23 大网站规则 调整
  533. model_pattern = re.compile('(型号|序号)[::]([^,。]{2,20})[,。]') # 2020/11/23 大网站规则 调整
  534. number_pattern = re.compile("[0-9A-Za-z一二三四五六七八九十ⅠⅡⅢⅣⅤⅥⅦ]{1,4}")
  535. package_code_pattern = re.compile("(?:编号[::]?\s*)([-\dA-Za-z\(\)]+)")
  536. # 纯数字类型的包号统一,例如:'01','1'
  537. re_digital = re.compile("^\d+$")
  538. def changeIndexFromWordToWords(tokens,word_index):
  539. '''
  540. @summary:转换某个字的字偏移为词偏移
  541. '''
  542. before_index = 0
  543. after_index = 0
  544. for i in range(len(tokens)):
  545. after_index = after_index+len(tokens[i])
  546. if before_index<=word_index and after_index>=word_index:
  547. return i
  548. before_index = after_index
  549. package_names = []
  550. def extractPackageCode(tokens,word_index,size=20,pattern = package_code_pattern):
  551. '''
  552. @summary:抽取包附近的标段号
  553. @param:
  554. tokens:包所在句子的分词
  555. word_index:包所在字偏移
  556. size:左右各取多少个词
  557. pattern:提取标段号的正则
  558. @return: type:string,meaning:标段号
  559. '''
  560. index = changeIndexFromWordToWords(tokens,word_index)
  561. if index<size:
  562. begin = index
  563. else:
  564. begin = index-size
  565. if index+size>len(tokens):
  566. end = len(tokens)
  567. else:
  568. end = index+size
  569. #拿到左右两边的词语组成短语
  570. text = "".join(tokens[begin:end])
  571. #在短语中的字偏移
  572. new_word_index = word_index-len("".join(tokens[:begin]))
  573. min_distance = len(text)
  574. packageCode = None
  575. for the_iter in re.finditer(pattern,text):
  576. #算出最小距离
  577. distance = min([abs(new_word_index-the_iter.span()[0]),abs(new_word_index-the_iter.span()[1])])
  578. if distance<min_distance:
  579. min_distance = distance
  580. packageCode = the_iter.group(1)
  581. return packageCode
  582. #从标段介绍表格中提取包名和包号
  583. for i in range(len(list_sentence)):
  584. content = list_sentence[i].sentence_text
  585. names = re.findall(package_name_pattern,content)
  586. if names == []:
  587. names = re.findall(other_package_pattern, content)
  588. N_names = re.findall(package_N_name_pattern,content)
  589. if len(names)==1 and len(N_names)==1:
  590. package_names.append([names[0][-1],N_names[0][-1]])
  591. for i in range(len(list_sentence)):
  592. PackageList_item = []
  593. PackageList_item_scope = []
  594. content = list_sentence[i].sentence_text
  595. tokens = list_sentence[i].tokens
  596. _names = []
  597. # 2021/6/23 包名称去重
  598. for name in package_names:
  599. if name not in _names:
  600. _names.append(name)
  601. # for name in package_names[:20]:
  602. for name in _names[:20]:
  603. for index in findAllIndex(name[0],content):
  604. temp_package_number = re.findall(number_pattern,name[1])[0]
  605. if re.search(re_digital,temp_package_number):
  606. temp_package_number = str(int(temp_package_number))
  607. PackageList_item.append({"name":temp_package_number,"sentence_index":list_sentence[i].sentence_index,"offsetWords_begin":changeIndexFromWordToWords(tokens,index),"offsetWord_begin":index,"offsetWord_end":index+len(name[0])})
  608. # PackageList_item.append([temp_package_number,i,changeIndexFromWordToWords(tokens,index),index,index+len(str(temp_package_number))])
  609. code = extractPackageCode(tokens, index)
  610. if code is not None:
  611. dict_packageCode[temp_package_number] = code
  612. PackageSet.add(temp_package_number)
  613. for iter in re.finditer(package_number_pattern,content):
  614. temp_package_number = re.findall(number_pattern,content[iter.span()[0]:iter.span()[1]])[0]
  615. if re.search(re_digital, temp_package_number):
  616. temp_package_number = str(int(temp_package_number))
  617. PackageList_item.append({"name":temp_package_number,"sentence_index":list_sentence[i].sentence_index,"offsetWords_begin":changeIndexFromWordToWords(tokens,iter.span()[0]),"offsetWord_begin":iter.span()[0],"offsetWord_end":iter.span()[1]})
  618. # PackageList_item.append([temp_package_number,i,changeIndexFromWordToWords(tokens,iter.span()[0]),iter.span()[0],iter.span()[1]])
  619. code = extractPackageCode(tokens, iter.span()[0])
  620. if code is not None:
  621. dict_packageCode[temp_package_number] = code
  622. PackageSet.add(temp_package_number)
  623. #识别packageScope
  624. for iter in re.finditer(pattern_packageScope,content):
  625. PackageList_item_scope.append({"name":"","sentence_index":list_sentence[i].sentence_index,"offsetWords_begin":changeIndexFromWordToWords(tokens,iter.span()[0]),"offsetWord_begin":iter.span()[0],"offsetWord_end":iter.span()[1]})
  626. # PackageList_item_scope.append(["",i,changeIndexFromWordToWords(tokens,iter.span()[0]),iter.span()[0],iter.span()[1]])
  627. PackageList_item_scope = PackageList_item +PackageList_item_scope
  628. PackageList_item_scope.sort(key=lambda x:x["offsetWord_begin"])
  629. PackageList_scope = PackageList_scope+PackageList_item_scope
  630. PackageList_item.sort(key=lambda x:x["sentence_index"])
  631. #PackageList = PackageList+PackageList_item
  632. #不作为包
  633. # if len(PackageSet)==0:
  634. # for i in range(len(list_sentence)):
  635. # PackageList_item = []
  636. # PackageList_item_scope = []
  637. # content = list_sentence[i].sentence_text
  638. # tokens = list_sentence[i].tokens
  639. # for iter in re.finditer(other_package_pattern,content):
  640. # temp_package_number = iter.group(2)
  641. # PackageList_item.append({"name":temp_package_number,"sentence_index":list_sentence[i].sentence_index,"offsetWords_begin":changeIndexFromWordToWords(tokens,iter.span()[0]),"offsetWord_begin":iter.span()[0],"offsetWord_end":iter.span()[1]})
  642. # # PackageList_item.append([temp_package_number,i,changeIndexFromWordToWords(tokens,iter.span()[0]),iter.span()[0],iter.span()[1]])
  643. # code = extractPackageCode(tokens, iter.span()[0])
  644. # if code is not None:
  645. # dict_packageCode[temp_package_number] = code
  646. # PackageSet.add(temp_package_number)
  647. # #识别packageScope
  648. # for iter in re.finditer(pattern_packageScope,content):
  649. # PackageList_item_scope.append({"name":"","sentence_index":list_sentence[i].sentence_index,"offsetWords_begin":changeIndexFromWordToWords(tokens,iter.span()[0]),"offsetWord_begin":iter.span()[0],"offsetWord_end":iter.span()[1]})
  650. # # PackageList_item_scope.append(["",i,changeIndexFromWordToWords(tokens,iter.span()[0]),iter.span()[0],iter.span()[1]])
  651. # PackageList_item_scope = PackageList_item +PackageList_item_scope
  652. # PackageList_item_scope.sort(key=lambda x:x["offsetWord_begin"])
  653. # PackageList_scope = PackageList_scope+PackageList_item_scope
  654. # PackageList_item.sort(key=lambda x:x["sentence_index"])
  655. # 2020/11/23 大网站规则 调整
  656. if len(PackageSet)==0 and len(set([it.entity_text for it in list_entity if it.entity_type in ['org', 'company'] and it.label==2]))>1:
  657. for i in range(len(list_sentence)):
  658. PackageList_item = []
  659. PackageList_item_scope = []
  660. content = list_sentence[i].sentence_text
  661. tokens = list_sentence[i].tokens
  662. names = re.findall(other_package_pattern, content)
  663. N_names = re.findall(win_tenderer_pattern, content)
  664. if len(names) != 1 or len(N_names) != 1:
  665. continue
  666. for iter in re.finditer(other_package_pattern,content):
  667. temp_package_number = iter.group(4)
  668. xinghao = re.search(model_pattern, content)
  669. if xinghao:
  670. temp_package_number = temp_package_number + '+' + xinghao.group(2)
  671. # print('新正则采购包名补充',temp_package_number)
  672. if re.search(re_digital,temp_package_number):
  673. temp_package_number = str(int(temp_package_number))
  674. PackageList_item.append({"name":temp_package_number,"sentence_index":list_sentence[i].sentence_index,"offsetWords_begin":changeIndexFromWordToWords(tokens,iter.span()[0]),"offsetWord_begin":iter.span()[0],"offsetWord_end":iter.span()[1]})
  675. # PackageList_item.append([temp_package_number,i,changeIndexFromWordToWords(tokens,iter.span()[0]),iter.span()[0],iter.span()[1]])
  676. code = extractPackageCode(tokens, iter.span()[0])
  677. if code is not None:
  678. dict_packageCode[temp_package_number] = code
  679. PackageSet.add(temp_package_number)
  680. #识别packageScope
  681. for iter in re.finditer(pattern_packageScope,content):
  682. PackageList_item_scope.append({"name":"","sentence_index":list_sentence[i].sentence_index,"offsetWords_begin":changeIndexFromWordToWords(tokens,iter.span()[0]),"offsetWord_begin":iter.span()[0],"offsetWord_end":iter.span()[1]})
  683. # PackageList_item_scope.append(["",i,changeIndexFromWordToWords(tokens,iter.span()[0]),iter.span()[0],iter.span()[1]])
  684. PackageList_item_scope = PackageList_item +PackageList_item_scope
  685. PackageList_item_scope.sort(key=lambda x:x["offsetWord_begin"])
  686. PackageList_scope = PackageList_scope+PackageList_item_scope
  687. PackageList_item.sort(key=lambda x:x["sentence_index"])
  688. pattern_punctuation = "[::()\(\),,。;;]"
  689. # print("===packageList_scope",PackageList_scope)
  690. for i in range(len(list_sentence)):
  691. for j in range(len(PackageList_scope)):
  692. if i==PackageList_scope[j]["sentence_index"] and PackageList_scope[j]["name"]!="":
  693. _flag = False
  694. left_str = list_sentence[i].sentence_text[PackageList_scope[j]["offsetWord_begin"]-30:PackageList_scope[j]["offsetWord_begin"]+1]
  695. right_str = list_sentence[i].sentence_text[PackageList_scope[j]["offsetWord_begin"]:PackageList_scope[j]["offsetWord_begin"]+30]
  696. _left_find = re.findall(pattern_punctuation,left_str)
  697. _right_find = re.findall(pattern_punctuation,right_str)
  698. #print(left_str)
  699. if re.search("同",left_str[-1:]) is not None and PackageList_scope[j]["name"]=="一":
  700. continue
  701. if re.search("划分",right_str[:10]) is not None:
  702. continue
  703. if len(_left_find)>0 and _left_find[-1] in [":",":"]:
  704. _flag = True
  705. if len(_right_find)>0 and _right_find[0] in [":",":"]:
  706. _flag = True
  707. if _flag:
  708. scope_begin = [PackageList_scope[j]["sentence_index"],PackageList_scope[j]["offsetWords_begin"]]
  709. else:
  710. if j==0:
  711. scope_begin = [0,0]
  712. else:
  713. scope_begin = [PackageList_scope[j-1]["sentence_index"],PackageList_scope[j-1]["offsetWords_begin"]]
  714. if j==len(PackageList_scope)-1:
  715. scope_end = [list_sentence[-1].sentence_index,changeIndexFromWordToWords(list_sentence[-1].tokens, len(list_sentence[-1].sentence_text))]
  716. else:
  717. scope_end = [PackageList_scope[j+1]["sentence_index"],PackageList_scope[j+1]["offsetWords_begin"]]
  718. if PackageList_scope[j-1]["sentence_index"]==PackageList_scope[j]["sentence_index"] and PackageList_scope[j-1]["offsetWord_begin"]<=PackageList_scope[j]["offsetWord_begin"] and PackageList_scope[j-1]["offsetWord_end"]>=PackageList_scope[j]["offsetWord_end"]:
  719. continue
  720. #add package to entity
  721. _pack_entity = Entity(doc_id=list_sentence[0].doc_id,entity_id="%s_%s_%s_%s"%(list_sentence[0].doc_id,i,PackageList_scope[j]["offsetWord_begin"],PackageList_scope[j]["offsetWord_begin"]),entity_text=PackageList_scope[j]["name"],entity_type="package",sentence_index=PackageList_scope[j]["sentence_index"],begin_index=changeIndexFromWordToWords(list_sentence[i].tokens,PackageList_scope[j]["offsetWord_begin"]),end_index=changeIndexFromWordToWords(list_sentence[i].tokens,PackageList_scope[j]["offsetWord_end"]),wordOffset_begin=PackageList_scope[j]["offsetWord_begin"],wordOffset_end=PackageList_scope[j]["offsetWord_end"],in_attachment=list_sentence[i].in_attachment)
  722. list_entity.append(_pack_entity)
  723. copy_pack = copy.copy(PackageList_scope[j])
  724. copy_pack["scope"] = [scope_begin,scope_end]
  725. copy_pack["hit"] = set()
  726. copy_pack["pointer"] = _pack_entity
  727. PackageList.append(copy_pack)
  728. return PackageList,PackageSet,dict_packageCode
  729. # km配对方法
  730. def dispatch(match_list):
  731. main_roles = list(set([match.main_role for match in match_list]))
  732. attributes = list(set([match.attribute for match in match_list]))
  733. label = np.zeros(shape=(len(main_roles), len(attributes)))
  734. for match in match_list:
  735. main_role = match.main_role
  736. attribute = match.attribute
  737. value = match.value
  738. label[main_roles.index(main_role), attributes.index(attribute)] = value + 10000
  739. # print(label)
  740. gragh = -label
  741. # km算法
  742. row, col = linear_sum_assignment(gragh)
  743. max_dispatch = [(i, j) for i, j, value in zip(row, col, gragh[row, col]) if value]
  744. # return [Match(main_roles[row], attributes[col]) for row, col in max_dispatch]
  745. return [(main_roles[row], attributes[col]) for row, col in max_dispatch]
  746. from BiddingKG.dl.common.Utils import getUnifyMoney
  747. from BiddingKG.dl.interface.modelFactory import Model_relation_extraction
  748. relationExtraction_model = Model_relation_extraction()
  749. def findAttributeAfterEntity(PackDict,roleSet,PackageList,PackageSet,list_sentence,list_entity,list_outline,on_value = 0.5,on_value_person=0.5,sentence_len=4):
  750. '''
  751. @param:
  752. PackDict:文章包dict
  753. roleSet:文章所有角色的公司名称
  754. PackageList:文章的包信息
  755. PackageSet:文章所有包的名称
  756. list_entity:文章所有经过模型处理的实体
  757. on_value:金额模型的阈值
  758. on_value_person:联系人模型的阈值
  759. sentence_len:公司和属性间隔句子的最大长度
  760. @return:添加了属性信息的角色list
  761. '''
  762. #根据roleid添加金额到rolelist中
  763. def addMoneyByRoleid(packDict,packageName,roleid,money,money_prob):
  764. for i in range(len(packDict[packageName]["roleList"])):
  765. if packDict[packageName]["roleList"][i].role_name==dict_role_id.get(str(roleid)):
  766. if money_prob>packDict[packageName]["roleList"][i].money_prob:
  767. packDict[packageName]["roleList"][i].money = money
  768. packDict[packageName]["roleList"][i].money_prob = money_prob
  769. return packDict
  770. #根据实体名称添加金额到rolelist中
  771. def addMoneyByEntity(packDict,packageName,entity,money,money_prob):
  772. for i in range(len(packDict[packageName]["roleList"])):
  773. if packDict[packageName]["roleList"][i].entity_text==entity:
  774. # if money_prob>packDict[packageName]["roleList"][i].money_prob:
  775. # packDict[packageName]["roleList"][i].money = money
  776. # packDict[packageName]["roleList"][i].money_prob = money_prob
  777. if packDict[packageName]["roleList"][i].money_prob==0 : # 2021/7/20第一次更新金额
  778. packDict[packageName]["roleList"][i].money = money.entity_text
  779. packDict[packageName]["roleList"][i].money_prob = money_prob
  780. packDict[packageName]["roleList"][i].money_unit = money.money_unit
  781. elif money_prob>packDict[packageName]["roleList"][i].money_prob+0.2 or money.notes in ['大写']: # 2021/7/20改为优先选择大写金额,
  782. # print('已连接金额概率:money_prob:',packDict[packageName]["roleList"][i].money_prob)
  783. # print('链接金额备注 ',money.notes, money.entity_text, money.values)
  784. packDict[packageName]["roleList"][i].money = money.entity_text
  785. packDict[packageName]["roleList"][i].money_prob = money_prob
  786. packDict[packageName]["roleList"][i].money_unit = money.money_unit
  787. # print('链接中的金额:{0}, 单位:{1}'.format(money.entity_text, money.money_unit))
  788. return packDict
  789. def addRatioByEntity(packDict,packageName,entity,ratio):
  790. for i in range(len(packDict[packageName]["roleList"])):
  791. if packDict[packageName]["roleList"][i].entity_text==entity:
  792. packDict[packageName]["roleList"][i].ratio = ratio.entity_text
  793. def addServiceTimeByEntity(packDict,packageName,entity,serviceTime):
  794. for i in range(len(packDict[packageName]["roleList"])):
  795. if packDict[packageName]["roleList"][i].entity_text==entity:
  796. packDict[packageName]["roleList"][i].serviceTime = serviceTime.entity_text
  797. #根据实体名称得到角色
  798. def getRoleWithText(packDict,entity_text):
  799. for pack in packDict.keys():
  800. for i in range(len(packDict[pack]["roleList"])):
  801. if packDict[pack]["roleList"][i].entity_text==entity_text:
  802. return packDict[pack]["roleList"][i].role_name
  803. def doesEntityOrLinkedEntity_inRoleSet(entity,RoleSet):
  804. _list_entitys = [entity]+entity.linked_entitys
  805. for _entity in _list_entitys:
  806. if _entity.entity_text in RoleSet:
  807. return True
  808. p_entity = 0
  809. # 2021/7/19 顺序比较金额,前面是后面的一万倍则把前面金额/10000
  810. money_list = [it for it in list_entity if it.entity_type=="money"]
  811. for i in range(len(money_list)-1):
  812. for j in range(1, len(money_list)):
  813. if (float(money_list[i].entity_text) > 5000000000 or money_list[j].notes=='大写') and \
  814. Decimal(money_list[i].entity_text)/Decimal(money_list[j].entity_text)==10000:
  815. money_list[i].entity_text = str(Decimal(money_list[i].entity_text)/10000)
  816. # print('连接前修改大于50亿金额:前面是后面的一万倍则把前面金额/10000')
  817. #遍历所有实体
  818. # while(p_entity<len(list_entity)):
  819. # entity = list_entity[p_entity]
  820. '''
  821. #招标金额从后往前找
  822. if entity.entity_type=="money":
  823. if entity.values[entity.label]>=on_value:
  824. if str(entity.label)=="0":
  825. packagePointer,_ = getPackage(PackageList,entity.sentence_index,entity.begin_index,"money-"+str(entity.label))
  826. if packagePointer is None:
  827. packageName = "Project"
  828. else:
  829. packageName = packagePointer.entity_text
  830. addMoneyByRoleid(PackDict, packageName, "0", entity.entity_text, entity.values[entity.label])
  831. '''
  832. ''' # 2020/11/25 与下面的联系人连接步骤重复,取消
  833. if entity.entity_type=="person":
  834. if entity.values[entity.label]>=on_value_person:
  835. if str(entity.label)=="1":
  836. for i in range(len(PackDict["Project"]["roleList"])):
  837. if PackDict["Project"]["roleList"][i].role_name=="tenderee":
  838. PackDict["Project"]["roleList"][i].linklist.append((entity.entity_text,entity.person_phone))
  839. # add pointer_person
  840. for _entity in list_entity:
  841. if dict_role_id.get(str(_entity.label))=="tenderee":
  842. for i in range(len(PackDict["Project"]["roleList"])):
  843. if PackDict["Project"]["roleList"][i].entity_text==_entity.entity_text and PackDict["Project"]["roleList"][i].role_name=="tenderee":
  844. _entity.pointer_person = entity
  845. elif str(entity.label)=="2":
  846. for i in range(len(PackDict["Project"]["roleList"])):
  847. if PackDict["Project"]["roleList"][i].role_name=="agency":
  848. PackDict["Project"]["roleList"][i].linklist.append((entity.entity_text,entity.person_phone))
  849. # add pointer_person
  850. for _entity in list_entity:
  851. if dict_role_id.get(str(_entity.label))=="agency":
  852. for i in range(len(PackDict["Project"]["roleList"])):
  853. if PackDict["Project"]["roleList"][i].entity_text==_entity.entity_text and PackDict["Project"]["roleList"][i].role_name=="agency":
  854. _entity.pointer_person = entity
  855. '''
  856. # #金额往前找实体
  857. # if entity.entity_type=="money":
  858. # if entity.values[entity.label]>=on_value:
  859. # p_entity_money= p_entity
  860. # entity_money = list_entity[p_entity_money]
  861. # if len(PackageSet)>0:
  862. # packagePointer,_ = getPackage(PackageList,entity_money.sentence_index,entity_money.begin_index,"money-"+str(entity_money.entity_text)+"-"+str(entity_money.label))
  863. # if packagePointer is None:
  864. # packageName_entity = "Project"
  865. # else:
  866. # packageName_entity = packagePointer.entity_text
  867. # else:
  868. # packageName_entity = "Project"
  869. # while(p_entity_money>0):
  870. # entity_before = list_entity[p_entity_money]
  871. # if entity_before.entity_type in ['org','company']:
  872. # if str(entity_before.label)=="1":
  873. # addMoneyByEntity(PackDict, packageName_entity, entity_before.entity_text, entity_money.entity_text, entity_money.values[entity_money.label])
  874. # #add pointer_money
  875. # entity_before.pointer_money = entity_money
  876. # break
  877. # p_entity_money -= 1
  878. #如果实体属于角色集合,则往后找属性
  879. # if doesEntityOrLinkedEntity_inRoleSet(entity, roleSet):
  880. #
  881. # p_entity += 1
  882. # #循环查找符合的属性
  883. # while(p_entity<len(list_entity)):
  884. #
  885. # entity_after = list_entity[p_entity]
  886. # if entity_after.sentence_index-entity.sentence_index>=sentence_len:
  887. # p_entity -= 1
  888. # break
  889. # #若是遇到公司实体,则跳出循环
  890. # if entity_after.entity_type in ['org','company']:
  891. # p_entity -= 1
  892. # break
  893. # if entity_after.values is not None:
  894. # if entity_after.entity_type=="money":
  895. # if entity_after.values[entity_after.label]>=on_value:
  896. # '''
  897. # #招标金额从后往前找
  898. # if str(entity_after.label)=="0":
  899. # packagePointer,_ = getPackage(PackageList,entity.sentence_index,entity.begin_index,"money-"+str(entity.label))
  900. # if packagePointer is None:
  901. # packageName = "Project"
  902. # else:
  903. # packageName = packagePointer.entity_text
  904. # addMoneyByRoleid(PackDict, packageName, "0", entity_after.entity_text, entity_after.values[entity_after.label])
  905. # '''
  906. # if str(entity_after.label)=="1":
  907. # #print(entity_after.entity_text,entity.entity_text)
  908. # _list_entitys = [entity]+entity.linked_entitys
  909. # if len(PackageSet)>0:
  910. # packagePointer,_ = getPackage(PackageList,entity_after.sentence_index,entity_after.begin_index,"money-"+str(entity_after.entity_text)+"-"+str(entity_after.label))
  911. # if packagePointer is None:
  912. # packageName_entity = "Project"
  913. # else:
  914. # packageName_entity = packagePointer.entity_text
  915. # else:
  916. # packageName_entity = "Project"
  917. # if str(entity.label) in ["2","3","4"]:
  918. # # addMoneyByEntity(PackDict, packageName_entity, entity.entity_text, entity_after.entity_text, entity_after.values[entity_after.label])
  919. # if entity_after.notes == '单价' or float(entity_after.entity_text)<5000: #2021/12/17 调整小金额阈值,避免203608823.html 两次金额一次万元没提取到的情况
  920. # addMoneyByEntity(PackDict, packageName_entity, entity.entity_text, entity_after,
  921. # 0.5)
  922. # entity.pointer_money = entity_after
  923. # # print('role zhao money', entity.entity_text, '中标金额:', entity_after.entity_text)
  924. # else:
  925. # addMoneyByEntity(PackDict, packageName_entity, entity.entity_text, entity_after,
  926. # entity_after.values[entity_after.label])
  927. # entity.pointer_money = entity_after
  928. # # print('role zhao money', entity.entity_text, '中标金额:', entity_after.entity_text)
  929. # if entity_after.values[entity_after.label]>0.6:
  930. # break # 2021/7/16 新增,找到中标金额,非单价即停止,不再往后找金额
  931. # #add pointer_money
  932. # # entity.pointer_money = entity_after
  933. # # print('role zhao money', entity.entity_text, '中标金额:', entity_after.entity_text)
  934. # # if entity_after.notes!='单价':
  935. # # break # 2021/7/16 新增,找到中标金额即停止,不再往后找金额
  936. # '''
  937. # if entity_after.entity_type=="person":
  938. # if entity_after.values[entity_after.label]>=on_value_person:
  939. # if str(entity_after.label)=="1":
  940. # for i in range(len(roleList)):
  941. # if roleList[i].role_name=="tenderee":
  942. # roleList[i].linklist.append((entity_after.entity_text,entity_after.person_phone))
  943. # elif str(entity_after.label)=="2":
  944. # for i in range(len(roleList)):
  945. # if roleList[i].role_name=="agency":
  946. # roleList[i].linklist.append((entity_after.entity_text,entity_after.person_phone))
  947. # elif str(entity_after.label)=="3":
  948. # _list_entitys = [entity]+entity.linked_entitys
  949. # for _entity in _list_entitys:
  950. # for i in range(len(roleList)):
  951. # if roleList[i].entity_text==_entity.entity_text:
  952. # if entity_after.sentence_index-_entity.sentence_index>1 and len(roleList[i].linklist)>0:
  953. # break
  954. # roleList[i].linklist.append((entity_after.entity_text,entity_after.person_phone))
  955. # '''
  956. #
  957. # p_entity += 1
  958. #
  959. # p_entity += 1
  960. # 记录每句的分词数量
  961. tokens_num_dict = dict()
  962. last_tokens_num = 0
  963. for sentence in list_sentence:
  964. _index = sentence.sentence_index
  965. if _index == 0:
  966. tokens_num_dict[_index] = 0
  967. else:
  968. tokens_num_dict[_index] = tokens_num_dict[_index - 1] + last_tokens_num
  969. last_tokens_num = len(sentence.tokens)
  970. attribute_type = ['money','serviceTime','ratio']# 'money'仅指“中投标金额”
  971. for link_attribute in attribute_type:
  972. temp_entity_list = []
  973. if link_attribute=="money":
  974. temp_entity_list = [ent for ent in list_entity if (ent.entity_type in ['org','company'] and ent.label in [2,3,4]) or
  975. (ent.entity_type=='money' and ent.label==1 and ent.values[ent.label]>=0.5)]
  976. # 删除重复的‘中投标金额’,一般为大小写两种样式
  977. drop_tendererMoney = []
  978. for ent_idx in range(len(temp_entity_list)-1):
  979. entity = temp_entity_list[ent_idx]
  980. if entity.entity_type=='money':
  981. next_entity = temp_entity_list[ent_idx+1]
  982. if next_entity.entity_type=='money':
  983. if getUnifyMoney(entity.entity_text)==getUnifyMoney(next_entity.entity_text):
  984. if (tokens_num_dict[next_entity.sentence_index] + next_entity.begin_index) - (
  985. tokens_num_dict[entity.sentence_index] + entity.end_index) < 10:
  986. drop_tendererMoney.append(next_entity)
  987. for _drop in drop_tendererMoney:
  988. temp_entity_list.remove(_drop)
  989. elif link_attribute=="serviceTime":
  990. temp_entity_list = [ent for ent in list_entity if (ent.entity_type in ['org','company'] and ent.label in [2,3,4]) or
  991. ent.entity_type=='serviceTime']
  992. elif link_attribute=="ratio":
  993. temp_entity_list = [ent for ent in list_entity if (ent.entity_type in ['org','company'] and ent.label in [2,3,4]) or
  994. ent.entity_type=='ratio']
  995. temp_entity_list = sorted(temp_entity_list,key=lambda x: (x.sentence_index, x.begin_index))
  996. temp_match_list = []
  997. for ent_idx in range(len(temp_entity_list)):
  998. entity = temp_entity_list[ent_idx]
  999. if entity.entity_type in ['org','company']:
  1000. match_nums = 0
  1001. tenderer_nums = 0 #经过其他中投标人的数量
  1002. byNotTenderer_match_nums = 0 #跟在中投标人后面的属性
  1003. for after_index in range(ent_idx + 1, min(len(temp_entity_list), ent_idx + 4)):
  1004. after_entity = temp_entity_list[after_index]
  1005. if after_entity.entity_type == link_attribute:
  1006. distance = (tokens_num_dict[after_entity.sentence_index] + after_entity.begin_index) - (
  1007. tokens_num_dict[entity.sentence_index] + entity.end_index)
  1008. sentence_distance = after_entity.sentence_index - entity.sentence_index
  1009. value = (-1 / 2 * (distance ** 2)) / 10000
  1010. if link_attribute == "money":
  1011. if after_entity.notes == '单价':
  1012. value = value * 100
  1013. if sentence_distance == 0:
  1014. if distance < 100:
  1015. # value = (-1 / 2 * (distance ** 2)) / 10000
  1016. temp_match_list.append(Match(entity, after_entity, value))
  1017. match_nums += 1
  1018. if not tenderer_nums:
  1019. byNotTenderer_match_nums += 1
  1020. else:
  1021. break
  1022. else:
  1023. if distance < 60:
  1024. # value = (-1 / 2 * (distance ** 2)) / 10000
  1025. temp_match_list.append(Match(entity, after_entity, value))
  1026. match_nums += 1
  1027. if not tenderer_nums:
  1028. byNotTenderer_match_nums += 1
  1029. else:
  1030. break
  1031. else:
  1032. tenderer_nums += 1
  1033. #前向查找属性
  1034. if ent_idx!=0 and (not match_nums or not byNotTenderer_match_nums):
  1035. previous_entity = temp_entity_list[ent_idx - 1]
  1036. if previous_entity.entity_type == link_attribute:
  1037. # if previous_entity.sentence_index == entity.sentence_index:
  1038. distance = (tokens_num_dict[entity.sentence_index] + entity.begin_index) - (
  1039. tokens_num_dict[previous_entity.sentence_index] + previous_entity.end_index)
  1040. if distance < 40:
  1041. # 前向 没有 /10000
  1042. value = (-1 / 2 * (distance ** 2))
  1043. temp_match_list.append(Match(entity, previous_entity, value))
  1044. # km算法分配求解
  1045. dispatch_result = dispatch(temp_match_list)
  1046. dispatch_result = sorted(dispatch_result, key=lambda x: (x[0].sentence_index,x[0].begin_index))
  1047. for match in dispatch_result:
  1048. _entity = match[0]
  1049. _attribute = match[1]
  1050. if link_attribute=='money':
  1051. _entity.pointer_money = _attribute
  1052. packagePointer, _ = getPackage(PackageList, _attribute.sentence_index, _attribute.begin_index,
  1053. "money-" + str(_attribute.entity_text) + "-" + str(_attribute.label))
  1054. # print(_entity.entity_text,_attribute.entity_text)
  1055. if packagePointer is None:
  1056. packageName_entity = "Project"
  1057. else:
  1058. packageName_entity = packagePointer.entity_text
  1059. if _attribute.notes == '单价' or float(_attribute.entity_text) < 5000: # 2021/12/17 调整小金额阈值,避免203608823.html 两次金额一次万元没提取到的情况
  1060. # print(packageName_entity,_attribute.entity_text, _attribute.values[_attribute.label])
  1061. addMoneyByEntity(PackDict, packageName_entity, _entity.entity_text, _attribute,0.5)
  1062. else:
  1063. # print(packageName_entity,_attribute.entity_text, _attribute.values[_attribute.label])
  1064. addMoneyByEntity(PackDict, packageName_entity, _entity.entity_text, _attribute,
  1065. _attribute.values[_attribute.label])
  1066. elif link_attribute=='serviceTime':
  1067. _entity.pointer_serviceTime = _attribute
  1068. packagePointer, _ = getPackage(PackageList, _attribute.sentence_index, _attribute.begin_index,
  1069. "serviceTime-" + str(_attribute.entity_text) + "-" + str(_attribute.label))
  1070. if packagePointer is None:
  1071. packageName_entity = "Project"
  1072. else:
  1073. packageName_entity = packagePointer.entity_text
  1074. addServiceTimeByEntity(PackDict, packageName_entity, _entity.entity_text, _attribute)
  1075. elif link_attribute=='ratio':
  1076. _entity.pointer_ratio = _attribute
  1077. packagePointer, _ = getPackage(PackageList, _attribute.sentence_index, _attribute.begin_index,
  1078. "ratio-" + str(_attribute.entity_text) + "-" + str(_attribute.label))
  1079. if packagePointer is None:
  1080. packageName_entity = "Project"
  1081. else:
  1082. packageName_entity = packagePointer.entity_text
  1083. addRatioByEntity(PackDict, packageName_entity, _entity.entity_text, _attribute)
  1084. ''''''
  1085. # 通过模型分类的招标/代理联系人
  1086. list_sentence = sorted(list_sentence, key=lambda x: x.sentence_index)
  1087. person_list = [entity for entity in list_entity if entity.entity_type == 'person' and entity.label in [1, 2]]
  1088. tenderee_contact = set()
  1089. tenderee_phone = set()
  1090. agency_contact = set()
  1091. agency_phone = set()
  1092. winter_contact = set()
  1093. for _person in person_list:
  1094. if _person.label == 1:
  1095. tenderee_contact.add(_person.entity_text)
  1096. if _person.label == 2:
  1097. agency_contact.add(_person.entity_text)
  1098. # 正则匹配无 '主体/联系人' 的电话
  1099. # 例:"采购人联系方式:0833-5226788,"
  1100. phone_pattern = '(1[3-9][0-9][-—-―]?\d{4}[-—-―]?\d{4}|' \
  1101. '\+86.?1[3-9]\d{9}|' \
  1102. '0[1-9]\d{1,2}[-—-―][1-9]\d{6,7}/[1-9]\d{6,10}|' \
  1103. '0[1-9]\d{1,2}[-—-―]\d{7,8}.?转\d{1,4}|' \
  1104. '0[1-9]\d{1,2}[-—-―]\d{7,8}[-—-―]\d{1,4}|' \
  1105. '0[1-9]\d{1,2}[-—-―]?[1-9]\d{6}\d?(?=1[3-9]\d{9})|' \
  1106. '0[1-9]\d{1,2}[-—-―]?[1-9]\d{6}\d?(?=0[1-9]\d{1,2}[-—-―]?[1-9]\d{6}\d?)|' \
  1107. '0[1-9]\d{1,2}[-—-―]?[1-9]\d{6}\d?(?=[1-9]\d{6,7})|' \
  1108. '0[1-9]\d{1,2}[-—-―]?[1-9]\d{6}\d?|' \
  1109. '[\(|\(]0[1-9]\d{1,2}[\)|\)]-?\d{7,8}-?\d{,4}|' \
  1110. '[2-9]\d{6,7})'
  1111. re_tenderee_phone = re.compile(
  1112. "(?:(?:(?:采购|招标|议价|议标|比选)(?:人|公司|单位|组织|部门)|建设(?:单位|业主)|(?:采购|招标|甲)方|询价单位|项目业主|业主)[^。]{0,5}(?:电话|联系方式|联系人|联系电话)[::]?[^。]{0,7}?)"
  1113. # 电话号码
  1114. + phone_pattern)
  1115. # 例:"采购人地址和联系方式:峨边彝族自治县教育局,0833-5226788,"
  1116. re_tenderee_phone2 = re.compile(
  1117. "(?:(?:(?:采购|招标|议价|议标|比选)(?:人|公司|单位|组织|部门)|建设(?:单位|业主)|(?:采购|招标|甲)方|询价单位|项目业主|业主)[^。]{0,3}(?:地址)[^。]{0,3}(?:电话|联系方式|联系人|联系电话)[::]?[^。]{0,20}?)"
  1118. # 电话号码
  1119. + phone_pattern)
  1120. re_agent_phone = re.compile(
  1121. "(?:(?:代理(?:人|机构|公司|单位|组织|方)|采购机构|集中采购机构|集采机构|招标机构)[^。]{0,5}(?:电话|联系方式|联系人|联系电话)[::]?[^。]{0,7}?)"
  1122. # 电话号码
  1123. + phone_pattern)
  1124. re_agent_phone2 = re.compile(
  1125. "(?:(?:代理(?:人|机构|公司|单位|组织|方)|采购机构|集中采购机构|集采机构|招标机构)[^。]{0,3}(?:地址)[^。]{0,3}(?:电话|联系方式|联系人|联系电话)[::]?[^。]{0,20}?)"
  1126. # 电话号码
  1127. + phone_pattern)
  1128. content = ""
  1129. for _sentence in list_sentence:
  1130. content += "".join(_sentence.tokens)
  1131. _content = copy.deepcopy(content)
  1132. while re.search("(.)(,)([^0-9])|([^0-9])(,)(.)", content):
  1133. content_words = list(content)
  1134. for i in re.finditer("(.)(,)([^0-9])", content):
  1135. content_words[i.span(2)[0]] = ""
  1136. for i in re.finditer("([^0-9])(,)(.)", content):
  1137. content_words[i.span(2)[0]] = ""
  1138. content = "".join(content_words)
  1139. content = re.sub("[::]|[\((]|[\))]", "", content)
  1140. _tenderee_phone = re.findall(re_tenderee_phone, content)
  1141. # 更新正则确定的角色属性
  1142. for i in range(len(PackDict["Project"]["roleList"])):
  1143. if PackDict["Project"]["roleList"][i].role_name == "tenderee":
  1144. _tenderee_phone = re.findall(re_tenderee_phone, content)
  1145. if _tenderee_phone:
  1146. for _phone in _tenderee_phone:
  1147. _phone = _phone.split("/") # 分割多个号码
  1148. for one_phone in _phone:
  1149. PackDict["Project"]["roleList"][i].linklist.append(("", one_phone))
  1150. tenderee_phone.add(one_phone)
  1151. _tenderee_phone2 = re.findall(re_tenderee_phone2, content)
  1152. if _tenderee_phone2:
  1153. for _phone in _tenderee_phone2:
  1154. _phone = _phone.split("/")
  1155. for one_phone in _phone:
  1156. PackDict["Project"]["roleList"][i].linklist.append(("", one_phone))
  1157. tenderee_phone.add(one_phone)
  1158. if PackDict["Project"]["roleList"][i].role_name == "agency":
  1159. _agent_phone = re.findall(re_agent_phone, content)
  1160. if _agent_phone:
  1161. for _phone in _agent_phone:
  1162. _phone = _phone.split("/")
  1163. for one_phone in _phone:
  1164. PackDict["Project"]["roleList"][i].linklist.append(("", one_phone))
  1165. agency_phone.add(one_phone)
  1166. _agent_phone2 = re.findall(re_agent_phone2, content)
  1167. if _agent_phone2:
  1168. for _phone in _agent_phone2:
  1169. _phone = _phone.split("/")
  1170. for one_phone in _phone:
  1171. PackDict["Project"]["roleList"][i].linklist.append(("", one_phone))
  1172. agency_phone.add(one_phone)
  1173. # 正则提取电话号码实体
  1174. # key_word = re.compile('((?:电话|联系方式|联系人).{0,4}?)([0-1]\d{6,11})')
  1175. phone = re.compile('1[3-9][0-9][-—-―]?\d{4}[-—-―]?\d{4}|'
  1176. '\+86.?1[3-9]\d{9}|'
  1177. # '0[^0]\d{1,2}[-—-―][1-9]\d{6,7}/[1-9]\d{6,10}|'
  1178. '0[1-9]\d{1,2}[-—-―]\d{7,8}.?转\d{1,4}|'
  1179. '0[1-9]\d{1,2}[-—-―]\d{7,8}[-—-―]\d{1,4}|'
  1180. '0[1-9]\d{1,2}[-—-―]?[1-9]\d{6}\d?(?=1[3-9]\d{9})|'
  1181. '0[1-9]\d{1,2}[-—-―]?[1-9]\d{6}\d?(?=0[1-9]\d{1,2}[-—-―]?[1-9]\d{6}\d?)|'
  1182. '0[1-9]\d{1,2}[-—-―]?[1-9]\d{6}\d?(?=[1-9]\d{6,7})|'
  1183. '0[1-9]\d{1,2}[-—-―]?[1-9]\d{6}\d?|'
  1184. '[\(|\(]0[1-9]\d{1,2}[\)|\)]-?\d{7,8}-?\d{,4}|'
  1185. '[2-9]\d{6,7}')
  1186. url_pattern = re.compile("http[s]?://(?:[a-zA-Z]|[0-9]|[$\-_@.&+=\?:/]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+")
  1187. email_pattern = re.compile("[a-zA-Z0-9][a-zA-Z0-9_-]+(?:\.[a-zA-Z0-9_-]+)*@"
  1188. "[a-zA-Z0-9_-]+(?:\.[a-zA-Z0-9_-]+)*(?:\.[a-zA-Z]{2,})")
  1189. phone_entitys = []
  1190. code_entitys = [ent for ent in list_entity if ent.entity_type=='code']
  1191. for _sentence in list_sentence:
  1192. sentence_text = _sentence.sentence_text
  1193. in_attachment = _sentence.in_attachment
  1194. list_tokenbegin = []
  1195. begin = 0
  1196. for i in range(0, len(_sentence.tokens)):
  1197. list_tokenbegin.append(begin)
  1198. begin += len(str(_sentence.tokens[i]))
  1199. list_tokenbegin.append(begin + 1)
  1200. # 排除网址、邮箱、项目编号实体
  1201. error_list = []
  1202. for i in re.finditer(url_pattern, sentence_text):
  1203. error_list.append((i.start(), i.end()))
  1204. for i in re.finditer(email_pattern, sentence_text):
  1205. error_list.append((i.start(), i.end()))
  1206. for code_ent in [ent for ent in code_entitys if ent.sentence_index==_sentence.sentence_index]:
  1207. error_list.append((code_ent.wordOffset_begin,code_ent.wordOffset_end))
  1208. res_set = set()
  1209. for i in re.finditer(phone, sentence_text):
  1210. is_continue = False
  1211. for error_ent in error_list:
  1212. if i.start()>=error_ent[0] and i.end()<=error_ent[1]:
  1213. is_continue = True
  1214. break
  1215. if is_continue:
  1216. continue
  1217. res_set.add((i.group(), i.start(), i.end()))
  1218. res_set = sorted(list(res_set),key=lambda x:x[1])
  1219. last_phone_mask = True
  1220. for item_idx in range(len(res_set)):
  1221. item = res_set[item_idx]
  1222. phone_left = sentence_text[max(0, item[1] - 10):item[1]]
  1223. phone_right = sentence_text[item[2]:item[2] + 8]
  1224. if re.search("电话|手机|联系人|联系方式”",re.sub(",","",phone_left)):
  1225. pass
  1226. else:
  1227. # 排除“传真号”和其它错误项
  1228. if re.search("传,?真|信,?箱|邮,?[箱件]|QQ|qq", phone_left):
  1229. if not re.search("电,?话", phone_left):
  1230. last_phone_mask = False
  1231. continue
  1232. if re.search("注册[证号]|帐号|编[号码]|报价|标价|证号|价格|型号|附件|代码|列号|行号|税号|[\(\(]万?元[\)\)]|[a-zA-Z]+\d*$", re.sub(",","",phone_left)):
  1233. last_phone_mask = False
  1234. continue
  1235. if re.search("^\d{0,4}[.,]\d{2,}|^[0-9a-zA-Z\.]*@|^\d*[a-zA-Z]+|元", phone_right):
  1236. last_phone_mask = False
  1237. continue
  1238. # 前后跟着字母
  1239. if re.search("[a-zA-Z/]+$", phone_left) or re.search("^[a-zA-Z/]+", phone_right):
  1240. last_phone_mask = False
  1241. continue
  1242. # 前后跟着长度小于一定值数字的正则排除
  1243. if re.search("\d+[-—-―]?\d*$",phone_left) or re.search("^\d+[-—-―]?\d*",phone_right):
  1244. phone_left_number = re.search("\d+[-—-―]?\d*$",phone_left)
  1245. phone_right_number = re.search("^\d+[-—-―]?\d+",phone_right)
  1246. if phone_left_number:
  1247. if len(phone_left_number.group())<7:
  1248. last_phone_mask = False
  1249. continue
  1250. if phone_right_number:
  1251. if len(phone_right_number.group())<7:
  1252. last_phone_mask = False
  1253. continue
  1254. # if:上一个phone实体不符合条件
  1255. if not last_phone_mask:
  1256. item_start = item[1]
  1257. last_item_end = res_set[item_idx-1][2]
  1258. if item_start - last_item_end<=1 or re.search("^[\da-zA-Z\-—-―、]+$",sentence_text[last_item_end:item_start]):
  1259. last_phone_mask = False
  1260. continue
  1261. for j in range(len(list_tokenbegin)):
  1262. if list_tokenbegin[j] == item[1]:
  1263. begin_index = j
  1264. break
  1265. elif list_tokenbegin[j] > item[1]:
  1266. begin_index = j - 1
  1267. break
  1268. for j in range(begin_index, len(list_tokenbegin)):
  1269. if list_tokenbegin[j] >= item[2]:
  1270. end_index = j - 1
  1271. break
  1272. _entity = Entity(_sentence.doc_id, None, item[0], "phone", _sentence.sentence_index, begin_index, end_index, item[1],
  1273. item[2],in_attachment=in_attachment)
  1274. phone_entitys.append(_entity)
  1275. last_phone_mask = True
  1276. def is_company(entity,text):
  1277. # 判断"公司"实体是否为地址地点
  1278. if entity.label!=5 and entity.values[entity.label]>0.5:
  1279. return True
  1280. if ent.is_tail==True:
  1281. return False
  1282. entity_left = text[max(0,entity.wordOffset_begin-10):entity.wordOffset_begin]
  1283. entity_left = re.sub(",()\(\)","",entity_left)
  1284. entity_left = entity_left[-5:]
  1285. if re.search("地址|地点|银行[::]",entity_left):
  1286. return False
  1287. else:
  1288. return True
  1289. pre_entity = []
  1290. for ent in list_entity:
  1291. if (ent.entity_type in ['company','org','phone'] and is_company(ent,list_sentence[ent.sentence_index].sentence_text)) or (ent.entity_type=='person' and ent.label in [1,2,3]) \
  1292. or (ent.entity_type=='location' and len(ent.entity_text)>5):
  1293. pre_entity.append(ent)
  1294. text_data,pre_data = relationExtraction_model.encode(pre_entity + phone_entitys, list_sentence)
  1295. # print(pre_data)
  1296. maxlen = 512
  1297. relation_list = []
  1298. if 0<len(text_data)<=maxlen:
  1299. relation_list = relationExtraction_model.predict(text_data, pre_data)
  1300. else:
  1301. # 公告大于maxlen时,分段预测
  1302. start = 0
  1303. # print("len(pre_data)",len(pre_data))
  1304. temp_data = []
  1305. deal_data = 0
  1306. while start<len(pre_data):
  1307. _pre_data = pre_data[start:start+maxlen]
  1308. _text_data = text_data[start:start+maxlen]
  1309. if relationExtraction_model.check_data(_pre_data):
  1310. temp_data.append((_text_data,_pre_data))
  1311. else:
  1312. if temp_data:
  1313. deal_data += len(temp_data)
  1314. if deal_data>3:
  1315. break
  1316. for _text_data, _pre_data in temp_data:
  1317. relation_list.extend(relationExtraction_model.predict(_text_data,_pre_data))
  1318. temp_data = []
  1319. start = start + maxlen - 120
  1320. # print("预测数据:",len(temp_data))
  1321. # if len(temp_data)<=6:
  1322. # for _text_data,_pre_data in temp_data:
  1323. # relation_list.extend(relationExtraction_model.predict(_text_data,_pre_data))
  1324. # else:
  1325. # relation_list = []
  1326. # 去重结果
  1327. relation_list = list(set(relation_list))
  1328. # print(relation_list)
  1329. # tokens_num_dict = dict()
  1330. # last_tokens_num = 0
  1331. # for sentence in list_sentence:
  1332. # _index = sentence.sentence_index
  1333. # if _index == 0:
  1334. # tokens_num_dict[_index] = 0
  1335. # else:
  1336. # tokens_num_dict[_index] = tokens_num_dict[_index - 1] + last_tokens_num
  1337. # last_tokens_num = len(sentence.tokens)
  1338. right_combination = [('org','person'),('company','person'),('company','location'),('org','location'),('person','phone')]
  1339. linked_company = set()
  1340. linked_person = set()
  1341. linked_connetPerson = set()
  1342. linked_phone = set()
  1343. for predicate in ["rel_address","rel_phone","rel_person"]:
  1344. _match_list = []
  1345. _match_combo = []
  1346. for relation in relation_list:
  1347. _subject = relation[0]
  1348. _object = relation[2]
  1349. if isinstance(_subject,Entity) and isinstance(_object,Entity) and (_subject.entity_type,_object.entity_type) in right_combination:
  1350. if relation[1]==predicate:
  1351. if predicate=="rel_person":
  1352. if (_subject.label==0 and _object.entity_text in agency_contact ) or (_subject.label==1 and _object.entity_text in tenderee_contact):
  1353. continue
  1354. distance = (tokens_num_dict[_object.sentence_index] + _object.begin_index) - (
  1355. tokens_num_dict[_subject.sentence_index] + _subject.end_index)
  1356. if distance>0:
  1357. value = (-1 / 2 * (distance ** 2))/10000
  1358. else:
  1359. distance = abs(distance)
  1360. value = (-1 / 2 * (distance ** 2))
  1361. _match_list.append(Match(_subject,_object,value))
  1362. _match_combo.append((_subject,_object))
  1363. match_result = dispatch(_match_list)
  1364. error_list = []
  1365. for mat in list(set(_match_combo)-set(match_result)):
  1366. for temp in match_result:
  1367. if mat[1]==temp[1] and mat[0]!=temp[0]:
  1368. error_list.append(mat)
  1369. break
  1370. result = list(set(_match_combo)-set(error_list))
  1371. if predicate=='rel_person':
  1372. # 从后往前更新状态,已近后向链接的属性不在前向链接(解决错误链接)
  1373. result = sorted(result,key=lambda x:x[1].begin_index,reverse=True)
  1374. for combo in result:
  1375. is_continue = False
  1376. if not combo[0].pointer_person:
  1377. combo[0].pointer_person = []
  1378. if combo[1].begin_index<combo[0].begin_index:
  1379. if combo[0].pointer_person:
  1380. for temp in combo[0].pointer_person:
  1381. if temp.begin_index>combo[0].begin_index:
  1382. is_continue = True
  1383. break
  1384. if is_continue: continue
  1385. combo[0].pointer_person.append(combo[1])
  1386. linked_company.add(combo[0])
  1387. linked_person.add(combo[1])
  1388. # print(1,combo[0].entity_text,combo[1].entity_text)
  1389. if predicate=='rel_address':
  1390. result = sorted(result,key=lambda x:x[1].begin_index,reverse=True)
  1391. for combo in result:
  1392. if combo[0].pointer_address:
  1393. continue
  1394. combo[0].pointer_address = combo[1]
  1395. # print(2,combo[0].entity_text,combo[1].entity_text)
  1396. if predicate=='rel_phone':
  1397. result = sorted(result,key=lambda x:x[1].begin_index,reverse=True)
  1398. for combo in result:
  1399. is_continue = False
  1400. if not combo[0].person_phone:
  1401. combo[0].person_phone = []
  1402. if combo[1].begin_index<combo[0].begin_index:
  1403. if combo[0].person_phone:
  1404. for temp in combo[0].person_phone:
  1405. if temp.begin_index>combo[0].begin_index:
  1406. is_continue = True
  1407. break
  1408. if is_continue: continue
  1409. combo[0].person_phone.append(combo[1])
  1410. linked_connetPerson.add(combo[0])
  1411. linked_phone.add(combo[1])
  1412. if combo[0].label in [1,2]:
  1413. if PackDict.get("Project"):
  1414. for i in range(len(PackDict["Project"]["roleList"])):
  1415. if (combo[0].label==1 and PackDict["Project"]["roleList"][i].role_name=='tenderee') \
  1416. or (combo[0].label==2 and PackDict["Project"]["roleList"][i].role_name=='agency'):
  1417. PackDict["Project"]["roleList"][i].linklist.append((combo[0].entity_text,combo[1].entity_text))
  1418. break
  1419. # print(3,combo[0].entity_text,combo[1].entity_text)
  1420. # "联系人——联系电话" 链接规则补充
  1421. person_phone_EntityList = [ent for ent in pre_entity+ phone_entitys if ent.entity_type not in ['company','org','location']]
  1422. person_phone_EntityList = sorted(person_phone_EntityList, key=lambda x: (x.sentence_index, x.begin_index))
  1423. t_match_list = []
  1424. for ent_idx in range(len(person_phone_EntityList)):
  1425. entity = person_phone_EntityList[ent_idx]
  1426. if entity.entity_type=="person":
  1427. match_nums = 0
  1428. person_nums = 0 # 经过其他中联系人的数量
  1429. byNotPerson_match_nums = 0 # 跟在联系人后面的属性
  1430. phone_nums = 0 # 经过电话的数量
  1431. for after_index in range(ent_idx + 1, min(len(person_phone_EntityList), ent_idx + 8)):
  1432. after_entity = person_phone_EntityList[after_index]
  1433. if after_entity.entity_type == "phone":
  1434. distance = (tokens_num_dict[after_entity.sentence_index] + after_entity.begin_index) - (
  1435. tokens_num_dict[entity.sentence_index] + entity.end_index)
  1436. phone_nums += 1
  1437. if distance>100 or phone_nums>=4:
  1438. break
  1439. sentence_distance = after_entity.sentence_index - entity.sentence_index
  1440. value = (-1 / 2 * (distance ** 2)) / 10000
  1441. if sentence_distance == 0:
  1442. if distance < 80:
  1443. # value = (-1 / 2 * (distance ** 2)) / 10000
  1444. t_match_list.append(Match(entity, after_entity, value))
  1445. match_nums += 1
  1446. if not person_nums:
  1447. byNotPerson_match_nums += 1
  1448. else:
  1449. break
  1450. else:
  1451. if distance < 50:
  1452. # value = (-1 / 2 * (distance ** 2)) / 10000
  1453. t_match_list.append(Match(entity, after_entity, value))
  1454. match_nums += 1
  1455. if not person_nums:
  1456. byNotPerson_match_nums += 1
  1457. else:
  1458. break
  1459. else:
  1460. person_nums += 1
  1461. # 前向查找属性
  1462. if ent_idx != 0 and (not match_nums or not byNotPerson_match_nums):
  1463. previous_entity = person_phone_EntityList[ent_idx - 1]
  1464. if previous_entity.entity_type == 'phone':
  1465. # if previous_entity.sentence_index == entity.sentence_index:
  1466. distance = (tokens_num_dict[entity.sentence_index] + entity.begin_index) - (
  1467. tokens_num_dict[previous_entity.sentence_index] + previous_entity.end_index)
  1468. if distance < 40:
  1469. # 前向 没有 /10000
  1470. value = (-1 / 2 * (distance ** 2))
  1471. t_match_list.append(Match(entity, previous_entity, value))
  1472. # km算法分配求解(person-phone)
  1473. t_match_list = [mat for mat in t_match_list if mat.main_role not in linked_connetPerson and mat.attribute not in linked_phone]
  1474. personphone_result = dispatch(t_match_list)
  1475. personphone_result = sorted(personphone_result, key=lambda x: (x[0].sentence_index, x[0].begin_index))
  1476. for match in personphone_result:
  1477. _person = match[0]
  1478. _phone = match[1]
  1479. if not _person.person_phone:
  1480. _person.person_phone = []
  1481. _person.person_phone.append(_phone)
  1482. # 多个招标人/代理人或者别称
  1483. for idx in range(1,len(pre_entity)):
  1484. _pre_entity = pre_entity[idx]
  1485. if _pre_entity in linked_company and _pre_entity.label==5:
  1486. last_ent = pre_entity[idx-1]
  1487. if last_ent.entity_type in ['company','org'] and last_ent.label in [0,1]:
  1488. if last_ent.sentence_index==_pre_entity.sentence_index:
  1489. mid_text = list_sentence[_pre_entity.sentence_index].sentence_text[last_ent.wordOffset_end:_pre_entity.wordOffset_begin]
  1490. if len(mid_text)<=20 and "," not in mid_text and re.search("[、\((]",mid_text):
  1491. _pre_entity.label = last_ent.label
  1492. _pre_entity.values[last_ent.label] = 0.6
  1493. # 2022/01/25 固定电话可连多个联系人
  1494. temp_person_entitys = [entity for entity in pre_entity if entity.entity_type == 'person']
  1495. temp_person_entitys2 = [] #和固定电话相连的联系人
  1496. for entity in temp_person_entitys:
  1497. if entity.person_phone:
  1498. for _phone in entity.person_phone:
  1499. if not re.search("^1[3-9]\d{9}$", _phone.entity_text):
  1500. temp_person_entitys2.append(entity)
  1501. break
  1502. for index in range(len(temp_person_entitys)):
  1503. entity = temp_person_entitys[index]
  1504. if entity in temp_person_entitys2:
  1505. last_person = entity
  1506. for after_index in range(index + 1, min(len(temp_person_entitys), index + 5)):
  1507. after_entity = temp_person_entitys[after_index]
  1508. if after_entity.sentence_index == last_person.sentence_index and after_entity.begin_index - last_person.end_index < 3:
  1509. for _phone in entity.person_phone:
  1510. if not re.search("^1[3-9]\d{9}$", _phone.entity_text):
  1511. if _phone not in after_entity.person_phone:
  1512. after_entity.person_phone.append(_phone)
  1513. last_person = after_entity
  1514. else:
  1515. break
  1516. if index==0:
  1517. continue
  1518. last_person = entity
  1519. for before_index in range(index-1, max(-1,index-5), -1):
  1520. before_entity = temp_person_entitys[before_index]
  1521. if before_entity.sentence_index == last_person.sentence_index and last_person.begin_index - before_entity.end_index < 3:
  1522. for _phone in entity.person_phone:
  1523. if not re.search("^1[3-9]\d{9}$", _phone.entity_text):
  1524. if _phone not in before_entity.person_phone:
  1525. before_entity.person_phone.append(_phone)
  1526. last_person = before_entity
  1527. else:
  1528. break
  1529. # 更新person为招标/代理联系人的联系方式
  1530. for k in PackDict.keys():
  1531. for i in range(len(PackDict[k]["roleList"])):
  1532. if PackDict[k]["roleList"][i].role_name == "tenderee":
  1533. for _person in person_list:
  1534. if _person.label==1:#招标联系人
  1535. person_phone = [phone for phone in _person.person_phone] if _person.person_phone else []
  1536. for _p in person_phone:
  1537. PackDict[k]["roleList"][i].linklist.append((_person.entity_text, _p.entity_text))
  1538. if not person_phone:
  1539. PackDict[k]["roleList"][i].linklist.append((_person.entity_text,""))
  1540. if PackDict[k]["roleList"][i].role_name == "agency":
  1541. for _person in person_list:
  1542. if _person.label==2:#代理联系人
  1543. person_phone = [phone for phone in _person.person_phone] if _person.person_phone else []
  1544. for _p in person_phone:
  1545. PackDict[k]["roleList"][i].linklist.append((_person.entity_text, _p.entity_text))
  1546. if not person_phone:
  1547. PackDict[k]["roleList"][i].linklist.append((_person.entity_text,""))
  1548. # 更新 PackDict
  1549. not_sure_linked = []
  1550. for link_p in list(linked_company):
  1551. for k in PackDict.keys():
  1552. for i in range(len(PackDict[k]["roleList"])):
  1553. if PackDict[k]["roleList"][i].role_name == "tenderee":
  1554. if PackDict[k]["roleList"][i].entity_text != link_p.entity_text and link_p.label == 0:
  1555. not_sure_linked.append(link_p)
  1556. continue
  1557. if PackDict[k]["roleList"][i].entity_text == link_p.entity_text:
  1558. for per in link_p.pointer_person:
  1559. person_phone = [phone for phone in per.person_phone] if per.person_phone else []
  1560. if not person_phone:
  1561. if per.entity_text not in agency_contact:
  1562. PackDict[k]["roleList"][i].linklist.append((per.entity_text, ""))
  1563. continue
  1564. for _p in person_phone:
  1565. if per.entity_text not in agency_contact and _p.entity_text not in agency_phone:
  1566. PackDict[k]["roleList"][i].linklist.append((per.entity_text, _p.entity_text))
  1567. elif PackDict[k]["roleList"][i].role_name == "agency":
  1568. if PackDict[k]["roleList"][i].entity_text != link_p.entity_text and link_p.label == 1:
  1569. not_sure_linked.append(link_p)
  1570. continue
  1571. if PackDict[k]["roleList"][i].entity_text == link_p.entity_text:
  1572. for per in link_p.pointer_person:
  1573. person_phone = [phone for phone in per.person_phone] if per.person_phone else []
  1574. if not person_phone:
  1575. if per.entity_text not in tenderee_contact:
  1576. PackDict[k]["roleList"][i].linklist.append((per.entity_text, ""))
  1577. continue
  1578. for _p in person_phone:
  1579. if per.entity_text not in tenderee_contact and _p.entity_text not in tenderee_phone:
  1580. PackDict[k]["roleList"][i].linklist.append((per.entity_text, _p.entity_text))
  1581. else:
  1582. if PackDict[k]["roleList"][i].entity_text == link_p.entity_text:
  1583. for per in link_p.pointer_person:
  1584. person_phone = [phone for phone in per.person_phone] if per.person_phone else []
  1585. if not person_phone:
  1586. if per.entity_text not in tenderee_contact and per.entity_text not in agency_contact:
  1587. PackDict[k]["roleList"][i].linklist.append((per.entity_text, ""))
  1588. winter_contact.add(per.entity_text)
  1589. continue
  1590. for _p in person_phone:
  1591. if per.entity_text not in tenderee_contact and _p.entity_text not in tenderee_phone and \
  1592. per.entity_text not in agency_contact and _p.entity_text not in agency_phone:
  1593. PackDict[k]["roleList"][i].linklist.append((per.entity_text, _p.entity_text))
  1594. winter_contact.add(per.entity_text)
  1595. # 更新org/company实体label为0,1的链接
  1596. for link_p in not_sure_linked:
  1597. for k in PackDict.keys():
  1598. for i in range(len(PackDict[k]["roleList"])):
  1599. if PackDict[k]["roleList"][i].role_name == "tenderee":
  1600. if link_p.label == 0:
  1601. for per in link_p.pointer_person:
  1602. person_phone = [phone for phone in per.person_phone] if per.person_phone else []
  1603. if not person_phone:
  1604. if per.entity_text not in agency_contact and per.entity_text not in winter_contact:
  1605. PackDict[k]["roleList"][i].linklist.append((per.entity_text, ""))
  1606. continue
  1607. for _p in person_phone:
  1608. if per.entity_text not in agency_contact and _p.entity_text not in agency_phone and per.entity_text not in winter_contact:
  1609. PackDict[k]["roleList"][i].linklist.append((per.entity_text, _p.entity_text))
  1610. elif PackDict[k]["roleList"][i].role_name == "agency":
  1611. if link_p.label == 1:
  1612. for per in link_p.pointer_person:
  1613. person_phone = [phone for phone in per.person_phone] if per.person_phone else []
  1614. if not person_phone:
  1615. if per.entity_text not in tenderee_contact and per.entity_text not in winter_contact:
  1616. PackDict[k]["roleList"][i].linklist.append((per.entity_text, ""))
  1617. continue
  1618. for _p in person_phone:
  1619. if per.entity_text not in tenderee_contact and _p.entity_text not in tenderee_phone and per.entity_text not in winter_contact:
  1620. PackDict[k]["roleList"][i].linklist.append((per.entity_text, _p.entity_text))
  1621. re_split = re.compile("[^\u4e00-\u9fa5、](十一|十二|十三|十四|十五|一|二|三|四|五|六|七|八|九|十)、")
  1622. split_list = [0] * 16
  1623. split_dict = {
  1624. "一、": 1,
  1625. "二、": 2,
  1626. "三、": 3,
  1627. "四、": 4,
  1628. "五、": 5,
  1629. "六、": 6,
  1630. "七、": 7,
  1631. "八、": 8,
  1632. "九、": 9,
  1633. "十、": 10,
  1634. "十一、": 11,
  1635. "十二、": 12,
  1636. "十三、": 13,
  1637. "十四、": 14,
  1638. "十五、": 15
  1639. }
  1640. for item in re.finditer(re_split, _content):
  1641. _index = split_dict.get(item.group()[1:])
  1642. if not split_list[_index]:
  1643. split_list[_index] = item.span()[0] + 1
  1644. split_list = [i for i in split_list if i != 0]
  1645. start = 0
  1646. new_split_list = []
  1647. for idx in split_list:
  1648. new_split_list.append((start, idx))
  1649. start = idx
  1650. new_split_list.append((start, len(_content)))
  1651. # 实体列表按照“公告分段”分组
  1652. words_num_dict = dict()
  1653. last_words_num = 0
  1654. for sentence in list_sentence:
  1655. _index = sentence.sentence_index
  1656. if _index == 0:
  1657. words_num_dict[_index] = 0
  1658. else:
  1659. words_num_dict[_index] = words_num_dict[_index - 1] + last_words_num
  1660. last_words_num = len(sentence.sentence_text)
  1661. # 公司-联系人连接(km算法)
  1662. re_phone = re.compile('1[3-9][0-9][-—-―]?\d{4}[-—-―]?\d{4}|'
  1663. '\+86.?1[3-9]\d{9}|'
  1664. '0[1-9]\d{1,2}[-—-―][1-9]\d{6,7}/[1-9]\d{6,10}|'
  1665. '0[1-9]\d{1,2}[-—-―]\d{7,8}.?转\d{1,4}|'
  1666. '0[1-9]\d{1,2}[-—-―]\d{7,8}[-—-―]\d{1,4}|'
  1667. '0[1-9]\d{1,2}[-—-―]?[1-9]\d{6}\d?(?=1[3-9]\d{9})|'
  1668. '0[1-9]\d{1,2}[-—-―]?[1-9]\d{6}\d?(?=0[1-9]\d{1,2}[-—-―]?[1-9]\d{6}\d?)|'
  1669. '0[1-9]\d{1,2}[-—-―]?[1-9]\d{6}\d?(?=[1-9]\d{6,7})|'
  1670. '0[1-9]\d{1,2}[-—-―]?[1-9]\d{6}\d?|'
  1671. '[\(|\(]0[1-9]\d{1,2}[\)|\)]-?\d{7,8}-?\d{,4}|'
  1672. '[2-9]\d{6,7}')
  1673. key_phone = re.compile("联系方式|电话|联系人|负责人")
  1674. temporary_list2 = []
  1675. for entity in list_entity:
  1676. # if entity.entity_type in ['org', 'company', 'person'] and entity.is_tail==False:
  1677. if entity.entity_type in ['org', 'company', 'person']:
  1678. temporary_list2.append(entity)
  1679. temporary_list2 = sorted(temporary_list2, key=lambda x: (x.sentence_index, x.begin_index))
  1680. new_temporary_list2 = []
  1681. for _split in new_split_list:
  1682. temp_list = []
  1683. for _entity in temporary_list2:
  1684. if words_num_dict[_entity.sentence_index] + _entity.wordOffset_begin >= _split[0] and words_num_dict[
  1685. _entity.sentence_index] + _entity.wordOffset_end < _split[1]:
  1686. temp_list.append(_entity)
  1687. elif words_num_dict[_entity.sentence_index] + _entity.wordOffset_begin >= _split[1]:
  1688. break
  1689. new_temporary_list2.append(temp_list)
  1690. # print(new_temporary_list2)
  1691. match_list2 = []
  1692. for split_index in range(len(new_temporary_list2)):
  1693. split_entitys = new_temporary_list2[split_index]
  1694. is_skip = False
  1695. for index in range(len(split_entitys)):
  1696. entity = split_entitys[index]
  1697. if is_skip:
  1698. is_skip = False
  1699. continue
  1700. else:
  1701. if entity.entity_type in ['org', 'company']:
  1702. if entity.label != 5 or entity.entity_text in roleSet:
  1703. match_nums = 0
  1704. for after_index in range(index + 1, min(len(split_entitys), index + 4)):
  1705. after_entity = split_entitys[after_index]
  1706. if after_entity.entity_type in ['person']:
  1707. # 实体为中标人/候选人,联系人已确定类别【1,2】
  1708. if entity.label in [2, 3, 4] and after_entity.label in [1, 2]:
  1709. break
  1710. if after_entity.label in [1, 2, 3]:
  1711. distance = (tokens_num_dict[
  1712. after_entity.sentence_index] + after_entity.begin_index) - (
  1713. tokens_num_dict[entity.sentence_index] + entity.end_index)
  1714. sentence_distance = after_entity.sentence_index - entity.sentence_index
  1715. if sentence_distance == 0:
  1716. if distance < 100:
  1717. if (entity.label == 0 and after_entity.label == 1) or (
  1718. entity.label == 1 and after_entity.label == 2):
  1719. distance = distance / 100
  1720. value = (-1 / 2 * (distance ** 2)) / 10000
  1721. match_list2.append(Match(entity, after_entity, value))
  1722. match_nums += 1
  1723. else:
  1724. if distance < 60:
  1725. if (entity.label == 0 and after_entity.label == 1) or (
  1726. entity.label == 1 and after_entity.label == 2):
  1727. distance = distance / 100
  1728. value = (-1 / 2 * (distance ** 2)) / 10000
  1729. match_list2.append(Match(entity, after_entity, value))
  1730. match_nums += 1
  1731. if after_entity.entity_type in ['org', 'company']:
  1732. # 解决在‘地址’中识别出org/company的问题
  1733. # if entity.label in [0,1] and after_index==index+1 and after_entity.label not in [0,1]:
  1734. if entity.label != 5 and after_index == index + 1 and (
  1735. after_entity.label == entity.label or after_entity.label == 5):
  1736. distance = (tokens_num_dict[
  1737. after_entity.sentence_index] + after_entity.begin_index) - (
  1738. tokens_num_dict[entity.sentence_index] + entity.end_index)
  1739. if distance < 20:
  1740. after_entity_left = list_sentence[after_entity.sentence_index].tokens[max(0,
  1741. after_entity.begin_index - 10):after_entity.begin_index]
  1742. after_entity_right = list_sentence[after_entity.sentence_index].tokens[
  1743. after_entity.end_index + 1:after_entity.end_index + 6]
  1744. after_entity_left = "".join(after_entity_left)
  1745. if len(after_entity_left) > 20:
  1746. after_entity_left = after_entity_left[-20:]
  1747. after_entity_right = "".join(after_entity_right)[:10]
  1748. if re.search("地,?址", after_entity_left):
  1749. is_skip = True
  1750. continue
  1751. if re.search("\(|(", after_entity_left) and re.search("\)|)",
  1752. after_entity_right):
  1753. is_skip = True
  1754. continue
  1755. if entity.label in [0, 1] and after_entity.label in [0,
  1756. 1] and entity.label == after_entity.label:
  1757. break
  1758. if entity.label in [0, 1] and after_entity.label in [0, 1] and split_entitys[
  1759. index + 1].entity_type == "person":
  1760. break
  1761. if entity.label in [0, 1] and after_entity.label in [2, 3, 4]:
  1762. break
  1763. if entity.label in [2, 3, 4] and after_entity.label in [0, 1]:
  1764. break
  1765. # 搜索没有联系人的电话
  1766. mid_tokens = []
  1767. is_same_sentence = False
  1768. if index == len(split_entitys) - 1:
  1769. for i in range(entity.sentence_index, len(list_sentence)):
  1770. mid_tokens += list_sentence[i].tokens
  1771. mid_tokens = mid_tokens[entity.end_index + 1:]
  1772. mid_sentence = "".join(mid_tokens)
  1773. have_phone = re.findall(re_phone, mid_sentence)
  1774. if have_phone:
  1775. if re.findall(re_phone, mid_sentence.split("。")[0]):
  1776. is_same_sentence = True
  1777. _phone = have_phone[0]
  1778. phone_begin = mid_sentence.find(_phone)
  1779. if words_num_dict[entity.sentence_index] + entity.wordOffset_begin + phone_begin < \
  1780. new_split_list[split_index][1]:
  1781. mid_sentence = mid_sentence[max(0, phone_begin - 15):phone_begin].replace(",", "")
  1782. if re.search(key_phone, mid_sentence):
  1783. distance = 1
  1784. if is_same_sentence:
  1785. if phone_begin <= 200:
  1786. value = (-1 / 2 * (distance ** 2)) / 10000
  1787. match_list2.append(Match(entity, (entity, _phone), value))
  1788. match_nums += 1
  1789. else:
  1790. if phone_begin <= 60:
  1791. value = (-1 / 2 * (distance ** 2)) / 10000
  1792. match_list2.append(Match(entity, (entity, _phone), value))
  1793. match_nums += 1
  1794. else:
  1795. next_entity = split_entitys[index + 1]
  1796. if next_entity.entity_type in ["org","company"]:
  1797. _entity_left = list_sentence[next_entity.sentence_index].sentence_text[max(0, next_entity.wordOffset_begin - 20):next_entity.wordOffset_begin]
  1798. _entity_left2 = re.sub(",()\(\)::", "", _entity_left)
  1799. _entity_left2 = _entity_left2[-5:]
  1800. if re.search("(地,?址|地,?点)[::][^,。]*$", _entity_left) or re.search("地址|地点", _entity_left2):
  1801. if index + 2<= len(split_entitys) - 1:
  1802. next_entity = split_entitys[index + 2]
  1803. if entity.sentence_index == next_entity.sentence_index:
  1804. mid_tokens += list_sentence[entity.sentence_index].tokens[
  1805. entity.end_index + 1:next_entity.begin_index]
  1806. else:
  1807. sentence_index = entity.sentence_index
  1808. while sentence_index <= next_entity.sentence_index:
  1809. mid_tokens += list_sentence[sentence_index].tokens
  1810. sentence_index += 1
  1811. mid_tokens = mid_tokens[entity.end_index + 1:-(len(
  1812. list_sentence[next_entity.sentence_index].tokens) - next_entity.begin_index) + 1]
  1813. mid_sentence = "".join(mid_tokens)
  1814. have_phone = re.findall(re_phone, mid_sentence)
  1815. if have_phone:
  1816. if re.findall(re_phone, mid_sentence.split("。")[0]):
  1817. is_same_sentence = True
  1818. _phone = have_phone[0]
  1819. phone_begin = mid_sentence.find(_phone)
  1820. mid_sentence = mid_sentence[max(0, phone_begin - 15):phone_begin].replace(",", "")
  1821. if re.search(key_phone, mid_sentence):
  1822. p_phone = [p.entity_text for p in next_entity.person_phone] if next_entity.person_phone else []
  1823. if next_entity.entity_type == 'person' and _phone in p_phone:
  1824. pass
  1825. else:
  1826. distance = (tokens_num_dict[
  1827. next_entity.sentence_index] + next_entity.begin_index) - (
  1828. tokens_num_dict[entity.sentence_index] + entity.end_index)
  1829. distance = distance / 2
  1830. if is_same_sentence:
  1831. if phone_begin <= 200:
  1832. value = (-1 / 2 * (distance ** 2)) / 10000
  1833. match_list2.append(Match(entity, (entity, _phone), value))
  1834. match_nums += 1
  1835. else:
  1836. if phone_begin <= 60:
  1837. value = (-1 / 2 * (distance ** 2)) / 10000
  1838. match_list2.append(Match(entity, (entity, _phone), value))
  1839. match_nums += 1
  1840. # 实体无匹配时,尝试前向查找匹配
  1841. if not match_nums:
  1842. if (entity.label != 5 or entity.entity_text in roleSet) and entity.values[entity.label] >= 0.5 and index != 0:
  1843. previous_entity = split_entitys[index - 1]
  1844. if previous_entity.entity_type == 'person' and previous_entity.label in [1, 2, 3]:
  1845. if entity.label in [2, 3, 4] and previous_entity.label in [1, 2]:
  1846. continue
  1847. if previous_entity.sentence_index == entity.sentence_index:
  1848. distance = (tokens_num_dict[entity.sentence_index] + entity.begin_index) - (
  1849. tokens_num_dict[
  1850. previous_entity.sentence_index] + previous_entity.end_index)
  1851. if distance < 20:
  1852. # 距离相等时,前向添加处罚值
  1853. # distance += 1
  1854. # 前向 没有 /10000
  1855. value = (-1 / 2 * (distance ** 2))
  1856. match_list2.append(Match(entity, previous_entity, value))
  1857. # print(match_list2)
  1858. match_list2 = [mat for mat in match_list2 if mat.main_role not in linked_company and mat.attribute not in linked_person]
  1859. # print(match_list2)
  1860. # km算法分配求解
  1861. result2 = dispatch(match_list2)
  1862. # print(result2)
  1863. for match in result2:
  1864. entity = match[0]
  1865. # print(entity.entity_text)
  1866. # print(match.attribute)
  1867. entity_index = list_entity.index(entity)
  1868. is_update = False
  1869. if isinstance(match[1], tuple):
  1870. person_ = ''
  1871. phone_ = match[1][1].split("/") # 分割多个号码
  1872. # print(person_,phone_)
  1873. else:
  1874. person_ = match[1].entity_text
  1875. phone_ = [i.entity_text for i in match[1].person_phone] if match[1].person_phone else []
  1876. for k in PackDict.keys():
  1877. for i in range(len(PackDict[k]["roleList"])):
  1878. if PackDict[k]["roleList"][i].role_name == "tenderee":
  1879. # if not PackDict[k]["roleList"][i].linklist:
  1880. if PackDict[k]["roleList"][i].entity_text == entity.entity_text or entity.label == 0:
  1881. if person_ not in agency_contact and len(set(phone_)&set(agency_phone))==0 and person_ not in winter_contact:
  1882. if not phone_:
  1883. PackDict[k]["roleList"][i].linklist.append((person_, ""))
  1884. for p in phone_:
  1885. # if not person_ and len()
  1886. PackDict[k]["roleList"][i].linklist.append((person_, p))
  1887. is_update = True
  1888. elif PackDict[k]["roleList"][i].role_name == "agency":
  1889. # if not PackDict[k]["roleList"][i].linklist:
  1890. if PackDict[k]["roleList"][i].entity_text == entity.entity_text or entity.label == 1 and person_ not in winter_contact:
  1891. if person_ not in tenderee_contact and len(set(phone_)&set(tenderee_phone))==0:
  1892. if not phone_:
  1893. PackDict[k]["roleList"][i].linklist.append((person_, ""))
  1894. for p in phone_:
  1895. PackDict[k]["roleList"][i].linklist.append((person_, p))
  1896. is_update = True
  1897. else:
  1898. if PackDict[k]["roleList"][i].entity_text == entity.entity_text:
  1899. if not PackDict[k]["roleList"][i].linklist:
  1900. if person_ not in tenderee_contact and len(set(phone_)&set(tenderee_phone))==0 and \
  1901. person_ not in agency_contact and len(set(phone_)&set(agency_phone))==0:
  1902. if not phone_:
  1903. PackDict[k]["roleList"][i].linklist.append((person_, ""))
  1904. for p in phone_:
  1905. PackDict[k]["roleList"][i].linklist.append((person_, p))
  1906. is_update = True
  1907. if not person_:
  1908. is_update = False
  1909. if is_update:
  1910. # 更新 list_entity
  1911. if not list_entity[entity_index].pointer_person:
  1912. list_entity[entity_index].pointer_person = []
  1913. list_entity[entity_index].pointer_person.append(match[1])
  1914. linked_person = []
  1915. linked_persons_with = []
  1916. for company_entity in [entity for entity in list_entity if entity.entity_type in ['company','org']]:
  1917. if company_entity.pointer_person:
  1918. for _person in company_entity.pointer_person:
  1919. linked_person.append(_person)
  1920. linked_persons_with.append(company_entity)
  1921. # 一个公司对应多个联系人的补充
  1922. person_entitys = [entity for entity in list_entity if entity.entity_type=='person']
  1923. person_entitys = person_entitys[::-1]
  1924. for index in range(len(person_entitys)):
  1925. entity = person_entitys[index]
  1926. prepare_link = []
  1927. if entity not in linked_person:
  1928. prepare_link.append(entity)
  1929. last_person = entity
  1930. for after_index in range(index + 1, min(len(person_entitys), index + 5)):
  1931. after_entity = person_entitys[after_index]
  1932. if after_entity.sentence_index==last_person.sentence_index and last_person.begin_index-after_entity.end_index<5:
  1933. if after_entity in linked_person:
  1934. _index = linked_person.index(after_entity)
  1935. with_company = linked_persons_with[_index]
  1936. for i in range(len(PackDict["Project"]["roleList"])):
  1937. if PackDict["Project"]["roleList"][i].role_name == "tenderee":
  1938. if PackDict["Project"]["roleList"][i].entity_text == with_company.entity_text or with_company.label == 0:
  1939. for item in prepare_link:
  1940. person_phone = [p.entity_text for p in item.person_phone] if item.person_phone else []
  1941. for _p in person_phone:
  1942. PackDict["Project"]["roleList"][i].linklist.append((item.entity_text, _p))
  1943. with_company.pointer_person.append(item)
  1944. linked_person.append(item)
  1945. elif PackDict["Project"]["roleList"][i].role_name == "agency":
  1946. if PackDict["Project"]["roleList"][i].entity_text == with_company.entity_text or with_company.label == 1:
  1947. for item in prepare_link:
  1948. person_phone = [p.entity_text for p in item.person_phone] if item.person_phone else []
  1949. for _p in person_phone:
  1950. PackDict["Project"]["roleList"][i].linklist.append((item.entity_text, _p))
  1951. with_company.pointer_person.append(item)
  1952. linked_person.append(item)
  1953. else:
  1954. if PackDict["Project"]["roleList"][i].entity_text == with_company.entity_text:
  1955. for item in prepare_link:
  1956. person_phone = [p.entity_text for p in item.person_phone] if item.person_phone else []
  1957. for _p in person_phone:
  1958. PackDict["Project"]["roleList"][i].linklist.append((item.entity_text, _p))
  1959. with_company.pointer_person.append(item)
  1960. linked_person.append(item)
  1961. break
  1962. else:
  1963. prepare_link.append(after_entity)
  1964. last_person = after_entity
  1965. continue
  1966. # 统一同类角色的属性
  1967. if PackDict.get("Project"):
  1968. for i in range(len(PackDict["Project"]["roleList"])):
  1969. # if PackDict["Project"]["roleList"][i].role_name in ["tenderee","agency"]:
  1970. for _entity in list_entity:
  1971. if _entity.entity_type in ['org','company']:
  1972. is_similar = False
  1973. # entity_text相同
  1974. if _entity.entity_text==PackDict["Project"]["roleList"][i].entity_text:
  1975. is_similar = True
  1976. # entity.label为【0,1】
  1977. if _entity.label in [0,1] and dict_role_id[str(_entity.label)]==PackDict["Project"]["roleList"][i].role_name:
  1978. is_similar = True
  1979. if is_similar:
  1980. linked_entitys = _entity.linked_entitys
  1981. if linked_entitys:
  1982. for linked_entity in linked_entitys:
  1983. pointer_person = linked_entity.pointer_person if linked_entity.pointer_person else []
  1984. for _pointer_person in pointer_person:
  1985. _phone = [p.entity_text for p in _pointer_person.person_phone] if _pointer_person.person_phone else []
  1986. for _p in _phone:
  1987. if (_pointer_person.entity_text,_p) not in PackDict["Project"]["roleList"][i].linklist:
  1988. PackDict["Project"]["roleList"][i].linklist.append((_pointer_person.entity_text,_p))
  1989. # "roleList"中联系人电话去重
  1990. for i in range(len(PackDict["Project"]["roleList"])):
  1991. # print(123, PackDict["Project"]["roleList"][i].linklist)
  1992. # 带有联系人的电话
  1993. with_person = [person_phone[1] for person_phone in PackDict["Project"]["roleList"][i].linklist if person_phone[0]]
  1994. # 带有电话的联系人
  1995. with_phone = [person_phone[0] for person_phone in PackDict["Project"]["roleList"][i].linklist if person_phone[1]]
  1996. remove_list = []
  1997. for item in PackDict["Project"]["roleList"][i].linklist:
  1998. if not item[0]:
  1999. if item[1] in with_person:
  2000. # 删除重复的无联系人电话
  2001. remove_list.append(item)
  2002. elif not item[1]:
  2003. if item[0] in with_phone:
  2004. remove_list.append(item)
  2005. for _item in remove_list:
  2006. PackDict["Project"]["roleList"][i].linklist.remove(_item)
  2007. # 联系人——电子邮箱链接
  2008. temporary_list3 = [entity for entity in list_entity if entity.entity_type=='email' or (entity.entity_type=='person' and entity.label in [1,2,3])]
  2009. temporary_list3 = sorted(temporary_list3, key=lambda x: (x.sentence_index, x.begin_index))
  2010. new_temporary_list3 = []
  2011. for _split in new_split_list:
  2012. temp_list = []
  2013. for _entity in temporary_list3:
  2014. if words_num_dict[_entity.sentence_index] + _entity.wordOffset_begin >= _split[0] and words_num_dict[
  2015. _entity.sentence_index] + _entity.wordOffset_end < _split[1]:
  2016. temp_list.append(_entity)
  2017. elif words_num_dict[_entity.sentence_index] + _entity.wordOffset_begin >= _split[1]:
  2018. break
  2019. new_temporary_list3.append(temp_list)
  2020. # print(new_temporary_list3)
  2021. match_list3 = []
  2022. for split_index in range(len(new_temporary_list3)):
  2023. split_entitys = new_temporary_list3[split_index]
  2024. for index in range(len(split_entitys)):
  2025. entity = split_entitys[index]
  2026. if entity.entity_type == 'person':
  2027. match_nums = 0
  2028. for after_index in range(index + 1, min(len(split_entitys), index + 4)):
  2029. after_entity = split_entitys[after_index]
  2030. if match_nums > 2:
  2031. break
  2032. if after_entity.entity_type == 'email':
  2033. distance = (tokens_num_dict[after_entity.sentence_index] + after_entity.begin_index) - (
  2034. tokens_num_dict[entity.sentence_index] + entity.end_index)
  2035. sentence_distance = after_entity.sentence_index - entity.sentence_index
  2036. if sentence_distance == 0:
  2037. if distance < 100:
  2038. if (entity.label == 0 and after_entity.label == 1) or (
  2039. entity.label == 1 and after_entity.label == 2):
  2040. distance = distance / 100
  2041. value = (-1 / 2 * (distance ** 2)) / 10000
  2042. match_list3.append(Match(entity, after_entity, value))
  2043. match_nums += 1
  2044. else:
  2045. if distance < 60:
  2046. if (entity.label == 0 and after_entity.label == 1) or (
  2047. entity.label == 1 and after_entity.label == 2):
  2048. distance = distance / 100
  2049. value = (-1 / 2 * (distance ** 2)) / 10000
  2050. match_list3.append(Match(entity, after_entity, value))
  2051. match_nums += 1
  2052. # 前向查找匹配
  2053. # if not match_nums:
  2054. if index != 0:
  2055. previous_entity = split_entitys[index - 1]
  2056. if previous_entity.entity_type == 'email':
  2057. if previous_entity.sentence_index == entity.sentence_index:
  2058. distance = (tokens_num_dict[entity.sentence_index] + entity.begin_index) - (
  2059. tokens_num_dict[
  2060. previous_entity.sentence_index] + previous_entity.end_index)
  2061. if distance < 30:
  2062. # 距离相等时,前向添加处罚值
  2063. # distance += 1
  2064. # 前向 没有 /10000
  2065. value = (-1 / 2 * (distance ** 2))
  2066. match_list3.append(Match(entity, previous_entity, value))
  2067. # print(match_list3)
  2068. # km算法分配求解
  2069. result3 = dispatch(match_list3)
  2070. for match in result3:
  2071. match_person = match[0]
  2072. match_email = match[1]
  2073. match_person.pointer_email = match_email
  2074. # # 1)第一个公司实体的招标人,则看看下一个实体是否为代理人,如果是则联系人错位连接 。2)在同一句中往后找联系人。3)连接不上在整个文章找联系人。
  2075. # temp_ent_list = [] # 临时列表,记录0,1角色及3联系人
  2076. # other_person = [] # 阈值以上的联系人列表
  2077. # link_person = [] # 有电话没联系上角色的person列表
  2078. # other_ent = []
  2079. # link_ent = []
  2080. # found_person = False
  2081. # ent_list = []
  2082. # for entity in list_entity:
  2083. # if entity.entity_type in ['org','company','person']:
  2084. # ent_list.append(entity)
  2085. # # ent_list = [entity for entity in list_entity if entity.entity_type in ['org','company','person']]
  2086. # #for list_index in range(len(ent_list)):
  2087. # #if ent_list[list_index].entity_type in ['org','company'] and ent_list[list_index].label == 0 and list_index+2<len(ent_list) and \
  2088. # #ent_list[list_index+1].entity_type in ['org','company'] and ent_list[list_index+1].label == 1 and ent_list[list_index+2].entity_type in ['person']:
  2089. # #ent_list[list_index+1], ent_list[list_index+2] = ent_list[list_index+2], ent_list[list_index+1]
  2090. # # 2020/11/25增加确定角色联系人判断
  2091. # sure_person_set = set([entity.entity_text for entity in ent_list if entity.entity_type == 'person' and entity.label in [1, 2]])
  2092. # # 招标/代理在同一句中交叉情况的处理
  2093. # for index in range(len(ent_list)):
  2094. # entity = ent_list[index]
  2095. # if entity.entity_text in roleSet and entity.label in [0, 1] and index+3<len(ent_list):
  2096. # if entity.sentence_index==ent_list[index+1].sentence_index==ent_list[index+2].sentence_index==ent_list[index+3].sentence_index:
  2097. # if ent_list[index+1].begin_index - entity.end_index < 30:
  2098. # if ent_list[index+1].entity_text in roleSet and ent_list[index+1].label in [0, 1] and entity.label!=ent_list[index+1].label:
  2099. # if ent_list[index+2].entity_type=="person" and ent_list[index+3].entity_type=="person" and \
  2100. # ent_list[index+2].label==3 and ent_list[index+3].label==3:
  2101. # ent_list[index + 1], ent_list[index + 2] = ent_list[index + 2], ent_list[index + 1]
  2102. #
  2103. #
  2104. # for index in range(len(ent_list)):
  2105. # entity = ent_list[index]
  2106. # if entity.entity_type=="person":
  2107. # if str(entity.label) == "0": # 2020/11/25 非联系人直接跳过
  2108. # continue
  2109. # if entity.values[entity.label]>on_value_person:
  2110. # if str(entity.label)=="1":
  2111. # for i in range(len(PackDict["Project"]["roleList"])):
  2112. # if PackDict["Project"]["roleList"][i].role_name=="tenderee":
  2113. # PackDict["Project"]["roleList"][i].linklist.append((entity.entity_text,entity.person_phone))
  2114. # link_person.append(entity.entity_text)
  2115. # link_ent.append(PackDict["Project"]["roleList"][i].entity_text)
  2116. # # add pointer_person
  2117. # for _entity in list_entity:
  2118. # if dict_role_id.get(str(_entity.label))=="tenderee":
  2119. # for i in range(len(PackDict["Project"]["roleList"])):
  2120. # if PackDict["Project"]["roleList"][i].entity_text==_entity.entity_text and PackDict["Project"]["roleList"][i].role_name=="tenderee":
  2121. # _entity.pointer_person = entity
  2122. # elif str(entity.label)=="2":
  2123. # for i in range(len(PackDict["Project"]["roleList"])):
  2124. # if PackDict["Project"]["roleList"][i].role_name=="agency":
  2125. # PackDict["Project"]["roleList"][i].linklist.append((entity.entity_text,entity.person_phone))
  2126. # link_person.append(entity.entity_text)
  2127. # link_ent.append(PackDict["Project"]["roleList"][i].entity_text)
  2128. # # add pointer_person
  2129. # for _entity in list_entity:
  2130. # if dict_role_id.get(str(_entity.label))=="agency":
  2131. # for i in range(len(PackDict["Project"]["roleList"])):
  2132. # if PackDict["Project"]["roleList"][i].entity_text==_entity.entity_text and PackDict["Project"]["roleList"][i].role_name=="agency":
  2133. # _entity.pointer_person = entity
  2134. # elif str(entity.label)=="3":
  2135. # if entity.entity_text in sure_person_set: # 2020/11/25 排除已经确定角色的联系人
  2136. # continue
  2137. # #not_link_person.append((entity_after.entity_text,entity_after.person_phone))
  2138. # other_person.append(entity.entity_text)
  2139. # temp_ent_list.append((entity.entity_text,entity.person_phone,entity))
  2140. #
  2141. # #if entity.entity_text in roleSet:
  2142. # if entity.entity_text in roleSet:
  2143. # if entity.label in [0,1]:
  2144. # other_ent.append(entity.entity_text)
  2145. # temp_ent_list.append((entity.entity_text, entity.label,entity))
  2146. # for behind_index in range(index+1, len(ent_list)):
  2147. # entity_after = ent_list[behind_index]
  2148. # if entity_after.sentence_index-entity.sentence_index>=1 or entity_after.entity_type in ['org','company']: # 只在本句中找联系人
  2149. # break
  2150. # if entity_after.values is not None:
  2151. # if entity_after.entity_type=="person":
  2152. # if str(entity_after.label) == "0": # 2020/11/25角色后面为非联系人 停止继续往后找
  2153. # break
  2154. # if entity_after.values[entity_after.label]>on_value_person:
  2155. # if str(entity_after.label)=="1":
  2156. # for i in range(len(PackDict["Project"]["roleList"])):
  2157. # if PackDict["Project"]["roleList"][i].role_name=="tenderee":
  2158. # PackDict["Project"]["roleList"][i].linklist.append((entity_after.entity_text,entity_after.person_phone))
  2159. # link_person.append(entity_after.entity_text)
  2160. # link_ent.append(PackDict["Project"]["roleList"][i].entity_text)
  2161. # elif str(entity_after.label)=="2":
  2162. # for i in range(len(PackDict["Project"]["roleList"])):
  2163. # if PackDict["Project"]["roleList"][i].role_name=="agency":
  2164. # PackDict["Project"]["roleList"][i].linklist.append((entity_after.entity_text,entity_after.person_phone))
  2165. # link_person.append(entity_after.entity_text)
  2166. # link_ent.append(PackDict["Project"]["roleList"][i].entity_text)
  2167. # elif str(entity_after.label)=="3":
  2168. # if entity_after.entity_text in sure_person_set: # 2020/11/25 如果姓名已经出现在确定角色联系人中则停止往后找
  2169. # break
  2170. # elif entity_after.begin_index - entity.end_index > 30:#2020/10/25 如果角色实体与联系人实体间隔大于阈值停止
  2171. # break
  2172. # for pack in PackDict.keys():
  2173. # for i in range(len(PackDict[pack]["roleList"])):
  2174. # if PackDict[pack]["roleList"][i].entity_text==entity.entity_text:
  2175. # #if entity_after.sentence_index-entity.sentence_index>1 and len(roleList[i].linklist)>0:
  2176. # #break
  2177. # PackDict[pack]["roleList"][i].linklist.append((entity_after.entity_text,entity_after.person_phone))
  2178. # link_person.append(entity_after.entity_text)
  2179. # #add pointer_person
  2180. # entity.pointer_person = entity_after
  2181. #
  2182. # not_link_person = [person for person in other_person if person not in link_person]
  2183. # not_link_ent = [ent for ent in other_ent if ent not in link_ent]
  2184. # if len(not_link_person) > 0 and len(not_link_ent) > 0 :
  2185. # item = temp_ent_list
  2186. # for i in range(len(item)):
  2187. # if item[i][0] in not_link_ent and item[i][1] == 0 and i+3 < len(item):
  2188. # if item[i+1][0] in other_ent and item[i+1][1] == 1 and item[i+2][0] in other_person and item[i+3][0] in other_person:
  2189. # item[i+1], item[i+2] = item[i+2], item[i+1]
  2190. # for i in range(len(item)-1, -1, -1):
  2191. # if item[i][0] in not_link_ent:
  2192. # for pack in PackDict.keys():
  2193. # for role in PackDict[pack]["roleList"]:
  2194. # if role.entity_text == item[i][0] and len(role.linklist) < 1:
  2195. # for j in range(i+1, len(item)):
  2196. # if item[j][0] in not_link_person:
  2197. # role.linklist.append(item[j][:2])
  2198. # #add pointer_person
  2199. # item[i][2].pointer_person = item[j][2]
  2200. # break
  2201. # else:
  2202. # break
  2203. # # 电话没有联系人的处理
  2204. # role_with_no_phone = []
  2205. # for i in range(len(PackDict["Project"]["roleList"])):
  2206. # if PackDict["Project"]["roleList"][i].role_name in ["tenderee","agency"]:
  2207. # if len(PackDict["Project"]["roleList"][i].linklist)==0: # 找出没有联系人的招标/代理人
  2208. # role_with_no_phone.append(PackDict["Project"]["roleList"][i].entity_text)
  2209. # else:
  2210. # phone_nums = 0
  2211. # for link in PackDict["Project"]["roleList"][i].linklist:
  2212. # if link[1]:
  2213. # phone_nums += 1
  2214. # break
  2215. # if not phone_nums:
  2216. # role_with_no_phone.append(PackDict["Project"]["roleList"][i].entity_text)
  2217. # if role_with_no_phone:
  2218. # phone_with_person = [entity.person_phone for entity in list_entity if entity.entity_type == "person"]
  2219. # # phone_with_person = [phone for phone in phone_with_person if phone]
  2220. #
  2221. # dict_index_sentence = {}
  2222. # for _sentence in list_sentence:
  2223. # dict_index_sentence[_sentence.sentence_index] = _sentence
  2224. # new_entity_list = [entity for entity in list_entity if entity.entity_type in ['org','company','person']]
  2225. # for index in range(len(new_entity_list)):
  2226. # entity = new_entity_list[index]
  2227. # if entity.entity_text in role_with_no_phone:
  2228. # e_sentence = dict_index_sentence[entity.sentence_index]
  2229. # entity_right = e_sentence.tokens[entity.end_index:entity.end_index+40]
  2230. # entity_right = "".join(entity_right)
  2231. # if index+1<len(new_entity_list) and entity_right.find(new_entity_list[index+1].entity_text)>-1:
  2232. # entity_right = entity_right[:entity_right.find(new_entity_list[index+1].entity_text)]
  2233. # have_phone = re.findall(phone,entity_right)
  2234. # if have_phone:
  2235. # _phone = have_phone[0]
  2236. # phone_begin = entity_right.find(_phone)
  2237. # if _phone not in phone_with_person and re.search(key_phone,entity_right[:phone_begin]):
  2238. # # entity.person_phone = _phone
  2239. # for i in range(len(PackDict["Project"]["roleList"])):
  2240. # if PackDict["Project"]["roleList"][i].entity_text == entity.entity_text:
  2241. # PackDict["Project"]["roleList"][i].linklist.append(('', _phone))
  2242. #寻找多标段招标金额
  2243. p_entity = len(list_entity)-1
  2244. set_tenderer_money = set()
  2245. list_tenderer_money = [] #2021/7/16 新增列表,倒序保存所有中标金额
  2246. unit_list = [] #2021/8/17 新增,保存金额单位
  2247. #遍历所有实体
  2248. while(p_entity>=0):
  2249. entity = list_entity[p_entity]
  2250. if entity.entity_type=="money":
  2251. # 2021/12/03 添加成本警戒线、保证金
  2252. if entity.notes in ['保证金', '成本警戒线']:
  2253. packagePointer, _flag = getPackage(PackageList, entity.sentence_index, entity.begin_index,
  2254. "money-" + str(entity.label), MAX_DIS=2, DIRECT="L")
  2255. if packagePointer is None:
  2256. packageName = "Project"
  2257. else:
  2258. packageName = packagePointer.entity_text
  2259. if packageName == "Project":
  2260. # if PackDict["Project"]["tendereeMoney"]<float(entity.entity_text):
  2261. # PackDict["Project"]["tendereeMoney"] = float(entity.entity_text)
  2262. if entity.notes=="保证金" and "bond" not in PackDict["Project"]:
  2263. PackDict["Project"]["bond"] = float(entity.entity_text)
  2264. elif entity.notes=="成本警戒线" and "cost_warning" not in PackDict["Project"]:
  2265. PackDict["Project"]["cost_warning"] = float(entity.entity_text)
  2266. else:
  2267. if entity.notes == "保证金" and "bond" not in PackDict[packageName]:
  2268. PackDict[packageName]["bond"] = float(entity.entity_text)
  2269. elif entity.notes == "成本警戒线" and "cost_warning" not in PackDict[packageName]:
  2270. PackDict[packageName]["cost_warning"] = float(entity.entity_text)
  2271. elif entity.values[entity.label]>=on_value:
  2272. if str(entity.label)=="1":
  2273. set_tenderer_money.add(float(entity.entity_text))
  2274. list_tenderer_money.append(float(entity.entity_text)) # 2021/7/16 新增列表,倒序保存所有中标金额
  2275. unit_list.append(entity.money_unit)
  2276. # if str(entity.label)=="0":
  2277. if str(entity.label)=="0" and entity.notes!='总投资':
  2278. '''
  2279. if p_entity>0:
  2280. p_before = list_entity[p_entity-1]
  2281. if p_before.entity_type=="money" and p_before.label==entity.label and p_before.entity_text==entity.entity_text and abs(entity.begin_index-p_before.end_index)<=2:
  2282. p_entity -= 1
  2283. continue
  2284. '''
  2285. packagePointer,_flag = getPackage(PackageList,entity.sentence_index,entity.begin_index,"money-"+str(entity.label),MAX_DIS=2,DIRECT="L")
  2286. if packagePointer is None:
  2287. packageName = "Project"
  2288. else:
  2289. packageName = packagePointer.entity_text
  2290. if packageName=="Project":
  2291. # if PackDict["Project"]["tendereeMoney"]<float(entity.entity_text):
  2292. # PackDict["Project"]["tendereeMoney"] = float(entity.entity_text)
  2293. if entity.values[entity.label]>on_value:
  2294. PackDict["Project"]["tendereeMoney"] = float(entity.entity_text)
  2295. PackDict["Project"]["tendereeMoneyUnit"] = entity.money_unit
  2296. else:
  2297. PackDict[packageName]["tendereeMoney"] = float(entity.entity_text)
  2298. PackDict[packageName]["tendereeMoneyUnit"] = entity.money_unit
  2299. #add pointer_tendereeMoney
  2300. packagePointer.pointer_tendereeMoney = entity
  2301. p_entity -= 1
  2302. #删除一个机构有多个角色的数据
  2303. #删除重复人、概率不回传
  2304. final_roleList = []
  2305. list_pop = []
  2306. set_tenderer_role = set()
  2307. dict_pack_tenderer_money = dict()
  2308. for pack in PackDict.keys():
  2309. #删除无效包
  2310. if PackDict[pack]["code"]=="" and PackDict[pack]["tendereeMoney"]==0 and len(PackDict[pack]["roleList"])==0:
  2311. list_pop.append(pack)
  2312. for i in range(len(PackDict[pack]["roleList"])):
  2313. if PackDict[pack]["roleList"][i].role_name=="win_tenderer":
  2314. if PackDict[pack]["roleList"][i].money==0:
  2315. set_tenderer_role.add(PackDict[pack]["roleList"][i])
  2316. dict_pack_tenderer_money[pack] = [PackDict[pack]["roleList"][i],set()]
  2317. #找到包的中投标金额
  2318. for _index in range(len(PackageList)):
  2319. if "hit" in PackageList[_index]:
  2320. for _hit in list(PackageList[_index]["hit"]):
  2321. _money = float(_hit.split("-")[1]) if _hit.split("-")[0]=="money" else None
  2322. if PackageList[_index]["name"] in dict_pack_tenderer_money and _money is not None:
  2323. dict_pack_tenderer_money[PackageList[_index]["name"]][1].add(_money)
  2324. #只找到一个中标人和中标金额
  2325. if len(set_tenderer_money)==1 and len(set_tenderer_role)==1:
  2326. list(set_tenderer_role)[0].money = list(set_tenderer_money)[0]
  2327. list(set_tenderer_role)[0].money_unit = unit_list[0]
  2328. # print('一个中标人一个金额:', list(set_tenderer_money)[0])
  2329. #找到一个中标人和多个招标金额
  2330. if len(set_tenderer_money)>1 and len(set_tenderer_role)==1:
  2331. _maxMoney = 0
  2332. _sumMoney = 0
  2333. for _m in list(set_tenderer_money):
  2334. _sumMoney += _m
  2335. if _m>_maxMoney:
  2336. _maxMoney = _m
  2337. if _sumMoney/_maxMoney==2:
  2338. list(set_tenderer_role)[0].money = _maxMoney
  2339. # print('一人多金额分项合计 取最大金额:', _maxMoney)
  2340. else:
  2341. # list(set_tenderer_role)[0].money = _maxMoney
  2342. if min(list_tenderer_money)>200000 and list_tenderer_money[-1]/min(list_tenderer_money)>9000:
  2343. list(set_tenderer_role)[0].money = min(list_tenderer_money)
  2344. list(set_tenderer_role)[0].money_unit = unit_list[list_tenderer_money.index(min(list_tenderer_money))]
  2345. # print('一人多金额 且最小的大于20万第一个金额比最小金额大几千倍的最小中标金额:', min(list_tenderer_money))
  2346. else:
  2347. list(set_tenderer_role)[0].money = list_tenderer_money[-1] # 2021/7/16 修改 不是单价合计方式取第一个中标金额
  2348. list(set_tenderer_role)[0].money_unit = unit_list[-1] # 金额单位
  2349. # print('一人多金额 取第一个中标金额:', list_tenderer_money[-1])
  2350. #每个包都只找到一个金额
  2351. _flag_pack_money = True
  2352. for k,v in dict_pack_tenderer_money.items():
  2353. if len(v[1])!=1:
  2354. _flag_pack_money = False
  2355. if _flag_pack_money and len(PackageSet)==len(dict_pack_tenderer_money.keys()):
  2356. for k,v in dict_pack_tenderer_money.items():
  2357. v[0].money = list(v[1])[0]
  2358. # print('k,v in dict_pack_tenderer_money.items', k, v)
  2359. # 2021/7/16 #增加判断中标金额是否远大于招标金额逻辑
  2360. for pack in PackDict.keys():
  2361. for i in range(len(PackDict[pack]["roleList"])):
  2362. if PackDict[pack]["tendereeMoney"] > 0:
  2363. # print('金额数据类型:',type(PackDict[pack]["roleList"][i].money))
  2364. if float(PackDict[pack]["roleList"][i].money) >10000000 and \
  2365. float(PackDict[pack]["roleList"][i].money)/float(PackDict[pack]["tendereeMoney"])>=1000:
  2366. PackDict[pack]["roleList"][i].money = float(PackDict[pack]["roleList"][i].money) / 10000
  2367. # print('招标金额校正中标金额')
  2368. # 2022/04/01 #增加判断中标金额是否远小于招标金额逻辑,比例相差10000倍左右(中标金额“万”单位丢失或未识别)
  2369. for pack in PackDict.keys():
  2370. for i in range(len(PackDict[pack]["roleList"])):
  2371. if PackDict[pack]["tendereeMoney"] > 0 and float(PackDict[pack]["roleList"][i].money) > 0.:
  2372. if float(PackDict[pack]["roleList"][i].money) < 1000 and \
  2373. float(PackDict[pack]["tendereeMoney"])/float(PackDict[pack]["roleList"][i].money)>=9995 and \
  2374. float(PackDict[pack]["tendereeMoney"])/float(PackDict[pack]["roleList"][i].money)<11000:
  2375. PackDict[pack]["roleList"][i].money = float(PackDict[pack]["roleList"][i].money) * 10000
  2376. # 2021/7/19 #增加判断中标金额是否远大于第二三中标金额
  2377. for pack in PackDict.keys():
  2378. tmp_moneys = []
  2379. for i in range(len(PackDict[pack]["roleList"])):
  2380. if float(PackDict[pack]["roleList"][i].money) >100000:
  2381. tmp_moneys.append(float(PackDict[pack]["roleList"][i].money))
  2382. if len(tmp_moneys)>2 and max(tmp_moneys)/min(tmp_moneys)>1000:
  2383. for i in range(len(PackDict[pack]["roleList"])):
  2384. if float(PackDict[pack]["roleList"][i].money)/min(tmp_moneys)>1000:
  2385. PackDict[pack]["roleList"][i].money = float(PackDict[pack]["roleList"][i].money) / 10000
  2386. # print('通过其他中标人投标金额校正中标金额')
  2387. for item in list_pop:
  2388. PackDict.pop(item)
  2389. # 公告中只有"招标人"且无"联系人"链接时
  2390. if len(PackDict)==1:
  2391. k = list(PackDict.keys())[0]
  2392. if len(PackDict[k]["roleList"])==1:
  2393. if PackDict[k]["roleList"][0].role_name == "tenderee":
  2394. if not PackDict[k]["roleList"][0].linklist:
  2395. get_contacts = False
  2396. if not get_contacts:
  2397. # 根据大纲Outline类召回联系人
  2398. for outline in list_outline:
  2399. if re.search("联系人|联系方|联系方式|联系电话|电话|负责人|与.{2,4}联系",outline.outline_summary):
  2400. for t_person in [p for p in temporary_list2 if p.entity_type=='person' and p.label==3]:
  2401. if words_num_dict[t_person.sentence_index] + t_person.wordOffset_begin >= words_num_dict[outline.sentence_begin_index] + outline.wordOffset_begin and words_num_dict[
  2402. t_person.sentence_index] + t_person.wordOffset_end < words_num_dict[outline.sentence_end_index] + outline.wordOffset_end:
  2403. if t_person.person_phone:
  2404. _phone = [p.entity_text for p in t_person.person_phone]
  2405. for _p in _phone:
  2406. PackDict[k]["roleList"][0].linklist.append((t_person.entity_text, _p))
  2407. get_contacts = True
  2408. break
  2409. elif words_num_dict[t_person.sentence_index] + t_person.wordOffset_begin >= \
  2410. words_num_dict[outline.sentence_end_index] + outline.wordOffset_end:
  2411. break
  2412. if not get_contacts:
  2413. sentence_phone = phone.findall(outline.outline_text)
  2414. if sentence_phone:
  2415. PackDict[k]["roleList"][0].linklist.append(("", sentence_phone[0]))
  2416. get_contacts = True
  2417. break
  2418. if not get_contacts:
  2419. # 直接取文中倒数第一个联系人
  2420. for _entity in temporary_list2[::-1]:
  2421. if _entity.entity_type=='person' and _entity.label==3:
  2422. if _entity.person_phone:
  2423. _phone = [p.entity_text for p in _entity.person_phone]
  2424. for _p in _phone:
  2425. PackDict[k]["roleList"][0].linklist.append((_entity.entity_text, _p))
  2426. get_contacts = True
  2427. break
  2428. if not get_contacts:
  2429. # 如果文中只有一个“phone”实体,则直接取为联系人电话
  2430. if len(phone_entitys) == 1:
  2431. PackDict[k]["roleList"][0].linklist.append(("", phone_entitys[0].entity_text))
  2432. get_contacts = True
  2433. if not get_contacts:
  2434. # 通过大纲Outline类直接取电话
  2435. if len(new_split_list) > 1:
  2436. for _start, _end in new_split_list:
  2437. temp_sentence = _content[_start:_end]
  2438. sentence_outline = temp_sentence.split(",::")[0]
  2439. if re.search("联系人|联系方|联系方式|联系电话|电话|负责人|与.{2,4}联系", sentence_outline):
  2440. sentence_phone = phone.findall(temp_sentence)
  2441. if sentence_phone:
  2442. PackDict[k]["roleList"][0].linklist.append(("", sentence_phone[0]))
  2443. get_contacts = True
  2444. break
  2445. if not get_contacts:
  2446. # 通过正则提取句子段落进行提取电话
  2447. contacts_person = "(?:联系人|联系方|联系方式|负责人|电话|联系电话)[::]?"
  2448. tenderee_pattern = "(?:(?:采购|招标|议价|议标|比选)(?:人|公司|单位|组织|部门)|建设(?:单位|业主)|(?:采购|招标|甲)方|询价单位|项目业主|业主|业主单位)[^。]{0,5}"
  2449. contact_pattern_list = [tenderee_pattern + contacts_person,
  2450. "(?:采购[^。,]{0,2}项目|采购事项|招标)[^。,]{0,4}" + contacts_person,
  2451. "(?:项目|采购)[^。,]{0,4}" + contacts_person,
  2452. "(?:报名|报价|业务咨询|业务|投标咨询)[^。,]{0,4}" + contacts_person, ]
  2453. for _pattern in contact_pattern_list:
  2454. get_tenderee_contacts = False
  2455. for regular_match in re.finditer(_pattern, _content):
  2456. match_text = _content[regular_match.end():regular_match.end() + 40]
  2457. match_text = match_text.split("。")[0]
  2458. sentence_phone = phone.findall(match_text)
  2459. if sentence_phone:
  2460. PackDict[k]["roleList"][0].linklist.append(("", sentence_phone[0]))
  2461. get_tenderee_contacts = True
  2462. break
  2463. if get_tenderee_contacts:
  2464. break
  2465. for pack in PackDict.keys():
  2466. for i in range(len(PackDict[pack]["roleList"])):
  2467. PackDict[pack]["roleList"][i] = PackDict[pack]["roleList"][i].getString()
  2468. return PackDict
  2469. def initPackageAttr(RoleList,PackageSet):
  2470. '''
  2471. @summary: 根据拿到的roleList和packageSet初始化接口返回的数据
  2472. '''
  2473. packDict = dict()
  2474. packDict["Project"] = {"code":"","tendereeMoney":0,"roleList":[], 'tendereeMoneyUnit':''}
  2475. for item in list(PackageSet):
  2476. packDict[item] = {"code":"","tendereeMoney":0,"roleList":[], 'tendereeMoneyUnit':''}
  2477. for item in RoleList:
  2478. if packDict[item.packageName]["code"] =="":
  2479. packDict[item.packageName]["code"] = item.packageCode
  2480. # packDict[item.packageName]["roleList"].append(Role(item.role_name,item.entity_text,0,0,0.0,[]))
  2481. packDict[item.packageName]["roleList"].append(Role(item.role_name,item.entity_text,0,0,0.0,[])) #Role(角色名称,实体名称,角色阈值,金额,金额阈值,连接列表,金额单位)
  2482. return packDict
  2483. def getPackageRoleMoney(list_sentence,list_entity,list_outline):
  2484. '''
  2485. @param:
  2486. list_sentence:文章的句子list
  2487. list_entity:文章的实体list
  2488. @return: 拿到文章的包-标段号-角色-实体名称-金额-联系人-联系电话
  2489. '''
  2490. # print("=1")
  2491. theRole = getRoleList(list_sentence,list_entity)
  2492. if not theRole:
  2493. return []
  2494. RoleList,RoleSet,PackageList,PackageSet = theRole
  2495. '''
  2496. for item in PackageList:
  2497. # print(item)
  2498. '''
  2499. PackDict = initPackageAttr(RoleList, PackageSet)
  2500. PackDict = findAttributeAfterEntity(PackDict, RoleSet, PackageList, PackageSet, list_sentence, list_entity, list_outline)
  2501. return PackDict
  2502. def turnBidWay(bidway):
  2503. if bidway in ("邀请招标","采购方式:邀请"):
  2504. return "邀请招标"
  2505. elif bidway in ("询价","询单","询比","采购方式:询价"):
  2506. return "询价"
  2507. elif bidway in ("竞谈","竞争性谈判","公开竞谈"):
  2508. return "竞争性谈判"
  2509. elif bidway in ("竞争性磋商","磋商"):
  2510. return "竞争性磋商"
  2511. elif bidway in ("竞价","竞标","电子竞价","以电子竞价","电子书面竞投"):
  2512. return "竞价"
  2513. elif bidway in ("公开招标","网上电子投标","网上招标","采购方式:公开","招标为其他"):
  2514. return "公开招标"
  2515. elif bidway in ("单一来源"):
  2516. return "单一来源"
  2517. elif bidway in ("比选"):
  2518. return "比选"
  2519. else:
  2520. return "其他"
  2521. my_time_format_pattern = re.compile("((?P<year>\d{4}|\d{2})\s*[-\/年\.]\s*(?P<month>\d{1,2})\s*[-\/月\.]\s*(?P<day>\d{1,2}))")
  2522. import time
  2523. def my_timeFormat(_time):
  2524. current_year = time.strftime("%Y",time.localtime())
  2525. all_match = re.finditer(my_time_format_pattern,_time)
  2526. time_list = []
  2527. for _match in all_match:
  2528. if len(_match.group())>0:
  2529. legal = True
  2530. year = ""
  2531. month = ""
  2532. day = ""
  2533. for k,v in _match.groupdict().items():
  2534. if k=="year":
  2535. year = v
  2536. if k=="month":
  2537. month = v
  2538. if k=="day":
  2539. day = v
  2540. if year!="":
  2541. if len(year)==2:
  2542. year = "20"+year
  2543. if int(year)>int(current_year):
  2544. legal = False
  2545. else:
  2546. legal = False
  2547. if month!="":
  2548. if int(month)>12:
  2549. legal = False
  2550. else:
  2551. legal = False
  2552. if day!="":
  2553. if int(day)>31:
  2554. legal = False
  2555. else:
  2556. legal = False
  2557. if legal:
  2558. # return "%s-%s-%s"%(year,month.rjust(2,"0"),day.rjust(2,"0"))
  2559. time_list.append("%s-%s-%s"%(year,month.rjust(2,"0"),day.rjust(2,"0")))
  2560. return time_list
  2561. def getTimeAttributes(list_entity,list_sentence):
  2562. time_entitys = [i for i in list_entity if i.entity_type=='time']
  2563. time_entitys = sorted(time_entitys,key=lambda x:(x.sentence_index, x.begin_index))
  2564. list_sentence = sorted(list_sentence,key=lambda x:x.sentence_index)
  2565. dict_time = {
  2566. "time_release": [], # 1 发布时间
  2567. "time_bidopen": [], # 2 开标时间
  2568. "time_bidclose": [], # 3 截标时间
  2569. 'time_bidstart': [], # 12 投标(开始)时间、响应文件接收(开始)时间
  2570. 'time_publicityStart': [], # 4 公示开始时间(公示时间、公示期)
  2571. 'time_publicityEnd': [], # 5 公示截止时间
  2572. 'time_getFileStart': [], # 6 文件获取开始时间(文件获取时间)
  2573. 'time_getFileEnd': [], # 7 文件获取截止时间
  2574. 'time_registrationStart': [], # 8 报名开始时间(报名时间)
  2575. 'time_registrationEnd': [], # 9 报名截止时间
  2576. 'time_earnestMoneyStart': [], #10 保证金递交开始时间(保证金递交时间)
  2577. 'time_earnestMoneyEnd': [] , # 11 保证金递交截止时间
  2578. 'time_commencement':[] , #13 开工日期
  2579. 'time_completion': [] # 14 竣工日期
  2580. }
  2581. last_sentence_index = 0
  2582. last_time_type = ""
  2583. last_time_index = {
  2584. 'time_bidstart':"time_bidclose",
  2585. 'time_publicityStart':"time_publicityEnd",
  2586. 'time_getFileStart':"time_getFileEnd",
  2587. 'time_registrationStart':"time_registrationEnd",
  2588. 'time_earnestMoneyStart':"time_earnestMoneyEnd",
  2589. 'time_commencement':"time_completion",
  2590. }
  2591. for entity in time_entitys:
  2592. sentence_text = list_sentence[entity.sentence_index].sentence_text
  2593. entity_left = sentence_text[max(0, entity.wordOffset_begin - 2):entity.wordOffset_begin]
  2594. entity_right = sentence_text[entity.wordOffset_end:entity.wordOffset_end + 3]
  2595. label_prob = entity.values[entity.label]
  2596. entity_text = entity.entity_text
  2597. extract_time = my_timeFormat(entity_text)
  2598. if extract_time:
  2599. if re.search("至|到", entity_left):
  2600. if entity.sentence_index == last_sentence_index:
  2601. time_type = last_time_index.get(last_time_type)
  2602. if time_type:
  2603. dict_time[time_type].append((extract_time[0], 0.5 + label_prob / 10))
  2604. last_time_type = ""
  2605. continue
  2606. if entity.label!=0:
  2607. if entity.label==1 and label_prob>0.5:
  2608. dict_time['time_release'].append((extract_time[0],label_prob))
  2609. last_time_type = 'time_release'
  2610. elif entity.label==2 and label_prob>0.5:
  2611. dict_time['time_bidopen'].append((extract_time[0],label_prob))
  2612. last_time_type = 'time_bidopen'
  2613. elif entity.label==3 and label_prob>0.5:
  2614. dict_time['time_bidclose'].append((extract_time[0],label_prob))
  2615. last_time_type = 'time_bidclose'
  2616. elif entity.label==12 and label_prob>0.5:
  2617. if len(extract_time)==1:
  2618. if re.search("前|止|截止",entity_right) or re.search("至|止|到",entity_left) or re.search("前",entity_text[-2:]):
  2619. dict_time['time_bidclose'].append((extract_time[0], label_prob))
  2620. last_time_type = 'time_bidclose'
  2621. else:
  2622. dict_time['time_bidstart'].append((extract_time[0], label_prob))
  2623. last_time_type = 'time_bidstart'
  2624. else:
  2625. dict_time['time_bidstart'].append((extract_time[0],label_prob))
  2626. dict_time['time_bidclose'].append((extract_time[1],label_prob))
  2627. last_time_type = ''
  2628. elif entity.label==4 and label_prob>0.5:
  2629. if len(extract_time)==1:
  2630. if re.search("前|止|截止",entity_right) or re.search("至|止|到",entity_left) or re.search("前",entity_text[-2:]):
  2631. dict_time['time_publicityEnd'].append((extract_time[0], label_prob))
  2632. last_time_type = 'time_publicityEnd'
  2633. else:
  2634. dict_time['time_publicityStart'].append((extract_time[0], label_prob))
  2635. last_time_type = 'time_publicityStart'
  2636. else:
  2637. dict_time['time_publicityStart'].append((extract_time[0],label_prob))
  2638. dict_time['time_publicityEnd'].append((extract_time[1],label_prob))
  2639. last_time_type = ''
  2640. elif entity.label==5 and label_prob>0.5:
  2641. if len(extract_time)==1:
  2642. dict_time['time_publicityEnd'].append((extract_time[0], label_prob))
  2643. last_time_type = 'time_publicityEnd'
  2644. else:
  2645. dict_time['time_publicityStart'].append((extract_time[0],label_prob))
  2646. dict_time['time_publicityEnd'].append((extract_time[1],label_prob))
  2647. last_time_type = ''
  2648. elif entity.label==6 and label_prob>0.5:
  2649. if len(extract_time)==1:
  2650. if re.search("前|止|截止",entity_right) or re.search("至|止|到",entity_left) or re.search("前",entity_text[-2:]):
  2651. dict_time['time_getFileEnd'].append((extract_time[0], label_prob))
  2652. last_time_type = 'time_getFileEnd'
  2653. else:
  2654. dict_time['time_getFileStart'].append((extract_time[0], label_prob))
  2655. last_time_type = 'time_getFileStart'
  2656. else:
  2657. dict_time['time_getFileStart'].append((extract_time[0],label_prob))
  2658. dict_time['time_getFileEnd'].append((extract_time[1],label_prob))
  2659. last_time_type = ''
  2660. elif entity.label==7 and label_prob>0.5:
  2661. if len(extract_time)==1:
  2662. dict_time['time_getFileEnd'].append((extract_time[0], label_prob))
  2663. last_time_type = 'time_getFileEnd'
  2664. else:
  2665. dict_time['time_getFileStart'].append((extract_time[0],label_prob))
  2666. dict_time['time_getFileEnd'].append((extract_time[1],label_prob))
  2667. last_time_type = ''
  2668. elif entity.label==8 and label_prob>0.5:
  2669. if len(extract_time)==1:
  2670. if re.search("前|止|截止",entity_right) or re.search("至|止|到",entity_left) or re.search("前",entity_text[-2:]):
  2671. dict_time['time_registrationEnd'].append((extract_time[0], label_prob))
  2672. last_time_type = 'time_registrationEnd'
  2673. else:
  2674. dict_time['time_registrationStart'].append((extract_time[0], label_prob))
  2675. last_time_type = 'time_registrationStart'
  2676. else:
  2677. dict_time['time_registrationStart'].append((extract_time[0],label_prob))
  2678. dict_time['time_registrationEnd'].append((extract_time[1],label_prob))
  2679. last_time_type = ''
  2680. elif entity.label==9 and label_prob>0.5:
  2681. if len(extract_time)==1:
  2682. dict_time['time_registrationEnd'].append((extract_time[0], label_prob))
  2683. last_time_type = 'time_registrationEnd'
  2684. else:
  2685. dict_time['time_registrationStart'].append((extract_time[0],label_prob))
  2686. dict_time['time_registrationEnd'].append((extract_time[1],label_prob))
  2687. last_time_type = ''
  2688. elif entity.label==10 and label_prob>0.5:
  2689. if len(extract_time)==1:
  2690. if re.search("前|止|截止",entity_right) or re.search("至|止|到",entity_left) or re.search("前",entity_text[-2:]):
  2691. dict_time['time_earnestMoneyEnd'].append((extract_time[0], label_prob))
  2692. last_time_type = 'time_earnestMoneyEnd'
  2693. else:
  2694. dict_time['time_earnestMoneyStart'].append((extract_time[0], label_prob))
  2695. last_time_type = 'time_earnestMoneyStart'
  2696. else:
  2697. dict_time['time_earnestMoneyStart'].append((extract_time[0],label_prob))
  2698. dict_time['time_earnestMoneyEnd'].append((extract_time[1],label_prob))
  2699. last_time_type = ''
  2700. elif entity.label==11 and label_prob>0.5:
  2701. if len(extract_time)==1:
  2702. dict_time['time_earnestMoneyEnd'].append((extract_time[0], label_prob))
  2703. last_time_type = 'time_earnestMoneyEnd'
  2704. else:
  2705. dict_time['time_earnestMoneyStart'].append((extract_time[0],label_prob))
  2706. dict_time['time_earnestMoneyEnd'].append((extract_time[1],label_prob))
  2707. last_time_type = ''
  2708. elif entity.label==13 and label_prob>0.5:
  2709. if len(extract_time)==1:
  2710. if re.search("前|止|截止",entity_right) or re.search("至|止|到",entity_left) or re.search("前",entity_text[-2:]):
  2711. dict_time['time_completion'].append((extract_time[0], label_prob))
  2712. last_time_type = 'time_completion'
  2713. else:
  2714. dict_time['time_commencement'].append((extract_time[0], label_prob))
  2715. last_time_type = 'time_commencement'
  2716. else:
  2717. dict_time['time_commencement'].append((extract_time[0],label_prob))
  2718. dict_time['time_completion'].append((extract_time[1],label_prob))
  2719. last_time_type = ''
  2720. elif entity.label==14 and label_prob>0.5:
  2721. if len(extract_time)==1:
  2722. dict_time['time_completion'].append((extract_time[0], label_prob))
  2723. last_time_type = 'time_completion'
  2724. else:
  2725. dict_time['time_commencement'].append((extract_time[0],label_prob))
  2726. dict_time['time_completion'].append((extract_time[1],label_prob))
  2727. last_time_type = ''
  2728. else:
  2729. last_time_type = ""
  2730. else:
  2731. last_time_type = ""
  2732. else:
  2733. last_time_type = ""
  2734. last_sentence_index = entity.sentence_index
  2735. result_dict = dict((key,"") for key in dict_time.keys())
  2736. for time_type,value in dict_time.items():
  2737. list_time = dict_time[time_type]
  2738. if list_time:
  2739. list_time.sort(key=lambda x:x[1],reverse=True)
  2740. result_dict[time_type] = list_time[0][0]
  2741. return result_dict
  2742. def getOtherAttributes(list_entity):
  2743. dict_other = {"moneysource":"",
  2744. "person_review":[],
  2745. "serviceTime":"",
  2746. "product":[],
  2747. "total_tendereeMoney":0,
  2748. "total_tendereeMoneyUnit":''}
  2749. for entity in list_entity:
  2750. if entity.entity_type == 'bidway':
  2751. dict_other["bidway"] = turnBidWay(entity.entity_text)
  2752. elif entity.entity_type=='moneysource':
  2753. dict_other["moneysource"] = entity.entity_text
  2754. elif entity.entity_type=='serviceTime':
  2755. dict_other["serviceTime"] = entity.entity_text
  2756. elif entity.entity_type=="person" and entity.label ==4:
  2757. dict_other["person_review"].append(entity.entity_text)
  2758. elif entity.entity_type=='product':
  2759. dict_other["product"].append(entity.entity_text)
  2760. elif entity.entity_type=='money' and entity.notes=='总投资' and dict_other["total_tendereeMoney"]<float(entity.entity_text):
  2761. dict_other["total_tendereeMoney"] = float(entity.entity_text)
  2762. dict_other["total_tendereeMoneyUnit"] = entity.money_unit
  2763. dict_other["product"] = list(set(dict_other["product"]))
  2764. return dict_other
  2765. def getMoneyRange(RoleList):
  2766. pass
  2767. def getPREMs(list_sentences,list_entitys,list_articles,list_outlines):
  2768. '''
  2769. @param:
  2770. list_sentence:所有文章的句子list
  2771. list_entity:所有文章的实体list
  2772. @return:list of dict which include文章的包-角色-实体名称-金额-联系人-联系电话
  2773. '''
  2774. result = []
  2775. for list_sentence,list_entity,list_article,list_outline in zip(list_sentences,list_entitys,list_articles,list_outlines):
  2776. RoleList = getPackageRoleMoney(list_sentence,list_entity,list_outline)
  2777. result.append(dict({"prem":RoleList,"docid":list_article.doc_id},**getOtherAttributes(list_entity),**getTimeAttributes(list_entity,list_sentence),
  2778. **{"fingerprint":list_article.fingerprint,"match_enterprise":list_article.match_enterprise,
  2779. "match_enterprise_type":list_article.match_enterprise_type,"process_time":getCurrent_date(),
  2780. "attachmentTypes":list_article.attachmentTypes, "bidway": list_article.bidway}))
  2781. return result
  2782. if __name__=="__main__":
  2783. '''
  2784. conn = getConnection()
  2785. cursor = conn.cursor()
  2786. #sql = " select distinct A.doc_id from entity_mention A,test_predict_role B where A.entity_id=B.entity_id limit 200"
  2787. sql = " select B.doc_id,B.prem from articles_processed A, articles_validation B where A.id=B.doc_id "
  2788. result = []
  2789. cursor.execute(sql)
  2790. rows = cursor.fetchall()
  2791. count = 0
  2792. for row in rows:
  2793. count += 1
  2794. # print(count)
  2795. doc_id = row[0]
  2796. roleList = getPackageRoleMoney(doc_id)
  2797. result.append([doc_id,str(roleList),row[1]])
  2798. ''''''
  2799. with codecs.open("getAttribute.html","w",encoding="utf8") as f:
  2800. f.write('<html><head>\
  2801. <meta http-equiv="Content-Type"\
  2802. content="text/html; charset=UTF-8">\
  2803. </head>\
  2804. <body bgcolor="#FFFFFF">\
  2805. <table border="1">\
  2806. <tr>\
  2807. <td>doc_id</td>\
  2808. <td>角色</td>\
  2809. </tr>')
  2810. for item in result:
  2811. f.write("<tr>"+"<td>"+item[0]+"</td>"+"<td>"+item[1]+"</td>"+"<td>"+item[2]+"</td>"+"</tr>")
  2812. f.write("</table></body>")
  2813. '''