getAttributes.py 111 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050105110521053105410551056105710581059106010611062106310641065106610671068106910701071107210731074107510761077107810791080108110821083108410851086108710881089109010911092109310941095109610971098109911001101110211031104110511061107110811091110111111121113111411151116111711181119112011211122112311241125112611271128112911301131113211331134113511361137113811391140114111421143114411451146114711481149115011511152115311541155115611571158115911601161116211631164116511661167116811691170117111721173117411751176117711781179118011811182118311841185118611871188118911901191119211931194119511961197119811991200120112021203120412051206120712081209121012111212121312141215121612171218121912201221122212231224122512261227122812291230123112321233123412351236123712381239124012411242124312441245124612471248124912501251125212531254125512561257125812591260126112621263126412651266126712681269127012711272127312741275127612771278127912801281128212831284128512861287128812891290129112921293129412951296129712981299130013011302130313041305130613071308130913101311131213131314131513161317131813191320132113221323132413251326132713281329133013311332133313341335133613371338133913401341134213431344134513461347134813491350135113521353135413551356135713581359136013611362136313641365136613671368136913701371137213731374137513761377137813791380138113821383138413851386138713881389139013911392139313941395139613971398139914001401140214031404140514061407140814091410141114121413141414151416141714181419142014211422142314241425142614271428142914301431143214331434143514361437143814391440144114421443144414451446144714481449145014511452145314541455145614571458145914601461146214631464146514661467146814691470147114721473147414751476147714781479148014811482148314841485148614871488148914901491149214931494149514961497149814991500150115021503150415051506150715081509151015111512151315141515151615171518151915201521152215231524152515261527152815291530153115321533153415351536153715381539154015411542154315441545154615471548154915501551155215531554155515561557155815591560156115621563156415651566156715681569157015711572157315741575157615771578157915801581158215831584158515861587158815891590159115921593159415951596159715981599160016011602160316041605160616071608160916101611161216131614161516161617161816191620162116221623162416251626162716281629163016311632163316341635163616371638163916401641164216431644164516461647164816491650165116521653165416551656165716581659166016611662166316641665166616671668166916701671167216731674167516761677167816791680168116821683168416851686168716881689169016911692169316941695169616971698169917001701170217031704170517061707170817091710171117121713171417151716171717181719172017211722172317241725172617271728172917301731173217331734173517361737173817391740174117421743174417451746174717481749175017511752175317541755175617571758175917601761176217631764176517661767176817691770177117721773177417751776177717781779178017811782178317841785178617871788178917901791179217931794179517961797179817991800180118021803180418051806180718081809181018111812181318141815181618171818181918201821182218231824182518261827182818291830183118321833183418351836183718381839184018411842184318441845184618471848184918501851185218531854185518561857185818591860186118621863186418651866186718681869187018711872187318741875187618771878187918801881188218831884188518861887188818891890189118921893189418951896189718981899190019011902190319041905190619071908190919101911191219131914191519161917191819191920192119221923192419251926192719281929193019311932193319341935193619371938193919401941194219431944194519461947194819491950195119521953195419551956195719581959196019611962196319641965196619671968196919701971197219731974197519761977197819791980198119821983198419851986198719881989199019911992199319941995199619971998199920002001200220032004200520062007200820092010201120122013201420152016201720182019202020212022
  1. from BiddingKG.dl.common.Utils import findAllIndex,debug,timeFormat,getCurrent_date
  2. from BiddingKG.dl.interface.Entitys import PREM,Role,Entity
  3. import re
  4. import copy
  5. import math
  6. import pandas as pd
  7. import os
  8. from scipy.optimize import linear_sum_assignment
  9. from BiddingKG.dl.interface.Entitys import Match
  10. import numpy as np
  11. def getTheRole(entity,role_list):
  12. '''
  13. @summary:根据实体名称拿到index
  14. @param:
  15. entity:实体名称
  16. role_list:角色list
  17. @return:该实体所在下标
  18. '''
  19. for role_index in range(len(role_list)):
  20. if entity in role_list[role_index]:
  21. return role_index
  22. return None
  23. dict_role_id = {"0":"tenderee",
  24. "1":"agency",
  25. "2":"win_tenderer",
  26. "3":"second_tenderer",
  27. "4":"third_tenderer"}
  28. def getPackage(packageList,sentence_index,begin_index,roleid,MAX_DIS=None,DIRECT=None):
  29. '''
  30. @param:
  31. packageList:文章的包的信息,包号-sent_index-词偏移-字偏移-[[前作用域句子,句内偏移],[后作用域句子,句内偏移]]-匹配集合
  32. sentence_index:实体所在的句子
  33. begin_index:实体所在句子的起始位置
  34. @return:公司实体所属的包
  35. @summary: 优化多标段,确定标段作用域之后,寻找作用域包含该实体的所有包,从前往后找到一个还没有该roleid的包返回,若找到的包都有roleid,则返回第一个,若没有找到包,返回None
  36. '''
  37. '''
  38. if len(packageList)==0:
  39. return None
  40. before_index = None
  41. after_index = None
  42. equal_index = None
  43. equal_count = 0
  44. for pack_index in range(len(packageList)):
  45. if packageList[pack_index][1]>sentence_index and after_index is None:
  46. after_index = pack_index
  47. if packageList[pack_index][1]<sentence_index:
  48. before_index = pack_index
  49. if packageList[pack_index][1]==sentence_index and equal_index is None:
  50. equal_index = pack_index
  51. #当前句子和之前句子未找到包
  52. if before_index is None and equal_index is None:
  53. return None
  54. else:
  55. if after_index is None:
  56. end_index = len(packageList)
  57. else:
  58. end_index = after_index
  59. #只在当前句子找到一个包号
  60. if end_index-max((before_index if before_index is not None else -1,equal_index if equal_index is not None else -1))==1:
  61. return packageList[end_index-1][0]
  62. else:
  63. for i in range(max((before_index if before_index is not None else -1,equal_index if equal_index is not None else -1)),end_index):
  64. if packageList[i][2]>int(begin_index):
  65. if packageList[i-1][4]:
  66. return packageList[i-1][0]
  67. else:
  68. if packageList[i][4]:
  69. return packageList[i-1][0]
  70. else:
  71. return packageList[i][0]
  72. return packageList[end_index-1][0]
  73. '''
  74. if len(packageList)==0:
  75. return None,False
  76. list_legalPack = []
  77. for pack_index in range(len(packageList)):
  78. if DIRECT=="L" and (packageList[pack_index]["sentence_index"]>sentence_index or (packageList[pack_index]["sentence_index"]==sentence_index and packageList[pack_index]["offsetWords_begin"]>begin_index)):
  79. continue
  80. if DIRECT=="R" and (packageList[pack_index]["sentence_index"]<sentence_index or (packageList[pack_index]["sentence_index"]==sentence_index and packageList[pack_index]["offsetwords_begin"]<begin_index)):
  81. continue
  82. if (packageList[pack_index]["scope"][0][0]<sentence_index or (packageList[pack_index]["scope"][0][0]==sentence_index and packageList[pack_index]["scope"][0][1]<=begin_index)) and (packageList[pack_index]["scope"][1][0]>sentence_index or (packageList[pack_index]["scope"][1][0]==sentence_index and packageList[pack_index]["scope"][1][1]>=begin_index)):
  83. if MAX_DIS is not None:
  84. if abs(sentence_index-packageList[pack_index]["sentence_index"])<=MAX_DIS:
  85. list_legalPack.append(pack_index)
  86. else:
  87. list_legalPack.append(pack_index)
  88. # if (packageList[pack_index]["scope"][0][0] < sentence_index
  89. # or (packageList[pack_index]["scope"][0][0] == sentence_index
  90. # and packageList[pack_index]["scope"][0][1] <= begin_index))
  91. # and (packageList[pack_index]["scope"][1][0] > sentence_index
  92. # or (packageList[pack_index]["scope"][1][0] == sentence_index
  93. # and packageList[pack_index]["scope"][1][1] >= begin_index)):
  94. # pass
  95. _flag = True
  96. for _index in list_legalPack:
  97. if roleid in packageList[_index]["hit"]:
  98. continue
  99. else:
  100. _flag = False
  101. packageList[_index]["hit"].add(roleid)
  102. return packageList[_index]["pointer"],_flag
  103. if len(list_legalPack)>0:
  104. return packageList[0]["pointer"],_flag
  105. return None,False
  106. #生成合法的组合
  107. def get_legal_comba(list_entity,dict_role_combination):
  108. #拿到一个包中所有合法的组合
  109. def circle_package(_dict_legal_combination):
  110. list_dict_role_first = []
  111. for _role in _dict_legal_combination:
  112. if len(list_dict_role_first)==0:
  113. for _entity in _dict_legal_combination[_role]:
  114. if _entity !="":
  115. list_dict_role_first.append({_role:_entity})
  116. else:
  117. list_dict_role_after = []
  118. _find_count = 0
  119. for _entity in _dict_legal_combination[_role]:
  120. if _entity !="":
  121. for _dict in list_dict_role_first:
  122. _flag = True
  123. for _key1 in _dict:
  124. if _entity==_dict[_key1]:
  125. #修改为招标人和代理人可以为同一个
  126. if str(_key1) in ["0","1"] and str(_role) in ["0","1"]:
  127. _flag = True
  128. else:
  129. _flag = False
  130. if _flag:
  131. _find_count += 1
  132. _new_dict = copy.copy(_dict)
  133. _new_dict[_role] = _entity
  134. if len(list_dict_role_after)>100000:
  135. break
  136. list_dict_role_after.append(_new_dict)
  137. else:
  138. # 2021/5/25 update,同一实体(entity_text)不同角色
  139. if len(list_dict_role_after) > 100000:
  140. break
  141. for _dict in list_dict_role_first:
  142. for _key1 in _dict:
  143. if _entity == _dict[_key1]:
  144. _new_dict = copy.copy(_dict)
  145. _new_dict.pop(_key1)
  146. _new_dict[_role] = _entity
  147. list_dict_role_after.append({_role:_entity})
  148. if len(list_dict_role_after)==0:
  149. pass
  150. else:
  151. list_dict_role_first.extend(list_dict_role_after)
  152. return list_dict_role_first
  153. def recursive_package(_dict_legal_combination,set_legal_entity,dict_one_selution,list_all_selution):
  154. last_layer = False
  155. #若是空组合则放回空
  156. if len(_dict_legal_combination.keys())==0:
  157. return []
  158. #递归到最后一层则修改状态
  159. if len(_dict_legal_combination.keys())==1:
  160. last_layer = True
  161. #取一个角色开始进行遍历
  162. _key_role = list(_dict_legal_combination.keys())[0]
  163. for item in _dict_legal_combination[_key_role]:
  164. copy_dict_one_selution = copy.copy(dict_one_selution)
  165. copy_dict_legal_combination = {}
  166. copy_set_legal_entity = copy.copy(set_legal_entity)
  167. #复制余下的所有角色,进行下一轮递归
  168. for _key in _dict_legal_combination.keys():
  169. if _key!=_key_role:
  170. copy_dict_legal_combination[_key] = _dict_legal_combination[_key]
  171. #修改为招标人和代理人可以为同一个
  172. if item !="":
  173. _flag = True
  174. if str(_key_role) in ["0","1"]:
  175. for _key_flag in copy_dict_one_selution:
  176. if _key_flag not in ["0","1"] and copy_dict_one_selution[_key_flag]==item:
  177. _flag = False
  178. else:
  179. for _key_flag in copy_dict_one_selution:
  180. if copy_dict_one_selution[_key_flag]==item:
  181. _flag = False
  182. if _flag:
  183. copy_dict_one_selution[_key_role] = item
  184. '''
  185. if item not in copy_set_legal_entity:
  186. if item !="":
  187. copy_dict_one_selution[_key_role] = item
  188. '''
  189. copy_set_legal_entity.add(item)
  190. if last_layer:
  191. list_all_selution.append(copy_dict_one_selution)
  192. else:
  193. recursive_package(copy_dict_legal_combination,copy_set_legal_entity,copy_dict_one_selution,list_all_selution)
  194. #递归匹配各个包的结果
  195. def recursive_packages(_dict_legal_combination,dict_one_selution,list_all_selution):
  196. last_layer = False
  197. if len(_dict_legal_combination.keys())==0:
  198. return []
  199. if len(_dict_legal_combination.keys())==1:
  200. last_layer = True
  201. _key_pack = list(_dict_legal_combination.keys())[0]
  202. for item in _dict_legal_combination[_key_pack]:
  203. copy_dict_one_selution = copy.copy(dict_one_selution)
  204. copy_dict_legal_combination = {}
  205. for _key in _dict_legal_combination.keys():
  206. if _key!=_key_pack:
  207. copy_dict_legal_combination[_key] = _dict_legal_combination[_key]
  208. for _key_role in item.keys():
  209. copy_dict_one_selution[_key_pack+"$$"+_key_role] = item[_key_role]
  210. if last_layer:
  211. list_all_selution.append(copy_dict_one_selution)
  212. else:
  213. recursive_packages(copy_dict_legal_combination,copy_dict_one_selution,list_all_selution)
  214. return list_all_selution
  215. #循环获取所有包组合
  216. def circle_pageages(_dict_legal_combination):
  217. list_all_selution = []
  218. for _key_pack in _dict_legal_combination.keys():
  219. list_key_selution = []
  220. for item in _dict_legal_combination[_key_pack]:
  221. _dict = dict()
  222. for _key_role in item.keys():
  223. _dict[_key_pack+"$$"+_key_role] = item[_key_role]
  224. list_key_selution.append(_dict)
  225. if len(list_all_selution)==0:
  226. list_all_selution = list_key_selution
  227. else:
  228. _list_all_selution = []
  229. for item_1 in list_all_selution:
  230. for item_2 in list_key_selution:
  231. _list_all_selution.append(dict(item_1,**item_2))
  232. list_all_selution = _list_all_selution
  233. return list_all_selution
  234. #拿到各个包解析之后的结果
  235. _dict_legal_combination = {}
  236. for packageName in dict_role_combination.keys():
  237. _list_all_selution = []
  238. # recursive_package(dict_role_combination[packageName], set(), {}, _list_all_selution)
  239. _list_all_selution = circle_package(dict_role_combination[packageName])
  240. '''
  241. print("===1")
  242. print(packageName)
  243. for item in _list_all_selution:
  244. print(item)
  245. print("===2")
  246. '''
  247. #去除包含子集
  248. list_all_selution_simple = []
  249. _list_set_all_selution = []
  250. for item_selution in _list_all_selution:
  251. item_set_selution = set()
  252. for _key in item_selution.keys():
  253. item_set_selution.add((_key,item_selution[_key]))
  254. _list_set_all_selution.append(item_set_selution)
  255. if len(_list_set_all_selution)>1000:
  256. _dict_legal_combination[packageName] = _list_all_selution
  257. continue
  258. for i in range(len(_list_set_all_selution)):
  259. be_included = False
  260. for j in range(len(_list_set_all_selution)):
  261. if i!=j:
  262. if len(set(_list_set_all_selution[i])&set(_list_set_all_selution[j]))==len(_list_set_all_selution[i]) and len(_list_set_all_selution[i])!=len(_list_set_all_selution[j]):
  263. be_included = True
  264. if not be_included:
  265. list_all_selution_simple.append(_list_all_selution[i])
  266. _dict_legal_combination[packageName] = list_all_selution_simple
  267. _list_final_comba = []
  268. #对各个包的结果进行排列组合
  269. _comba_count = 1
  270. for _key in _dict_legal_combination.keys():
  271. _comba_count *= len(_dict_legal_combination[_key])
  272. #如果过大,则每个包只取概率最大的那个
  273. dict_pack_entity_prob = get_dict_entity_prob(list_entity)
  274. if _comba_count>250:
  275. new_dict_legal_combination = dict()
  276. for _key_pack in _dict_legal_combination.keys():
  277. MAX_PROB = -1000
  278. _MAX_PROB_COMBA = None
  279. for item in _dict_legal_combination[_key_pack]:
  280. # print(_key_pack,item)
  281. _dict = dict()
  282. for _key in item.keys():
  283. _dict[str(_key_pack)+"$$"+str(_key)] = item[_key]
  284. _prob = getSumExpectation(dict_pack_entity_prob, _dict)
  285. if _prob>MAX_PROB:
  286. MAX_PROB = _prob
  287. _MAX_PROB_COMBA = [item]
  288. if _MAX_PROB_COMBA is not None:
  289. new_dict_legal_combination[_key_pack] = _MAX_PROB_COMBA
  290. _dict_legal_combination = new_dict_legal_combination
  291. #recursive_packages(_dict_legal_combination, {}, _list_final_comba)
  292. _list_final_comba = circle_pageages(_dict_legal_combination)
  293. #除了Project包(招标人和代理人),其他包是不会有冲突的
  294. #查看是否有一个实体出现在了Project包和其他包中,如有,要进行裁剪
  295. _list_real_comba = []
  296. for dict_item in _list_final_comba:
  297. set_project = set()
  298. set_other = set()
  299. for _key in list(dict_item.keys()):
  300. if _key.split("$$")[0]=="Project":
  301. set_project.add(dict_item[_key])
  302. else:
  303. set_other.add(dict_item[_key])
  304. set_common = set_project&set_other
  305. if len(set_common)>0:
  306. dict_project = {}
  307. dict_not_project = {}
  308. for _key in list(dict_item.keys()):
  309. if dict_item[_key] in set_common:
  310. if str(_key.split("$$")[0])=="Project":
  311. dict_project[_key] = dict_item[_key]
  312. else:
  313. dict_not_project[_key] = dict_item[_key]
  314. else:
  315. dict_project[_key] = dict_item[_key]
  316. dict_not_project[_key] = dict_item[_key]
  317. _list_real_comba.append(dict_project)
  318. _list_real_comba.append(dict_not_project)
  319. else:
  320. _list_real_comba.append(dict_item)
  321. return _list_real_comba
  322. def get_dict_entity_prob(list_entity,on_value=0.5):
  323. dict_pack_entity_prob = {}
  324. for entity in list_entity:
  325. if entity.entity_type in ['org','company']:
  326. values = entity.values
  327. role_prob = float(values[int(entity.label)])
  328. _key = entity.packageName+"$$"+str(entity.label)
  329. if role_prob>=on_value and str(entity.label)!="5":
  330. _key_prob = _key+"$text$"+entity.entity_text
  331. if _key_prob in dict_pack_entity_prob:
  332. if role_prob>dict_pack_entity_prob[_key_prob][1]:
  333. dict_pack_entity_prob[_key_prob] = [entity.entity_text,role_prob]
  334. else:
  335. dict_pack_entity_prob[_key_prob] = [entity.entity_text,role_prob]
  336. return dict_pack_entity_prob
  337. #计算合计期望
  338. def getSumExpectation(dict_pack_entity_prob,combination,on_value=0.5):
  339. '''
  340. expect = 0
  341. for entity in list_entity:
  342. if entity.entity_type in ['org','company']:
  343. values = entity.values
  344. role_prob = float(values[int(entity.label)])
  345. _key = entity.packageName+"$$"+str(entity.label)
  346. if role_prob>on_value and str(entity.label)!="5":
  347. if _key in combination.keys() and combination[_key]==entity.entity_text:
  348. expect += math.pow(role_prob,4)
  349. else:
  350. expect -= math.pow(role_prob,4)
  351. '''
  352. #修改为同一个实体只取对应包-角色的最大的概率值
  353. expect = 0
  354. dict_entity_prob = {}
  355. for _key_pack_entity in dict_pack_entity_prob:
  356. _key_pack = _key_pack_entity.split("$text$")[0]
  357. role_prob = dict_pack_entity_prob[_key_pack_entity][1]
  358. if _key_pack in combination.keys() and combination[_key_pack]==dict_pack_entity_prob[_key_pack_entity][0]:
  359. if _key_pack_entity in dict_entity_prob.keys():
  360. if dict_entity_prob[_key_pack_entity]<role_prob:
  361. dict_entity_prob[_key_pack_entity] = role_prob
  362. else:
  363. dict_entity_prob[_key_pack_entity] = role_prob
  364. else:
  365. if _key_pack_entity in dict_entity_prob.keys():
  366. if dict_entity_prob[_key_pack_entity]>-role_prob:
  367. dict_entity_prob[_key_pack_entity] = -role_prob
  368. else:
  369. dict_entity_prob[_key_pack_entity] = -role_prob
  370. # for entity in list_entity:
  371. # if entity.entity_type in ['org','company']:
  372. # values = entity.values
  373. # role_prob = float(values[int(entity.label)])
  374. # _key = entity.packageName+"$$"+str(entity.label)
  375. # if role_prob>=on_value and str(entity.label)!="5":
  376. # if _key in combination.keys() and combination[_key]==entity.entity_text:
  377. # _key_prob = _key+entity.entity_text
  378. # if _key_prob in dict_entity_prob.keys():
  379. # if dict_entity_prob[_key_prob]<role_prob:
  380. # dict_entity_prob[_key_prob] = role_prob
  381. # else:
  382. # dict_entity_prob[_key_prob] = role_prob
  383. # else:
  384. # _key_prob = _key+entity.entity_text
  385. # if _key_prob in dict_entity_prob.keys():
  386. # if dict_entity_prob[_key_prob]>-role_prob:
  387. # dict_entity_prob[_key_prob] = -role_prob
  388. # else:
  389. # dict_entity_prob[_key_prob] = -role_prob
  390. for _key in dict_entity_prob.keys():
  391. symbol = 1 if dict_entity_prob[_key]>0 else -1
  392. expect += symbol*math.pow(dict_entity_prob[_key],2)
  393. return expect
  394. def getRoleList(list_sentence,list_entity,on_value = 0.5):
  395. '''
  396. @summary: 搜索树,得到所有不矛盾的角色组合,取合计期望值最大的作为结果返回
  397. @param:
  398. list_sentence:文章所有的sentence
  399. list_entity:文章所有的实体
  400. on_value:概率阈值
  401. @return:文章的角色list
  402. '''
  403. pack = getPackagesFromArticle(list_sentence,list_entity)
  404. if pack is None:
  405. return None
  406. PackageList,PackageSet,dict_PackageCode = pack
  407. #拿到所有可能的情况
  408. dict_role_combination = {}
  409. #拿到各个实体的packageName,packageCode
  410. for entity in list_entity:
  411. if entity.entity_type in ['org','company']:
  412. #过滤掉字数小于3个的实体
  413. if len(entity.entity_text)<=3:
  414. continue
  415. values = entity.values
  416. role_prob = float(values[int(entity.label)])
  417. if role_prob>=on_value and str(entity.label)!="5":
  418. if str(entity.label) in ["0","1"]:
  419. packageName = "Project"
  420. else:
  421. if len(PackageSet)>0:
  422. packagePointer,_ = getPackage(PackageList,entity.sentence_index,entity.begin_index,"role-"+str(entity.label))
  423. if packagePointer is None:
  424. #continue
  425. packageName = "Project"
  426. else:
  427. #add pointer_pack
  428. entity.pointer_pack = packagePointer
  429. packageName = packagePointer.entity_text
  430. else:
  431. packageName = "Project"
  432. find_flag = False
  433. if packageName in dict_PackageCode.keys():
  434. packageCode = dict_PackageCode[packageName]
  435. else:
  436. packageCode = ""
  437. entity.packageCode = packageCode
  438. role_name = dict_role_id.get(str(entity.label))
  439. entity.roleName = role_name
  440. entity.packageName = packageName
  441. if entity.packageName in dict_role_combination.keys():
  442. if str(entity.label) in dict_role_combination[entity.packageName].keys():
  443. dict_role_combination[entity.packageName][str(entity.label)].add(entity.entity_text)
  444. else:
  445. dict_role_combination[entity.packageName][str(entity.label)] = set([entity.entity_text])
  446. else:
  447. dict_role_combination[entity.packageName] = {}
  448. #初始化空值
  449. roleIds = [0,1,2,3,4]
  450. for _roleId in roleIds:
  451. dict_role_combination[entity.packageName][str(_roleId)] = set([""])
  452. dict_role_combination[entity.packageName][str(entity.label)].add(entity.entity_text)
  453. list_real_comba = get_legal_comba(list_entity,dict_role_combination)
  454. #拿到最大期望值的组合
  455. max_index = 0
  456. max_expect = -100
  457. _index = 0
  458. dict_pack_entity_prob = get_dict_entity_prob(list_entity)
  459. for item_combination in list_real_comba:
  460. expect = getSumExpectation(dict_pack_entity_prob, item_combination)
  461. if expect>max_expect:
  462. max_index = _index
  463. max_expect = expect
  464. _index += 1
  465. RoleList = []
  466. RoleSet = set()
  467. if len(list_real_comba)>0:
  468. for _key in list_real_comba[max_index].keys():
  469. packageName = _key.split("$$")[0]
  470. label = _key.split("$$")[1]
  471. role_name = dict_role_id.get(str(label))
  472. entity_text = list_real_comba[max_index][_key]
  473. if packageName in dict_PackageCode.keys():
  474. packagecode = dict_PackageCode.get(packageName)
  475. else:
  476. packagecode = ""
  477. RoleList.append(PREM(packageName,packagecode,role_name,entity_text,0,0,0.0,[]))
  478. RoleSet.add(entity_text)
  479. #根据最优树来修正list_entity中角色对包的连接
  480. for _entity in list_entity:
  481. if _entity.pointer_pack is not None:
  482. _pack_name = _entity.pointer_pack.entity_text
  483. _find_flag = False
  484. for _prem in RoleList:
  485. if _prem.packageName==_pack_name and _prem.entity_text==_entity.entity_text:
  486. _find_flag = True
  487. if not _find_flag:
  488. _entity.pointer_pack = None
  489. return RoleList,RoleSet,PackageList,PackageSet
  490. def getPackageScopePattern():
  491. '''
  492. @summary: 获取包的作用域关键词
  493. '''
  494. df = pd.read_excel(os.path.dirname(__file__)+"/end.xls")
  495. pattern = "("
  496. for item in df["list_word"]:
  497. item = str(item).replace("(","\(").replace(")","\)").replace(".","\.").replace("[","\[").replace("]","\]").replace("-","\-")
  498. pattern += item+"|"
  499. pattern = pattern[:-1]+")[::是为]|业绩.{,30}标段[0-9A-Za-z一二三四五六七八九十]{0,3}"
  500. return pattern
  501. pattern_packageScope = getPackageScopePattern()
  502. def getPackagesFromArticle(list_sentence,list_entity):
  503. '''
  504. @param:
  505. list_sentence:文章的句子list
  506. @summary: 将包的信息插入list_entity中
  507. @return: type:list if [包号,句子index,词偏移,标段号] meaning:文章的包/标段信息
  508. '''
  509. if len(list_sentence)==0:
  510. return None
  511. PackageList = []
  512. PackageList_scope = []
  513. PackageSet = set()
  514. dict_packageCode = dict()
  515. package_name_pattern = re.compile("((标[段号的包]|分包)的?(名[称单]?|包名))[::]?([^::]{3,30}?),{1}")
  516. package_N_name_pattern = re.compile("[^承](分?包|标段|标包|标|包|包组|子项目|包件|项目类型)编?号?[::]?[\((]?([0-9A-Za-z一二三四五六七八九十ⅠⅡⅢⅣⅤⅥⅦ]){1,2},{1}")
  517. package_number_pattern = re.compile("(([^承](包|标[段号的包]|分?包|包组|包件)编?号?|子项目|项目类型)[::]?[0-9A-Za-z一二三四五六七八九十ⅠⅡⅢⅣⅤⅥⅦ]{1,4}[^\.])[^至]?|([^\.]?第?[ⅠⅡⅢⅣⅤⅥⅦ0-9A-Za-z一二三四五六七八九十]{1,4}(包号|标[段号的包]|分?包))")
  518. # other_package_pattern = re.compile('(项目名称|物资名称|设备名称|场次名称|标段名称)[::](.{,20}?)(,|项目)') # 新正则识别标段
  519. other_package_pattern = re.compile('((项目|物资|设备|场次|标段|标的|产品)(名称))[::]([^,。]{2,50}?)[,。]') # # 2020/11/23 大网站规则 调整 package_N_name_pattern, package_N_name_pattern 中的项目 改为 子项目
  520. win_tenderer_pattern = re.compile('(中标人|供应商)[::](.{2,25})[,。]') # 2020/11/23 大网站规则 调整
  521. model_pattern = re.compile('(型号|序号)[::]([^,。]{2,20})[,。]') # 2020/11/23 大网站规则 调整
  522. number_pattern = re.compile("[0-9A-Za-z一二三四五六七八九十ⅠⅡⅢⅣⅤⅥⅦ]{1,4}")
  523. package_code_pattern = re.compile("(?:编号[::]?\s*)([-\dA-Za-z\(\)]+)")
  524. # 纯数字类型的包号统一,例如:'01','1'
  525. re_digital = re.compile("^\d+$")
  526. def changeIndexFromWordToWords(tokens,word_index):
  527. '''
  528. @summary:转换某个字的字偏移为词偏移
  529. '''
  530. before_index = 0
  531. after_index = 0
  532. for i in range(len(tokens)):
  533. after_index = after_index+len(tokens[i])
  534. if before_index<=word_index and after_index>=word_index:
  535. return i
  536. before_index = after_index
  537. package_names = []
  538. def extractPackageCode(tokens,word_index,size=20,pattern = package_code_pattern):
  539. '''
  540. @summary:抽取包附近的标段号
  541. @param:
  542. tokens:包所在句子的分词
  543. word_index:包所在字偏移
  544. size:左右各取多少个词
  545. pattern:提取标段号的正则
  546. @return: type:string,meaning:标段号
  547. '''
  548. index = changeIndexFromWordToWords(tokens,word_index)
  549. if index<size:
  550. begin = index
  551. else:
  552. begin = index-size
  553. if index+size>len(tokens):
  554. end = len(tokens)
  555. else:
  556. end = index+size
  557. #拿到左右两边的词语组成短语
  558. text = "".join(tokens[begin:end])
  559. #在短语中的字偏移
  560. new_word_index = word_index-len("".join(tokens[:begin]))
  561. min_distance = len(text)
  562. packageCode = None
  563. for the_iter in re.finditer(pattern,text):
  564. #算出最小距离
  565. distance = min([abs(new_word_index-the_iter.span()[0]),abs(new_word_index-the_iter.span()[1])])
  566. if distance<min_distance:
  567. min_distance = distance
  568. packageCode = the_iter.group(1)
  569. return packageCode
  570. #从标段介绍表格中提取包名和包号
  571. for i in range(len(list_sentence)):
  572. content = list_sentence[i].sentence_text
  573. names = re.findall(package_name_pattern,content)
  574. if names == []:
  575. names = re.findall(other_package_pattern, content)
  576. N_names = re.findall(package_N_name_pattern,content)
  577. if len(names)==1 and len(N_names)==1:
  578. package_names.append([names[0][-1],N_names[0][-1]])
  579. for i in range(len(list_sentence)):
  580. PackageList_item = []
  581. PackageList_item_scope = []
  582. content = list_sentence[i].sentence_text
  583. tokens = list_sentence[i].tokens
  584. _names = []
  585. # 2021/6/23 包名称去重
  586. for name in package_names:
  587. if name not in _names:
  588. _names.append(name)
  589. # for name in package_names[:20]:
  590. for name in _names[:20]:
  591. for index in findAllIndex(name[0],content):
  592. temp_package_number = re.findall(number_pattern,name[1])[0]
  593. if re.search(re_digital,temp_package_number):
  594. temp_package_number = str(int(temp_package_number))
  595. PackageList_item.append({"name":temp_package_number,"sentence_index":list_sentence[i].sentence_index,"offsetWords_begin":changeIndexFromWordToWords(tokens,index),"offsetWord_begin":index,"offsetWord_end":index+len(name[0])})
  596. # PackageList_item.append([temp_package_number,i,changeIndexFromWordToWords(tokens,index),index,index+len(str(temp_package_number))])
  597. code = extractPackageCode(tokens, index)
  598. if code is not None:
  599. dict_packageCode[temp_package_number] = code
  600. PackageSet.add(temp_package_number)
  601. for iter in re.finditer(package_number_pattern,content):
  602. temp_package_number = re.findall(number_pattern,content[iter.span()[0]:iter.span()[1]])[0]
  603. if re.search(re_digital, temp_package_number):
  604. temp_package_number = str(int(temp_package_number))
  605. PackageList_item.append({"name":temp_package_number,"sentence_index":list_sentence[i].sentence_index,"offsetWords_begin":changeIndexFromWordToWords(tokens,iter.span()[0]),"offsetWord_begin":iter.span()[0],"offsetWord_end":iter.span()[1]})
  606. # PackageList_item.append([temp_package_number,i,changeIndexFromWordToWords(tokens,iter.span()[0]),iter.span()[0],iter.span()[1]])
  607. code = extractPackageCode(tokens, iter.span()[0])
  608. if code is not None:
  609. dict_packageCode[temp_package_number] = code
  610. PackageSet.add(temp_package_number)
  611. #识别packageScope
  612. for iter in re.finditer(pattern_packageScope,content):
  613. PackageList_item_scope.append({"name":"","sentence_index":list_sentence[i].sentence_index,"offsetWords_begin":changeIndexFromWordToWords(tokens,iter.span()[0]),"offsetWord_begin":iter.span()[0],"offsetWord_end":iter.span()[1]})
  614. # PackageList_item_scope.append(["",i,changeIndexFromWordToWords(tokens,iter.span()[0]),iter.span()[0],iter.span()[1]])
  615. PackageList_item_scope = PackageList_item +PackageList_item_scope
  616. PackageList_item_scope.sort(key=lambda x:x["offsetWord_begin"])
  617. PackageList_scope = PackageList_scope+PackageList_item_scope
  618. PackageList_item.sort(key=lambda x:x["sentence_index"])
  619. #PackageList = PackageList+PackageList_item
  620. #不作为包
  621. # if len(PackageSet)==0:
  622. # for i in range(len(list_sentence)):
  623. # PackageList_item = []
  624. # PackageList_item_scope = []
  625. # content = list_sentence[i].sentence_text
  626. # tokens = list_sentence[i].tokens
  627. # for iter in re.finditer(other_package_pattern,content):
  628. # temp_package_number = iter.group(2)
  629. # PackageList_item.append({"name":temp_package_number,"sentence_index":list_sentence[i].sentence_index,"offsetWords_begin":changeIndexFromWordToWords(tokens,iter.span()[0]),"offsetWord_begin":iter.span()[0],"offsetWord_end":iter.span()[1]})
  630. # # PackageList_item.append([temp_package_number,i,changeIndexFromWordToWords(tokens,iter.span()[0]),iter.span()[0],iter.span()[1]])
  631. # code = extractPackageCode(tokens, iter.span()[0])
  632. # if code is not None:
  633. # dict_packageCode[temp_package_number] = code
  634. # PackageSet.add(temp_package_number)
  635. # #识别packageScope
  636. # for iter in re.finditer(pattern_packageScope,content):
  637. # PackageList_item_scope.append({"name":"","sentence_index":list_sentence[i].sentence_index,"offsetWords_begin":changeIndexFromWordToWords(tokens,iter.span()[0]),"offsetWord_begin":iter.span()[0],"offsetWord_end":iter.span()[1]})
  638. # # PackageList_item_scope.append(["",i,changeIndexFromWordToWords(tokens,iter.span()[0]),iter.span()[0],iter.span()[1]])
  639. # PackageList_item_scope = PackageList_item +PackageList_item_scope
  640. # PackageList_item_scope.sort(key=lambda x:x["offsetWord_begin"])
  641. # PackageList_scope = PackageList_scope+PackageList_item_scope
  642. # PackageList_item.sort(key=lambda x:x["sentence_index"])
  643. # 2020/11/23 大网站规则 调整
  644. if len(PackageSet)==0 and len(set([it.entity_text for it in list_entity if it.entity_type in ['org', 'company'] and it.label==2]))>1:
  645. for i in range(len(list_sentence)):
  646. PackageList_item = []
  647. PackageList_item_scope = []
  648. content = list_sentence[i].sentence_text
  649. tokens = list_sentence[i].tokens
  650. names = re.findall(other_package_pattern, content)
  651. N_names = re.findall(win_tenderer_pattern, content)
  652. if len(names) != 1 or len(N_names) != 1:
  653. continue
  654. for iter in re.finditer(other_package_pattern,content):
  655. temp_package_number = iter.group(4)
  656. xinghao = re.search(model_pattern, content)
  657. if xinghao:
  658. temp_package_number = temp_package_number + '+' + xinghao.group(2)
  659. # print('新正则采购包名补充',temp_package_number)
  660. if re.search(re_digital,temp_package_number):
  661. temp_package_number = str(int(temp_package_number))
  662. PackageList_item.append({"name":temp_package_number,"sentence_index":list_sentence[i].sentence_index,"offsetWords_begin":changeIndexFromWordToWords(tokens,iter.span()[0]),"offsetWord_begin":iter.span()[0],"offsetWord_end":iter.span()[1]})
  663. # PackageList_item.append([temp_package_number,i,changeIndexFromWordToWords(tokens,iter.span()[0]),iter.span()[0],iter.span()[1]])
  664. code = extractPackageCode(tokens, iter.span()[0])
  665. if code is not None:
  666. dict_packageCode[temp_package_number] = code
  667. PackageSet.add(temp_package_number)
  668. #识别packageScope
  669. for iter in re.finditer(pattern_packageScope,content):
  670. PackageList_item_scope.append({"name":"","sentence_index":list_sentence[i].sentence_index,"offsetWords_begin":changeIndexFromWordToWords(tokens,iter.span()[0]),"offsetWord_begin":iter.span()[0],"offsetWord_end":iter.span()[1]})
  671. # PackageList_item_scope.append(["",i,changeIndexFromWordToWords(tokens,iter.span()[0]),iter.span()[0],iter.span()[1]])
  672. PackageList_item_scope = PackageList_item +PackageList_item_scope
  673. PackageList_item_scope.sort(key=lambda x:x["offsetWord_begin"])
  674. PackageList_scope = PackageList_scope+PackageList_item_scope
  675. PackageList_item.sort(key=lambda x:x["sentence_index"])
  676. pattern_punctuation = "[::()\(\),,。;;]"
  677. for i in range(len(list_sentence)):
  678. for j in range(len(PackageList_scope)):
  679. if i==PackageList_scope[j]["sentence_index"] and PackageList_scope[j]["name"]!="":
  680. _flag = False
  681. left_str = list_sentence[i].sentence_text[PackageList_scope[j]["offsetWord_begin"]-30:PackageList_scope[j]["offsetWord_begin"]+1]
  682. right_str = list_sentence[i].sentence_text[PackageList_scope[j]["offsetWord_begin"]:PackageList_scope[j]["offsetWord_begin"]+30]
  683. _left_find = re.findall(pattern_punctuation,left_str)
  684. _right_find = re.findall(pattern_punctuation,right_str)
  685. #print(left_str)
  686. if re.search("同",left_str[-1:]) is not None and PackageList_scope[j]["name"]=="一":
  687. continue
  688. if re.search("划分",right_str[:10]) is not None:
  689. continue
  690. if len(_left_find)>0 and _left_find[-1] in [":",":"]:
  691. _flag = True
  692. if len(_right_find)>0 and _right_find[0] in [":",":"]:
  693. _flag = True
  694. if _flag:
  695. scope_begin = [PackageList_scope[j]["sentence_index"],PackageList_scope[j]["offsetWords_begin"]]
  696. else:
  697. if j==0:
  698. scope_begin = [0,0]
  699. else:
  700. scope_begin = [PackageList_scope[j-1]["sentence_index"],PackageList_scope[j-1]["offsetWords_begin"]]
  701. if j==len(PackageList_scope)-1:
  702. scope_end = [PackageList_scope[j]["sentence_index"],changeIndexFromWordToWords(list_sentence[i].tokens, len(list_sentence[i].sentence_text))]
  703. else:
  704. scope_end = [PackageList_scope[j+1]["sentence_index"],PackageList_scope[j+1]["offsetWords_begin"]]
  705. if PackageList_scope[j-1]["sentence_index"]==PackageList_scope[j]["sentence_index"] and PackageList_scope[j-1]["offsetWord_begin"]<=PackageList_scope[j]["offsetWord_begin"] and PackageList_scope[j-1]["offsetWord_end"]>=PackageList_scope[j]["offsetWord_end"]:
  706. continue
  707. #add package to entity
  708. _pack_entity = Entity(doc_id=list_sentence[0].doc_id,entity_id="%s_%s_%s_%s"%(list_sentence[0].doc_id,i,PackageList_scope[j]["offsetWord_begin"],PackageList_scope[j]["offsetWord_begin"]),entity_text=PackageList_scope[j]["name"],entity_type="package",sentence_index=PackageList_scope[j]["sentence_index"],begin_index=changeIndexFromWordToWords(list_sentence[i].tokens,PackageList_scope[j]["offsetWord_begin"]),end_index=changeIndexFromWordToWords(list_sentence[i].tokens,PackageList_scope[j]["offsetWord_end"]),wordOffset_begin=PackageList_scope[j]["offsetWord_begin"],wordOffset_end=PackageList_scope[j]["offsetWord_end"])
  709. list_entity.append(_pack_entity)
  710. copy_pack = copy.copy(PackageList_scope[j])
  711. copy_pack["scope"] = [scope_begin,scope_end]
  712. copy_pack["hit"] = set()
  713. copy_pack["pointer"] = _pack_entity
  714. PackageList.append(copy_pack)
  715. return PackageList,PackageSet,dict_packageCode
  716. from BiddingKG.dl.relation_extraction.model import Relation_extraction
  717. relationExtraction_model = Relation_extraction()
  718. def findAttributeAfterEntity(PackDict,roleSet,PackageList,PackageSet,list_entity,list_sentence,on_value = 0.5,on_value_person=0.5,sentence_len=4):
  719. '''
  720. @param:
  721. PackDict:文章包dict
  722. roleSet:文章所有角色的公司名称
  723. PackageList:文章的包信息
  724. PackageSet:文章所有包的名称
  725. list_entity:文章所有经过模型处理的实体
  726. on_value:金额模型的阈值
  727. on_value_person:联系人模型的阈值
  728. sentence_len:公司和属性间隔句子的最大长度
  729. @return:添加了属性信息的角色list
  730. '''
  731. #根据roleid添加金额到rolelist中
  732. def addMoneyByRoleid(packDict,packageName,roleid,money,money_prob):
  733. for i in range(len(packDict[packageName]["roleList"])):
  734. if packDict[packageName]["roleList"][i].role_name==dict_role_id.get(str(roleid)):
  735. if money_prob>packDict[packageName]["roleList"][i].money_prob:
  736. packDict[packageName]["roleList"][i].money = money
  737. packDict[packageName]["roleList"][i].money_prob = money_prob
  738. return packDict
  739. #根据实体名称添加金额到rolelist中
  740. def addMoneyByEntity(packDict,packageName,entity,money,money_prob):
  741. for i in range(len(packDict[packageName]["roleList"])):
  742. if packDict[packageName]["roleList"][i].entity_text==entity:
  743. if money_prob>packDict[packageName]["roleList"][i].money_prob:
  744. packDict[packageName]["roleList"][i].money = money
  745. packDict[packageName]["roleList"][i].money_prob = money_prob
  746. return packDict
  747. #根据实体名称得到角色
  748. def getRoleWithText(packDict,entity_text):
  749. for pack in packDict.keys():
  750. for i in range(len(packDict[pack]["roleList"])):
  751. if packDict[pack]["roleList"][i].entity_text==entity_text:
  752. return packDict[pack]["roleList"][i].role_name
  753. def doesEntityOrLinkedEntity_inRoleSet(entity,RoleSet):
  754. _list_entitys = [entity]+entity.linked_entitys
  755. for _entity in _list_entitys:
  756. if _entity.entity_text in RoleSet:
  757. return True
  758. p_entity = 0
  759. #遍历所有实体
  760. while(p_entity<len(list_entity)):
  761. entity = list_entity[p_entity]
  762. '''
  763. #招标金额从后往前找
  764. if entity.entity_type=="money":
  765. if entity.values[entity.label]>=on_value:
  766. if str(entity.label)=="0":
  767. packagePointer,_ = getPackage(PackageList,entity.sentence_index,entity.begin_index,"money-"+str(entity.label))
  768. if packagePointer is None:
  769. packageName = "Project"
  770. else:
  771. packageName = packagePointer.entity_text
  772. addMoneyByRoleid(PackDict, packageName, "0", entity.entity_text, entity.values[entity.label])
  773. '''
  774. ''' # 2020/11/25 与下面的联系人连接步骤重复,取消
  775. if entity.entity_type=="person":
  776. if entity.values[entity.label]>=on_value_person:
  777. if str(entity.label)=="1":
  778. for i in range(len(PackDict["Project"]["roleList"])):
  779. if PackDict["Project"]["roleList"][i].role_name=="tenderee":
  780. PackDict["Project"]["roleList"][i].linklist.append((entity.entity_text,entity.person_phone))
  781. # add pointer_person
  782. for _entity in list_entity:
  783. if dict_role_id.get(str(_entity.label))=="tenderee":
  784. for i in range(len(PackDict["Project"]["roleList"])):
  785. if PackDict["Project"]["roleList"][i].entity_text==_entity.entity_text and PackDict["Project"]["roleList"][i].role_name=="tenderee":
  786. _entity.pointer_person = entity
  787. elif str(entity.label)=="2":
  788. for i in range(len(PackDict["Project"]["roleList"])):
  789. if PackDict["Project"]["roleList"][i].role_name=="agency":
  790. PackDict["Project"]["roleList"][i].linklist.append((entity.entity_text,entity.person_phone))
  791. # add pointer_person
  792. for _entity in list_entity:
  793. if dict_role_id.get(str(_entity.label))=="agency":
  794. for i in range(len(PackDict["Project"]["roleList"])):
  795. if PackDict["Project"]["roleList"][i].entity_text==_entity.entity_text and PackDict["Project"]["roleList"][i].role_name=="agency":
  796. _entity.pointer_person = entity
  797. '''
  798. #金额往前找实体
  799. if entity.entity_type=="money":
  800. if entity.values[entity.label]>=on_value:
  801. p_entity_money= p_entity
  802. entity_money = list_entity[p_entity_money]
  803. if len(PackageSet)>0:
  804. packagePointer,_ = getPackage(PackageList,entity_money.sentence_index,entity_money.begin_index,"money-"+str(entity_money.entity_text)+"-"+str(entity_money.label))
  805. if packagePointer is None:
  806. packageName_entity = "Project"
  807. else:
  808. packageName_entity = packagePointer.entity_text
  809. else:
  810. packageName_entity = "Project"
  811. while(p_entity_money>0):
  812. entity_before = list_entity[p_entity_money]
  813. if entity_before.entity_type in ['org','company']:
  814. if str(entity_before.label)=="1":
  815. addMoneyByEntity(PackDict, packageName_entity, entity_before.entity_text, entity_money.entity_text, entity_money.values[entity_money.label])
  816. #add pointer_money
  817. entity_before.pointer_money = entity_money
  818. break
  819. p_entity_money -= 1
  820. #如果实体属于角色集合,则往后找属性
  821. if doesEntityOrLinkedEntity_inRoleSet(entity, roleSet):
  822. p_entity += 1
  823. #循环查找符合的属性
  824. while(p_entity<len(list_entity)):
  825. entity_after = list_entity[p_entity]
  826. if entity_after.sentence_index-entity.sentence_index>=sentence_len:
  827. p_entity -= 1
  828. break
  829. #若是遇到公司实体,则跳出循环
  830. if entity_after.entity_type in ['org','company']:
  831. p_entity -= 1
  832. break
  833. if entity_after.values is not None:
  834. if entity_after.entity_type=="money":
  835. if entity_after.values[entity_after.label]>=on_value:
  836. '''
  837. #招标金额从后往前找
  838. if str(entity_after.label)=="0":
  839. packagePointer,_ = getPackage(PackageList,entity.sentence_index,entity.begin_index,"money-"+str(entity.label))
  840. if packagePointer is None:
  841. packageName = "Project"
  842. else:
  843. packageName = packagePointer.entity_text
  844. addMoneyByRoleid(PackDict, packageName, "0", entity_after.entity_text, entity_after.values[entity_after.label])
  845. '''
  846. if str(entity_after.label)=="1":
  847. #print(entity_after.entity_text,entity.entity_text)
  848. _list_entitys = [entity]+entity.linked_entitys
  849. if len(PackageSet)>0:
  850. packagePointer,_ = getPackage(PackageList,entity_after.sentence_index,entity_after.begin_index,"money-"+str(entity_after.entity_text)+"-"+str(entity_after.label))
  851. if packagePointer is None:
  852. packageName_entity = "Project"
  853. else:
  854. packageName_entity = packagePointer.entity_text
  855. else:
  856. packageName_entity = "Project"
  857. if str(entity.label) in ["2","3","4"]:
  858. addMoneyByEntity(PackDict, packageName_entity, entity.entity_text, entity_after.entity_text, entity_after.values[entity_after.label])
  859. #add pointer_money
  860. entity.pointer_money = entity_after
  861. '''
  862. if entity_after.entity_type=="person":
  863. if entity_after.values[entity_after.label]>=on_value_person:
  864. if str(entity_after.label)=="1":
  865. for i in range(len(roleList)):
  866. if roleList[i].role_name=="tenderee":
  867. roleList[i].linklist.append((entity_after.entity_text,entity_after.person_phone))
  868. elif str(entity_after.label)=="2":
  869. for i in range(len(roleList)):
  870. if roleList[i].role_name=="agency":
  871. roleList[i].linklist.append((entity_after.entity_text,entity_after.person_phone))
  872. elif str(entity_after.label)=="3":
  873. _list_entitys = [entity]+entity.linked_entitys
  874. for _entity in _list_entitys:
  875. for i in range(len(roleList)):
  876. if roleList[i].entity_text==_entity.entity_text:
  877. if entity_after.sentence_index-_entity.sentence_index>1 and len(roleList[i].linklist)>0:
  878. break
  879. roleList[i].linklist.append((entity_after.entity_text,entity_after.person_phone))
  880. '''
  881. p_entity += 1
  882. p_entity += 1
  883. ''''''
  884. # 通过模型分类的招标/代理联系人
  885. list_sentence = sorted(list_sentence, key=lambda x: x.sentence_index)
  886. person_list = [entity for entity in list_entity if entity.entity_type == 'person' and entity.label in [1, 2]]
  887. tenderee_contact = set()
  888. tenderee_phone = set()
  889. agency_contact = set()
  890. agency_phone = set()
  891. for _person in person_list:
  892. if _person.label == 1:
  893. tenderee_contact.add(_person.entity_text)
  894. if _person.label == 2:
  895. agency_contact.add(_person.entity_text)
  896. # 正则匹配无 '主体/联系人' 的电话
  897. # 例:"采购人联系方式:0833-5226788,"
  898. re_tenderee_phone = re.compile(
  899. "(?:(?:(?:采购|招标|议价|议标|比选)(?:人|公司|单位|组织|部门)|建设(?:单位|业主)|(?:采购|招标|甲)方|询价单位|项目业主|业主)[^。]{0,5}(?:电话|联系方式|联系人)[::]?[^。]{0,7}?)"
  900. # 电话号码
  901. "(1[3-9][0-9][-—-]?\d{4}[-—-]?\d{4}|0\d{2,3}[-—-][1-9]\d{6,7}/[1-9]\d{6,7}|0\d{2,3}[-—-][1-9]\d{6,7}转\d{1,4}|0\d{2,3}[-—-]?[1-9]\d{6,7}|[\(|\(]0\d{2,3}[\)|\)]-?\d{7,8}-?\d{,4}|[1-9]\d{6,7})(?:[^\.]|$)")
  902. # 例:"采购人地址和联系方式:峨边彝族自治县教育局,0833-5226788,"
  903. re_tenderee_phone2 = re.compile(
  904. "(?:(?:(?:采购|招标|议价|议标|比选)(?:人|公司|单位|组织|部门)|建设(?:单位|业主)|(?:采购|招标|甲)方|询价单位|项目业主|业主)[^。]{0,3}(?:地址)[^。]{0,3}(?:电话|联系方式|联系人)[::]?[^。]{0,20}?)"
  905. # 电话号码
  906. "(1[3-9][0-9][-—-]?\d{4}[-—-]?\d{4}|0\d{2,3}[-—-][1-9]\d{6,7}/[1-9]\d{6,7}|0\d{2,3}[-—-][1-9]\d{6,7}转\d{1,4}|0\d{2,3}[-—-]?[1-9]\d{6,7}|[\(|\(]0\d{2,3}[\)|\)]-?\d{7,8}-?\d{,4}|[1-9]\d{6,7})(?:[^\.]|$)")
  907. re_agent_phone = re.compile(
  908. "(?:(?:代理(?:人|机构|公司|单位|组织|方)|采购机构|集中采购机构|集采机构|招标机构)[^。]{0,5}(?:电话|联系方式|联系人)[::]?[^。]{0,7}?)"
  909. # 电话号码
  910. "(1[3-9][0-9][-—-]?\d{4}[-—-]?\d{4}|0\d{2,3}[-—-][1-9]\d{6,7}/[1-9]\d{6,7}|0\d{2,3}[-—-][1-9]\d{6,7}转\d{1,4}|0\d{2,3}[-—-]?[1-9]\d{6,7}|[\(|\(]0\d{2,3}[\)|\)]-?\d{7,8}-?\d{,4}|[1-9]\d{6,7})(?:[^\.]|$)")
  911. re_agent_phone2 = re.compile(
  912. "(?:(?:代理(?:人|机构|公司|单位|组织|方)|采购机构|集中采购机构|集采机构|招标机构)[^。]{0,3}(?:地址)[^。]{0,3}(?:电话|联系方式|联系人)[::]?[^。]{0,20}?)"
  913. # 电话号码
  914. "(1[3-9][0-9][-—-]?\d{4}[-—-]?\d{4}|0\d{2,3}[-—-][1-9]\d{6,7}/[1-9]\d{6,7}|0\d{2,3}[-—-][1-9]\d{6,7}转\d{1,4}|0\d{2,3}[-—-]?[1-9]\d{6,7}|[\(|\(]0\d{2,3}[\)|\)]-?\d{7,8}-?\d{,4}|[1-9]\d{6,7})(?:[^\.]|$)")
  915. content = ""
  916. for _sentence in list_sentence:
  917. content += "".join(_sentence.tokens)
  918. _content = copy.deepcopy(content)
  919. while re.search("(.)(,)([^0-9])|([^0-9])(,)(.)", content):
  920. content_words = list(content)
  921. for i in re.finditer("(.)(,)([^0-9])", content):
  922. content_words[i.span(2)[0]] = ""
  923. for i in re.finditer("([^0-9])(,)(.)", content):
  924. content_words[i.span(2)[0]] = ""
  925. content = "".join(content_words)
  926. content = re.sub("[::]|[\((]|[\))]", "", content)
  927. _tenderee_phone = re.findall(re_tenderee_phone, content)
  928. # 更新正则确定的角色属性
  929. for i in range(len(PackDict["Project"]["roleList"])):
  930. if PackDict["Project"]["roleList"][i].role_name == "tenderee":
  931. _tenderee_phone = re.findall(re_tenderee_phone, content)
  932. if _tenderee_phone:
  933. for _phone in _tenderee_phone:
  934. PackDict["Project"]["roleList"][i].linklist.append(("", _phone))
  935. tenderee_phone.add(_phone)
  936. _tenderee_phone2 = re.findall(re_tenderee_phone2, content)
  937. if _tenderee_phone2:
  938. for _phone in _tenderee_phone2:
  939. PackDict["Project"]["roleList"][i].linklist.append(("", _phone))
  940. tenderee_phone.add(_phone)
  941. if PackDict["Project"]["roleList"][i].role_name == "agency":
  942. _agent_phone = re.findall(re_agent_phone, content)
  943. if _agent_phone:
  944. for _phone in _agent_phone:
  945. PackDict["Project"]["roleList"][i].linklist.append(("", _phone))
  946. agency_phone.add(_phone)
  947. _agent_phone2 = re.findall(re_agent_phone2, content)
  948. if _agent_phone2:
  949. for _phone in _agent_phone2:
  950. PackDict["Project"]["roleList"][i].linklist.append(("", _phone))
  951. agency_phone.add(_phone)
  952. # km配对方法
  953. def dispatch(match_list):
  954. main_roles = list(set([match.main_role for match in match_list]))
  955. attributes = list(set([match.attribute for match in match_list]))
  956. label = np.zeros(shape=(len(main_roles), len(attributes)))
  957. for match in match_list:
  958. main_role = match.main_role
  959. attribute = match.attribute
  960. value = match.value
  961. label[main_roles.index(main_role), attributes.index(attribute)] = value + 10000
  962. # print(label)
  963. gragh = -label
  964. # km算法
  965. row, col = linear_sum_assignment(gragh)
  966. max_dispatch = [(i, j) for i, j, value in zip(row, col, gragh[row, col]) if value]
  967. # return [Match(main_roles[row], attributes[col]) for row, col in max_dispatch]
  968. return [(main_roles[row], attributes[col]) for row, col in max_dispatch]
  969. # 正则提取电话号码实体
  970. key_word = re.compile('((?:电话|联系方式|联系人).{0,4}?)([0-1]\d{6,11})')
  971. phone = re.compile('1[3|4|5|6|7|8|9][0-9][-—-―]?\d{4}[-—-―]?\d{4}|'
  972. '\+86.?1[3|4|5|6|7|8|9]\d{9}|'
  973. '0[^0]\d{1,2}[-—-―][1-9]\d{6,7}/[1-9]\d{6,10}|'
  974. '0[^0]\d{1,2}[-—-―]\d{7,8}转\d{1,4}|'
  975. '0[^0]\d{1,2}[-—-―]?[1-9]\d{6}\d?(?=1[3|4|5|6|7|8|9]\d{9})|'
  976. '0[^0]\d{1,2}[-—-―]?[1-9]\d{6}\d?(?=0[^0]\d{1,2}[-—-―]?[1-9]\d{6}\d?)|'
  977. '0[^0]\d{1,2}[-—-―]?[1-9]\d{6}\d?(?=[1-9]\d{6,7})|'
  978. '0[^0]\d{1,2}[-—-―]?[1-9]\d{6}\d?|'
  979. '[\(|\(]0[^0]\d{1,2}[\)|\)]-?\d{7,8}-?\d{,4}|'
  980. '[2-9]\d{6,7}')
  981. phone_entitys = []
  982. for _sentence in list_sentence:
  983. sentence_text = _sentence.sentence_text
  984. list_tokenbegin = []
  985. begin = 0
  986. for i in range(0, len(_sentence.tokens)):
  987. list_tokenbegin.append(begin)
  988. begin += len(str(_sentence.tokens[i]))
  989. list_tokenbegin.append(begin + 1)
  990. res_set = set()
  991. for i in re.finditer(phone, sentence_text):
  992. res_set.add((i.group(), i.start(), i.end()))
  993. # for i in re.finditer(key_word, sentence_text):
  994. # res_set.add((i.group(2), i.start() + len(i.group(1)), i.end()))
  995. for item in list(res_set):
  996. phone_left = sentence_text[max(0, item[1] - 10):item[1]]
  997. phone_right = sentence_text[item[2]:item[2] + 8]
  998. # 排除“传真号”和其它错误项
  999. if re.search("传,?真|信,?箱|邮,?箱", phone_left):
  1000. if not re.search("电,?话", phone_left):
  1001. continue
  1002. if re.search("帐,?号|编,?号|报,?价|证,?号|价,?格|[\((]万?元[\))]", phone_left):
  1003. continue
  1004. if re.search("[.,]\d{2,}", phone_right):
  1005. continue
  1006. for j in range(len(list_tokenbegin)):
  1007. if list_tokenbegin[j] == item[1]:
  1008. begin_index = j
  1009. break
  1010. elif list_tokenbegin[j] > item[1]:
  1011. begin_index = j - 1
  1012. break
  1013. for j in range(begin_index, len(list_tokenbegin)):
  1014. if list_tokenbegin[j] >= item[2]:
  1015. end_index = j - 1
  1016. break
  1017. _entity = Entity(_sentence.doc_id, None, item[0], "phone", _sentence.sentence_index, begin_index, end_index, item[1],
  1018. item[2])
  1019. phone_entitys.append(_entity)
  1020. def is_company(entity,text):
  1021. # 判断"公司"实体是否为地址地点
  1022. if entity.label!=5 and entity.values[entity.label]>0.5:
  1023. return True
  1024. if ent.is_tail==True:
  1025. return False
  1026. entity_left = text[max(0,entity.wordOffset_begin-10):entity.wordOffset_begin]
  1027. entity_left = re.sub(",()\(\)::","",entity_left)
  1028. entity_left = entity_left[-5:]
  1029. if re.search("地址|地点",entity_left):
  1030. return False
  1031. else:
  1032. return True
  1033. pre_entity = []
  1034. for ent in list_entity:
  1035. if (ent.entity_type in ['company','org','phone'] and is_company(ent,list_sentence[ent.sentence_index].sentence_text)) or (ent.entity_type=='person' and ent.label in [1,2,3]) \
  1036. or (ent.entity_type=='location' and len(ent.entity_text)>5):
  1037. pre_entity.append(ent)
  1038. text_data,pre_data = relationExtraction_model.get_predata(pre_entity+phone_entitys,list_sentence)
  1039. # print(pre_data)
  1040. maxlen = 512
  1041. relation_list = []
  1042. if 0<len(text_data)<=maxlen:
  1043. relation_list = relationExtraction_model.predict(text_data,pre_data)
  1044. else:
  1045. # 公告大于maxlen时,分段预测
  1046. start = 0
  1047. while start<len(pre_data):
  1048. _pre_data = pre_data[start:start+maxlen]
  1049. _text_data = text_data[start:start+maxlen]
  1050. relation_list.extend(relationExtraction_model.predict(_text_data,_pre_data))
  1051. start = start + maxlen - 100
  1052. # 去重结果
  1053. relation_list = list(set(relation_list))
  1054. # print(relation_list)
  1055. tokens_num_dict = dict()
  1056. last_tokens_num = 0
  1057. for sentence in list_sentence:
  1058. _index = sentence.sentence_index
  1059. if _index == 0:
  1060. tokens_num_dict[_index] = 0
  1061. else:
  1062. tokens_num_dict[_index] = tokens_num_dict[_index - 1] + last_tokens_num
  1063. last_tokens_num = len(sentence.tokens)
  1064. right_combination = [('org','person'),('company','person'),('company','location'),('org','location'),('person','phone')]
  1065. linked_company = set()
  1066. linked_person = set()
  1067. for predicate in ["rel_address","rel_phone","rel_person"]:
  1068. _match_list = []
  1069. _match_combo = []
  1070. for relation in relation_list:
  1071. _subject = relation[0]
  1072. _object = relation[2]
  1073. if isinstance(_subject,Entity) and isinstance(_object,Entity) and (_subject.entity_type,_object.entity_type) in right_combination:
  1074. if relation[1]==predicate:
  1075. if predicate=="rel_person":
  1076. if (_subject.label==0 and _object.entity_text in agency_contact ) or (_subject.label==1 and _object.entity_text in tenderee_contact):
  1077. continue
  1078. distance = (tokens_num_dict[_object.sentence_index] + _object.begin_index) - (
  1079. tokens_num_dict[_subject.sentence_index] + _subject.end_index)
  1080. if distance>0:
  1081. value = (-1 / 2 * (distance ** 2))/10000
  1082. else:
  1083. distance = abs(distance)
  1084. value = (-1 / 2 * (distance ** 2))
  1085. _match_list.append(Match(_subject,_object,value))
  1086. _match_combo.append((_subject,_object))
  1087. match_result = dispatch(_match_list)
  1088. error_list = []
  1089. for mat in list(set(_match_combo)-set(match_result)):
  1090. for temp in match_result:
  1091. if mat[1]==temp[1] and mat[0]!=temp[0]:
  1092. error_list.append(mat)
  1093. break
  1094. result = list(set(_match_combo)-set(error_list))
  1095. if predicate=='rel_person':
  1096. # 从后往前更新状态,已近后向链接的属性不在前向链接(解决错误链接)
  1097. result = sorted(result,key=lambda x:x[1].begin_index,reverse=True)
  1098. for combo in result:
  1099. is_continue = False
  1100. if not combo[0].pointer_person:
  1101. combo[0].pointer_person = []
  1102. if combo[1].begin_index<combo[0].begin_index:
  1103. if combo[0].pointer_person:
  1104. for temp in combo[0].pointer_person:
  1105. if temp.begin_index>combo[0].begin_index:
  1106. is_continue = True
  1107. break
  1108. if is_continue: continue
  1109. combo[0].pointer_person.append(combo[1])
  1110. linked_company.add(combo[0])
  1111. linked_person.add(combo[1])
  1112. # print(1,combo[0].entity_text,combo[1].entity_text)
  1113. if predicate=='rel_address':
  1114. result = sorted(result,key=lambda x:x[1].begin_index,reverse=True)
  1115. for combo in result:
  1116. if combo[0].pointer_address:
  1117. continue
  1118. combo[0].pointer_address = combo[1]
  1119. # print(2,combo[0].entity_text,combo[1].entity_text)
  1120. if predicate=='rel_phone':
  1121. result = sorted(result,key=lambda x:x[1].begin_index,reverse=True)
  1122. for combo in result:
  1123. is_continue = False
  1124. if not combo[0].person_phone:
  1125. combo[0].person_phone = []
  1126. if combo[1].begin_index<combo[0].begin_index:
  1127. if combo[0].person_phone:
  1128. for temp in combo[0].person_phone:
  1129. if temp.begin_index>combo[0].begin_index:
  1130. is_continue = True
  1131. break
  1132. if is_continue: continue
  1133. combo[0].person_phone.append(combo[1])
  1134. if combo[0].label in [1,2]:
  1135. if PackDict.get("Project"):
  1136. for i in range(len(PackDict["Project"]["roleList"])):
  1137. if (combo[0].label==1 and PackDict["Project"]["roleList"][i].role_name=='tenderee') \
  1138. or (combo[0].label==2 and PackDict["Project"]["roleList"][i].role_name=='agency'):
  1139. PackDict["Project"]["roleList"][i].linklist.append((combo[0].entity_text,combo[1].entity_text))
  1140. break
  1141. # print(3,combo[0].entity_text,combo[1].entity_text)
  1142. # 更新 PackDict
  1143. for link_p in list(linked_company):
  1144. for k in PackDict.keys():
  1145. for i in range(len(PackDict[k]["roleList"])):
  1146. if PackDict[k]["roleList"][i].role_name == "tenderee":
  1147. if PackDict[k]["roleList"][i].entity_text == link_p.entity_text or link_p.label == 0:
  1148. for per in link_p.pointer_person:
  1149. person_phone = [phone for phone in per.person_phone] if per.person_phone else []
  1150. if not person_phone:
  1151. if per.entity_text not in agency_contact:
  1152. PackDict[k]["roleList"][i].linklist.append((per.entity_text, ""))
  1153. continue
  1154. for _p in person_phone:
  1155. if per.entity_text not in agency_contact and _p.entity_text not in agency_phone:
  1156. PackDict[k]["roleList"][i].linklist.append((per.entity_text, _p.entity_text))
  1157. elif PackDict[k]["roleList"][i].role_name == "agency":
  1158. if PackDict[k]["roleList"][i].entity_text == link_p.entity_text or link_p.label == 1:
  1159. for per in link_p.pointer_person:
  1160. person_phone = [phone for phone in per.person_phone] if per.person_phone else []
  1161. if not person_phone:
  1162. if per.entity_text not in tenderee_contact:
  1163. PackDict[k]["roleList"][i].linklist.append((per.entity_text, ""))
  1164. continue
  1165. for _p in person_phone:
  1166. if per.entity_text not in tenderee_contact and _p.entity_text not in tenderee_phone:
  1167. PackDict[k]["roleList"][i].linklist.append((per.entity_text, _p.entity_text))
  1168. else:
  1169. if PackDict[k]["roleList"][i].entity_text == link_p.entity_text:
  1170. for per in link_p.pointer_person:
  1171. person_phone = [phone for phone in per.person_phone] if per.person_phone else []
  1172. if not person_phone:
  1173. if per.entity_text not in tenderee_contact and per.entity_text not in agency_contact:
  1174. PackDict[k]["roleList"][i].linklist.append((per.entity_text, ""))
  1175. continue
  1176. for _p in person_phone:
  1177. if per.entity_text not in tenderee_contact and _p.entity_text not in tenderee_phone and \
  1178. per.entity_text not in agency_contact and _p.entity_text not in agency_phone:
  1179. PackDict[k]["roleList"][i].linklist.append((per.entity_text, _p.entity_text))
  1180. re_split = re.compile("[^\u4e00-\u9fa5、](十一|十二|十三|十四|十五|一|二|三|四|五|六|七|八|九|十)、")
  1181. split_list = [0] * 16
  1182. split_dict = {
  1183. "一、": 1,
  1184. "二、": 2,
  1185. "三、": 3,
  1186. "四、": 4,
  1187. "五、": 5,
  1188. "六、": 6,
  1189. "七、": 7,
  1190. "八、": 8,
  1191. "九、": 9,
  1192. "十、": 10,
  1193. "十一、": 11,
  1194. "十二、": 12,
  1195. "十三、": 13,
  1196. "十四、": 14,
  1197. "十五、": 15
  1198. }
  1199. for item in re.finditer(re_split, _content):
  1200. _index = split_dict.get(item.group()[1:])
  1201. if not split_list[_index]:
  1202. split_list[_index] = item.span()[0] + 1
  1203. split_list = [i for i in split_list if i != 0]
  1204. start = 0
  1205. new_split_list = []
  1206. for idx in split_list:
  1207. new_split_list.append((start, idx))
  1208. start = idx
  1209. new_split_list.append((start, len(_content)))
  1210. # 实体列表按照“公告分段”分组
  1211. words_num_dict = dict()
  1212. last_words_num = 0
  1213. for sentence in list_sentence:
  1214. _index = sentence.sentence_index
  1215. if _index == 0:
  1216. words_num_dict[_index] = 0
  1217. else:
  1218. words_num_dict[_index] = words_num_dict[_index - 1] + last_words_num
  1219. last_words_num = len(sentence.sentence_text)
  1220. # 公司-联系人连接(km算法)
  1221. re_phone = re.compile('1[3-9][0-9][-—-]?\d{4}[-—-]?\d{4}|'
  1222. '0\d{2,3}[-—-][1-9]\d{6,7}/[1-9]\d{6,10}|'
  1223. '0\d{2,3}[-—-][1-9]\d{6,7}转\d{1,4}|'
  1224. '0\d{2,3}[-—-]?[1-9]\d{6,7}|'
  1225. '[\(|\(]0\d{2,3}[\)|\)]-?\d{7,8}-?\d{,4}|'
  1226. '[1-9]\d{6,7}')
  1227. key_phone = re.compile("联系方式|电话|联系人|负责人")
  1228. temporary_list2 = []
  1229. for entity in list_entity:
  1230. # if entity.entity_type in ['org', 'company', 'person'] and entity.is_tail==False:
  1231. if entity.entity_type in ['org', 'company', 'person']:
  1232. temporary_list2.append(entity)
  1233. temporary_list2 = sorted(temporary_list2, key=lambda x: (x.sentence_index, x.begin_index))
  1234. new_temporary_list2 = []
  1235. for _split in new_split_list:
  1236. temp_list = []
  1237. for _entity in temporary_list2:
  1238. if words_num_dict[_entity.sentence_index] + _entity.wordOffset_begin >= _split[0] and words_num_dict[
  1239. _entity.sentence_index] + _entity.wordOffset_end < _split[1]:
  1240. temp_list.append(_entity)
  1241. elif words_num_dict[_entity.sentence_index] + _entity.wordOffset_begin >= _split[1]:
  1242. break
  1243. new_temporary_list2.append(temp_list)
  1244. # print(new_temporary_list2)
  1245. match_list2 = []
  1246. for split_index in range(len(new_temporary_list2)):
  1247. split_entitys = new_temporary_list2[split_index]
  1248. is_skip = False
  1249. for index in range(len(split_entitys)):
  1250. entity = split_entitys[index]
  1251. if is_skip:
  1252. is_skip = False
  1253. continue
  1254. else:
  1255. if entity.entity_type in ['org', 'company']:
  1256. if entity.label != 5 or entity.entity_text in roleSet:
  1257. match_nums = 0
  1258. for after_index in range(index + 1, min(len(split_entitys), index + 4)):
  1259. after_entity = split_entitys[after_index]
  1260. if after_entity.entity_type in ['person']:
  1261. # 实体为中标人/候选人,联系人已确定类别【1,2】
  1262. if entity.label in [2, 3, 4] and after_entity.label in [1, 2]:
  1263. break
  1264. if after_entity.label in [1, 2, 3]:
  1265. distance = (tokens_num_dict[
  1266. after_entity.sentence_index] + after_entity.begin_index) - (
  1267. tokens_num_dict[entity.sentence_index] + entity.end_index)
  1268. sentence_distance = after_entity.sentence_index - entity.sentence_index
  1269. if sentence_distance == 0:
  1270. if distance < 100:
  1271. if (entity.label == 0 and after_entity.label == 1) or (
  1272. entity.label == 1 and after_entity.label == 2):
  1273. distance = distance / 100
  1274. value = (-1 / 2 * (distance ** 2)) / 10000
  1275. match_list2.append(Match(entity, after_entity, value))
  1276. match_nums += 1
  1277. else:
  1278. if distance < 60:
  1279. if (entity.label == 0 and after_entity.label == 1) or (
  1280. entity.label == 1 and after_entity.label == 2):
  1281. distance = distance / 100
  1282. value = (-1 / 2 * (distance ** 2)) / 10000
  1283. match_list2.append(Match(entity, after_entity, value))
  1284. match_nums += 1
  1285. if after_entity.entity_type in ['org', 'company']:
  1286. # 解决在‘地址’中识别出org/company的问题
  1287. # if entity.label in [0,1] and after_index==index+1 and after_entity.label not in [0,1]:
  1288. if entity.label != 5 and after_index == index + 1 and (
  1289. after_entity.label == entity.label or after_entity.label == 5):
  1290. distance = (tokens_num_dict[
  1291. after_entity.sentence_index] + after_entity.begin_index) - (
  1292. tokens_num_dict[entity.sentence_index] + entity.end_index)
  1293. if distance < 20:
  1294. after_entity_left = list_sentence[after_entity.sentence_index].tokens[max(0,
  1295. after_entity.begin_index - 10):after_entity.begin_index]
  1296. after_entity_right = list_sentence[after_entity.sentence_index].tokens[
  1297. after_entity.end_index + 1:after_entity.end_index + 6]
  1298. after_entity_left = "".join(after_entity_left)
  1299. if len(after_entity_left) > 20:
  1300. after_entity_left = after_entity_left[-20:]
  1301. after_entity_right = "".join(after_entity_right)[:10]
  1302. if re.search("地,?址", after_entity_left):
  1303. is_skip = True
  1304. continue
  1305. if re.search("\(|(", after_entity_left) and re.search("\)|)",
  1306. after_entity_right):
  1307. is_skip = True
  1308. continue
  1309. if entity.label in [0, 1] and after_entity.label in [0,
  1310. 1] and entity.label == after_entity.label:
  1311. break
  1312. if entity.label in [0, 1] and after_entity.label in [0, 1] and split_entitys[
  1313. index + 1].entity_type == "person":
  1314. break
  1315. if entity.label in [0, 1] and after_entity.label in [2, 3, 4]:
  1316. break
  1317. if entity.label in [2, 3, 4] and after_entity.label in [0, 1]:
  1318. break
  1319. # 搜索没有联系人的电话
  1320. mid_tokens = []
  1321. is_same_sentence = False
  1322. if index == len(split_entitys) - 1:
  1323. for i in range(entity.sentence_index, len(list_sentence)):
  1324. mid_tokens += list_sentence[i].tokens
  1325. mid_tokens = mid_tokens[entity.end_index + 1:]
  1326. mid_sentence = "".join(mid_tokens)
  1327. have_phone = re.findall(re_phone, mid_sentence)
  1328. if have_phone:
  1329. if re.findall(re_phone, mid_sentence.split("。")[0]):
  1330. is_same_sentence = True
  1331. _phone = have_phone[0]
  1332. phone_begin = mid_sentence.find(_phone)
  1333. if words_num_dict[entity.sentence_index] + entity.wordOffset_begin + phone_begin < \
  1334. new_split_list[split_index][1]:
  1335. mid_sentence = mid_sentence[max(0, phone_begin - 15):phone_begin].replace(",", "")
  1336. if re.search(key_phone, mid_sentence):
  1337. distance = 1
  1338. if is_same_sentence:
  1339. if phone_begin <= 200:
  1340. value = (-1 / 2 * (distance ** 2)) / 10000
  1341. match_list2.append(Match(entity, (entity, _phone), value))
  1342. match_nums += 1
  1343. else:
  1344. if phone_begin <= 60:
  1345. value = (-1 / 2 * (distance ** 2)) / 10000
  1346. match_list2.append(Match(entity, (entity, _phone), value))
  1347. match_nums += 1
  1348. else:
  1349. next_entity = split_entitys[index + 1]
  1350. if entity.sentence_index == next_entity.sentence_index:
  1351. mid_tokens += list_sentence[entity.sentence_index].tokens[
  1352. entity.end_index + 1:next_entity.begin_index]
  1353. else:
  1354. sentence_index = entity.sentence_index
  1355. while sentence_index <= next_entity.sentence_index:
  1356. mid_tokens += list_sentence[sentence_index].tokens
  1357. sentence_index += 1
  1358. mid_tokens = mid_tokens[entity.end_index + 1:-(len(
  1359. list_sentence[next_entity.sentence_index].tokens) - next_entity.begin_index) + 1]
  1360. mid_sentence = "".join(mid_tokens)
  1361. have_phone = re.findall(re_phone, mid_sentence)
  1362. if have_phone:
  1363. if re.findall(re_phone, mid_sentence.split("。")[0]):
  1364. is_same_sentence = True
  1365. _phone = have_phone[0]
  1366. phone_begin = mid_sentence.find(_phone)
  1367. mid_sentence = mid_sentence[max(0, phone_begin - 15):phone_begin].replace(",", "")
  1368. if re.search(key_phone, mid_sentence):
  1369. p_phone = [p.entity_text for p in next_entity.person_phone] if next_entity.person_phone else []
  1370. if next_entity.entity_type == 'person' and _phone in p_phone:
  1371. pass
  1372. else:
  1373. distance = (tokens_num_dict[
  1374. next_entity.sentence_index] + next_entity.begin_index) - (
  1375. tokens_num_dict[entity.sentence_index] + entity.end_index)
  1376. distance = distance / 2
  1377. if is_same_sentence:
  1378. if phone_begin <= 200:
  1379. value = (-1 / 2 * (distance ** 2)) / 10000
  1380. match_list2.append(Match(entity, (entity, _phone), value))
  1381. match_nums += 1
  1382. else:
  1383. if phone_begin <= 60:
  1384. value = (-1 / 2 * (distance ** 2)) / 10000
  1385. match_list2.append(Match(entity, (entity, _phone), value))
  1386. match_nums += 1
  1387. # 实体无匹配时,尝试前向查找匹配
  1388. if not match_nums:
  1389. if entity.label != 5 and entity.values[entity.label] > 0.5 and index != 0:
  1390. previous_entity = split_entitys[index - 1]
  1391. if previous_entity.entity_type == 'person' and previous_entity.label in [1, 2, 3]:
  1392. if entity.label in [2, 3, 4] and previous_entity.label in [1, 2]:
  1393. continue
  1394. if previous_entity.sentence_index == entity.sentence_index:
  1395. distance = (tokens_num_dict[entity.sentence_index] + entity.begin_index) - (
  1396. tokens_num_dict[
  1397. previous_entity.sentence_index] + previous_entity.end_index)
  1398. if distance < 20:
  1399. # 距离相等时,前向添加处罚值
  1400. # distance += 1
  1401. # 前向 没有 /10000
  1402. value = (-1 / 2 * (distance ** 2))
  1403. match_list2.append(Match(entity, previous_entity, value))
  1404. # print(match_list2)
  1405. match_list2 = [mat for mat in match_list2 if mat.main_role not in linked_company and mat.attribute not in linked_person]
  1406. # print(match_list2)
  1407. # km算法分配求解
  1408. result2 = dispatch(match_list2)
  1409. # print(result2)
  1410. linked_person = []
  1411. linked_persons_with = []
  1412. for match in result2:
  1413. entity = match[0]
  1414. # print(entity.entity_text)
  1415. # print(match.attribute)
  1416. entity_index = list_entity.index(entity)
  1417. is_update = False
  1418. if isinstance(match[1], tuple):
  1419. person_ = ''
  1420. phone_ = [match[1][1]]
  1421. else:
  1422. person_ = match[1].entity_text
  1423. phone_ = [i.entity_text for i in match[1].person_phone] if match[1].person_phone else []
  1424. for k in PackDict.keys():
  1425. for i in range(len(PackDict[k]["roleList"])):
  1426. if PackDict[k]["roleList"][i].role_name == "tenderee":
  1427. if not PackDict[k]["roleList"][i].linklist:
  1428. if PackDict[k]["roleList"][i].entity_text == entity.entity_text or entity.label == 0:
  1429. if person_ not in agency_contact and len(set(phone_)&set(agency_phone))==0:
  1430. if not phone_:
  1431. PackDict[k]["roleList"][i].linklist.append((person_, ""))
  1432. for p in phone_:
  1433. PackDict[k]["roleList"][i].linklist.append((person_, p))
  1434. is_update = True
  1435. elif PackDict[k]["roleList"][i].role_name == "agency":
  1436. if not PackDict[k]["roleList"][i].linklist:
  1437. if PackDict[k]["roleList"][i].entity_text == entity.entity_text or entity.label == 1:
  1438. if person_ not in tenderee_contact and len(set(phone_)&set(tenderee_phone))==0:
  1439. if not phone_:
  1440. PackDict[k]["roleList"][i].linklist.append((person_, ""))
  1441. for p in phone_:
  1442. PackDict[k]["roleList"][i].linklist.append((person_, p))
  1443. is_update = True
  1444. else:
  1445. if PackDict[k]["roleList"][i].entity_text == entity.entity_text:
  1446. if not PackDict[k]["roleList"][i].linklist:
  1447. if person_ not in tenderee_contact and len(set(phone_)&set(tenderee_phone))==0 and \
  1448. person_ not in agency_contact and len(set(phone_)&set(agency_phone))==0:
  1449. if not phone_:
  1450. PackDict[k]["roleList"][i].linklist.append((person_, ""))
  1451. for p in phone_:
  1452. PackDict[k]["roleList"][i].linklist.append((person_, p))
  1453. is_update = True
  1454. if not person_:
  1455. is_update = False
  1456. if is_update:
  1457. # 更新 list_entity
  1458. if not list_entity[entity_index].pointer_person:
  1459. list_entity[entity_index].pointer_person = []
  1460. list_entity[entity_index].pointer_person.append(match[1])
  1461. linked_person.append(match[1])
  1462. linked_persons_with.append(entity)
  1463. # 一个公司对应多个联系人的补充
  1464. person_entitys = [entity for entity in list_entity if entity.entity_type=='person']
  1465. person_entitys = person_entitys[::-1]
  1466. for index in range(len(person_entitys)):
  1467. entity = person_entitys[index]
  1468. prepare_link = []
  1469. if entity not in linked_person:
  1470. prepare_link.append(entity)
  1471. last_person = entity
  1472. for after_index in range(index + 1, min(len(person_entitys), index + 5)):
  1473. after_entity = person_entitys[after_index]
  1474. if after_entity.sentence_index==last_person.sentence_index and last_person.begin_index-after_entity.end_index<5:
  1475. if after_entity in linked_person:
  1476. _index = linked_person.index(after_entity)
  1477. with_company = linked_persons_with[_index]
  1478. for i in range(len(PackDict["Project"]["roleList"])):
  1479. if PackDict["Project"]["roleList"][i].role_name == "tenderee":
  1480. if PackDict["Project"]["roleList"][i].entity_text == with_company.entity_text or with_company.label == 0:
  1481. for item in prepare_link:
  1482. person_phone = [p.entity_text for p in item.person_phone] if item.person_phone else []
  1483. for _p in person_phone:
  1484. PackDict["Project"]["roleList"][i].linklist.append((item.entity_text, _p))
  1485. with_company.pointer_person.append(item)
  1486. linked_person.append(item)
  1487. elif PackDict["Project"]["roleList"][i].role_name == "agency":
  1488. if PackDict["Project"]["roleList"][i].entity_text == with_company.entity_text or with_company.label == 1:
  1489. for item in prepare_link:
  1490. person_phone = [p.entity_text for p in item.person_phone] if item.person_phone else []
  1491. for _p in person_phone:
  1492. PackDict["Project"]["roleList"][i].linklist.append((item.entity_text, _p))
  1493. with_company.pointer_person.append(item)
  1494. linked_person.append(item)
  1495. else:
  1496. if PackDict["Project"]["roleList"][i].entity_text == with_company.entity_text:
  1497. for item in prepare_link:
  1498. person_phone = [p.entity_text for p in item.person_phone] if item.person_phone else []
  1499. for _p in person_phone:
  1500. PackDict["Project"]["roleList"][i].linklist.append((item.entity_text, _p))
  1501. with_company.pointer_person.append(item)
  1502. linked_person.append(item)
  1503. break
  1504. else:
  1505. prepare_link.append(after_entity)
  1506. last_person = after_entity
  1507. continue
  1508. # 统一同类角色的属性
  1509. if PackDict.get("Project"):
  1510. for i in range(len(PackDict["Project"]["roleList"])):
  1511. # if PackDict["Project"]["roleList"][i].role_name in ["tenderee","agency"]:
  1512. for _entity in list_entity:
  1513. if _entity.entity_type in ['org','company']:
  1514. is_similar = False
  1515. # entity_text相同
  1516. if _entity.entity_text==PackDict["Project"]["roleList"][i].entity_text:
  1517. is_similar = True
  1518. # entity.label为【0,1】
  1519. if _entity.label in [0,1] and dict_role_id[str(_entity.label)]==PackDict["Project"]["roleList"][i].role_name:
  1520. is_similar = True
  1521. if is_similar:
  1522. linked_entitys = _entity.linked_entitys
  1523. if linked_entitys:
  1524. for linked_entity in linked_entitys:
  1525. pointer_person = linked_entity.pointer_person if linked_entity.pointer_person else []
  1526. for _pointer_person in pointer_person:
  1527. _phone = [p.entity_text for p in _pointer_person.person_phone] if _pointer_person.person_phone else []
  1528. for _p in _phone:
  1529. if (_pointer_person.entity_text,_p) not in PackDict["Project"]["roleList"][i].linklist:
  1530. PackDict["Project"]["roleList"][i].linklist.append((_pointer_person.entity_text,_p))
  1531. # "roleList"中联系人电话去重
  1532. for i in range(len(PackDict["Project"]["roleList"])):
  1533. # print(123, PackDict["Project"]["roleList"][i].linklist)
  1534. # 带有联系人的电话
  1535. with_person = [person_phone[1] for person_phone in PackDict["Project"]["roleList"][i].linklist if person_phone[0]]
  1536. # 带有电话的联系人
  1537. with_phone = [person_phone[0] for person_phone in PackDict["Project"]["roleList"][i].linklist if person_phone[1]]
  1538. remove_list = []
  1539. for item in PackDict["Project"]["roleList"][i].linklist:
  1540. if not item[0]:
  1541. if item[1] in with_person:
  1542. # 删除重复的无联系人电话
  1543. remove_list.append(item)
  1544. elif not item[1]:
  1545. if item[0] in with_phone:
  1546. remove_list.append(item)
  1547. for _item in remove_list:
  1548. PackDict["Project"]["roleList"][i].linklist.remove(_item)
  1549. # # 1)第一个公司实体的招标人,则看看下一个实体是否为代理人,如果是则联系人错位连接 。2)在同一句中往后找联系人。3)连接不上在整个文章找联系人。
  1550. # temp_ent_list = [] # 临时列表,记录0,1角色及3联系人
  1551. # other_person = [] # 阈值以上的联系人列表
  1552. # link_person = [] # 有电话没联系上角色的person列表
  1553. # other_ent = []
  1554. # link_ent = []
  1555. # found_person = False
  1556. # ent_list = []
  1557. # for entity in list_entity:
  1558. # if entity.entity_type in ['org','company','person']:
  1559. # ent_list.append(entity)
  1560. # # ent_list = [entity for entity in list_entity if entity.entity_type in ['org','company','person']]
  1561. # #for list_index in range(len(ent_list)):
  1562. # #if ent_list[list_index].entity_type in ['org','company'] and ent_list[list_index].label == 0 and list_index+2<len(ent_list) and \
  1563. # #ent_list[list_index+1].entity_type in ['org','company'] and ent_list[list_index+1].label == 1 and ent_list[list_index+2].entity_type in ['person']:
  1564. # #ent_list[list_index+1], ent_list[list_index+2] = ent_list[list_index+2], ent_list[list_index+1]
  1565. # # 2020/11/25增加确定角色联系人判断
  1566. # sure_person_set = set([entity.entity_text for entity in ent_list if entity.entity_type == 'person' and entity.label in [1, 2]])
  1567. # # 招标/代理在同一句中交叉情况的处理
  1568. # for index in range(len(ent_list)):
  1569. # entity = ent_list[index]
  1570. # if entity.entity_text in roleSet and entity.label in [0, 1] and index+3<len(ent_list):
  1571. # if entity.sentence_index==ent_list[index+1].sentence_index==ent_list[index+2].sentence_index==ent_list[index+3].sentence_index:
  1572. # if ent_list[index+1].begin_index - entity.end_index < 30:
  1573. # if ent_list[index+1].entity_text in roleSet and ent_list[index+1].label in [0, 1] and entity.label!=ent_list[index+1].label:
  1574. # if ent_list[index+2].entity_type=="person" and ent_list[index+3].entity_type=="person" and \
  1575. # ent_list[index+2].label==3 and ent_list[index+3].label==3:
  1576. # ent_list[index + 1], ent_list[index + 2] = ent_list[index + 2], ent_list[index + 1]
  1577. #
  1578. #
  1579. # for index in range(len(ent_list)):
  1580. # entity = ent_list[index]
  1581. # if entity.entity_type=="person":
  1582. # if str(entity.label) == "0": # 2020/11/25 非联系人直接跳过
  1583. # continue
  1584. # if entity.values[entity.label]>on_value_person:
  1585. # if str(entity.label)=="1":
  1586. # for i in range(len(PackDict["Project"]["roleList"])):
  1587. # if PackDict["Project"]["roleList"][i].role_name=="tenderee":
  1588. # PackDict["Project"]["roleList"][i].linklist.append((entity.entity_text,entity.person_phone))
  1589. # link_person.append(entity.entity_text)
  1590. # link_ent.append(PackDict["Project"]["roleList"][i].entity_text)
  1591. # # add pointer_person
  1592. # for _entity in list_entity:
  1593. # if dict_role_id.get(str(_entity.label))=="tenderee":
  1594. # for i in range(len(PackDict["Project"]["roleList"])):
  1595. # if PackDict["Project"]["roleList"][i].entity_text==_entity.entity_text and PackDict["Project"]["roleList"][i].role_name=="tenderee":
  1596. # _entity.pointer_person = entity
  1597. # elif str(entity.label)=="2":
  1598. # for i in range(len(PackDict["Project"]["roleList"])):
  1599. # if PackDict["Project"]["roleList"][i].role_name=="agency":
  1600. # PackDict["Project"]["roleList"][i].linklist.append((entity.entity_text,entity.person_phone))
  1601. # link_person.append(entity.entity_text)
  1602. # link_ent.append(PackDict["Project"]["roleList"][i].entity_text)
  1603. # # add pointer_person
  1604. # for _entity in list_entity:
  1605. # if dict_role_id.get(str(_entity.label))=="agency":
  1606. # for i in range(len(PackDict["Project"]["roleList"])):
  1607. # if PackDict["Project"]["roleList"][i].entity_text==_entity.entity_text and PackDict["Project"]["roleList"][i].role_name=="agency":
  1608. # _entity.pointer_person = entity
  1609. # elif str(entity.label)=="3":
  1610. # if entity.entity_text in sure_person_set: # 2020/11/25 排除已经确定角色的联系人
  1611. # continue
  1612. # #not_link_person.append((entity_after.entity_text,entity_after.person_phone))
  1613. # other_person.append(entity.entity_text)
  1614. # temp_ent_list.append((entity.entity_text,entity.person_phone,entity))
  1615. #
  1616. # #if entity.entity_text in roleSet:
  1617. # if entity.entity_text in roleSet:
  1618. # if entity.label in [0,1]:
  1619. # other_ent.append(entity.entity_text)
  1620. # temp_ent_list.append((entity.entity_text, entity.label,entity))
  1621. # for behind_index in range(index+1, len(ent_list)):
  1622. # entity_after = ent_list[behind_index]
  1623. # if entity_after.sentence_index-entity.sentence_index>=1 or entity_after.entity_type in ['org','company']: # 只在本句中找联系人
  1624. # break
  1625. # if entity_after.values is not None:
  1626. # if entity_after.entity_type=="person":
  1627. # if str(entity_after.label) == "0": # 2020/11/25角色后面为非联系人 停止继续往后找
  1628. # break
  1629. # if entity_after.values[entity_after.label]>on_value_person:
  1630. # if str(entity_after.label)=="1":
  1631. # for i in range(len(PackDict["Project"]["roleList"])):
  1632. # if PackDict["Project"]["roleList"][i].role_name=="tenderee":
  1633. # PackDict["Project"]["roleList"][i].linklist.append((entity_after.entity_text,entity_after.person_phone))
  1634. # link_person.append(entity_after.entity_text)
  1635. # link_ent.append(PackDict["Project"]["roleList"][i].entity_text)
  1636. # elif str(entity_after.label)=="2":
  1637. # for i in range(len(PackDict["Project"]["roleList"])):
  1638. # if PackDict["Project"]["roleList"][i].role_name=="agency":
  1639. # PackDict["Project"]["roleList"][i].linklist.append((entity_after.entity_text,entity_after.person_phone))
  1640. # link_person.append(entity_after.entity_text)
  1641. # link_ent.append(PackDict["Project"]["roleList"][i].entity_text)
  1642. # elif str(entity_after.label)=="3":
  1643. # if entity_after.entity_text in sure_person_set: # 2020/11/25 如果姓名已经出现在确定角色联系人中则停止往后找
  1644. # break
  1645. # elif entity_after.begin_index - entity.end_index > 30:#2020/10/25 如果角色实体与联系人实体间隔大于阈值停止
  1646. # break
  1647. # for pack in PackDict.keys():
  1648. # for i in range(len(PackDict[pack]["roleList"])):
  1649. # if PackDict[pack]["roleList"][i].entity_text==entity.entity_text:
  1650. # #if entity_after.sentence_index-entity.sentence_index>1 and len(roleList[i].linklist)>0:
  1651. # #break
  1652. # PackDict[pack]["roleList"][i].linklist.append((entity_after.entity_text,entity_after.person_phone))
  1653. # link_person.append(entity_after.entity_text)
  1654. # #add pointer_person
  1655. # entity.pointer_person = entity_after
  1656. #
  1657. # not_link_person = [person for person in other_person if person not in link_person]
  1658. # not_link_ent = [ent for ent in other_ent if ent not in link_ent]
  1659. # if len(not_link_person) > 0 and len(not_link_ent) > 0 :
  1660. # item = temp_ent_list
  1661. # for i in range(len(item)):
  1662. # if item[i][0] in not_link_ent and item[i][1] == 0 and i+3 < len(item):
  1663. # if item[i+1][0] in other_ent and item[i+1][1] == 1 and item[i+2][0] in other_person and item[i+3][0] in other_person:
  1664. # item[i+1], item[i+2] = item[i+2], item[i+1]
  1665. # for i in range(len(item)-1, -1, -1):
  1666. # if item[i][0] in not_link_ent:
  1667. # for pack in PackDict.keys():
  1668. # for role in PackDict[pack]["roleList"]:
  1669. # if role.entity_text == item[i][0] and len(role.linklist) < 1:
  1670. # for j in range(i+1, len(item)):
  1671. # if item[j][0] in not_link_person:
  1672. # role.linklist.append(item[j][:2])
  1673. # #add pointer_person
  1674. # item[i][2].pointer_person = item[j][2]
  1675. # break
  1676. # else:
  1677. # break
  1678. # # 电话没有联系人的处理
  1679. # role_with_no_phone = []
  1680. # for i in range(len(PackDict["Project"]["roleList"])):
  1681. # if PackDict["Project"]["roleList"][i].role_name in ["tenderee","agency"]:
  1682. # if len(PackDict["Project"]["roleList"][i].linklist)==0: # 找出没有联系人的招标/代理人
  1683. # role_with_no_phone.append(PackDict["Project"]["roleList"][i].entity_text)
  1684. # else:
  1685. # phone_nums = 0
  1686. # for link in PackDict["Project"]["roleList"][i].linklist:
  1687. # if link[1]:
  1688. # phone_nums += 1
  1689. # break
  1690. # if not phone_nums:
  1691. # role_with_no_phone.append(PackDict["Project"]["roleList"][i].entity_text)
  1692. # if role_with_no_phone:
  1693. # phone_with_person = [entity.person_phone for entity in list_entity if entity.entity_type == "person"]
  1694. # # phone_with_person = [phone for phone in phone_with_person if phone]
  1695. #
  1696. # dict_index_sentence = {}
  1697. # for _sentence in list_sentence:
  1698. # dict_index_sentence[_sentence.sentence_index] = _sentence
  1699. # new_entity_list = [entity for entity in list_entity if entity.entity_type in ['org','company','person']]
  1700. # for index in range(len(new_entity_list)):
  1701. # entity = new_entity_list[index]
  1702. # if entity.entity_text in role_with_no_phone:
  1703. # e_sentence = dict_index_sentence[entity.sentence_index]
  1704. # entity_right = e_sentence.tokens[entity.end_index:entity.end_index+40]
  1705. # entity_right = "".join(entity_right)
  1706. # if index+1<len(new_entity_list) and entity_right.find(new_entity_list[index+1].entity_text)>-1:
  1707. # entity_right = entity_right[:entity_right.find(new_entity_list[index+1].entity_text)]
  1708. # have_phone = re.findall(phone,entity_right)
  1709. # if have_phone:
  1710. # _phone = have_phone[0]
  1711. # phone_begin = entity_right.find(_phone)
  1712. # if _phone not in phone_with_person and re.search(key_phone,entity_right[:phone_begin]):
  1713. # # entity.person_phone = _phone
  1714. # for i in range(len(PackDict["Project"]["roleList"])):
  1715. # if PackDict["Project"]["roleList"][i].entity_text == entity.entity_text:
  1716. # PackDict["Project"]["roleList"][i].linklist.append(('', _phone))
  1717. #寻找多标段招标金额
  1718. p_entity = len(list_entity)-1
  1719. set_tenderer_money = set()
  1720. #遍历所有实体
  1721. while(p_entity>=0):
  1722. entity = list_entity[p_entity]
  1723. if entity.entity_type=="money":
  1724. if entity.values[entity.label]>=on_value:
  1725. if str(entity.label)=="1":
  1726. set_tenderer_money.add(float(entity.entity_text))
  1727. if str(entity.label)=="0":
  1728. '''
  1729. if p_entity>0:
  1730. p_before = list_entity[p_entity-1]
  1731. if p_before.entity_type=="money" and p_before.label==entity.label and p_before.entity_text==entity.entity_text and abs(entity.begin_index-p_before.end_index)<=2:
  1732. p_entity -= 1
  1733. continue
  1734. '''
  1735. packagePointer,_flag = getPackage(PackageList,entity.sentence_index,entity.begin_index,"money-"+str(entity.label),MAX_DIS=2,DIRECT="L")
  1736. if packagePointer is None:
  1737. packageName = "Project"
  1738. else:
  1739. packageName = packagePointer.entity_text
  1740. if packageName=="Project":
  1741. if PackDict["Project"]["tendereeMoney"]<float(entity.entity_text):
  1742. PackDict["Project"]["tendereeMoney"] = float(entity.entity_text)
  1743. else:
  1744. PackDict[packageName]["tendereeMoney"] = float(entity.entity_text)
  1745. #add pointer_tendereeMoney
  1746. packagePointer.pointer_tendereeMoney = entity
  1747. p_entity -= 1
  1748. #删除一个机构有多个角色的数据
  1749. #删除重复人、概率不回传
  1750. final_roleList = []
  1751. list_pop = []
  1752. set_tenderer_role = set()
  1753. dict_pack_tenderer_money = dict()
  1754. for pack in PackDict.keys():
  1755. #删除无效包
  1756. if PackDict[pack]["code"]=="" and PackDict[pack]["tendereeMoney"]==0 and len(PackDict[pack]["roleList"])==0:
  1757. list_pop.append(pack)
  1758. for i in range(len(PackDict[pack]["roleList"])):
  1759. if PackDict[pack]["roleList"][i].role_name=="win_tenderer":
  1760. if PackDict[pack]["roleList"][i].money==0:
  1761. set_tenderer_role.add(PackDict[pack]["roleList"][i])
  1762. dict_pack_tenderer_money[pack] = [PackDict[pack]["roleList"][i],set()]
  1763. #找到包的中投标金额
  1764. for _index in range(len(PackageList)):
  1765. if "hit" in PackageList[_index]:
  1766. for _hit in list(PackageList[_index]["hit"]):
  1767. _money = float(_hit.split("-")[1]) if _hit.split("-")[0]=="money" else None
  1768. if PackageList[_index]["name"] in dict_pack_tenderer_money and _money is not None:
  1769. dict_pack_tenderer_money[PackageList[_index]["name"]][1].add(_money)
  1770. #只找到一个中标人和中标金额
  1771. if len(set_tenderer_money)==1 and len(set_tenderer_role)==1:
  1772. list(set_tenderer_role)[0].money = list(set_tenderer_money)[0]
  1773. #找到一个中标人和多个招标金额
  1774. if len(set_tenderer_money)>1 and len(set_tenderer_role)==1:
  1775. _maxMoney = 0
  1776. _sumMoney = 0
  1777. for _m in list(set_tenderer_money):
  1778. _sumMoney += _m
  1779. if _m>_maxMoney:
  1780. _maxMoney = _m
  1781. if _sumMoney/_maxMoney==2:
  1782. list(set_tenderer_role)[0].money = _maxMoney
  1783. else:
  1784. list(set_tenderer_role)[0].money = _maxMoney
  1785. #每个包都只找到一个金额
  1786. _flag_pack_money = True
  1787. for k,v in dict_pack_tenderer_money.items():
  1788. if len(v[1])!=1:
  1789. _flag_pack_money = False
  1790. if _flag_pack_money and len(PackageSet)==len(dict_pack_tenderer_money.keys()):
  1791. for k,v in dict_pack_tenderer_money.items():
  1792. v[0].money = list(v[1])[0]
  1793. for pack in PackDict.keys():
  1794. for i in range(len(PackDict[pack]["roleList"])):
  1795. PackDict[pack]["roleList"][i] = PackDict[pack]["roleList"][i].getString()
  1796. for item in list_pop:
  1797. PackDict.pop(item)
  1798. return PackDict
  1799. def initPackageAttr(RoleList,PackageSet):
  1800. '''
  1801. @summary: 根据拿到的roleList和packageSet初始化接口返回的数据
  1802. '''
  1803. packDict = dict()
  1804. packDict["Project"] = {"code":"","tendereeMoney":0,"roleList":[]}
  1805. for item in list(PackageSet):
  1806. packDict[item] = {"code":"","tendereeMoney":0,"roleList":[]}
  1807. for item in RoleList:
  1808. if packDict[item.packageName]["code"] =="":
  1809. packDict[item.packageName]["code"] = item.packageCode
  1810. packDict[item.packageName]["roleList"].append(Role(item.role_name,item.entity_text,0,0,0.0,[]))
  1811. return packDict
  1812. def getPackageRoleMoney(list_sentence,list_entity):
  1813. '''
  1814. @param:
  1815. list_sentence:文章的句子list
  1816. list_entity:文章的实体list
  1817. @return: 拿到文章的包-标段号-角色-实体名称-金额-联系人-联系电话
  1818. '''
  1819. # print("=1")
  1820. theRole = getRoleList(list_sentence,list_entity)
  1821. if not theRole:
  1822. return []
  1823. RoleList,RoleSet,PackageList,PackageSet = theRole
  1824. '''
  1825. for item in PackageList:
  1826. print(item)
  1827. '''
  1828. # print("=2")
  1829. PackDict = initPackageAttr(RoleList, PackageSet)
  1830. # print("=3")
  1831. PackDict = findAttributeAfterEntity(PackDict, RoleSet, PackageList, PackageSet, list_entity, list_sentence)
  1832. # print("=4")
  1833. return PackDict
  1834. def turnBidWay(bidway):
  1835. if bidway in ("邀请招标","采购方式:邀请"):
  1836. return "邀请招标"
  1837. elif bidway in ("询价","询单","询比","采购方式:询价"):
  1838. return "询价"
  1839. elif bidway in ("竞谈","竞争性谈判","公开竞谈"):
  1840. return "竞争性谈判"
  1841. elif bidway in ("竞争性磋商","磋商"):
  1842. return "竞争性磋商"
  1843. elif bidway in ("竞价","竞标","电子竞价","以电子竞价","电子书面竞投"):
  1844. return "竞价"
  1845. elif bidway in ("公开招标","网上电子投标","网上招标","采购方式:公开","招标为其他"):
  1846. return "公开招标"
  1847. elif bidway in ("单一来源"):
  1848. return "单一来源"
  1849. elif bidway in ("比选"):
  1850. return "比选"
  1851. else:
  1852. return "其他"
  1853. def getOtherAttributes(list_entity):
  1854. dict_other = {"bidway":"",
  1855. "moneysource":"",
  1856. "person_review":[],
  1857. "time_release":"",
  1858. "time_bidopen":"",
  1859. "time_bidclose":"",
  1860. "serviceTime":"",
  1861. "product":[]}
  1862. for entity in list_entity:
  1863. if entity.entity_type == 'bidway':
  1864. dict_other["bidway"] = turnBidWay(entity.entity_text)
  1865. elif entity.entity_type=='moneysource':
  1866. dict_other["moneysource"] = entity.entity_text
  1867. elif entity.entity_type=='serviceTime':
  1868. dict_other["serviceTime"] = entity.entity_text
  1869. elif entity.entity_type == 'time' and entity.label==1:
  1870. dict_other["time_release"] = timeFormat(entity.entity_text)
  1871. elif entity.entity_type == 'time' and entity.label==2:
  1872. dict_other["time_bidopen"] = timeFormat(entity.entity_text)
  1873. elif entity.entity_type == 'time' and entity.label == 3:
  1874. dict_other["time_bidclose"] = timeFormat(entity.entity_text)
  1875. elif entity.entity_type=="person" and entity.label ==4:
  1876. dict_other["person_review"].append(entity.entity_text)
  1877. elif entity.entity_type=='product':
  1878. dict_other["product"].append(entity.entity_text)
  1879. dict_other["product"] = list(set(dict_other["product"]))
  1880. return dict_other
  1881. def getMoneyRange(RoleList):
  1882. pass
  1883. def getPREMs(list_sentences,list_entitys,list_articles):
  1884. '''
  1885. @param:
  1886. list_sentence:所有文章的句子list
  1887. list_entity:所有文章的实体list
  1888. @return:list of dict which include文章的包-角色-实体名称-金额-联系人-联系电话
  1889. '''
  1890. result = []
  1891. for list_sentence,list_entity,list_article in zip(list_sentences,list_entitys,list_articles):
  1892. RoleList = getPackageRoleMoney(list_sentence,list_entity)
  1893. result.append(dict({"prem":RoleList,"docid":list_article.doc_id},**getOtherAttributes(list_entity),
  1894. **{"fingerprint":list_article.fingerprint,"match_enterprise":list_article.match_enterprise,
  1895. "match_enterprise_type":list_article.match_enterprise_type,"process_time":getCurrent_date(),
  1896. "attachmentTypes":list_article.attachmentTypes}))
  1897. return result
  1898. if __name__=="__main__":
  1899. '''
  1900. conn = getConnection()
  1901. cursor = conn.cursor()
  1902. #sql = " select distinct A.doc_id from entity_mention A,test_predict_role B where A.entity_id=B.entity_id limit 200"
  1903. sql = " select B.doc_id,B.prem from articles_processed A, articles_validation B where A.id=B.doc_id "
  1904. result = []
  1905. cursor.execute(sql)
  1906. rows = cursor.fetchall()
  1907. count = 0
  1908. for row in rows:
  1909. count += 1
  1910. print(count)
  1911. doc_id = row[0]
  1912. roleList = getPackageRoleMoney(doc_id)
  1913. result.append([doc_id,str(roleList),row[1]])
  1914. ''''''
  1915. with codecs.open("getAttribute.html","w",encoding="utf8") as f:
  1916. f.write('<html><head>\
  1917. <meta http-equiv="Content-Type"\
  1918. content="text/html; charset=UTF-8">\
  1919. </head>\
  1920. <body bgcolor="#FFFFFF">\
  1921. <table border="1">\
  1922. <tr>\
  1923. <td>doc_id</td>\
  1924. <td>角色</td>\
  1925. </tr>')
  1926. for item in result:
  1927. f.write("<tr>"+"<td>"+item[0]+"</td>"+"<td>"+item[1]+"</td>"+"<td>"+item[2]+"</td>"+"</tr>")
  1928. f.write("</table></body>")
  1929. '''