getAttributes.py 65 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012101310141015101610171018101910201021102210231024102510261027102810291030103110321033103410351036103710381039104010411042104310441045104610471048104910501051105210531054105510561057105810591060106110621063106410651066106710681069107010711072107310741075107610771078107910801081108210831084108510861087108810891090109110921093109410951096109710981099110011011102110311041105110611071108110911101111111211131114111511161117111811191120112111221123112411251126112711281129113011311132113311341135113611371138113911401141114211431144114511461147114811491150115111521153115411551156115711581159116011611162116311641165116611671168116911701171117211731174117511761177117811791180118111821183118411851186118711881189119011911192119311941195119611971198119912001201120212031204120512061207120812091210121112121213121412151216121712181219122012211222122312241225122612271228122912301231123212331234123512361237123812391240124112421243124412451246124712481249125012511252125312541255125612571258125912601261126212631264
  1. from BiddingKG.dl.common.Utils import findAllIndex,debug,timeFormat,getCurrent_date
  2. from BiddingKG.dl.interface.Entitys import PREM,Role,Entity
  3. import re
  4. import copy
  5. import math
  6. import pandas as pd
  7. import os
  8. def getTheRole(entity,role_list):
  9. '''
  10. @summary:根据实体名称拿到index
  11. @param:
  12. entity:实体名称
  13. role_list:角色list
  14. @return:该实体所在下标
  15. '''
  16. for role_index in range(len(role_list)):
  17. if entity in role_list[role_index]:
  18. return role_index
  19. return None
  20. dict_role_id = {"0":"tenderee",
  21. "1":"agency",
  22. "2":"win_tenderer",
  23. "3":"second_tenderer",
  24. "4":"third_tenderer"}
  25. def getPackage(packageList,sentence_index,begin_index,roleid,MAX_DIS=None,DIRECT=None):
  26. '''
  27. @param:
  28. packageList:文章的包的信息,包号-sent_index-词偏移-字偏移-[[前作用域句子,句内偏移],[后作用域句子,句内偏移]]-匹配集合
  29. sentence_index:实体所在的句子
  30. begin_index:实体所在句子的起始位置
  31. @return:公司实体所属的包
  32. @summary: 优化多标段,确定标段作用域之后,寻找作用域包含该实体的所有包,从前往后找到一个还没有该roleid的包返回,若找到的包都有roleid,则返回第一个,若没有找到包,返回None
  33. '''
  34. '''
  35. if len(packageList)==0:
  36. return None
  37. before_index = None
  38. after_index = None
  39. equal_index = None
  40. equal_count = 0
  41. for pack_index in range(len(packageList)):
  42. if packageList[pack_index][1]>sentence_index and after_index is None:
  43. after_index = pack_index
  44. if packageList[pack_index][1]<sentence_index:
  45. before_index = pack_index
  46. if packageList[pack_index][1]==sentence_index and equal_index is None:
  47. equal_index = pack_index
  48. #当前句子和之前句子未找到包
  49. if before_index is None and equal_index is None:
  50. return None
  51. else:
  52. if after_index is None:
  53. end_index = len(packageList)
  54. else:
  55. end_index = after_index
  56. #只在当前句子找到一个包号
  57. if end_index-max((before_index if before_index is not None else -1,equal_index if equal_index is not None else -1))==1:
  58. return packageList[end_index-1][0]
  59. else:
  60. for i in range(max((before_index if before_index is not None else -1,equal_index if equal_index is not None else -1)),end_index):
  61. if packageList[i][2]>int(begin_index):
  62. if packageList[i-1][4]:
  63. return packageList[i-1][0]
  64. else:
  65. if packageList[i][4]:
  66. return packageList[i-1][0]
  67. else:
  68. return packageList[i][0]
  69. return packageList[end_index-1][0]
  70. '''
  71. if len(packageList)==0:
  72. return None,False
  73. list_legalPack = []
  74. for pack_index in range(len(packageList)):
  75. if DIRECT=="L" and (packageList[pack_index]["sentence_index"]>sentence_index or (packageList[pack_index]["sentence_index"]==sentence_index and packageList[pack_index]["offsetWords_begin"]>begin_index)):
  76. continue
  77. if DIRECT=="R" and (packageList[pack_index]["sentence_index"]<sentence_index or (packageList[pack_index]["sentence_index"]==sentence_index and packageList[pack_index]["offsetwords_begin"]<begin_index)):
  78. continue
  79. if (packageList[pack_index]["scope"][0][0]<sentence_index or (packageList[pack_index]["scope"][0][0]==sentence_index and packageList[pack_index]["scope"][0][1]<=begin_index)) and (packageList[pack_index]["scope"][1][0]>sentence_index or (packageList[pack_index]["scope"][1][0]==sentence_index and packageList[pack_index]["scope"][1][1]>=begin_index)):
  80. if MAX_DIS is not None:
  81. if abs(sentence_index-packageList[pack_index]["sentence_index"])<=MAX_DIS:
  82. list_legalPack.append(pack_index)
  83. else:
  84. list_legalPack.append(pack_index)
  85. _flag = True
  86. for _index in list_legalPack:
  87. if roleid in packageList[_index]["hit"]:
  88. continue
  89. else:
  90. _flag = False
  91. packageList[_index]["hit"].add(roleid)
  92. return packageList[_index]["pointer"],_flag
  93. if len(list_legalPack)>0:
  94. return packageList[0]["pointer"],_flag
  95. return None,False
  96. #生成合法的组合
  97. def get_legal_comba(list_entity,dict_role_combination):
  98. #拿到一个包中所有合法的组合
  99. def circle_package(_dict_legal_combination):
  100. list_dict_role_first = []
  101. for _role in _dict_legal_combination:
  102. if len(list_dict_role_first)==0:
  103. for _entity in _dict_legal_combination[_role]:
  104. if _entity !="":
  105. list_dict_role_first.append({_role:_entity})
  106. else:
  107. list_dict_role_after = []
  108. _find_count = 0
  109. for _entity in _dict_legal_combination[_role]:
  110. if _entity !="":
  111. for _dict in list_dict_role_first:
  112. _flag = True
  113. for _key1 in _dict:
  114. if _entity==_dict[_key1]:
  115. #修改为招标人和代理人可以为同一个
  116. if str(_key1) in ["0","1"] and str(_role) in ["0","1"]:
  117. _flag = True
  118. else:
  119. _flag = False
  120. if _flag:
  121. _find_count += 1
  122. _new_dict = copy.copy(_dict)
  123. _new_dict[_role] = _entity
  124. if len(list_dict_role_after)>100000:
  125. break
  126. list_dict_role_after.append(_new_dict)
  127. if len(list_dict_role_after)==0:
  128. pass
  129. else:
  130. list_dict_role_first.extend(list_dict_role_after)
  131. return list_dict_role_first
  132. def recursive_package(_dict_legal_combination,set_legal_entity,dict_one_selution,list_all_selution):
  133. last_layer = False
  134. #若是空组合则放回空
  135. if len(_dict_legal_combination.keys())==0:
  136. return []
  137. #递归到最后一层则修改状态
  138. if len(_dict_legal_combination.keys())==1:
  139. last_layer = True
  140. #取一个角色开始进行遍历
  141. _key_role = list(_dict_legal_combination.keys())[0]
  142. for item in _dict_legal_combination[_key_role]:
  143. copy_dict_one_selution = copy.copy(dict_one_selution)
  144. copy_dict_legal_combination = {}
  145. copy_set_legal_entity = copy.copy(set_legal_entity)
  146. #复制余下的所有角色,进行下一轮递归
  147. for _key in _dict_legal_combination.keys():
  148. if _key!=_key_role:
  149. copy_dict_legal_combination[_key] = _dict_legal_combination[_key]
  150. #修改为招标人和代理人可以为同一个
  151. if item !="":
  152. _flag = True
  153. if str(_key_role) in ["0","1"]:
  154. for _key_flag in copy_dict_one_selution:
  155. if _key_flag not in ["0","1"] and copy_dict_one_selution[_key_flag]==item:
  156. _flag = False
  157. else:
  158. for _key_flag in copy_dict_one_selution:
  159. if copy_dict_one_selution[_key_flag]==item:
  160. _flag = False
  161. if _flag:
  162. copy_dict_one_selution[_key_role] = item
  163. '''
  164. if item not in copy_set_legal_entity:
  165. if item !="":
  166. copy_dict_one_selution[_key_role] = item
  167. '''
  168. copy_set_legal_entity.add(item)
  169. if last_layer:
  170. list_all_selution.append(copy_dict_one_selution)
  171. else:
  172. recursive_package(copy_dict_legal_combination,copy_set_legal_entity,copy_dict_one_selution,list_all_selution)
  173. #递归匹配各个包的结果
  174. def recursive_packages(_dict_legal_combination,dict_one_selution,list_all_selution):
  175. last_layer = False
  176. if len(_dict_legal_combination.keys())==0:
  177. return []
  178. if len(_dict_legal_combination.keys())==1:
  179. last_layer = True
  180. _key_pack = list(_dict_legal_combination.keys())[0]
  181. for item in _dict_legal_combination[_key_pack]:
  182. copy_dict_one_selution = copy.copy(dict_one_selution)
  183. copy_dict_legal_combination = {}
  184. for _key in _dict_legal_combination.keys():
  185. if _key!=_key_pack:
  186. copy_dict_legal_combination[_key] = _dict_legal_combination[_key]
  187. for _key_role in item.keys():
  188. copy_dict_one_selution[_key_pack+"$$"+_key_role] = item[_key_role]
  189. if last_layer:
  190. list_all_selution.append(copy_dict_one_selution)
  191. else:
  192. recursive_packages(copy_dict_legal_combination,copy_dict_one_selution,list_all_selution)
  193. return list_all_selution
  194. #循环获取所有包组合
  195. def circle_pageages(_dict_legal_combination):
  196. list_all_selution = []
  197. for _key_pack in _dict_legal_combination.keys():
  198. list_key_selution = []
  199. for item in _dict_legal_combination[_key_pack]:
  200. _dict = dict()
  201. for _key_role in item.keys():
  202. _dict[_key_pack+"$$"+_key_role] = item[_key_role]
  203. list_key_selution.append(_dict)
  204. if len(list_all_selution)==0:
  205. list_all_selution = list_key_selution
  206. else:
  207. _list_all_selution = []
  208. for item_1 in list_all_selution:
  209. for item_2 in list_key_selution:
  210. _list_all_selution.append(dict(item_1,**item_2))
  211. list_all_selution = _list_all_selution
  212. return list_all_selution
  213. #拿到各个包解析之后的结果
  214. _dict_legal_combination = {}
  215. for packageName in dict_role_combination.keys():
  216. _list_all_selution = []
  217. # recursive_package(dict_role_combination[packageName], set(), {}, _list_all_selution)
  218. _list_all_selution = circle_package(dict_role_combination[packageName])
  219. '''
  220. print("===1")
  221. print(packageName)
  222. for item in _list_all_selution:
  223. print(item)
  224. print("===2")
  225. '''
  226. #去除包含子集
  227. list_all_selution_simple = []
  228. _list_set_all_selution = []
  229. for item_selution in _list_all_selution:
  230. item_set_selution = set()
  231. for _key in item_selution.keys():
  232. item_set_selution.add((_key,item_selution[_key]))
  233. _list_set_all_selution.append(item_set_selution)
  234. if len(_list_set_all_selution)>1000:
  235. _dict_legal_combination[packageName] = _list_all_selution
  236. continue
  237. for i in range(len(_list_set_all_selution)):
  238. be_included = False
  239. for j in range(len(_list_set_all_selution)):
  240. if i!=j:
  241. if len(set(_list_set_all_selution[i])&set(_list_set_all_selution[j]))==len(_list_set_all_selution[i]) and len(_list_set_all_selution[i])!=len(_list_set_all_selution[j]):
  242. be_included = True
  243. if not be_included:
  244. list_all_selution_simple.append(_list_all_selution[i])
  245. _dict_legal_combination[packageName] = list_all_selution_simple
  246. _list_final_comba = []
  247. #对各个包的结果进行排列组合
  248. _comba_count = 1
  249. for _key in _dict_legal_combination.keys():
  250. _comba_count *= len(_dict_legal_combination[_key])
  251. #如果过大,则每个包只取概率最大的那个
  252. dict_pack_entity_prob = get_dict_entity_prob(list_entity)
  253. if _comba_count>250:
  254. new_dict_legal_combination = dict()
  255. for _key_pack in _dict_legal_combination.keys():
  256. MAX_PROB = -1000
  257. _MAX_PROB_COMBA = None
  258. for item in _dict_legal_combination[_key_pack]:
  259. # print(_key_pack,item)
  260. _dict = dict()
  261. for _key in item.keys():
  262. _dict[str(_key_pack)+"$$"+str(_key)] = item[_key]
  263. _prob = getSumExpectation(dict_pack_entity_prob, _dict)
  264. if _prob>MAX_PROB:
  265. MAX_PROB = _prob
  266. _MAX_PROB_COMBA = [item]
  267. if _MAX_PROB_COMBA is not None:
  268. new_dict_legal_combination[_key_pack] = _MAX_PROB_COMBA
  269. _dict_legal_combination = new_dict_legal_combination
  270. #recursive_packages(_dict_legal_combination, {}, _list_final_comba)
  271. _list_final_comba = circle_pageages(_dict_legal_combination)
  272. #除了Project包(招标人和代理人),其他包是不会有冲突的
  273. #查看是否有一个实体出现在了Project包和其他包中,如有,要进行裁剪
  274. _list_real_comba = []
  275. for dict_item in _list_final_comba:
  276. set_project = set()
  277. set_other = set()
  278. for _key in list(dict_item.keys()):
  279. if _key.split("$$")[0]=="Project":
  280. set_project.add(dict_item[_key])
  281. else:
  282. set_other.add(dict_item[_key])
  283. set_common = set_project&set_other
  284. if len(set_common)>0:
  285. dict_project = {}
  286. dict_not_project = {}
  287. for _key in list(dict_item.keys()):
  288. if dict_item[_key] in set_common:
  289. if str(_key.split("$$")[0])=="Project":
  290. dict_project[_key] = dict_item[_key]
  291. else:
  292. dict_not_project[_key] = dict_item[_key]
  293. else:
  294. dict_project[_key] = dict_item[_key]
  295. dict_not_project[_key] = dict_item[_key]
  296. _list_real_comba.append(dict_project)
  297. _list_real_comba.append(dict_not_project)
  298. else:
  299. _list_real_comba.append(dict_item)
  300. return _list_real_comba
  301. def get_dict_entity_prob(list_entity,on_value=0.5):
  302. dict_pack_entity_prob = {}
  303. for entity in list_entity:
  304. if entity.entity_type in ['org','company']:
  305. values = entity.values
  306. role_prob = float(values[int(entity.label)])
  307. _key = entity.packageName+"$$"+str(entity.label)
  308. if role_prob>=on_value and str(entity.label)!="5":
  309. _key_prob = _key+"$text$"+entity.entity_text
  310. if _key_prob in dict_pack_entity_prob:
  311. if role_prob>dict_pack_entity_prob[_key_prob][1]:
  312. dict_pack_entity_prob[_key_prob] = [entity.entity_text,role_prob]
  313. else:
  314. dict_pack_entity_prob[_key_prob] = [entity.entity_text,role_prob]
  315. return dict_pack_entity_prob
  316. #计算合计期望
  317. def getSumExpectation(dict_pack_entity_prob,combination,on_value=0.5):
  318. '''
  319. expect = 0
  320. for entity in list_entity:
  321. if entity.entity_type in ['org','company']:
  322. values = entity.values
  323. role_prob = float(values[int(entity.label)])
  324. _key = entity.packageName+"$$"+str(entity.label)
  325. if role_prob>on_value and str(entity.label)!="5":
  326. if _key in combination.keys() and combination[_key]==entity.entity_text:
  327. expect += math.pow(role_prob,4)
  328. else:
  329. expect -= math.pow(role_prob,4)
  330. '''
  331. #修改为同一个实体只取对应包-角色的最大的概率值
  332. expect = 0
  333. dict_entity_prob = {}
  334. for _key_pack_entity in dict_pack_entity_prob:
  335. _key_pack = _key_pack_entity.split("$text$")[0]
  336. role_prob = dict_pack_entity_prob[_key_pack_entity][1]
  337. if _key_pack in combination.keys() and combination[_key_pack]==dict_pack_entity_prob[_key_pack_entity][0]:
  338. if _key_pack_entity in dict_entity_prob.keys():
  339. if dict_entity_prob[_key_pack_entity]<role_prob:
  340. dict_entity_prob[_key_pack_entity] = role_prob
  341. else:
  342. dict_entity_prob[_key_pack_entity] = role_prob
  343. else:
  344. if _key_pack_entity in dict_entity_prob.keys():
  345. if dict_entity_prob[_key_pack_entity]>-role_prob:
  346. dict_entity_prob[_key_pack_entity] = -role_prob
  347. else:
  348. dict_entity_prob[_key_pack_entity] = -role_prob
  349. # for entity in list_entity:
  350. # if entity.entity_type in ['org','company']:
  351. # values = entity.values
  352. # role_prob = float(values[int(entity.label)])
  353. # _key = entity.packageName+"$$"+str(entity.label)
  354. # if role_prob>=on_value and str(entity.label)!="5":
  355. # if _key in combination.keys() and combination[_key]==entity.entity_text:
  356. # _key_prob = _key+entity.entity_text
  357. # if _key_prob in dict_entity_prob.keys():
  358. # if dict_entity_prob[_key_prob]<role_prob:
  359. # dict_entity_prob[_key_prob] = role_prob
  360. # else:
  361. # dict_entity_prob[_key_prob] = role_prob
  362. # else:
  363. # _key_prob = _key+entity.entity_text
  364. # if _key_prob in dict_entity_prob.keys():
  365. # if dict_entity_prob[_key_prob]>-role_prob:
  366. # dict_entity_prob[_key_prob] = -role_prob
  367. # else:
  368. # dict_entity_prob[_key_prob] = -role_prob
  369. for _key in dict_entity_prob.keys():
  370. symbol = 1 if dict_entity_prob[_key]>0 else -1
  371. expect += symbol*math.pow(dict_entity_prob[_key],2)
  372. return expect
  373. def getRoleList(list_sentence,list_entity,on_value = 0.5):
  374. '''
  375. @summary: 搜索树,得到所有不矛盾的角色组合,取合计期望值最大的作为结果返回
  376. @param:
  377. list_sentence:文章所有的sentence
  378. list_entity:文章所有的实体
  379. on_value:概率阈值
  380. @return:文章的角色list
  381. '''
  382. pack = getPackagesFromArticle(list_sentence,list_entity)
  383. if pack is None:
  384. return None
  385. PackageList,PackageSet,dict_PackageCode = pack
  386. #拿到所有可能的情况
  387. dict_role_combination = {}
  388. #拿到各个实体的packageName,packageCode
  389. for entity in list_entity:
  390. if entity.entity_type in ['org','company']:
  391. #过滤掉字数小于3个的实体
  392. if len(entity.entity_text)<=3:
  393. continue
  394. values = entity.values
  395. role_prob = float(values[int(entity.label)])
  396. if role_prob>=on_value and str(entity.label)!="5":
  397. if str(entity.label) in ["0","1"]:
  398. packageName = "Project"
  399. else:
  400. if len(PackageSet)>0:
  401. packagePointer,_ = getPackage(PackageList,entity.sentence_index,entity.end_index,"role-"+str(entity.label))
  402. if packagePointer is None:
  403. #continue
  404. packageName = "Project"
  405. else:
  406. #add pointer_pack
  407. entity.pointer_pack = packagePointer
  408. packageName = packagePointer.entity_text
  409. else:
  410. packageName = "Project"
  411. find_flag = False
  412. if packageName in dict_PackageCode.keys():
  413. packageCode = dict_PackageCode[packageName]
  414. else:
  415. packageCode = ""
  416. entity.packageCode = packageCode
  417. role_name = dict_role_id.get(str(entity.label))
  418. entity.roleName = role_name
  419. entity.packageName = packageName
  420. if entity.packageName in dict_role_combination.keys():
  421. if str(entity.label) in dict_role_combination[entity.packageName].keys():
  422. dict_role_combination[entity.packageName][str(entity.label)].add(entity.entity_text)
  423. else:
  424. dict_role_combination[entity.packageName][str(entity.label)] = set([entity.entity_text])
  425. else:
  426. dict_role_combination[entity.packageName] = {}
  427. #初始化空值
  428. roleIds = [0,1,2,3,4]
  429. for _roleId in roleIds:
  430. dict_role_combination[entity.packageName][str(_roleId)] = set([""])
  431. dict_role_combination[entity.packageName][str(entity.label)].add(entity.entity_text)
  432. list_real_comba = get_legal_comba(list_entity,dict_role_combination)
  433. #拿到最大期望值的组合
  434. max_index = 0
  435. max_expect = -100
  436. _index = 0
  437. dict_pack_entity_prob = get_dict_entity_prob(list_entity)
  438. for item_combination in list_real_comba:
  439. expect = getSumExpectation(dict_pack_entity_prob, item_combination)
  440. if expect>max_expect:
  441. max_index = _index
  442. max_expect = expect
  443. _index += 1
  444. RoleList = []
  445. RoleSet = set()
  446. if len(list_real_comba)>0:
  447. for _key in list_real_comba[max_index].keys():
  448. packageName = _key.split("$$")[0]
  449. label = _key.split("$$")[1]
  450. role_name = dict_role_id.get(str(label))
  451. entity_text = list_real_comba[max_index][_key]
  452. if packageName in dict_PackageCode.keys():
  453. packagecode = dict_PackageCode.get(packageName)
  454. else:
  455. packagecode = ""
  456. RoleList.append(PREM(packageName,packagecode,role_name,entity_text,0,0,0.0,[]))
  457. RoleSet.add(entity_text)
  458. #根据最优树来修正list_entity中角色对包的连接
  459. for _entity in list_entity:
  460. if _entity.pointer_pack is not None:
  461. _pack_name = _entity.pointer_pack.entity_text
  462. _find_flag = False
  463. for _prem in RoleList:
  464. if _prem.packageName==_pack_name and _prem.entity_text==_entity.entity_text:
  465. _find_flag = True
  466. if not _find_flag:
  467. _entity.pointer_pack = None
  468. return RoleList,RoleSet,PackageList,PackageSet
  469. def getPackageScopePattern():
  470. '''
  471. @summary: 获取包的作用域关键词
  472. '''
  473. df = pd.read_excel(os.path.dirname(__file__)+"/end.xls")
  474. pattern = "("
  475. for item in df["list_word"]:
  476. item = str(item).replace("(","\(").replace(")","\)").replace(".","\.").replace("[","\[").replace("]","\]").replace("-","\-")
  477. pattern += item+"|"
  478. pattern = pattern[:-1]+")[::是为]|业绩.{,30}标段[0-9A-Za-z一二三四五六七八九十]{0,3}"
  479. return pattern
  480. pattern_packageScope = getPackageScopePattern()
  481. def getPackagesFromArticle(list_sentence,list_entity):
  482. '''
  483. @param:
  484. list_sentence:文章的句子list
  485. @summary: 将包的信息插入list_entity中
  486. @return: type:list if [包号,句子index,词偏移,标段号] meaning:文章的包/标段信息
  487. '''
  488. if len(list_sentence)==0:
  489. return None
  490. PackageList = []
  491. PackageList_scope = []
  492. PackageSet = set()
  493. dict_packageCode = dict()
  494. package_name_pattern = re.compile("((标[段号的包]|分包)的?(名[称单]?|包名))[::]?([^::]{3,30}?),{1}")
  495. package_N_name_pattern = re.compile("[^承](分?包|标段|标包|标|包|包组|子项目|包件|项目类型)编?号?[::]?[\((]?([0-9A-Za-z一二三四五六七八九十ⅠⅡⅢⅣⅤⅥⅦ]){1,2},{1}")
  496. package_number_pattern = re.compile("(([^承](包|标[段号的包]|分?包|包组|包件)编?号?|子项目|项目类型)[::]?[0-9A-Za-z一二三四五六七八九十ⅠⅡⅢⅣⅤⅥⅦ]{1,4}[^\.])[^至]?|([^\.]?第?[ⅠⅡⅢⅣⅤⅥⅦ0-9A-Za-z一二三四五六七八九十]{1,4}(包号|标[段号的包]|分?包))")
  497. # other_package_pattern = re.compile('(项目名称|物资名称|设备名称|场次名称|标段名称)[::](.{,20}?)(,|项目)') # 新正则识别标段
  498. other_package_pattern = re.compile('((项目|物资|设备|场次|标段|标的|产品)(名称))[::]([^,。]{2,50}?)[,。]') # # 2020/11/23 大网站规则 调整 package_N_name_pattern, package_N_name_pattern 中的项目 改为 子项目
  499. win_tenderer_pattern = re.compile('(中标人|供应商)[::](.{2,25})[,。]') # 2020/11/23 大网站规则 调整
  500. model_pattern = re.compile('(型号|序号)[::]([^,。]{2,20})[,。]') # 2020/11/23 大网站规则 调整
  501. number_pattern = re.compile("[0-9A-Za-z一二三四五六七八九十ⅠⅡⅢⅣⅤⅥⅦ]{1,4}")
  502. package_code_pattern = re.compile("(?:编号[::]?\s*)([-\dA-Za-z\(\)]+)")
  503. def changeIndexFromWordToWords(tokens,word_index):
  504. '''
  505. @summary:转换某个字的字偏移为词偏移
  506. '''
  507. before_index = 0
  508. after_index = 0
  509. for i in range(len(tokens)):
  510. after_index = after_index+len(tokens[i])
  511. if before_index<=word_index and after_index>=word_index:
  512. return i
  513. before_index = after_index
  514. package_names = []
  515. def extractPackageCode(tokens,word_index,size=20,pattern = package_code_pattern):
  516. '''
  517. @summary:抽取包附近的标段号
  518. @param:
  519. tokens:包所在句子的分词
  520. word_index:包所在字偏移
  521. size:左右各取多少个词
  522. pattern:提取标段号的正则
  523. @return: type:string,meaning:标段号
  524. '''
  525. index = changeIndexFromWordToWords(tokens,word_index)
  526. if index<size:
  527. begin = index
  528. else:
  529. begin = index-size
  530. if index+size>len(tokens):
  531. end = len(tokens)
  532. else:
  533. end = index+size
  534. #拿到左右两边的词语组成短语
  535. text = "".join(tokens[begin:end])
  536. #在短语中的字偏移
  537. new_word_index = word_index-len("".join(tokens[:begin]))
  538. min_distance = len(text)
  539. packageCode = None
  540. for the_iter in re.finditer(pattern,text):
  541. #算出最小距离
  542. distance = min([abs(new_word_index-the_iter.span()[0]),abs(new_word_index-the_iter.span()[1])])
  543. if distance<min_distance:
  544. min_distance = distance
  545. packageCode = the_iter.group(1)
  546. return packageCode
  547. #从标段介绍表格中提取包名和包号
  548. for i in range(len(list_sentence)):
  549. content = list_sentence[i].sentence_text
  550. names = re.findall(package_name_pattern,content)
  551. if names == []:
  552. names = re.findall(other_package_pattern, content)
  553. N_names = re.findall(package_N_name_pattern,content)
  554. if len(names)==1 and len(N_names)==1:
  555. package_names.append([names[0][-1],N_names[0][-1]])
  556. for i in range(len(list_sentence)):
  557. PackageList_item = []
  558. PackageList_item_scope = []
  559. content = list_sentence[i].sentence_text
  560. tokens = list_sentence[i].tokens
  561. for name in package_names[:20]:
  562. for index in findAllIndex(name[0],content):
  563. temp_package_number = re.findall(number_pattern,name[1])[0]
  564. PackageList_item.append({"name":temp_package_number,"sentence_index":list_sentence[i].sentence_index,"offsetWords_begin":changeIndexFromWordToWords(tokens,index),"offsetWord_begin":index,"offsetWord_end":index+len(name[0])})
  565. # PackageList_item.append([temp_package_number,i,changeIndexFromWordToWords(tokens,index),index,index+len(str(temp_package_number))])
  566. code = extractPackageCode(tokens, index)
  567. if code is not None:
  568. dict_packageCode[temp_package_number] = code
  569. PackageSet.add(temp_package_number)
  570. for iter in re.finditer(package_number_pattern,content):
  571. temp_package_number = re.findall(number_pattern,content[iter.span()[0]:iter.span()[1]])[0]
  572. PackageList_item.append({"name":temp_package_number,"sentence_index":list_sentence[i].sentence_index,"offsetWords_begin":changeIndexFromWordToWords(tokens,iter.span()[0]),"offsetWord_begin":iter.span()[0],"offsetWord_end":iter.span()[1]})
  573. # PackageList_item.append([temp_package_number,i,changeIndexFromWordToWords(tokens,iter.span()[0]),iter.span()[0],iter.span()[1]])
  574. code = extractPackageCode(tokens, iter.span()[0])
  575. if code is not None:
  576. dict_packageCode[temp_package_number] = code
  577. PackageSet.add(temp_package_number)
  578. #识别packageScope
  579. for iter in re.finditer(pattern_packageScope,content):
  580. PackageList_item_scope.append({"name":"","sentence_index":list_sentence[i].sentence_index,"offsetWords_begin":changeIndexFromWordToWords(tokens,iter.span()[0]),"offsetWord_begin":iter.span()[0],"offsetWord_end":iter.span()[1]})
  581. # PackageList_item_scope.append(["",i,changeIndexFromWordToWords(tokens,iter.span()[0]),iter.span()[0],iter.span()[1]])
  582. PackageList_item_scope = PackageList_item +PackageList_item_scope
  583. PackageList_item_scope.sort(key=lambda x:x["offsetWord_begin"])
  584. PackageList_scope = PackageList_scope+PackageList_item_scope
  585. PackageList_item.sort(key=lambda x:x["sentence_index"])
  586. #PackageList = PackageList+PackageList_item
  587. #不作为包
  588. # if len(PackageSet)==0:
  589. # for i in range(len(list_sentence)):
  590. # PackageList_item = []
  591. # PackageList_item_scope = []
  592. # content = list_sentence[i].sentence_text
  593. # tokens = list_sentence[i].tokens
  594. # for iter in re.finditer(other_package_pattern,content):
  595. # temp_package_number = iter.group(2)
  596. # PackageList_item.append({"name":temp_package_number,"sentence_index":list_sentence[i].sentence_index,"offsetWords_begin":changeIndexFromWordToWords(tokens,iter.span()[0]),"offsetWord_begin":iter.span()[0],"offsetWord_end":iter.span()[1]})
  597. # # PackageList_item.append([temp_package_number,i,changeIndexFromWordToWords(tokens,iter.span()[0]),iter.span()[0],iter.span()[1]])
  598. # code = extractPackageCode(tokens, iter.span()[0])
  599. # if code is not None:
  600. # dict_packageCode[temp_package_number] = code
  601. # PackageSet.add(temp_package_number)
  602. # #识别packageScope
  603. # for iter in re.finditer(pattern_packageScope,content):
  604. # PackageList_item_scope.append({"name":"","sentence_index":list_sentence[i].sentence_index,"offsetWords_begin":changeIndexFromWordToWords(tokens,iter.span()[0]),"offsetWord_begin":iter.span()[0],"offsetWord_end":iter.span()[1]})
  605. # # PackageList_item_scope.append(["",i,changeIndexFromWordToWords(tokens,iter.span()[0]),iter.span()[0],iter.span()[1]])
  606. # PackageList_item_scope = PackageList_item +PackageList_item_scope
  607. # PackageList_item_scope.sort(key=lambda x:x["offsetWord_begin"])
  608. # PackageList_scope = PackageList_scope+PackageList_item_scope
  609. # PackageList_item.sort(key=lambda x:x["sentence_index"])
  610. # 2020/11/23 大网站规则 调整
  611. if len(PackageSet)==0 and len(set([it.entity_text for it in list_entity if it.entity_type in ['org', 'company'] and it.label==2]))>1:
  612. for i in range(len(list_sentence)):
  613. PackageList_item = []
  614. PackageList_item_scope = []
  615. content = list_sentence[i].sentence_text
  616. tokens = list_sentence[i].tokens
  617. names = re.findall(other_package_pattern, content)
  618. N_names = re.findall(win_tenderer_pattern, content)
  619. if len(names) != 1 or len(N_names) != 1:
  620. continue
  621. for iter in re.finditer(other_package_pattern,content):
  622. temp_package_number = iter.group(4)
  623. xinghao = re.search(model_pattern, content)
  624. if xinghao:
  625. temp_package_number = temp_package_number + '+' + xinghao.group(2)
  626. # print('新正则采购包名补充',temp_package_number)
  627. PackageList_item.append({"name":temp_package_number,"sentence_index":list_sentence[i].sentence_index,"offsetWords_begin":changeIndexFromWordToWords(tokens,iter.span()[0]),"offsetWord_begin":iter.span()[0],"offsetWord_end":iter.span()[1]})
  628. # PackageList_item.append([temp_package_number,i,changeIndexFromWordToWords(tokens,iter.span()[0]),iter.span()[0],iter.span()[1]])
  629. code = extractPackageCode(tokens, iter.span()[0])
  630. if code is not None:
  631. dict_packageCode[temp_package_number] = code
  632. PackageSet.add(temp_package_number)
  633. #识别packageScope
  634. for iter in re.finditer(pattern_packageScope,content):
  635. PackageList_item_scope.append({"name":"","sentence_index":list_sentence[i].sentence_index,"offsetWords_begin":changeIndexFromWordToWords(tokens,iter.span()[0]),"offsetWord_begin":iter.span()[0],"offsetWord_end":iter.span()[1]})
  636. # PackageList_item_scope.append(["",i,changeIndexFromWordToWords(tokens,iter.span()[0]),iter.span()[0],iter.span()[1]])
  637. PackageList_item_scope = PackageList_item +PackageList_item_scope
  638. PackageList_item_scope.sort(key=lambda x:x["offsetWord_begin"])
  639. PackageList_scope = PackageList_scope+PackageList_item_scope
  640. PackageList_item.sort(key=lambda x:x["sentence_index"])
  641. pattern_punctuation = "[::()\(\),,。;;]"
  642. for i in range(len(list_sentence)):
  643. for j in range(len(PackageList_scope)):
  644. if i==PackageList_scope[j]["sentence_index"] and PackageList_scope[j]["name"]!="":
  645. _flag = False
  646. left_str = list_sentence[i].sentence_text[PackageList_scope[j]["offsetWord_begin"]-30:PackageList_scope[j]["offsetWord_begin"]+1]
  647. right_str = list_sentence[i].sentence_text[PackageList_scope[j]["offsetWord_begin"]:PackageList_scope[j]["offsetWord_begin"]+30]
  648. _left_find = re.findall(pattern_punctuation,left_str)
  649. _right_find = re.findall(pattern_punctuation,right_str)
  650. #print(left_str)
  651. if re.search("同",left_str[-1:]) is not None and PackageList_scope[j]["name"]=="一":
  652. continue
  653. if re.search("划分",right_str[:10]) is not None:
  654. continue
  655. if len(_left_find)>0 and _left_find[-1] in [":",":"]:
  656. _flag = True
  657. if len(_right_find)>0 and _right_find[0] in [":",":"]:
  658. _flag = True
  659. if _flag:
  660. scope_begin = [PackageList_scope[j]["sentence_index"],PackageList_scope[j]["offsetWords_begin"]]
  661. else:
  662. if j==0:
  663. scope_begin = [0,0]
  664. else:
  665. scope_begin = [PackageList_scope[j-1]["sentence_index"],PackageList_scope[j-1]["offsetWords_begin"]]
  666. if j==len(PackageList_scope)-1:
  667. scope_end = [PackageList_scope[j]["sentence_index"],changeIndexFromWordToWords(list_sentence[i].tokens, len(list_sentence[i].sentence_text))]
  668. else:
  669. scope_end = [PackageList_scope[j+1]["sentence_index"],PackageList_scope[j+1]["offsetWords_begin"]]
  670. if PackageList_scope[j-1]["sentence_index"]==PackageList_scope[j]["sentence_index"] and PackageList_scope[j-1]["offsetWord_begin"]<=PackageList_scope[j]["offsetWord_begin"] and PackageList_scope[j-1]["offsetWord_end"]>=PackageList_scope[j]["offsetWord_end"]:
  671. continue
  672. #add package to entity
  673. _pack_entity = Entity(doc_id=list_sentence[0].doc_id,entity_id="%s_%s_%s_%s"%(list_sentence[0].doc_id,i,PackageList_scope[j]["offsetWord_begin"],PackageList_scope[j]["offsetWord_begin"]),entity_text=PackageList_scope[j]["name"],entity_type="package",sentence_index=PackageList_scope[j]["sentence_index"],begin_index=changeIndexFromWordToWords(list_sentence[i].tokens,PackageList_scope[j]["offsetWord_begin"]),end_index=changeIndexFromWordToWords(list_sentence[i].tokens,PackageList_scope[j]["offsetWord_end"]),wordOffset_begin=PackageList_scope[j]["offsetWord_begin"],wordOffset_end=PackageList_scope[j]["offsetWord_end"])
  674. list_entity.append(_pack_entity)
  675. copy_pack = copy.copy(PackageList_scope[j])
  676. copy_pack["scope"] = [scope_begin,scope_end]
  677. copy_pack["hit"] = set()
  678. copy_pack["pointer"] = _pack_entity
  679. PackageList.append(copy_pack)
  680. return PackageList,PackageSet,dict_packageCode
  681. def findAttributeAfterEntity(PackDict,roleSet,PackageList,PackageSet,list_entity,on_value = 0.5,on_value_person=0.5,sentence_len=4):
  682. '''
  683. @param:
  684. PackDict:文章包dict
  685. roleSet:文章所有角色的公司名称
  686. PackageList:文章的包信息
  687. PackageSet:文章所有包的名称
  688. list_entity:文章所有经过模型处理的实体
  689. on_value:金额模型的阈值
  690. on_value_person:联系人模型的阈值
  691. sentence_len:公司和属性间隔句子的最大长度
  692. @return:添加了属性信息的角色list
  693. '''
  694. #根据roleid添加金额到rolelist中
  695. def addMoneyByRoleid(packDict,packageName,roleid,money,money_prob):
  696. for i in range(len(packDict[packageName]["roleList"])):
  697. if packDict[packageName]["roleList"][i].role_name==dict_role_id.get(str(roleid)):
  698. if money_prob>packDict[packageName]["roleList"][i].money_prob:
  699. packDict[packageName]["roleList"][i].money = money
  700. packDict[packageName]["roleList"][i].money_prob = money_prob
  701. return packDict
  702. #根据实体名称添加金额到rolelist中
  703. def addMoneyByEntity(packDict,packageName,entity,money,money_prob):
  704. for i in range(len(packDict[packageName]["roleList"])):
  705. if packDict[packageName]["roleList"][i].entity_text==entity:
  706. if money_prob>packDict[packageName]["roleList"][i].money_prob:
  707. packDict[packageName]["roleList"][i].money = money
  708. packDict[packageName]["roleList"][i].money_prob = money_prob
  709. return packDict
  710. #根据实体名称得到角色
  711. def getRoleWithText(packDict,entity_text):
  712. for pack in packDict.keys():
  713. for i in range(len(packDict[pack]["roleList"])):
  714. if packDict[pack]["roleList"][i].entity_text==entity_text:
  715. return packDict[pack]["roleList"][i].role_name
  716. def doesEntityOrLinkedEntity_inRoleSet(entity,RoleSet):
  717. _list_entitys = [entity]+entity.linked_entitys
  718. for _entity in _list_entitys:
  719. if _entity.entity_text in RoleSet:
  720. return True
  721. p_entity = 0
  722. #遍历所有实体
  723. while(p_entity<len(list_entity)):
  724. entity = list_entity[p_entity]
  725. '''
  726. #招标金额从后往前找
  727. if entity.entity_type=="money":
  728. if entity.values[entity.label]>=on_value:
  729. if str(entity.label)=="0":
  730. packagePointer,_ = getPackage(PackageList,entity.sentence_index,entity.begin_index,"money-"+str(entity.label))
  731. if packagePointer is None:
  732. packageName = "Project"
  733. else:
  734. packageName = packagePointer.entity_text
  735. addMoneyByRoleid(PackDict, packageName, "0", entity.entity_text, entity.values[entity.label])
  736. '''
  737. ''' # 2020/11/25 与下面的联系人连接步骤重复,取消
  738. if entity.entity_type=="person":
  739. if entity.values[entity.label]>=on_value_person:
  740. if str(entity.label)=="1":
  741. for i in range(len(PackDict["Project"]["roleList"])):
  742. if PackDict["Project"]["roleList"][i].role_name=="tenderee":
  743. PackDict["Project"]["roleList"][i].linklist.append((entity.entity_text,entity.person_phone))
  744. # add pointer_person
  745. for _entity in list_entity:
  746. if dict_role_id.get(str(_entity.label))=="tenderee":
  747. for i in range(len(PackDict["Project"]["roleList"])):
  748. if PackDict["Project"]["roleList"][i].entity_text==_entity.entity_text and PackDict["Project"]["roleList"][i].role_name=="tenderee":
  749. _entity.pointer_person = entity
  750. elif str(entity.label)=="2":
  751. for i in range(len(PackDict["Project"]["roleList"])):
  752. if PackDict["Project"]["roleList"][i].role_name=="agency":
  753. PackDict["Project"]["roleList"][i].linklist.append((entity.entity_text,entity.person_phone))
  754. # add pointer_person
  755. for _entity in list_entity:
  756. if dict_role_id.get(str(_entity.label))=="agency":
  757. for i in range(len(PackDict["Project"]["roleList"])):
  758. if PackDict["Project"]["roleList"][i].entity_text==_entity.entity_text and PackDict["Project"]["roleList"][i].role_name=="agency":
  759. _entity.pointer_person = entity
  760. '''
  761. #金额往前找实体
  762. if entity.entity_type=="money":
  763. if entity.values[entity.label]>=on_value:
  764. p_entity_money= p_entity
  765. entity_money = list_entity[p_entity_money]
  766. if len(PackageSet)>0:
  767. packagePointer,_ = getPackage(PackageList,entity_money.sentence_index,entity_money.begin_index,"money-"+str(entity_money.entity_text)+"-"+str(entity_money.label))
  768. if packagePointer is None:
  769. packageName_entity = "Project"
  770. else:
  771. packageName_entity = packagePointer.entity_text
  772. else:
  773. packageName_entity = "Project"
  774. while(p_entity_money>0):
  775. entity_before = list_entity[p_entity_money]
  776. if entity_before.entity_type in ['org','company']:
  777. if str(entity_before.label)=="1":
  778. addMoneyByEntity(PackDict, packageName_entity, entity_before.entity_text, entity_money.entity_text, entity_money.values[entity_money.label])
  779. #add pointer_money
  780. entity_before.pointer_money = entity_money
  781. break
  782. p_entity_money -= 1
  783. #如果实体属于角色集合,则往后找属性
  784. if doesEntityOrLinkedEntity_inRoleSet(entity, roleSet):
  785. p_entity += 1
  786. #循环查找符合的属性
  787. while(p_entity<len(list_entity)):
  788. entity_after = list_entity[p_entity]
  789. if entity_after.sentence_index-entity.sentence_index>=sentence_len:
  790. p_entity -= 1
  791. break
  792. #若是遇到公司实体,则跳出循环
  793. if entity_after.entity_type in ['org','company']:
  794. p_entity -= 1
  795. break
  796. if entity_after.values is not None:
  797. if entity_after.entity_type=="money":
  798. if entity_after.values[entity_after.label]>=on_value:
  799. '''
  800. #招标金额从后往前找
  801. if str(entity_after.label)=="0":
  802. packagePointer,_ = getPackage(PackageList,entity.sentence_index,entity.begin_index,"money-"+str(entity.label))
  803. if packagePointer is None:
  804. packageName = "Project"
  805. else:
  806. packageName = packagePointer.entity_text
  807. addMoneyByRoleid(PackDict, packageName, "0", entity_after.entity_text, entity_after.values[entity_after.label])
  808. '''
  809. if str(entity_after.label)=="1":
  810. #print(entity_after.entity_text,entity.entity_text)
  811. _list_entitys = [entity]+entity.linked_entitys
  812. if len(PackageSet)>0:
  813. packagePointer,_ = getPackage(PackageList,entity_after.sentence_index,entity_after.begin_index,"money-"+str(entity_after.entity_text)+"-"+str(entity_after.label))
  814. if packagePointer is None:
  815. packageName_entity = "Project"
  816. else:
  817. packageName_entity = packagePointer.entity_text
  818. else:
  819. packageName_entity = "Project"
  820. if str(entity.label) in ["2","3","4"]:
  821. addMoneyByEntity(PackDict, packageName_entity, entity.entity_text, entity_after.entity_text, entity_after.values[entity_after.label])
  822. #add pointer_money
  823. entity.pointer_money = entity_after
  824. '''
  825. if entity_after.entity_type=="person":
  826. if entity_after.values[entity_after.label]>=on_value_person:
  827. if str(entity_after.label)=="1":
  828. for i in range(len(roleList)):
  829. if roleList[i].role_name=="tenderee":
  830. roleList[i].linklist.append((entity_after.entity_text,entity_after.person_phone))
  831. elif str(entity_after.label)=="2":
  832. for i in range(len(roleList)):
  833. if roleList[i].role_name=="agency":
  834. roleList[i].linklist.append((entity_after.entity_text,entity_after.person_phone))
  835. elif str(entity_after.label)=="3":
  836. _list_entitys = [entity]+entity.linked_entitys
  837. for _entity in _list_entitys:
  838. for i in range(len(roleList)):
  839. if roleList[i].entity_text==_entity.entity_text:
  840. if entity_after.sentence_index-_entity.sentence_index>1 and len(roleList[i].linklist)>0:
  841. break
  842. roleList[i].linklist.append((entity_after.entity_text,entity_after.person_phone))
  843. '''
  844. p_entity += 1
  845. p_entity += 1
  846. ''''''
  847. # 1)第一个公司实体的招标人,则看看下一个实体是否为代理人,如果是则联系人错位连接 。2)在同一句中往后找联系人。3)连接不上在整个文章找联系人。
  848. temp_ent_list = [] # 临时列表,记录0,1角色及3联系人
  849. other_person = [] # 阈值以上的联系人列表
  850. link_person = [] # 有电话没联系上角色的person列表
  851. other_ent = []
  852. link_ent = []
  853. found_person = False
  854. ent_list = []
  855. for entity in list_entity:
  856. if entity.entity_type in ['org','company','person']:
  857. ent_list.append(entity)
  858. #for list_index in range(len(ent_list)):
  859. #if ent_list[list_index].entity_type in ['org','company'] and ent_list[list_index].label == 0 and list_index+2<len(ent_list) and \
  860. #ent_list[list_index+1].entity_type in ['org','company'] and ent_list[list_index+1].label == 1 and ent_list[list_index+2].entity_type in ['person']:
  861. #ent_list[list_index+1], ent_list[list_index+2] = ent_list[list_index+2], ent_list[list_index+1]
  862. # 2020/11/25增加确定角色联系人判断
  863. sure_person_set = set([entity.entity_text for entity in ent_list if entity.entity_type == 'person' and entity.label in [1, 2]])
  864. for index in range(len(ent_list)):
  865. entity = ent_list[index]
  866. if entity.entity_type=="person":
  867. if str(entity.label) == "0": # 2020/11/25 非联系人直接跳过
  868. continue
  869. if entity.values[entity.label]>on_value_person:
  870. if str(entity.label)=="1":
  871. for i in range(len(PackDict["Project"]["roleList"])):
  872. if PackDict["Project"]["roleList"][i].role_name=="tenderee":
  873. PackDict["Project"]["roleList"][i].linklist.append((entity.entity_text,entity.person_phone))
  874. link_person.append(entity.entity_text)
  875. link_ent.append(PackDict["Project"]["roleList"][i].entity_text)
  876. # add pointer_person
  877. for _entity in list_entity:
  878. if dict_role_id.get(str(_entity.label))=="tenderee":
  879. for i in range(len(PackDict["Project"]["roleList"])):
  880. if PackDict["Project"]["roleList"][i].entity_text==_entity.entity_text and PackDict["Project"]["roleList"][i].role_name=="tenderee":
  881. _entity.pointer_person = entity
  882. elif str(entity.label)=="2":
  883. for i in range(len(PackDict["Project"]["roleList"])):
  884. if PackDict["Project"]["roleList"][i].role_name=="agency":
  885. PackDict["Project"]["roleList"][i].linklist.append((entity.entity_text,entity.person_phone))
  886. link_person.append(entity.entity_text)
  887. link_ent.append(PackDict["Project"]["roleList"][i].entity_text)
  888. # add pointer_person
  889. for _entity in list_entity:
  890. if dict_role_id.get(str(_entity.label))=="agency":
  891. for i in range(len(PackDict["Project"]["roleList"])):
  892. if PackDict["Project"]["roleList"][i].entity_text==_entity.entity_text and PackDict["Project"]["roleList"][i].role_name=="agency":
  893. _entity.pointer_person = entity
  894. elif str(entity.label)=="3":
  895. if entity.entity_text in sure_person_set: # 2020/11/25 排除已经确定角色的联系人
  896. continue
  897. #not_link_person.append((entity_after.entity_text,entity_after.person_phone))
  898. other_person.append(entity.entity_text)
  899. temp_ent_list.append((entity.entity_text,entity.person_phone,entity))
  900. #if entity.entity_text in roleSet:
  901. if entity.entity_text in roleSet:
  902. if entity.label in [0,1]:
  903. other_ent.append(entity.entity_text)
  904. temp_ent_list.append((entity.entity_text, entity.label,entity))
  905. for behind_index in range(index+1, len(ent_list)):
  906. entity_after = ent_list[behind_index]
  907. if entity_after.sentence_index-entity.sentence_index>=1 or entity_after.entity_type in ['org','company']: # 只在本句中找联系人
  908. break
  909. if entity_after.values is not None:
  910. if entity_after.entity_type=="person":
  911. if str(entity_after.label) == "0": # 2020/11/25角色后面为非联系人 停止继续往后找
  912. break
  913. if entity_after.values[entity_after.label]>on_value_person:
  914. if str(entity_after.label)=="1":
  915. for i in range(len(PackDict["Project"]["roleList"])):
  916. if PackDict["Project"]["roleList"][i].role_name=="tenderee":
  917. PackDict["Project"]["roleList"][i].linklist.append((entity_after.entity_text,entity_after.person_phone))
  918. link_person.append(entity_after.entity_text)
  919. link_ent.append(PackDict["Project"]["roleList"][i].entity_text)
  920. elif str(entity_after.label)=="2":
  921. for i in range(len(PackDict["Project"]["roleList"])):
  922. if PackDict["Project"]["roleList"][i].role_name=="agency":
  923. PackDict["Project"]["roleList"][i].linklist.append((entity_after.entity_text,entity_after.person_phone))
  924. link_person.append(entity_after.entity_text)
  925. link_ent.append(PackDict["Project"]["roleList"][i].entity_text)
  926. elif str(entity_after.label)=="3":
  927. if entity_after.entity_text in sure_person_set: # 2020/11/25 如果姓名已经出现在确定角色联系人中则停止往后找
  928. break
  929. elif entity_after.begin_index - entity.end_index > 30:#2020/10/25 如果角色实体与联系人实体间隔大于阈值停止
  930. break
  931. for pack in PackDict.keys():
  932. for i in range(len(PackDict[pack]["roleList"])):
  933. if PackDict[pack]["roleList"][i].entity_text==entity.entity_text:
  934. #if entity_after.sentence_index-entity.sentence_index>1 and len(roleList[i].linklist)>0:
  935. #break
  936. PackDict[pack]["roleList"][i].linklist.append((entity_after.entity_text,entity_after.person_phone))
  937. link_person.append(entity_after.entity_text)
  938. #add pointer_person
  939. entity.pointer_person = entity_after
  940. not_link_person = [person for person in other_person if person not in link_person]
  941. not_link_ent = [ent for ent in other_ent if ent not in link_ent]
  942. if len(not_link_person) > 0 and len(not_link_ent) > 0 :
  943. item = temp_ent_list
  944. for i in range(len(item)):
  945. if item[i][0] in not_link_ent and item[i][1] == 0 and i+3 < len(item):
  946. if item[i+1][0] in other_ent and item[i+1][1] == 1 and item[i+2][0] in other_person and item[i+3][0] in other_person:
  947. item[i+1], item[i+2] = item[i+2], item[i+1]
  948. for i in range(len(item)-1, -1, -1):
  949. if item[i][0] in not_link_ent:
  950. for pack in PackDict.keys():
  951. for role in PackDict[pack]["roleList"]:
  952. if role.entity_text == item[i][0] and len(role.linklist) < 1:
  953. for j in range(i+1, len(item)):
  954. if item[j][0] in not_link_person:
  955. role.linklist.append(item[j][:2])
  956. #add pointer_person
  957. item[i][2].pointer_person = item[j][2]
  958. break
  959. else:
  960. break
  961. #寻找多标段招标金额
  962. p_entity = len(list_entity)-1
  963. set_tenderer_money = set()
  964. #遍历所有实体
  965. while(p_entity>=0):
  966. entity = list_entity[p_entity]
  967. if entity.entity_type=="money":
  968. if entity.values[entity.label]>=on_value:
  969. if str(entity.label)=="1":
  970. set_tenderer_money.add(float(entity.entity_text))
  971. if str(entity.label)=="0":
  972. '''
  973. if p_entity>0:
  974. p_before = list_entity[p_entity-1]
  975. if p_before.entity_type=="money" and p_before.label==entity.label and p_before.entity_text==entity.entity_text and abs(entity.begin_index-p_before.end_index)<=2:
  976. p_entity -= 1
  977. continue
  978. '''
  979. packagePointer,_flag = getPackage(PackageList,entity.sentence_index,entity.begin_index,"money-"+str(entity.label),MAX_DIS=2,DIRECT="L")
  980. if packagePointer is None:
  981. packageName = "Project"
  982. else:
  983. packageName = packagePointer.entity_text
  984. if packageName=="Project":
  985. if PackDict["Project"]["tendereeMoney"]<float(entity.entity_text):
  986. PackDict["Project"]["tendereeMoney"] = float(entity.entity_text)
  987. else:
  988. PackDict[packageName]["tendereeMoney"] = float(entity.entity_text)
  989. #add pointer_tendereeMoney
  990. packagePointer.pointer_tendereeMoney = entity
  991. p_entity -= 1
  992. #删除一个机构有多个角色的数据
  993. #删除重复人、概率不回传
  994. final_roleList = []
  995. list_pop = []
  996. set_tenderer_role = set()
  997. dict_pack_tenderer_money = dict()
  998. for pack in PackDict.keys():
  999. #删除无效包
  1000. if PackDict[pack]["code"]=="" and PackDict[pack]["tendereeMoney"]==0 and len(PackDict[pack]["roleList"])==0:
  1001. list_pop.append(pack)
  1002. for i in range(len(PackDict[pack]["roleList"])):
  1003. if PackDict[pack]["roleList"][i].role_name=="win_tenderer":
  1004. if PackDict[pack]["roleList"][i].money==0:
  1005. set_tenderer_role.add(PackDict[pack]["roleList"][i])
  1006. dict_pack_tenderer_money[pack] = [PackDict[pack]["roleList"][i],set()]
  1007. #找到包的中投标金额
  1008. for _index in range(len(PackageList)):
  1009. if "hit" in PackageList[_index]:
  1010. for _hit in list(PackageList[_index]["hit"]):
  1011. _money = float(_hit.split("-")[1]) if _hit.split("-")[0]=="money" else None
  1012. if PackageList[_index]["name"] in dict_pack_tenderer_money and _money is not None:
  1013. dict_pack_tenderer_money[PackageList[_index]["name"]][1].add(_money)
  1014. #只找到一个中标人和中标金额
  1015. if len(set_tenderer_money)==1 and len(set_tenderer_role)==1:
  1016. list(set_tenderer_role)[0].money = list(set_tenderer_money)[0]
  1017. #找到一个中标人和多个招标金额
  1018. if len(set_tenderer_money)>1 and len(set_tenderer_role)==1:
  1019. _maxMoney = 0
  1020. _sumMoney = 0
  1021. for _m in list(set_tenderer_money):
  1022. _sumMoney += _m
  1023. if _m>_maxMoney:
  1024. _maxMoney = _m
  1025. if _sumMoney/_maxMoney==2:
  1026. list(set_tenderer_role)[0].money = _maxMoney
  1027. else:
  1028. list(set_tenderer_role)[0].money = _maxMoney
  1029. #每个包都只找到一个金额
  1030. _flag_pack_money = True
  1031. for k,v in dict_pack_tenderer_money.items():
  1032. if len(v[1])!=1:
  1033. _flag_pack_money = False
  1034. if _flag_pack_money and len(PackageSet)==len(dict_pack_tenderer_money.keys()):
  1035. for k,v in dict_pack_tenderer_money.items():
  1036. v[0].money = list(v[1])[0]
  1037. for pack in PackDict.keys():
  1038. for i in range(len(PackDict[pack]["roleList"])):
  1039. PackDict[pack]["roleList"][i] = PackDict[pack]["roleList"][i].getString()
  1040. for item in list_pop:
  1041. PackDict.pop(item)
  1042. return PackDict
  1043. def initPackageAttr(RoleList,PackageSet):
  1044. '''
  1045. @summary: 根据拿到的roleList和packageSet初始化接口返回的数据
  1046. '''
  1047. packDict = dict()
  1048. packDict["Project"] = {"code":"","tendereeMoney":0,"roleList":[]}
  1049. for item in list(PackageSet):
  1050. packDict[item] = {"code":"","tendereeMoney":0,"roleList":[]}
  1051. for item in RoleList:
  1052. if packDict[item.packageName]["code"] =="":
  1053. packDict[item.packageName]["code"] = item.packageCode
  1054. packDict[item.packageName]["roleList"].append(Role(item.role_name,item.entity_text,0,0,0.0,[]))
  1055. return packDict
  1056. def getPackageRoleMoney(list_sentence,list_entity):
  1057. '''
  1058. @param:
  1059. list_sentence:文章的句子list
  1060. list_entity:文章的实体list
  1061. @return: 拿到文章的包-标段号-角色-实体名称-金额-联系人-联系电话
  1062. '''
  1063. # print("=1")
  1064. theRole = getRoleList(list_sentence,list_entity)
  1065. if not theRole:
  1066. return []
  1067. RoleList,RoleSet,PackageList,PackageSet = theRole
  1068. '''
  1069. for item in PackageList:
  1070. print(item)
  1071. '''
  1072. # print("=2")
  1073. PackDict = initPackageAttr(RoleList, PackageSet)
  1074. # print("=3")
  1075. PackDict = findAttributeAfterEntity(PackDict, RoleSet, PackageList, PackageSet, list_entity)
  1076. # print("=4")
  1077. return PackDict
  1078. def turnBidWay(bidway):
  1079. if bidway in ("邀请招标","采购方式:邀请"):
  1080. return "邀请招标"
  1081. elif bidway in ("询价","询单","询比","采购方式:询价"):
  1082. return "询价"
  1083. elif bidway in ("竞谈","竞争性谈判","公开竞谈"):
  1084. return "竞争性谈判"
  1085. elif bidway in ("竞争性磋商","磋商"):
  1086. return "竞争性磋商"
  1087. elif bidway in ("竞价","竞标","电子竞价","以电子竞价","电子书面竞投"):
  1088. return "竞价"
  1089. elif bidway in ("公开招标","网上电子投标","网上招标","采购方式:公开","招标为其他"):
  1090. return "公开招标"
  1091. elif bidway in ("单一来源"):
  1092. return "单一来源"
  1093. elif bidway in ("比选"):
  1094. return "比选"
  1095. else:
  1096. return "其他"
  1097. def getOtherAttributes(list_entity):
  1098. dict_other = {"bidway":"",
  1099. "moneysource":"",
  1100. "person_review":[],
  1101. "time_release":"",
  1102. "time_bidopen":"",
  1103. "time_bidclose":"",
  1104. "serviceTime":"",
  1105. "product":[]}
  1106. for entity in list_entity:
  1107. if entity.entity_type == 'bidway':
  1108. dict_other["bidway"] = turnBidWay(entity.entity_text)
  1109. elif entity.entity_type=='moneysource':
  1110. dict_other["moneysource"] = entity.entity_text
  1111. elif entity.entity_type=='serviceTime':
  1112. dict_other["serviceTime"] = entity.entity_text
  1113. elif entity.entity_type == 'time' and entity.label==1:
  1114. dict_other["time_release"] = timeFormat(entity.entity_text)
  1115. elif entity.entity_type == 'time' and entity.label==2:
  1116. dict_other["time_bidopen"] = timeFormat(entity.entity_text)
  1117. elif entity.entity_type == 'time' and entity.label == 3:
  1118. dict_other["time_bidclose"] = timeFormat(entity.entity_text)
  1119. elif entity.entity_type=="person" and entity.label ==4:
  1120. dict_other["person_review"].append(entity.entity_text)
  1121. elif entity.entity_type=='product':
  1122. dict_other["product"].append(entity.entity_text)
  1123. dict_other["product"] = list(set(dict_other["product"]))
  1124. return dict_other
  1125. def getMoneyRange(RoleList):
  1126. pass
  1127. def getPREMs(list_sentences,list_entitys,list_articles):
  1128. '''
  1129. @param:
  1130. list_sentence:所有文章的句子list
  1131. list_entity:所有文章的实体list
  1132. @return:list of dict which include文章的包-角色-实体名称-金额-联系人-联系电话
  1133. '''
  1134. result = []
  1135. for list_sentence,list_entity,list_article in zip(list_sentences,list_entitys,list_articles):
  1136. RoleList = getPackageRoleMoney(list_sentence,list_entity)
  1137. result.append(dict({"prem":RoleList,"docid":list_article.doc_id},**getOtherAttributes(list_entity),
  1138. **{"fingerprint":list_article.fingerprint,"match_enterprise":list_article.match_enterprise,
  1139. "match_enterprise_type":list_article.match_enterprise_type,"process_time":getCurrent_date(),
  1140. "attachmentTypes":list_article.attachmentTypes}))
  1141. return result
  1142. if __name__=="__main__":
  1143. '''
  1144. conn = getConnection()
  1145. cursor = conn.cursor()
  1146. #sql = " select distinct A.doc_id from entity_mention A,test_predict_role B where A.entity_id=B.entity_id limit 200"
  1147. sql = " select B.doc_id,B.prem from articles_processed A, articles_validation B where A.id=B.doc_id "
  1148. result = []
  1149. cursor.execute(sql)
  1150. rows = cursor.fetchall()
  1151. count = 0
  1152. for row in rows:
  1153. count += 1
  1154. print(count)
  1155. doc_id = row[0]
  1156. roleList = getPackageRoleMoney(doc_id)
  1157. result.append([doc_id,str(roleList),row[1]])
  1158. ''''''
  1159. with codecs.open("getAttribute.html","w",encoding="utf8") as f:
  1160. f.write('<html><head>\
  1161. <meta http-equiv="Content-Type"\
  1162. content="text/html; charset=UTF-8">\
  1163. </head>\
  1164. <body bgcolor="#FFFFFF">\
  1165. <table border="1">\
  1166. <tr>\
  1167. <td>doc_id</td>\
  1168. <td>角色</td>\
  1169. </tr>')
  1170. for item in result:
  1171. f.write("<tr>"+"<td>"+item[0]+"</td>"+"<td>"+item[1]+"</td>"+"<td>"+item[2]+"</td>"+"</tr>")
  1172. f.write("</table></body>")
  1173. '''