getAttributes.py 71 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012101310141015101610171018101910201021102210231024102510261027102810291030103110321033103410351036103710381039104010411042104310441045104610471048104910501051105210531054105510561057105810591060106110621063106410651066106710681069107010711072107310741075107610771078107910801081108210831084108510861087108810891090109110921093109410951096109710981099110011011102110311041105110611071108110911101111111211131114111511161117111811191120112111221123112411251126112711281129113011311132113311341135113611371138113911401141114211431144114511461147114811491150115111521153115411551156115711581159116011611162116311641165116611671168116911701171117211731174117511761177117811791180118111821183118411851186118711881189119011911192119311941195119611971198119912001201120212031204120512061207120812091210121112121213121412151216121712181219122012211222122312241225122612271228122912301231123212331234123512361237123812391240124112421243124412451246124712481249125012511252125312541255125612571258125912601261126212631264126512661267126812691270127112721273127412751276127712781279128012811282128312841285128612871288128912901291129212931294129512961297129812991300130113021303130413051306130713081309131013111312131313141315131613171318131913201321132213231324132513261327132813291330133113321333133413351336133713381339
  1. from BiddingKG.dl.common.Utils import findAllIndex,debug,timeFormat,getCurrent_date
  2. from BiddingKG.dl.interface.Entitys import PREM,Role,Entity
  3. from decimal import Decimal
  4. import re
  5. import copy
  6. import math
  7. import pandas as pd
  8. import os
  9. def getTheRole(entity,role_list):
  10. '''
  11. @summary:根据实体名称拿到index
  12. @param:
  13. entity:实体名称
  14. role_list:角色list
  15. @return:该实体所在下标
  16. '''
  17. for role_index in range(len(role_list)):
  18. if entity in role_list[role_index]:
  19. return role_index
  20. return None
  21. dict_role_id = {"0":"tenderee",
  22. "1":"agency",
  23. "2":"win_tenderer",
  24. "3":"second_tenderer",
  25. "4":"third_tenderer"}
  26. def getPackage(packageList,sentence_index,begin_index,roleid,MAX_DIS=None,DIRECT=None):
  27. '''
  28. @param:
  29. packageList:文章的包的信息,包号-sent_index-词偏移-字偏移-[[前作用域句子,句内偏移],[后作用域句子,句内偏移]]-匹配集合
  30. sentence_index:实体所在的句子
  31. begin_index:实体所在句子的起始位置
  32. @return:公司实体所属的包
  33. @summary: 优化多标段,确定标段作用域之后,寻找作用域包含该实体的所有包,从前往后找到一个还没有该roleid的包返回,若找到的包都有roleid,则返回第一个,若没有找到包,返回None
  34. '''
  35. '''
  36. if len(packageList)==0:
  37. return None
  38. before_index = None
  39. after_index = None
  40. equal_index = None
  41. equal_count = 0
  42. for pack_index in range(len(packageList)):
  43. if packageList[pack_index][1]>sentence_index and after_index is None:
  44. after_index = pack_index
  45. if packageList[pack_index][1]<sentence_index:
  46. before_index = pack_index
  47. if packageList[pack_index][1]==sentence_index and equal_index is None:
  48. equal_index = pack_index
  49. #当前句子和之前句子未找到包
  50. if before_index is None and equal_index is None:
  51. return None
  52. else:
  53. if after_index is None:
  54. end_index = len(packageList)
  55. else:
  56. end_index = after_index
  57. #只在当前句子找到一个包号
  58. if end_index-max((before_index if before_index is not None else -1,equal_index if equal_index is not None else -1))==1:
  59. return packageList[end_index-1][0]
  60. else:
  61. for i in range(max((before_index if before_index is not None else -1,equal_index if equal_index is not None else -1)),end_index):
  62. if packageList[i][2]>int(begin_index):
  63. if packageList[i-1][4]:
  64. return packageList[i-1][0]
  65. else:
  66. if packageList[i][4]:
  67. return packageList[i-1][0]
  68. else:
  69. return packageList[i][0]
  70. return packageList[end_index-1][0]
  71. '''
  72. if len(packageList)==0:
  73. return None,False
  74. list_legalPack = []
  75. for pack_index in range(len(packageList)):
  76. if DIRECT=="L" and (packageList[pack_index]["sentence_index"]>sentence_index or (packageList[pack_index]["sentence_index"]==sentence_index and packageList[pack_index]["offsetWords_begin"]>begin_index)):
  77. continue
  78. if DIRECT=="R" and (packageList[pack_index]["sentence_index"]<sentence_index or (packageList[pack_index]["sentence_index"]==sentence_index and packageList[pack_index]["offsetwords_begin"]<begin_index)):
  79. continue
  80. if (packageList[pack_index]["scope"][0][0]<sentence_index or (packageList[pack_index]["scope"][0][0]==sentence_index and packageList[pack_index]["scope"][0][1]<=begin_index)) and (packageList[pack_index]["scope"][1][0]>sentence_index or (packageList[pack_index]["scope"][1][0]==sentence_index and packageList[pack_index]["scope"][1][1]>=begin_index)):
  81. if MAX_DIS is not None:
  82. if abs(sentence_index-packageList[pack_index]["sentence_index"])<=MAX_DIS:
  83. list_legalPack.append(pack_index)
  84. else:
  85. list_legalPack.append(pack_index)
  86. _flag = True
  87. for _index in list_legalPack:
  88. if roleid in packageList[_index]["hit"]:
  89. continue
  90. else:
  91. _flag = False
  92. packageList[_index]["hit"].add(roleid)
  93. return packageList[_index]["pointer"],_flag
  94. if len(list_legalPack)>0:
  95. return packageList[0]["pointer"],_flag
  96. return None,False
  97. #生成合法的组合
  98. def get_legal_comba(list_entity,dict_role_combination):
  99. #拿到一个包中所有合法的组合
  100. def circle_package(_dict_legal_combination):
  101. list_dict_role_first = []
  102. for _role in _dict_legal_combination:
  103. if len(list_dict_role_first)==0:
  104. for _entity in _dict_legal_combination[_role]:
  105. if _entity !="":
  106. list_dict_role_first.append({_role:_entity})
  107. else:
  108. list_dict_role_after = []
  109. _find_count = 0
  110. for _entity in _dict_legal_combination[_role]:
  111. if _entity !="":
  112. for _dict in list_dict_role_first:
  113. _flag = True
  114. for _key1 in _dict:
  115. if _entity==_dict[_key1]:
  116. #修改为招标人和代理人可以为同一个
  117. if str(_key1) in ["0","1"] and str(_role) in ["0","1"]:
  118. _flag = True
  119. else:
  120. _flag = False
  121. if _flag:
  122. _find_count += 1
  123. _new_dict = copy.copy(_dict)
  124. _new_dict[_role] = _entity
  125. if len(list_dict_role_after)>100000:
  126. break
  127. list_dict_role_after.append(_new_dict)
  128. if len(list_dict_role_after)==0:
  129. pass
  130. else:
  131. list_dict_role_first.extend(list_dict_role_after)
  132. return list_dict_role_first
  133. def recursive_package(_dict_legal_combination,set_legal_entity,dict_one_selution,list_all_selution):
  134. last_layer = False
  135. #若是空组合则放回空
  136. if len(_dict_legal_combination.keys())==0:
  137. return []
  138. #递归到最后一层则修改状态
  139. if len(_dict_legal_combination.keys())==1:
  140. last_layer = True
  141. #取一个角色开始进行遍历
  142. _key_role = list(_dict_legal_combination.keys())[0]
  143. for item in _dict_legal_combination[_key_role]:
  144. copy_dict_one_selution = copy.copy(dict_one_selution)
  145. copy_dict_legal_combination = {}
  146. copy_set_legal_entity = copy.copy(set_legal_entity)
  147. #复制余下的所有角色,进行下一轮递归
  148. for _key in _dict_legal_combination.keys():
  149. if _key!=_key_role:
  150. copy_dict_legal_combination[_key] = _dict_legal_combination[_key]
  151. #修改为招标人和代理人可以为同一个
  152. if item !="":
  153. _flag = True
  154. if str(_key_role) in ["0","1"]:
  155. for _key_flag in copy_dict_one_selution:
  156. if _key_flag not in ["0","1"] and copy_dict_one_selution[_key_flag]==item:
  157. _flag = False
  158. else:
  159. for _key_flag in copy_dict_one_selution:
  160. if copy_dict_one_selution[_key_flag]==item:
  161. _flag = False
  162. if _flag:
  163. copy_dict_one_selution[_key_role] = item
  164. '''
  165. if item not in copy_set_legal_entity:
  166. if item !="":
  167. copy_dict_one_selution[_key_role] = item
  168. '''
  169. copy_set_legal_entity.add(item)
  170. if last_layer:
  171. list_all_selution.append(copy_dict_one_selution)
  172. else:
  173. recursive_package(copy_dict_legal_combination,copy_set_legal_entity,copy_dict_one_selution,list_all_selution)
  174. #递归匹配各个包的结果
  175. def recursive_packages(_dict_legal_combination,dict_one_selution,list_all_selution):
  176. last_layer = False
  177. if len(_dict_legal_combination.keys())==0:
  178. return []
  179. if len(_dict_legal_combination.keys())==1:
  180. last_layer = True
  181. _key_pack = list(_dict_legal_combination.keys())[0]
  182. for item in _dict_legal_combination[_key_pack]:
  183. copy_dict_one_selution = copy.copy(dict_one_selution)
  184. copy_dict_legal_combination = {}
  185. for _key in _dict_legal_combination.keys():
  186. if _key!=_key_pack:
  187. copy_dict_legal_combination[_key] = _dict_legal_combination[_key]
  188. for _key_role in item.keys():
  189. copy_dict_one_selution[_key_pack+"$$"+_key_role] = item[_key_role]
  190. if last_layer:
  191. list_all_selution.append(copy_dict_one_selution)
  192. else:
  193. recursive_packages(copy_dict_legal_combination,copy_dict_one_selution,list_all_selution)
  194. return list_all_selution
  195. #循环获取所有包组合
  196. def circle_pageages(_dict_legal_combination):
  197. list_all_selution = []
  198. for _key_pack in _dict_legal_combination.keys():
  199. list_key_selution = []
  200. for item in _dict_legal_combination[_key_pack]:
  201. _dict = dict()
  202. for _key_role in item.keys():
  203. _dict[_key_pack+"$$"+_key_role] = item[_key_role]
  204. list_key_selution.append(_dict)
  205. if len(list_all_selution)==0:
  206. list_all_selution = list_key_selution
  207. else:
  208. _list_all_selution = []
  209. for item_1 in list_all_selution:
  210. for item_2 in list_key_selution:
  211. _list_all_selution.append(dict(item_1,**item_2))
  212. list_all_selution = _list_all_selution
  213. return list_all_selution
  214. #拿到各个包解析之后的结果
  215. _dict_legal_combination = {}
  216. for packageName in dict_role_combination.keys():
  217. _list_all_selution = []
  218. # recursive_package(dict_role_combination[packageName], set(), {}, _list_all_selution)
  219. _list_all_selution = circle_package(dict_role_combination[packageName])
  220. '''
  221. # print("===1")
  222. # print(packageName)
  223. for item in _list_all_selution:
  224. # print(item)
  225. # print("===2")
  226. '''
  227. #去除包含子集
  228. list_all_selution_simple = []
  229. _list_set_all_selution = []
  230. for item_selution in _list_all_selution:
  231. item_set_selution = set()
  232. for _key in item_selution.keys():
  233. item_set_selution.add((_key,item_selution[_key]))
  234. _list_set_all_selution.append(item_set_selution)
  235. if len(_list_set_all_selution)>1000:
  236. _dict_legal_combination[packageName] = _list_all_selution
  237. continue
  238. for i in range(len(_list_set_all_selution)):
  239. be_included = False
  240. for j in range(len(_list_set_all_selution)):
  241. if i!=j:
  242. if len(set(_list_set_all_selution[i])&set(_list_set_all_selution[j]))==len(_list_set_all_selution[i]) and len(_list_set_all_selution[i])!=len(_list_set_all_selution[j]):
  243. be_included = True
  244. if not be_included:
  245. list_all_selution_simple.append(_list_all_selution[i])
  246. _dict_legal_combination[packageName] = list_all_selution_simple
  247. _list_final_comba = []
  248. #对各个包的结果进行排列组合
  249. _comba_count = 1
  250. for _key in _dict_legal_combination.keys():
  251. _comba_count *= len(_dict_legal_combination[_key])
  252. #如果过大,则每个包只取概率最大的那个
  253. dict_pack_entity_prob = get_dict_entity_prob(list_entity)
  254. if _comba_count>250:
  255. new_dict_legal_combination = dict()
  256. for _key_pack in _dict_legal_combination.keys():
  257. MAX_PROB = -1000
  258. _MAX_PROB_COMBA = None
  259. for item in _dict_legal_combination[_key_pack]:
  260. # print(_key_pack,item)
  261. _dict = dict()
  262. for _key in item.keys():
  263. _dict[str(_key_pack)+"$$"+str(_key)] = item[_key]
  264. _prob = getSumExpectation(dict_pack_entity_prob, _dict)
  265. if _prob>MAX_PROB:
  266. MAX_PROB = _prob
  267. _MAX_PROB_COMBA = [item]
  268. if _MAX_PROB_COMBA is not None:
  269. new_dict_legal_combination[_key_pack] = _MAX_PROB_COMBA
  270. _dict_legal_combination = new_dict_legal_combination
  271. #recursive_packages(_dict_legal_combination, {}, _list_final_comba)
  272. _list_final_comba = circle_pageages(_dict_legal_combination)
  273. #除了Project包(招标人和代理人),其他包是不会有冲突的
  274. #查看是否有一个实体出现在了Project包和其他包中,如有,要进行裁剪
  275. _list_real_comba = []
  276. for dict_item in _list_final_comba:
  277. set_project = set()
  278. set_other = set()
  279. for _key in list(dict_item.keys()):
  280. if _key.split("$$")[0]=="Project":
  281. set_project.add(dict_item[_key])
  282. else:
  283. set_other.add(dict_item[_key])
  284. set_common = set_project&set_other
  285. if len(set_common)>0:
  286. dict_project = {}
  287. dict_not_project = {}
  288. for _key in list(dict_item.keys()):
  289. if dict_item[_key] in set_common:
  290. if str(_key.split("$$")[0])=="Project":
  291. dict_project[_key] = dict_item[_key]
  292. else:
  293. dict_not_project[_key] = dict_item[_key]
  294. else:
  295. dict_project[_key] = dict_item[_key]
  296. dict_not_project[_key] = dict_item[_key]
  297. _list_real_comba.append(dict_project)
  298. _list_real_comba.append(dict_not_project)
  299. else:
  300. _list_real_comba.append(dict_item)
  301. return _list_real_comba
  302. def get_dict_entity_prob(list_entity,on_value=0.5):
  303. dict_pack_entity_prob = {}
  304. for entity in list_entity:
  305. if entity.entity_type in ['org','company']:
  306. values = entity.values
  307. role_prob = float(values[int(entity.label)])
  308. _key = entity.packageName+"$$"+str(entity.label)
  309. if role_prob>=on_value and str(entity.label)!="5":
  310. _key_prob = _key+"$text$"+entity.entity_text
  311. if _key_prob in dict_pack_entity_prob:
  312. if role_prob>dict_pack_entity_prob[_key_prob][1]:
  313. dict_pack_entity_prob[_key_prob] = [entity.entity_text,role_prob]
  314. else:
  315. dict_pack_entity_prob[_key_prob] = [entity.entity_text,role_prob]
  316. return dict_pack_entity_prob
  317. #计算合计期望
  318. def getSumExpectation(dict_pack_entity_prob,combination,on_value=0.5):
  319. '''
  320. expect = 0
  321. for entity in list_entity:
  322. if entity.entity_type in ['org','company']:
  323. values = entity.values
  324. role_prob = float(values[int(entity.label)])
  325. _key = entity.packageName+"$$"+str(entity.label)
  326. if role_prob>on_value and str(entity.label)!="5":
  327. if _key in combination.keys() and combination[_key]==entity.entity_text:
  328. expect += math.pow(role_prob,4)
  329. else:
  330. expect -= math.pow(role_prob,4)
  331. '''
  332. #修改为同一个实体只取对应包-角色的最大的概率值
  333. expect = 0
  334. dict_entity_prob = {}
  335. for _key_pack_entity in dict_pack_entity_prob:
  336. _key_pack = _key_pack_entity.split("$text$")[0]
  337. role_prob = dict_pack_entity_prob[_key_pack_entity][1]
  338. if _key_pack in combination.keys() and combination[_key_pack]==dict_pack_entity_prob[_key_pack_entity][0]:
  339. if _key_pack_entity in dict_entity_prob.keys():
  340. if dict_entity_prob[_key_pack_entity]<role_prob:
  341. dict_entity_prob[_key_pack_entity] = role_prob
  342. else:
  343. dict_entity_prob[_key_pack_entity] = role_prob
  344. else:
  345. if _key_pack_entity in dict_entity_prob.keys():
  346. if dict_entity_prob[_key_pack_entity]>-role_prob:
  347. dict_entity_prob[_key_pack_entity] = -role_prob
  348. else:
  349. dict_entity_prob[_key_pack_entity] = -role_prob
  350. # for entity in list_entity:
  351. # if entity.entity_type in ['org','company']:
  352. # values = entity.values
  353. # role_prob = float(values[int(entity.label)])
  354. # _key = entity.packageName+"$$"+str(entity.label)
  355. # if role_prob>=on_value and str(entity.label)!="5":
  356. # if _key in combination.keys() and combination[_key]==entity.entity_text:
  357. # _key_prob = _key+entity.entity_text
  358. # if _key_prob in dict_entity_prob.keys():
  359. # if dict_entity_prob[_key_prob]<role_prob:
  360. # dict_entity_prob[_key_prob] = role_prob
  361. # else:
  362. # dict_entity_prob[_key_prob] = role_prob
  363. # else:
  364. # _key_prob = _key+entity.entity_text
  365. # if _key_prob in dict_entity_prob.keys():
  366. # if dict_entity_prob[_key_prob]>-role_prob:
  367. # dict_entity_prob[_key_prob] = -role_prob
  368. # else:
  369. # dict_entity_prob[_key_prob] = -role_prob
  370. for _key in dict_entity_prob.keys():
  371. symbol = 1 if dict_entity_prob[_key]>0 else -1
  372. expect += symbol*math.pow(dict_entity_prob[_key],2)
  373. return expect
  374. def getRoleList(list_sentence,list_entity,on_value = 0.5):
  375. '''
  376. @summary: 搜索树,得到所有不矛盾的角色组合,取合计期望值最大的作为结果返回
  377. @param:
  378. list_sentence:文章所有的sentence
  379. list_entity:文章所有的实体
  380. on_value:概率阈值
  381. @return:文章的角色list
  382. '''
  383. pack = getPackagesFromArticle(list_sentence,list_entity)
  384. if pack is None:
  385. return None
  386. PackageList,PackageSet,dict_PackageCode = pack
  387. #拿到所有可能的情况
  388. dict_role_combination = {}
  389. #拿到各个实体的packageName,packageCode
  390. for entity in list_entity:
  391. if entity.entity_type in ['org','company']:
  392. #过滤掉字数小于3个的实体
  393. if len(entity.entity_text)<=3:
  394. continue
  395. values = entity.values
  396. role_prob = float(values[int(entity.label)])
  397. if role_prob>=on_value and str(entity.label)!="5":
  398. if str(entity.label) in ["0","1"]:
  399. packageName = "Project"
  400. else:
  401. if len(PackageSet)>0:
  402. packagePointer,_ = getPackage(PackageList,entity.sentence_index,entity.end_index,"role-"+str(entity.label))
  403. if packagePointer is None:
  404. #continue
  405. packageName = "Project"
  406. else:
  407. #add pointer_pack
  408. entity.pointer_pack = packagePointer
  409. packageName = packagePointer.entity_text
  410. else:
  411. packageName = "Project"
  412. find_flag = False
  413. if packageName in dict_PackageCode.keys():
  414. packageCode = dict_PackageCode[packageName]
  415. else:
  416. packageCode = ""
  417. entity.packageCode = packageCode
  418. role_name = dict_role_id.get(str(entity.label))
  419. entity.roleName = role_name
  420. entity.packageName = packageName
  421. if entity.packageName in dict_role_combination.keys():
  422. if str(entity.label) in dict_role_combination[entity.packageName].keys():
  423. dict_role_combination[entity.packageName][str(entity.label)].add(entity.entity_text)
  424. else:
  425. dict_role_combination[entity.packageName][str(entity.label)] = set([entity.entity_text])
  426. else:
  427. dict_role_combination[entity.packageName] = {}
  428. #初始化空值
  429. roleIds = [0,1,2,3,4]
  430. for _roleId in roleIds:
  431. dict_role_combination[entity.packageName][str(_roleId)] = set([""])
  432. dict_role_combination[entity.packageName][str(entity.label)].add(entity.entity_text)
  433. list_real_comba = get_legal_comba(list_entity,dict_role_combination)
  434. #拿到最大期望值的组合
  435. max_index = 0
  436. max_expect = -100
  437. _index = 0
  438. dict_pack_entity_prob = get_dict_entity_prob(list_entity)
  439. for item_combination in list_real_comba:
  440. expect = getSumExpectation(dict_pack_entity_prob, item_combination)
  441. if expect>max_expect:
  442. max_index = _index
  443. max_expect = expect
  444. _index += 1
  445. RoleList = []
  446. RoleSet = set()
  447. if len(list_real_comba)>0:
  448. for _key in list_real_comba[max_index].keys():
  449. packageName = _key.split("$$")[0]
  450. label = _key.split("$$")[1]
  451. role_name = dict_role_id.get(str(label))
  452. entity_text = list_real_comba[max_index][_key]
  453. if packageName in dict_PackageCode.keys():
  454. packagecode = dict_PackageCode.get(packageName)
  455. else:
  456. packagecode = ""
  457. RoleList.append(PREM(packageName,packagecode,role_name,entity_text,0,0,0.0,[]))
  458. RoleSet.add(entity_text)
  459. #根据最优树来修正list_entity中角色对包的连接
  460. for _entity in list_entity:
  461. if _entity.pointer_pack is not None:
  462. _pack_name = _entity.pointer_pack.entity_text
  463. _find_flag = False
  464. for _prem in RoleList:
  465. if _prem.packageName==_pack_name and _prem.entity_text==_entity.entity_text:
  466. _find_flag = True
  467. if not _find_flag:
  468. _entity.pointer_pack = None
  469. return RoleList,RoleSet,PackageList,PackageSet
  470. def getPackageScopePattern():
  471. '''
  472. @summary: 获取包的作用域关键词
  473. '''
  474. df = pd.read_excel(os.path.dirname(__file__)+"/end.xls")
  475. pattern = "("
  476. for item in df["list_word"]:
  477. item = str(item).replace("(","\(").replace(")","\)").replace(".","\.").replace("[","\[").replace("]","\]").replace("-","\-")
  478. pattern += item+"|"
  479. pattern = pattern[:-1]+")[::是为]|业绩.{,30}标段[0-9A-Za-z一二三四五六七八九十]{0,3}"
  480. return pattern
  481. pattern_packageScope = getPackageScopePattern()
  482. def getPackagesFromArticle(list_sentence,list_entity):
  483. '''
  484. @param:
  485. list_sentence:文章的句子list
  486. @summary: 将包的信息插入list_entity中
  487. @return: type:list if [包号,句子index,词偏移,标段号] meaning:文章的包/标段信息
  488. '''
  489. if len(list_sentence)==0:
  490. return None
  491. PackageList = []
  492. PackageList_scope = []
  493. PackageSet = set()
  494. dict_packageCode = dict()
  495. package_name_pattern = re.compile("((标[段号的包]|分包)的?(名[称单]?|包名))[::]?([^::]{3,30}?),{1}")
  496. package_N_name_pattern = re.compile("[^承](分?包|标段|标包|标|包|包组|子项目|包件|项目类型)编?号?[::]?[\((]?([0-9A-Za-z一二三四五六七八九十ⅠⅡⅢⅣⅤⅥⅦ]){1,2},{1}")
  497. package_number_pattern = re.compile("(([^承](包|标[段号的包]|分?包|包组|包件)编?号?|子项目|项目类型)[::]?[0-9A-Za-z一二三四五六七八九十ⅠⅡⅢⅣⅤⅥⅦ]{1,4}[^\.])[^至]?|([^\.]?第?[ⅠⅡⅢⅣⅤⅥⅦ0-9A-Za-z一二三四五六七八九十]{1,4}(包号|标[段号的包]|分?包))")
  498. # other_package_pattern = re.compile('(项目名称|物资名称|设备名称|场次名称|标段名称)[::](.{,20}?)(,|项目)') # 新正则识别标段
  499. other_package_pattern = re.compile('((项目|物资|设备|场次|标段|标的|产品)(名称))[::]([^,。]{2,50}?)[,。]') # # 2020/11/23 大网站规则 调整 package_N_name_pattern, package_N_name_pattern 中的项目 改为 子项目
  500. win_tenderer_pattern = re.compile('(中标人|供应商)[::](.{2,25})[,。]') # 2020/11/23 大网站规则 调整
  501. model_pattern = re.compile('(型号|序号)[::]([^,。]{2,20})[,。]') # 2020/11/23 大网站规则 调整
  502. number_pattern = re.compile("[0-9A-Za-z一二三四五六七八九十ⅠⅡⅢⅣⅤⅥⅦ]{1,4}")
  503. package_code_pattern = re.compile("(?:编号[::]?\s*)([-\dA-Za-z\(\)]+)")
  504. def changeIndexFromWordToWords(tokens,word_index):
  505. '''
  506. @summary:转换某个字的字偏移为词偏移
  507. '''
  508. before_index = 0
  509. after_index = 0
  510. for i in range(len(tokens)):
  511. after_index = after_index+len(tokens[i])
  512. if before_index<=word_index and after_index>=word_index:
  513. return i
  514. before_index = after_index
  515. package_names = []
  516. def extractPackageCode(tokens,word_index,size=20,pattern = package_code_pattern):
  517. '''
  518. @summary:抽取包附近的标段号
  519. @param:
  520. tokens:包所在句子的分词
  521. word_index:包所在字偏移
  522. size:左右各取多少个词
  523. pattern:提取标段号的正则
  524. @return: type:string,meaning:标段号
  525. '''
  526. index = changeIndexFromWordToWords(tokens,word_index)
  527. if index<size:
  528. begin = index
  529. else:
  530. begin = index-size
  531. if index+size>len(tokens):
  532. end = len(tokens)
  533. else:
  534. end = index+size
  535. #拿到左右两边的词语组成短语
  536. text = "".join(tokens[begin:end])
  537. #在短语中的字偏移
  538. new_word_index = word_index-len("".join(tokens[:begin]))
  539. min_distance = len(text)
  540. packageCode = None
  541. for the_iter in re.finditer(pattern,text):
  542. #算出最小距离
  543. distance = min([abs(new_word_index-the_iter.span()[0]),abs(new_word_index-the_iter.span()[1])])
  544. if distance<min_distance:
  545. min_distance = distance
  546. packageCode = the_iter.group(1)
  547. return packageCode
  548. #从标段介绍表格中提取包名和包号
  549. for i in range(len(list_sentence)):
  550. content = list_sentence[i].sentence_text
  551. names = re.findall(package_name_pattern,content)
  552. if names == []:
  553. names = re.findall(other_package_pattern, content)
  554. N_names = re.findall(package_N_name_pattern,content)
  555. if len(names)==1 and len(N_names)==1:
  556. package_names.append([names[0][-1],N_names[0][-1]])
  557. for i in range(len(list_sentence)):
  558. PackageList_item = []
  559. PackageList_item_scope = []
  560. content = list_sentence[i].sentence_text
  561. tokens = list_sentence[i].tokens
  562. for name in package_names[:20]:
  563. for index in findAllIndex(name[0],content):
  564. temp_package_number = re.findall(number_pattern,name[1])[0]
  565. PackageList_item.append({"name":temp_package_number,"sentence_index":list_sentence[i].sentence_index,"offsetWords_begin":changeIndexFromWordToWords(tokens,index),"offsetWord_begin":index,"offsetWord_end":index+len(name[0])})
  566. # PackageList_item.append([temp_package_number,i,changeIndexFromWordToWords(tokens,index),index,index+len(str(temp_package_number))])
  567. code = extractPackageCode(tokens, index)
  568. if code is not None:
  569. dict_packageCode[temp_package_number] = code
  570. PackageSet.add(temp_package_number)
  571. for iter in re.finditer(package_number_pattern,content):
  572. temp_package_number = re.findall(number_pattern,content[iter.span()[0]:iter.span()[1]])[0]
  573. PackageList_item.append({"name":temp_package_number,"sentence_index":list_sentence[i].sentence_index,"offsetWords_begin":changeIndexFromWordToWords(tokens,iter.span()[0]),"offsetWord_begin":iter.span()[0],"offsetWord_end":iter.span()[1]})
  574. # PackageList_item.append([temp_package_number,i,changeIndexFromWordToWords(tokens,iter.span()[0]),iter.span()[0],iter.span()[1]])
  575. code = extractPackageCode(tokens, iter.span()[0])
  576. if code is not None:
  577. dict_packageCode[temp_package_number] = code
  578. PackageSet.add(temp_package_number)
  579. #识别packageScope
  580. for iter in re.finditer(pattern_packageScope,content):
  581. PackageList_item_scope.append({"name":"","sentence_index":list_sentence[i].sentence_index,"offsetWords_begin":changeIndexFromWordToWords(tokens,iter.span()[0]),"offsetWord_begin":iter.span()[0],"offsetWord_end":iter.span()[1]})
  582. # PackageList_item_scope.append(["",i,changeIndexFromWordToWords(tokens,iter.span()[0]),iter.span()[0],iter.span()[1]])
  583. PackageList_item_scope = PackageList_item +PackageList_item_scope
  584. PackageList_item_scope.sort(key=lambda x:x["offsetWord_begin"])
  585. PackageList_scope = PackageList_scope+PackageList_item_scope
  586. PackageList_item.sort(key=lambda x:x["sentence_index"])
  587. #PackageList = PackageList+PackageList_item
  588. #不作为包
  589. # if len(PackageSet)==0:
  590. # for i in range(len(list_sentence)):
  591. # PackageList_item = []
  592. # PackageList_item_scope = []
  593. # content = list_sentence[i].sentence_text
  594. # tokens = list_sentence[i].tokens
  595. # for iter in re.finditer(other_package_pattern,content):
  596. # temp_package_number = iter.group(2)
  597. # PackageList_item.append({"name":temp_package_number,"sentence_index":list_sentence[i].sentence_index,"offsetWords_begin":changeIndexFromWordToWords(tokens,iter.span()[0]),"offsetWord_begin":iter.span()[0],"offsetWord_end":iter.span()[1]})
  598. # # PackageList_item.append([temp_package_number,i,changeIndexFromWordToWords(tokens,iter.span()[0]),iter.span()[0],iter.span()[1]])
  599. # code = extractPackageCode(tokens, iter.span()[0])
  600. # if code is not None:
  601. # dict_packageCode[temp_package_number] = code
  602. # PackageSet.add(temp_package_number)
  603. # #识别packageScope
  604. # for iter in re.finditer(pattern_packageScope,content):
  605. # PackageList_item_scope.append({"name":"","sentence_index":list_sentence[i].sentence_index,"offsetWords_begin":changeIndexFromWordToWords(tokens,iter.span()[0]),"offsetWord_begin":iter.span()[0],"offsetWord_end":iter.span()[1]})
  606. # # PackageList_item_scope.append(["",i,changeIndexFromWordToWords(tokens,iter.span()[0]),iter.span()[0],iter.span()[1]])
  607. # PackageList_item_scope = PackageList_item +PackageList_item_scope
  608. # PackageList_item_scope.sort(key=lambda x:x["offsetWord_begin"])
  609. # PackageList_scope = PackageList_scope+PackageList_item_scope
  610. # PackageList_item.sort(key=lambda x:x["sentence_index"])
  611. # 2020/11/23 大网站规则 调整
  612. if len(PackageSet)==0 and len(set([it.entity_text for it in list_entity if it.entity_type in ['org', 'company'] and it.label==2]))>1:
  613. for i in range(len(list_sentence)):
  614. PackageList_item = []
  615. PackageList_item_scope = []
  616. content = list_sentence[i].sentence_text
  617. tokens = list_sentence[i].tokens
  618. names = re.findall(other_package_pattern, content)
  619. N_names = re.findall(win_tenderer_pattern, content)
  620. if len(names) != 1 or len(N_names) != 1:
  621. continue
  622. for iter in re.finditer(other_package_pattern,content):
  623. temp_package_number = iter.group(4)
  624. xinghao = re.search(model_pattern, content)
  625. if xinghao:
  626. temp_package_number = temp_package_number + '+' + xinghao.group(2)
  627. # print('新正则采购包名补充',temp_package_number)
  628. PackageList_item.append({"name":temp_package_number,"sentence_index":list_sentence[i].sentence_index,"offsetWords_begin":changeIndexFromWordToWords(tokens,iter.span()[0]),"offsetWord_begin":iter.span()[0],"offsetWord_end":iter.span()[1]})
  629. # PackageList_item.append([temp_package_number,i,changeIndexFromWordToWords(tokens,iter.span()[0]),iter.span()[0],iter.span()[1]])
  630. code = extractPackageCode(tokens, iter.span()[0])
  631. if code is not None:
  632. dict_packageCode[temp_package_number] = code
  633. PackageSet.add(temp_package_number)
  634. #识别packageScope
  635. for iter in re.finditer(pattern_packageScope,content):
  636. PackageList_item_scope.append({"name":"","sentence_index":list_sentence[i].sentence_index,"offsetWords_begin":changeIndexFromWordToWords(tokens,iter.span()[0]),"offsetWord_begin":iter.span()[0],"offsetWord_end":iter.span()[1]})
  637. # PackageList_item_scope.append(["",i,changeIndexFromWordToWords(tokens,iter.span()[0]),iter.span()[0],iter.span()[1]])
  638. PackageList_item_scope = PackageList_item +PackageList_item_scope
  639. PackageList_item_scope.sort(key=lambda x:x["offsetWord_begin"])
  640. PackageList_scope = PackageList_scope+PackageList_item_scope
  641. PackageList_item.sort(key=lambda x:x["sentence_index"])
  642. pattern_punctuation = "[::()\(\),,。;;]"
  643. for i in range(len(list_sentence)):
  644. for j in range(len(PackageList_scope)):
  645. if i==PackageList_scope[j]["sentence_index"] and PackageList_scope[j]["name"]!="":
  646. _flag = False
  647. left_str = list_sentence[i].sentence_text[PackageList_scope[j]["offsetWord_begin"]-30:PackageList_scope[j]["offsetWord_begin"]+1]
  648. right_str = list_sentence[i].sentence_text[PackageList_scope[j]["offsetWord_begin"]:PackageList_scope[j]["offsetWord_begin"]+30]
  649. _left_find = re.findall(pattern_punctuation,left_str)
  650. _right_find = re.findall(pattern_punctuation,right_str)
  651. #print(left_str)
  652. if re.search("同",left_str[-1:]) is not None and PackageList_scope[j]["name"]=="一":
  653. continue
  654. if re.search("划分",right_str[:10]) is not None:
  655. continue
  656. if len(_left_find)>0 and _left_find[-1] in [":",":"]:
  657. _flag = True
  658. if len(_right_find)>0 and _right_find[0] in [":",":"]:
  659. _flag = True
  660. if _flag:
  661. scope_begin = [PackageList_scope[j]["sentence_index"],PackageList_scope[j]["offsetWords_begin"]]
  662. else:
  663. if j==0:
  664. scope_begin = [0,0]
  665. else:
  666. scope_begin = [PackageList_scope[j-1]["sentence_index"],PackageList_scope[j-1]["offsetWords_begin"]]
  667. if j==len(PackageList_scope)-1:
  668. scope_end = [PackageList_scope[j]["sentence_index"],changeIndexFromWordToWords(list_sentence[i].tokens, len(list_sentence[i].sentence_text))]
  669. else:
  670. scope_end = [PackageList_scope[j+1]["sentence_index"],PackageList_scope[j+1]["offsetWords_begin"]]
  671. if PackageList_scope[j-1]["sentence_index"]==PackageList_scope[j]["sentence_index"] and PackageList_scope[j-1]["offsetWord_begin"]<=PackageList_scope[j]["offsetWord_begin"] and PackageList_scope[j-1]["offsetWord_end"]>=PackageList_scope[j]["offsetWord_end"]:
  672. continue
  673. #add package to entity
  674. _pack_entity = Entity(doc_id=list_sentence[0].doc_id,entity_id="%s_%s_%s_%s"%(list_sentence[0].doc_id,i,PackageList_scope[j]["offsetWord_begin"],PackageList_scope[j]["offsetWord_begin"]),entity_text=PackageList_scope[j]["name"],entity_type="package",sentence_index=PackageList_scope[j]["sentence_index"],begin_index=changeIndexFromWordToWords(list_sentence[i].tokens,PackageList_scope[j]["offsetWord_begin"]),end_index=changeIndexFromWordToWords(list_sentence[i].tokens,PackageList_scope[j]["offsetWord_end"]),wordOffset_begin=PackageList_scope[j]["offsetWord_begin"],wordOffset_end=PackageList_scope[j]["offsetWord_end"])
  675. list_entity.append(_pack_entity)
  676. copy_pack = copy.copy(PackageList_scope[j])
  677. copy_pack["scope"] = [scope_begin,scope_end]
  678. copy_pack["hit"] = set()
  679. copy_pack["pointer"] = _pack_entity
  680. PackageList.append(copy_pack)
  681. return PackageList,PackageSet,dict_packageCode
  682. def findAttributeAfterEntity(PackDict,roleSet,PackageList,PackageSet,list_entity,on_value = 0.5,on_value_person=0.5,sentence_len=4):
  683. '''
  684. @param:
  685. PackDict:文章包dict
  686. roleSet:文章所有角色的公司名称
  687. PackageList:文章的包信息
  688. PackageSet:文章所有包的名称
  689. list_entity:文章所有经过模型处理的实体
  690. on_value:金额模型的阈值
  691. on_value_person:联系人模型的阈值
  692. sentence_len:公司和属性间隔句子的最大长度
  693. @return:添加了属性信息的角色list
  694. '''
  695. #根据roleid添加金额到rolelist中
  696. def addMoneyByRoleid(packDict,packageName,roleid,money,money_prob):
  697. for i in range(len(packDict[packageName]["roleList"])):
  698. if packDict[packageName]["roleList"][i].role_name==dict_role_id.get(str(roleid)):
  699. if money_prob>packDict[packageName]["roleList"][i].money_prob:
  700. packDict[packageName]["roleList"][i].money = money
  701. packDict[packageName]["roleList"][i].money_prob = money_prob
  702. return packDict
  703. #根据实体名称添加金额到rolelist中
  704. def addMoneyByEntity(packDict,packageName,entity,money,money_prob):
  705. for i in range(len(packDict[packageName]["roleList"])):
  706. if packDict[packageName]["roleList"][i].entity_text==entity:
  707. # if money_prob>packDict[packageName]["roleList"][i].money_prob:
  708. # packDict[packageName]["roleList"][i].money = money
  709. # packDict[packageName]["roleList"][i].money_prob = money_prob
  710. if packDict[packageName]["roleList"][i].money_prob==0 : # 2021/7/20第一次更新金额
  711. packDict[packageName]["roleList"][i].money = money.entity_text
  712. packDict[packageName]["roleList"][i].money_prob = money_prob
  713. elif money_prob>packDict[packageName]["roleList"][i].money_prob+0.2 or money.notes in ['大写']: # 2021/7/20改为优先选择大写金额,
  714. # print('已连接金额概率:money_prob:',packDict[packageName]["roleList"][i].money_prob)
  715. # print('链接金额备注 ',money.notes, money.entity_text, money.values)
  716. packDict[packageName]["roleList"][i].money = money.entity_text
  717. packDict[packageName]["roleList"][i].money_prob = money_prob
  718. return packDict
  719. #根据实体名称得到角色
  720. def getRoleWithText(packDict,entity_text):
  721. for pack in packDict.keys():
  722. for i in range(len(packDict[pack]["roleList"])):
  723. if packDict[pack]["roleList"][i].entity_text==entity_text:
  724. return packDict[pack]["roleList"][i].role_name
  725. def doesEntityOrLinkedEntity_inRoleSet(entity,RoleSet):
  726. _list_entitys = [entity]+entity.linked_entitys
  727. for _entity in _list_entitys:
  728. if _entity.entity_text in RoleSet:
  729. return True
  730. p_entity = 0
  731. # 2021/7/19 顺序比较金额,前面是后面的一万倍则把前面金额/10000
  732. money_list = [it for it in list_entity if it.entity_type=="money"]
  733. for i in range(len(money_list)-1):
  734. for j in range(1, len(money_list)):
  735. if (float(money_list[i].entity_text) > 5000000000 or money_list[j].notes=='大写') and \
  736. Decimal(money_list[i].entity_text)/Decimal(money_list[j].entity_text)==10000:
  737. money_list[i].entity_text = str(Decimal(money_list[i].entity_text)/10000)
  738. # print('连接前修改大于50亿金额:前面是后面的一万倍则把前面金额/10000')
  739. #遍历所有实体
  740. while(p_entity<len(list_entity)):
  741. entity = list_entity[p_entity]
  742. '''
  743. #招标金额从后往前找
  744. if entity.entity_type=="money":
  745. if entity.values[entity.label]>=on_value:
  746. if str(entity.label)=="0":
  747. packagePointer,_ = getPackage(PackageList,entity.sentence_index,entity.begin_index,"money-"+str(entity.label))
  748. if packagePointer is None:
  749. packageName = "Project"
  750. else:
  751. packageName = packagePointer.entity_text
  752. addMoneyByRoleid(PackDict, packageName, "0", entity.entity_text, entity.values[entity.label])
  753. '''
  754. ''' # 2020/11/25 与下面的联系人连接步骤重复,取消
  755. if entity.entity_type=="person":
  756. if entity.values[entity.label]>=on_value_person:
  757. if str(entity.label)=="1":
  758. for i in range(len(PackDict["Project"]["roleList"])):
  759. if PackDict["Project"]["roleList"][i].role_name=="tenderee":
  760. PackDict["Project"]["roleList"][i].linklist.append((entity.entity_text,entity.person_phone))
  761. # add pointer_person
  762. for _entity in list_entity:
  763. if dict_role_id.get(str(_entity.label))=="tenderee":
  764. for i in range(len(PackDict["Project"]["roleList"])):
  765. if PackDict["Project"]["roleList"][i].entity_text==_entity.entity_text and PackDict["Project"]["roleList"][i].role_name=="tenderee":
  766. _entity.pointer_person = entity
  767. elif str(entity.label)=="2":
  768. for i in range(len(PackDict["Project"]["roleList"])):
  769. if PackDict["Project"]["roleList"][i].role_name=="agency":
  770. PackDict["Project"]["roleList"][i].linklist.append((entity.entity_text,entity.person_phone))
  771. # add pointer_person
  772. for _entity in list_entity:
  773. if dict_role_id.get(str(_entity.label))=="agency":
  774. for i in range(len(PackDict["Project"]["roleList"])):
  775. if PackDict["Project"]["roleList"][i].entity_text==_entity.entity_text and PackDict["Project"]["roleList"][i].role_name=="agency":
  776. _entity.pointer_person = entity
  777. '''
  778. # #金额往前找实体
  779. # if entity.entity_type=="money":
  780. # if entity.values[entity.label]>=on_value:
  781. # p_entity_money= p_entity
  782. # entity_money = list_entity[p_entity_money]
  783. # if len(PackageSet)>0:
  784. # packagePointer,_ = getPackage(PackageList,entity_money.sentence_index,entity_money.begin_index,"money-"+str(entity_money.entity_text)+"-"+str(entity_money.label))
  785. # if packagePointer is None:
  786. # packageName_entity = "Project"
  787. # else:
  788. # packageName_entity = packagePointer.entity_text
  789. # else:
  790. # packageName_entity = "Project"
  791. # while(p_entity_money>0):
  792. # entity_before = list_entity[p_entity_money]
  793. # if entity_before.entity_type in ['org','company']:
  794. # if str(entity_before.label)=="1":
  795. # addMoneyByEntity(PackDict, packageName_entity, entity_before.entity_text, entity_money.entity_text, entity_money.values[entity_money.label])
  796. # #add pointer_money
  797. # entity_before.pointer_money = entity_money
  798. # break
  799. # p_entity_money -= 1
  800. #如果实体属于角色集合,则往后找属性
  801. if doesEntityOrLinkedEntity_inRoleSet(entity, roleSet):
  802. p_entity += 1
  803. #循环查找符合的属性
  804. while(p_entity<len(list_entity)):
  805. entity_after = list_entity[p_entity]
  806. if entity_after.sentence_index-entity.sentence_index>=sentence_len:
  807. p_entity -= 1
  808. break
  809. #若是遇到公司实体,则跳出循环
  810. if entity_after.entity_type in ['org','company']:
  811. p_entity -= 1
  812. break
  813. if entity_after.values is not None:
  814. if entity_after.entity_type=="money":
  815. if entity_after.values[entity_after.label]>=on_value:
  816. '''
  817. #招标金额从后往前找
  818. if str(entity_after.label)=="0":
  819. packagePointer,_ = getPackage(PackageList,entity.sentence_index,entity.begin_index,"money-"+str(entity.label))
  820. if packagePointer is None:
  821. packageName = "Project"
  822. else:
  823. packageName = packagePointer.entity_text
  824. addMoneyByRoleid(PackDict, packageName, "0", entity_after.entity_text, entity_after.values[entity_after.label])
  825. '''
  826. if str(entity_after.label)=="1":
  827. #print(entity_after.entity_text,entity.entity_text)
  828. _list_entitys = [entity]+entity.linked_entitys
  829. if len(PackageSet)>0:
  830. packagePointer,_ = getPackage(PackageList,entity_after.sentence_index,entity_after.begin_index,"money-"+str(entity_after.entity_text)+"-"+str(entity_after.label))
  831. if packagePointer is None:
  832. packageName_entity = "Project"
  833. else:
  834. packageName_entity = packagePointer.entity_text
  835. else:
  836. packageName_entity = "Project"
  837. if str(entity.label) in ["2","3","4"]:
  838. # addMoneyByEntity(PackDict, packageName_entity, entity.entity_text, entity_after.entity_text, entity_after.values[entity_after.label])
  839. if entity_after.notes == '单价':
  840. addMoneyByEntity(PackDict, packageName_entity, entity.entity_text, entity_after,
  841. 0.5)
  842. entity.pointer_money = entity_after
  843. # print('role zhao money', entity.entity_text, '中标金额:', entity_after.entity_text)
  844. else:
  845. addMoneyByEntity(PackDict, packageName_entity, entity.entity_text, entity_after,
  846. entity_after.values[entity_after.label])
  847. entity.pointer_money = entity_after
  848. # print('role zhao money', entity.entity_text, '中标金额:', entity_after.entity_text)
  849. break # 2021/7/16 新增,找到中标金额,非单价即停止,不再往后找金额
  850. #add pointer_money
  851. # entity.pointer_money = entity_after
  852. # print('role zhao money', entity.entity_text, '中标金额:', entity_after.entity_text)
  853. # if entity_after.notes!='单价':
  854. # break # 2021/7/16 新增,找到中标金额即停止,不再往后找金额
  855. '''
  856. if entity_after.entity_type=="person":
  857. if entity_after.values[entity_after.label]>=on_value_person:
  858. if str(entity_after.label)=="1":
  859. for i in range(len(roleList)):
  860. if roleList[i].role_name=="tenderee":
  861. roleList[i].linklist.append((entity_after.entity_text,entity_after.person_phone))
  862. elif str(entity_after.label)=="2":
  863. for i in range(len(roleList)):
  864. if roleList[i].role_name=="agency":
  865. roleList[i].linklist.append((entity_after.entity_text,entity_after.person_phone))
  866. elif str(entity_after.label)=="3":
  867. _list_entitys = [entity]+entity.linked_entitys
  868. for _entity in _list_entitys:
  869. for i in range(len(roleList)):
  870. if roleList[i].entity_text==_entity.entity_text:
  871. if entity_after.sentence_index-_entity.sentence_index>1 and len(roleList[i].linklist)>0:
  872. break
  873. roleList[i].linklist.append((entity_after.entity_text,entity_after.person_phone))
  874. '''
  875. p_entity += 1
  876. p_entity += 1
  877. ''''''
  878. # 1)第一个公司实体的招标人,则看看下一个实体是否为代理人,如果是则联系人错位连接 。2)在同一句中往后找联系人。3)连接不上在整个文章找联系人。
  879. temp_ent_list = [] # 临时列表,记录0,1角色及3联系人
  880. other_person = [] # 阈值以上的联系人列表
  881. link_person = [] # 有电话没联系上角色的person列表
  882. other_ent = []
  883. link_ent = []
  884. found_person = False
  885. ent_list = []
  886. for entity in list_entity:
  887. if entity.entity_type in ['org','company','person']:
  888. ent_list.append(entity)
  889. #for list_index in range(len(ent_list)):
  890. #if ent_list[list_index].entity_type in ['org','company'] and ent_list[list_index].label == 0 and list_index+2<len(ent_list) and \
  891. #ent_list[list_index+1].entity_type in ['org','company'] and ent_list[list_index+1].label == 1 and ent_list[list_index+2].entity_type in ['person']:
  892. #ent_list[list_index+1], ent_list[list_index+2] = ent_list[list_index+2], ent_list[list_index+1]
  893. # 2020/11/25增加确定角色联系人判断
  894. sure_person_set = set([entity.entity_text for entity in ent_list if entity.entity_type == 'person' and entity.label in [1, 2]])
  895. for index in range(len(ent_list)):
  896. entity = ent_list[index]
  897. if entity.entity_type=="person":
  898. if str(entity.label) == "0": # 2020/11/25 非联系人直接跳过
  899. continue
  900. if entity.values[entity.label]>on_value_person:
  901. if str(entity.label)=="1":
  902. for i in range(len(PackDict["Project"]["roleList"])):
  903. if PackDict["Project"]["roleList"][i].role_name=="tenderee":
  904. PackDict["Project"]["roleList"][i].linklist.append((entity.entity_text,entity.person_phone))
  905. link_person.append(entity.entity_text)
  906. link_ent.append(PackDict["Project"]["roleList"][i].entity_text)
  907. # add pointer_person
  908. for _entity in list_entity:
  909. if dict_role_id.get(str(_entity.label))=="tenderee":
  910. for i in range(len(PackDict["Project"]["roleList"])):
  911. if PackDict["Project"]["roleList"][i].entity_text==_entity.entity_text and PackDict["Project"]["roleList"][i].role_name=="tenderee":
  912. _entity.pointer_person = entity
  913. elif str(entity.label)=="2":
  914. for i in range(len(PackDict["Project"]["roleList"])):
  915. if PackDict["Project"]["roleList"][i].role_name=="agency":
  916. PackDict["Project"]["roleList"][i].linklist.append((entity.entity_text,entity.person_phone))
  917. link_person.append(entity.entity_text)
  918. link_ent.append(PackDict["Project"]["roleList"][i].entity_text)
  919. # add pointer_person
  920. for _entity in list_entity:
  921. if dict_role_id.get(str(_entity.label))=="agency":
  922. for i in range(len(PackDict["Project"]["roleList"])):
  923. if PackDict["Project"]["roleList"][i].entity_text==_entity.entity_text and PackDict["Project"]["roleList"][i].role_name=="agency":
  924. _entity.pointer_person = entity
  925. elif str(entity.label)=="3":
  926. if entity.entity_text in sure_person_set: # 2020/11/25 排除已经确定角色的联系人
  927. continue
  928. #not_link_person.append((entity_after.entity_text,entity_after.person_phone))
  929. other_person.append(entity.entity_text)
  930. temp_ent_list.append((entity.entity_text,entity.person_phone,entity))
  931. #if entity.entity_text in roleSet:
  932. if entity.entity_text in roleSet:
  933. if entity.label in [0,1]:
  934. other_ent.append(entity.entity_text)
  935. temp_ent_list.append((entity.entity_text, entity.label,entity))
  936. for behind_index in range(index+1, len(ent_list)):
  937. entity_after = ent_list[behind_index]
  938. if entity_after.sentence_index-entity.sentence_index>=1 or entity_after.entity_type in ['org','company']: # 只在本句中找联系人
  939. break
  940. if entity_after.values is not None:
  941. if entity_after.entity_type=="person":
  942. if str(entity_after.label) == "0": # 2020/11/25角色后面为非联系人 停止继续往后找
  943. break
  944. if entity_after.values[entity_after.label]>on_value_person:
  945. if str(entity_after.label)=="1":
  946. for i in range(len(PackDict["Project"]["roleList"])):
  947. if PackDict["Project"]["roleList"][i].role_name=="tenderee":
  948. PackDict["Project"]["roleList"][i].linklist.append((entity_after.entity_text,entity_after.person_phone))
  949. link_person.append(entity_after.entity_text)
  950. link_ent.append(PackDict["Project"]["roleList"][i].entity_text)
  951. elif str(entity_after.label)=="2":
  952. for i in range(len(PackDict["Project"]["roleList"])):
  953. if PackDict["Project"]["roleList"][i].role_name=="agency":
  954. PackDict["Project"]["roleList"][i].linklist.append((entity_after.entity_text,entity_after.person_phone))
  955. link_person.append(entity_after.entity_text)
  956. link_ent.append(PackDict["Project"]["roleList"][i].entity_text)
  957. elif str(entity_after.label)=="3":
  958. if entity_after.entity_text in sure_person_set: # 2020/11/25 如果姓名已经出现在确定角色联系人中则停止往后找
  959. break
  960. elif entity_after.begin_index - entity.end_index > 30:#2020/10/25 如果角色实体与联系人实体间隔大于阈值停止
  961. break
  962. for pack in PackDict.keys():
  963. for i in range(len(PackDict[pack]["roleList"])):
  964. if PackDict[pack]["roleList"][i].entity_text==entity.entity_text:
  965. #if entity_after.sentence_index-entity.sentence_index>1 and len(roleList[i].linklist)>0:
  966. #break
  967. PackDict[pack]["roleList"][i].linklist.append((entity_after.entity_text,entity_after.person_phone))
  968. link_person.append(entity_after.entity_text)
  969. #add pointer_person
  970. entity.pointer_person = entity_after
  971. not_link_person = [person for person in other_person if person not in link_person]
  972. not_link_ent = [ent for ent in other_ent if ent not in link_ent]
  973. if len(not_link_person) > 0 and len(not_link_ent) > 0 :
  974. item = temp_ent_list
  975. for i in range(len(item)):
  976. if item[i][0] in not_link_ent and item[i][1] == 0 and i+3 < len(item):
  977. if item[i+1][0] in other_ent and item[i+1][1] == 1 and item[i+2][0] in other_person and item[i+3][0] in other_person:
  978. item[i+1], item[i+2] = item[i+2], item[i+1]
  979. for i in range(len(item)-1, -1, -1):
  980. if item[i][0] in not_link_ent:
  981. for pack in PackDict.keys():
  982. for role in PackDict[pack]["roleList"]:
  983. if role.entity_text == item[i][0] and len(role.linklist) < 1:
  984. for j in range(i+1, len(item)):
  985. if item[j][0] in not_link_person:
  986. role.linklist.append(item[j][:2])
  987. #add pointer_person
  988. item[i][2].pointer_person = item[j][2]
  989. break
  990. else:
  991. break
  992. #寻找多标段招标金额
  993. p_entity = len(list_entity)-1
  994. set_tenderer_money = set()
  995. list_tenderer_money = [] #2021/7/16 新增列表,倒序保存所有中标金额
  996. #遍历所有实体
  997. while(p_entity>=0):
  998. entity = list_entity[p_entity]
  999. if entity.entity_type=="money":
  1000. if entity.values[entity.label]>=on_value:
  1001. if str(entity.label)=="1":
  1002. set_tenderer_money.add(float(entity.entity_text))
  1003. list_tenderer_money.append(float(entity.entity_text)) # 2021/7/16 新增列表,倒序保存所有中标金额
  1004. # if str(entity.label)=="0":
  1005. if str(entity.label)=="0" and entity.notes!='总投资':
  1006. '''
  1007. if p_entity>0:
  1008. p_before = list_entity[p_entity-1]
  1009. if p_before.entity_type=="money" and p_before.label==entity.label and p_before.entity_text==entity.entity_text and abs(entity.begin_index-p_before.end_index)<=2:
  1010. p_entity -= 1
  1011. continue
  1012. '''
  1013. packagePointer,_flag = getPackage(PackageList,entity.sentence_index,entity.begin_index,"money-"+str(entity.label),MAX_DIS=2,DIRECT="L")
  1014. if packagePointer is None:
  1015. packageName = "Project"
  1016. else:
  1017. packageName = packagePointer.entity_text
  1018. if packageName=="Project":
  1019. # if PackDict["Project"]["tendereeMoney"]<float(entity.entity_text):
  1020. # PackDict["Project"]["tendereeMoney"] = float(entity.entity_text)
  1021. if entity.values[entity.label]>on_value:
  1022. PackDict["Project"]["tendereeMoney"] = float(entity.entity_text)
  1023. else:
  1024. PackDict[packageName]["tendereeMoney"] = float(entity.entity_text)
  1025. #add pointer_tendereeMoney
  1026. packagePointer.pointer_tendereeMoney = entity
  1027. p_entity -= 1
  1028. #删除一个机构有多个角色的数据
  1029. #删除重复人、概率不回传
  1030. final_roleList = []
  1031. list_pop = []
  1032. set_tenderer_role = set()
  1033. dict_pack_tenderer_money = dict()
  1034. for pack in PackDict.keys():
  1035. #删除无效包
  1036. if PackDict[pack]["code"]=="" and PackDict[pack]["tendereeMoney"]==0 and len(PackDict[pack]["roleList"])==0:
  1037. list_pop.append(pack)
  1038. for i in range(len(PackDict[pack]["roleList"])):
  1039. if PackDict[pack]["roleList"][i].role_name=="win_tenderer":
  1040. if PackDict[pack]["roleList"][i].money==0:
  1041. set_tenderer_role.add(PackDict[pack]["roleList"][i])
  1042. dict_pack_tenderer_money[pack] = [PackDict[pack]["roleList"][i],set()]
  1043. #找到包的中投标金额
  1044. for _index in range(len(PackageList)):
  1045. if "hit" in PackageList[_index]:
  1046. for _hit in list(PackageList[_index]["hit"]):
  1047. _money = float(_hit.split("-")[1]) if _hit.split("-")[0]=="money" else None
  1048. if PackageList[_index]["name"] in dict_pack_tenderer_money and _money is not None:
  1049. dict_pack_tenderer_money[PackageList[_index]["name"]][1].add(_money)
  1050. #只找到一个中标人和中标金额
  1051. if len(set_tenderer_money)==1 and len(set_tenderer_role)==1:
  1052. list(set_tenderer_role)[0].money = list(set_tenderer_money)[0]
  1053. # print('一个中标人一个金额:', list(set_tenderer_money)[0])
  1054. #找到一个中标人和多个招标金额
  1055. if len(set_tenderer_money)>1 and len(set_tenderer_role)==1:
  1056. _maxMoney = 0
  1057. _sumMoney = 0
  1058. for _m in list(set_tenderer_money):
  1059. _sumMoney += _m
  1060. if _m>_maxMoney:
  1061. _maxMoney = _m
  1062. if _sumMoney/_maxMoney==2:
  1063. list(set_tenderer_role)[0].money = _maxMoney
  1064. # print('一人多金额分项合计 取最大金额:', _maxMoney)
  1065. else:
  1066. # list(set_tenderer_role)[0].money = _maxMoney
  1067. if min(list_tenderer_money)>200000 and list_tenderer_money[-1]/min(list_tenderer_money)>9000:
  1068. list(set_tenderer_role)[0].money = min(list_tenderer_money)
  1069. # print('一人多金额 且最小的大于20万第一个金额比最小金额大几千倍的最小中标金额:', min(list_tenderer_money))
  1070. else:
  1071. list(set_tenderer_role)[0].money = list_tenderer_money[-1] # 2021/7/16 修改 不是单价合计方式取第一个中标金额
  1072. # print('一人多金额 取第一个中标金额:', list_tenderer_money[-1])
  1073. #每个包都只找到一个金额
  1074. _flag_pack_money = True
  1075. for k,v in dict_pack_tenderer_money.items():
  1076. if len(v[1])!=1:
  1077. _flag_pack_money = False
  1078. if _flag_pack_money and len(PackageSet)==len(dict_pack_tenderer_money.keys()):
  1079. for k,v in dict_pack_tenderer_money.items():
  1080. v[0].money = list(v[1])[0]
  1081. # print('k,v in dict_pack_tenderer_money.items', k, v)
  1082. # 2021/7/16 #增加判断中标金额是否远大于招标金额逻辑
  1083. for pack in PackDict.keys():
  1084. for i in range(len(PackDict[pack]["roleList"])):
  1085. if PackDict[pack]["tendereeMoney"] > 0:
  1086. # print('金额数据类型:',type(PackDict[pack]["roleList"][i].money))
  1087. if float(PackDict[pack]["roleList"][i].money) >10000000 and \
  1088. float(PackDict[pack]["roleList"][i].money)/float(PackDict[pack]["tendereeMoney"])>=1000:
  1089. PackDict[pack]["roleList"][i].money = float(PackDict[pack]["roleList"][i].money) / 10000
  1090. # print('招标金额校正中标金额')
  1091. # 2021/7/19 #增加判断中标金额是否远大于第二三中标金额
  1092. for pack in PackDict.keys():
  1093. tmp_moneys = []
  1094. for i in range(len(PackDict[pack]["roleList"])):
  1095. if float(PackDict[pack]["roleList"][i].money) >100000:
  1096. tmp_moneys.append(float(PackDict[pack]["roleList"][i].money))
  1097. if len(tmp_moneys)>2 and max(tmp_moneys)/min(tmp_moneys)>1000:
  1098. for i in range(len(PackDict[pack]["roleList"])):
  1099. if float(PackDict[pack]["roleList"][i].money)/min(tmp_moneys)>1000:
  1100. PackDict[pack]["roleList"][i].money = float(PackDict[pack]["roleList"][i].money) / 10000
  1101. # print('通过其他中标人投标金额校正中标金额')
  1102. for pack in PackDict.keys():
  1103. for i in range(len(PackDict[pack]["roleList"])):
  1104. PackDict[pack]["roleList"][i] = PackDict[pack]["roleList"][i].getString()
  1105. for item in list_pop:
  1106. PackDict.pop(item)
  1107. return PackDict
  1108. def initPackageAttr(RoleList,PackageSet):
  1109. '''
  1110. @summary: 根据拿到的roleList和packageSet初始化接口返回的数据
  1111. '''
  1112. packDict = dict()
  1113. packDict["Project"] = {"code":"","tendereeMoney":0,"roleList":[]}
  1114. for item in list(PackageSet):
  1115. packDict[item] = {"code":"","tendereeMoney":0,"roleList":[]}
  1116. for item in RoleList:
  1117. if packDict[item.packageName]["code"] =="":
  1118. packDict[item.packageName]["code"] = item.packageCode
  1119. packDict[item.packageName]["roleList"].append(Role(item.role_name,item.entity_text,0,0,0.0,[]))
  1120. return packDict
  1121. def getPackageRoleMoney(list_sentence,list_entity):
  1122. '''
  1123. @param:
  1124. list_sentence:文章的句子list
  1125. list_entity:文章的实体list
  1126. @return: 拿到文章的包-标段号-角色-实体名称-金额-联系人-联系电话
  1127. '''
  1128. # print("=1")
  1129. theRole = getRoleList(list_sentence,list_entity)
  1130. if not theRole:
  1131. return []
  1132. RoleList,RoleSet,PackageList,PackageSet = theRole
  1133. '''
  1134. for item in PackageList:
  1135. # print(item)
  1136. '''
  1137. # print("=2")
  1138. PackDict = initPackageAttr(RoleList, PackageSet)
  1139. # print("=3")
  1140. PackDict = findAttributeAfterEntity(PackDict, RoleSet, PackageList, PackageSet, list_entity)
  1141. # print("=4")
  1142. return PackDict
  1143. def turnBidWay(bidway):
  1144. if bidway in ("邀请招标","采购方式:邀请"):
  1145. return "邀请招标"
  1146. elif bidway in ("询价","询单","询比","采购方式:询价"):
  1147. return "询价"
  1148. elif bidway in ("竞谈","竞争性谈判","公开竞谈"):
  1149. return "竞争性谈判"
  1150. elif bidway in ("竞争性磋商","磋商"):
  1151. return "竞争性磋商"
  1152. elif bidway in ("竞价","竞标","电子竞价","以电子竞价","电子书面竞投"):
  1153. return "竞价"
  1154. elif bidway in ("公开招标","网上电子投标","网上招标","采购方式:公开","招标为其他"):
  1155. return "公开招标"
  1156. elif bidway in ("单一来源"):
  1157. return "单一来源"
  1158. elif bidway in ("比选"):
  1159. return "比选"
  1160. else:
  1161. return "其他"
  1162. def getOtherAttributes(list_entity):
  1163. dict_other = {"bidway":"",
  1164. "moneysource":"",
  1165. "person_review":[],
  1166. "time_release":"",
  1167. "time_bidopen":"",
  1168. "time_bidclose":"",
  1169. "serviceTime":"",
  1170. "product":[],
  1171. "total_tendereeMoney":0}
  1172. for entity in list_entity:
  1173. if entity.entity_type == 'bidway':
  1174. dict_other["bidway"] = turnBidWay(entity.entity_text)
  1175. elif entity.entity_type=='moneysource':
  1176. dict_other["moneysource"] = entity.entity_text
  1177. elif entity.entity_type=='serviceTime':
  1178. dict_other["serviceTime"] = entity.entity_text
  1179. elif entity.entity_type == 'time' and entity.label==1:
  1180. dict_other["time_release"] = timeFormat(entity.entity_text)
  1181. elif entity.entity_type == 'time' and entity.label==2:
  1182. dict_other["time_bidopen"] = timeFormat(entity.entity_text)
  1183. elif entity.entity_type == 'time' and entity.label == 3:
  1184. dict_other["time_bidclose"] = timeFormat(entity.entity_text)
  1185. elif entity.entity_type=="person" and entity.label ==4:
  1186. dict_other["person_review"].append(entity.entity_text)
  1187. elif entity.entity_type=='product':
  1188. dict_other["product"].append(entity.entity_text)
  1189. elif entity.entity_type=='money' and entity.notes=='总投资' and dict_other["total_tendereeMoney"]<float(entity.entity_text):
  1190. dict_other["total_tendereeMoney"] = float(entity.entity_text)
  1191. dict_other["product"] = list(set(dict_other["product"]))
  1192. return dict_other
  1193. def getMoneyRange(RoleList):
  1194. pass
  1195. def getPREMs(list_sentences,list_entitys,list_articles):
  1196. '''
  1197. @param:
  1198. list_sentence:所有文章的句子list
  1199. list_entity:所有文章的实体list
  1200. @return:list of dict which include文章的包-角色-实体名称-金额-联系人-联系电话
  1201. '''
  1202. result = []
  1203. for list_sentence,list_entity,list_article in zip(list_sentences,list_entitys,list_articles):
  1204. RoleList = getPackageRoleMoney(list_sentence,list_entity)
  1205. result.append(dict({"prem":RoleList,"docid":list_article.doc_id},**getOtherAttributes(list_entity),
  1206. **{"fingerprint":list_article.fingerprint,"match_enterprise":list_article.match_enterprise,
  1207. "match_enterprise_type":list_article.match_enterprise_type,"process_time":getCurrent_date(),
  1208. "attachmentTypes":list_article.attachmentTypes}))
  1209. return result
  1210. if __name__=="__main__":
  1211. '''
  1212. conn = getConnection()
  1213. cursor = conn.cursor()
  1214. #sql = " select distinct A.doc_id from entity_mention A,test_predict_role B where A.entity_id=B.entity_id limit 200"
  1215. sql = " select B.doc_id,B.prem from articles_processed A, articles_validation B where A.id=B.doc_id "
  1216. result = []
  1217. cursor.execute(sql)
  1218. rows = cursor.fetchall()
  1219. count = 0
  1220. for row in rows:
  1221. count += 1
  1222. # print(count)
  1223. doc_id = row[0]
  1224. roleList = getPackageRoleMoney(doc_id)
  1225. result.append([doc_id,str(roleList),row[1]])
  1226. ''''''
  1227. with codecs.open("getAttribute.html","w",encoding="utf8") as f:
  1228. f.write('<html><head>\
  1229. <meta http-equiv="Content-Type"\
  1230. content="text/html; charset=UTF-8">\
  1231. </head>\
  1232. <body bgcolor="#FFFFFF">\
  1233. <table border="1">\
  1234. <tr>\
  1235. <td>doc_id</td>\
  1236. <td>角色</td>\
  1237. </tr>')
  1238. for item in result:
  1239. f.write("<tr>"+"<td>"+item[0]+"</td>"+"<td>"+item[1]+"</td>"+"<td>"+item[2]+"</td>"+"</tr>")
  1240. f.write("</table></body>")
  1241. '''