getAttributes.py 64 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989910010110210310410510610710810911011111211311411511611711811912012112212312412512612712812913013113213313413513613713813914014114214314414514614714814915015115215315415515615715815916016116216316416516616716816917017117217317417517617717817918018118218318418518618718818919019119219319419519619719819920020120220320420520620720820921021121221321421521621721821922022122222322422522622722822923023123223323423523623723823924024124224324424524624724824925025125225325425525625725825926026126226326426526626726826927027127227327427527627727827928028128228328428528628728828929029129229329429529629729829930030130230330430530630730830931031131231331431531631731831932032132232332432532632732832933033133233333433533633733833934034134234334434534634734834935035135235335435535635735835936036136236336436536636736836937037137237337437537637737837938038138238338438538638738838939039139239339439539639739839940040140240340440540640740840941041141241341441541641741841942042142242342442542642742842943043143243343443543643743843944044144244344444544644744844945045145245345445545645745845946046146246346446546646746846947047147247347447547647747847948048148248348448548648748848949049149249349449549649749849950050150250350450550650750850951051151251351451551651751851952052152252352452552652752852953053153253353453553653753853954054154254354454554654754854955055155255355455555655755855956056156256356456556656756856957057157257357457557657757857958058158258358458558658758858959059159259359459559659759859960060160260360460560660760860961061161261361461561661761861962062162262362462562662762862963063163263363463563663763863964064164264364464564664764864965065165265365465565665765865966066166266366466566666766866967067167267367467567667767867968068168268368468568668768868969069169269369469569669769869970070170270370470570670770870971071171271371471571671771871972072172272372472572672772872973073173273373473573673773873974074174274374474574674774874975075175275375475575675775875976076176276376476576676776876977077177277377477577677777877978078178278378478578678778878979079179279379479579679779879980080180280380480580680780880981081181281381481581681781881982082182282382482582682782882983083183283383483583683783883984084184284384484584684784884985085185285385485585685785885986086186286386486586686786886987087187287387487587687787887988088188288388488588688788888989089189289389489589689789889990090190290390490590690790890991091191291391491591691791891992092192292392492592692792892993093193293393493593693793893994094194294394494594694794894995095195295395495595695795895996096196296396496596696796896997097197297397497597697797897998098198298398498598698798898999099199299399499599699799899910001001100210031004100510061007100810091010101110121013101410151016101710181019102010211022102310241025102610271028102910301031103210331034103510361037103810391040104110421043104410451046104710481049105010511052105310541055105610571058105910601061106210631064106510661067106810691070107110721073107410751076107710781079108010811082108310841085108610871088108910901091109210931094109510961097109810991100110111021103110411051106110711081109111011111112111311141115111611171118111911201121112211231124112511261127112811291130113111321133113411351136113711381139114011411142114311441145114611471148114911501151115211531154115511561157115811591160116111621163116411651166116711681169117011711172117311741175117611771178117911801181118211831184118511861187118811891190119111921193119411951196119711981199120012011202120312041205120612071208120912101211121212131214121512161217121812191220122112221223122412251226122712281229123012311232123312341235123612371238123912401241
  1. from BiddingKG.dl.common.Utils import findAllIndex,debug,timeFormat
  2. from BiddingKG.dl.interface.Entitys import PREM,Role,Entity
  3. import re
  4. import copy
  5. import math
  6. import pandas as pd
  7. import os
  8. def getTheRole(entity,role_list):
  9. '''
  10. @summary:根据实体名称拿到index
  11. @param:
  12. entity:实体名称
  13. role_list:角色list
  14. @return:该实体所在下标
  15. '''
  16. for role_index in range(len(role_list)):
  17. if entity in role_list[role_index]:
  18. return role_index
  19. return None
  20. dict_role_id = {"0":"tenderee",
  21. "1":"agency",
  22. "2":"win_tenderer",
  23. "3":"second_tenderer",
  24. "4":"third_tenderer"}
  25. def getPackage(packageList,sentence_index,begin_index,roleid,MAX_DIS=None,DIRECT=None):
  26. '''
  27. @param:
  28. packageList:文章的包的信息,包号-sent_index-词偏移-字偏移-[[前作用域句子,句内偏移],[后作用域句子,句内偏移]]-匹配集合
  29. sentence_index:实体所在的句子
  30. begin_index:实体所在句子的起始位置
  31. @return:公司实体所属的包
  32. @summary: 优化多标段,确定标段作用域之后,寻找作用域包含该实体的所有包,从前往后找到一个还没有该roleid的包返回,若找到的包都有roleid,则返回第一个,若没有找到包,返回None
  33. '''
  34. '''
  35. if len(packageList)==0:
  36. return None
  37. before_index = None
  38. after_index = None
  39. equal_index = None
  40. equal_count = 0
  41. for pack_index in range(len(packageList)):
  42. if packageList[pack_index][1]>sentence_index and after_index is None:
  43. after_index = pack_index
  44. if packageList[pack_index][1]<sentence_index:
  45. before_index = pack_index
  46. if packageList[pack_index][1]==sentence_index and equal_index is None:
  47. equal_index = pack_index
  48. #当前句子和之前句子未找到包
  49. if before_index is None and equal_index is None:
  50. return None
  51. else:
  52. if after_index is None:
  53. end_index = len(packageList)
  54. else:
  55. end_index = after_index
  56. #只在当前句子找到一个包号
  57. if end_index-max((before_index if before_index is not None else -1,equal_index if equal_index is not None else -1))==1:
  58. return packageList[end_index-1][0]
  59. else:
  60. for i in range(max((before_index if before_index is not None else -1,equal_index if equal_index is not None else -1)),end_index):
  61. if packageList[i][2]>int(begin_index):
  62. if packageList[i-1][4]:
  63. return packageList[i-1][0]
  64. else:
  65. if packageList[i][4]:
  66. return packageList[i-1][0]
  67. else:
  68. return packageList[i][0]
  69. return packageList[end_index-1][0]
  70. '''
  71. if len(packageList)==0:
  72. return None,False
  73. list_legalPack = []
  74. for pack_index in range(len(packageList)):
  75. if DIRECT=="L" and (packageList[pack_index]["sentence_index"]>sentence_index or (packageList[pack_index]["sentence_index"]==sentence_index and packageList[pack_index]["offsetWords_begin"]>begin_index)):
  76. continue
  77. if DIRECT=="R" and (packageList[pack_index]["sentence_index"]<sentence_index or (packageList[pack_index]["sentence_index"]==sentence_index and packageList[pack_index]["offsetwords_begin"]<begin_index)):
  78. continue
  79. if (packageList[pack_index]["scope"][0][0]<sentence_index or (packageList[pack_index]["scope"][0][0]==sentence_index and packageList[pack_index]["scope"][0][1]<=begin_index)) and (packageList[pack_index]["scope"][1][0]>sentence_index or (packageList[pack_index]["scope"][1][0]==sentence_index and packageList[pack_index]["scope"][1][1]>=begin_index)):
  80. if MAX_DIS is not None:
  81. if abs(sentence_index-packageList[pack_index]["sentence_index"])<=MAX_DIS:
  82. list_legalPack.append(pack_index)
  83. else:
  84. list_legalPack.append(pack_index)
  85. _flag = True
  86. for _index in list_legalPack:
  87. if roleid in packageList[_index]["hit"]:
  88. continue
  89. else:
  90. _flag = False
  91. packageList[_index]["hit"].add(roleid)
  92. return packageList[_index]["pointer"],_flag
  93. if len(list_legalPack)>0:
  94. return packageList[0]["pointer"],_flag
  95. return None,False
  96. #生成合法的组合
  97. def get_legal_comba(list_entity,dict_role_combination):
  98. #拿到一个包中所有合法的组合
  99. def circle_package(_dict_legal_combination):
  100. list_dict_role_first = []
  101. for _role in _dict_legal_combination:
  102. if len(list_dict_role_first)==0:
  103. for _entity in _dict_legal_combination[_role]:
  104. if _entity !="":
  105. list_dict_role_first.append({_role:_entity})
  106. else:
  107. list_dict_role_after = []
  108. _find_count = 0
  109. for _entity in _dict_legal_combination[_role]:
  110. if _entity !="":
  111. for _dict in list_dict_role_first:
  112. _flag = True
  113. for _key1 in _dict:
  114. if _entity==_dict[_key1]:
  115. #修改为招标人和代理人可以为同一个
  116. if str(_key1) in ["0","1"] and str(_role) in ["0","1"]:
  117. _flag = True
  118. else:
  119. _flag = False
  120. if _flag:
  121. _find_count += 1
  122. _new_dict = copy.copy(_dict)
  123. _new_dict[_role] = _entity
  124. if len(list_dict_role_after)>100000:
  125. break
  126. list_dict_role_after.append(_new_dict)
  127. if len(list_dict_role_after)==0:
  128. pass
  129. else:
  130. list_dict_role_first.extend(list_dict_role_after)
  131. return list_dict_role_first
  132. def recursive_package(_dict_legal_combination,set_legal_entity,dict_one_selution,list_all_selution):
  133. last_layer = False
  134. #若是空组合则放回空
  135. if len(_dict_legal_combination.keys())==0:
  136. return []
  137. #递归到最后一层则修改状态
  138. if len(_dict_legal_combination.keys())==1:
  139. last_layer = True
  140. #取一个角色开始进行遍历
  141. _key_role = list(_dict_legal_combination.keys())[0]
  142. for item in _dict_legal_combination[_key_role]:
  143. copy_dict_one_selution = copy.copy(dict_one_selution)
  144. copy_dict_legal_combination = {}
  145. copy_set_legal_entity = copy.copy(set_legal_entity)
  146. #复制余下的所有角色,进行下一轮递归
  147. for _key in _dict_legal_combination.keys():
  148. if _key!=_key_role:
  149. copy_dict_legal_combination[_key] = _dict_legal_combination[_key]
  150. #修改为招标人和代理人可以为同一个
  151. if item !="":
  152. _flag = True
  153. if str(_key_role) in ["0","1"]:
  154. for _key_flag in copy_dict_one_selution:
  155. if _key_flag not in ["0","1"] and copy_dict_one_selution[_key_flag]==item:
  156. _flag = False
  157. else:
  158. for _key_flag in copy_dict_one_selution:
  159. if copy_dict_one_selution[_key_flag]==item:
  160. _flag = False
  161. if _flag:
  162. copy_dict_one_selution[_key_role] = item
  163. '''
  164. if item not in copy_set_legal_entity:
  165. if item !="":
  166. copy_dict_one_selution[_key_role] = item
  167. '''
  168. copy_set_legal_entity.add(item)
  169. if last_layer:
  170. list_all_selution.append(copy_dict_one_selution)
  171. else:
  172. recursive_package(copy_dict_legal_combination,copy_set_legal_entity,copy_dict_one_selution,list_all_selution)
  173. #递归匹配各个包的结果
  174. def recursive_packages(_dict_legal_combination,dict_one_selution,list_all_selution):
  175. last_layer = False
  176. if len(_dict_legal_combination.keys())==0:
  177. return []
  178. if len(_dict_legal_combination.keys())==1:
  179. last_layer = True
  180. _key_pack = list(_dict_legal_combination.keys())[0]
  181. for item in _dict_legal_combination[_key_pack]:
  182. copy_dict_one_selution = copy.copy(dict_one_selution)
  183. copy_dict_legal_combination = {}
  184. for _key in _dict_legal_combination.keys():
  185. if _key!=_key_pack:
  186. copy_dict_legal_combination[_key] = _dict_legal_combination[_key]
  187. for _key_role in item.keys():
  188. copy_dict_one_selution[_key_pack+"$$"+_key_role] = item[_key_role]
  189. if last_layer:
  190. list_all_selution.append(copy_dict_one_selution)
  191. else:
  192. recursive_packages(copy_dict_legal_combination,copy_dict_one_selution,list_all_selution)
  193. return list_all_selution
  194. #循环获取所有包组合
  195. def circle_pageages(_dict_legal_combination):
  196. list_all_selution = []
  197. for _key_pack in _dict_legal_combination.keys():
  198. list_key_selution = []
  199. for item in _dict_legal_combination[_key_pack]:
  200. _dict = dict()
  201. for _key_role in item.keys():
  202. _dict[_key_pack+"$$"+_key_role] = item[_key_role]
  203. list_key_selution.append(_dict)
  204. if len(list_all_selution)==0:
  205. list_all_selution = list_key_selution
  206. else:
  207. _list_all_selution = []
  208. for item_1 in list_all_selution:
  209. for item_2 in list_key_selution:
  210. _list_all_selution.append(dict(item_1,**item_2))
  211. list_all_selution = _list_all_selution
  212. return list_all_selution
  213. #拿到各个包解析之后的结果
  214. _dict_legal_combination = {}
  215. for packageName in dict_role_combination.keys():
  216. _list_all_selution = []
  217. # recursive_package(dict_role_combination[packageName], set(), {}, _list_all_selution)
  218. _list_all_selution = circle_package(dict_role_combination[packageName])
  219. '''
  220. print("===1")
  221. print(packageName)
  222. for item in _list_all_selution:
  223. print(item)
  224. print("===2")
  225. '''
  226. #去除包含子集
  227. list_all_selution_simple = []
  228. _list_set_all_selution = []
  229. for item_selution in _list_all_selution:
  230. item_set_selution = set()
  231. for _key in item_selution.keys():
  232. item_set_selution.add((_key,item_selution[_key]))
  233. _list_set_all_selution.append(item_set_selution)
  234. if len(_list_set_all_selution)>1000:
  235. _dict_legal_combination[packageName] = _list_all_selution
  236. continue
  237. for i in range(len(_list_set_all_selution)):
  238. be_included = False
  239. for j in range(len(_list_set_all_selution)):
  240. if i!=j:
  241. if len(set(_list_set_all_selution[i])&set(_list_set_all_selution[j]))==len(_list_set_all_selution[i]) and len(_list_set_all_selution[i])!=len(_list_set_all_selution[j]):
  242. be_included = True
  243. if not be_included:
  244. list_all_selution_simple.append(_list_all_selution[i])
  245. _dict_legal_combination[packageName] = list_all_selution_simple
  246. _list_final_comba = []
  247. #对各个包的结果进行排列组合
  248. _comba_count = 1
  249. for _key in _dict_legal_combination.keys():
  250. _comba_count *= len(_dict_legal_combination[_key])
  251. #如果过大,则每个包只取概率最大的那个
  252. dict_pack_entity_prob = get_dict_entity_prob(list_entity)
  253. if _comba_count>250:
  254. new_dict_legal_combination = dict()
  255. for _key_pack in _dict_legal_combination.keys():
  256. MAX_PROB = -1000
  257. _MAX_PROB_COMBA = None
  258. for item in _dict_legal_combination[_key_pack]:
  259. # print(_key_pack,item)
  260. _dict = dict()
  261. for _key in item.keys():
  262. _dict[str(_key_pack)+"$$"+str(_key)] = item[_key]
  263. _prob = getSumExpectation(dict_pack_entity_prob, _dict)
  264. if _prob>MAX_PROB:
  265. MAX_PROB = _prob
  266. _MAX_PROB_COMBA = [item]
  267. if _MAX_PROB_COMBA is not None:
  268. new_dict_legal_combination[_key_pack] = _MAX_PROB_COMBA
  269. _dict_legal_combination = new_dict_legal_combination
  270. #recursive_packages(_dict_legal_combination, {}, _list_final_comba)
  271. _list_final_comba = circle_pageages(_dict_legal_combination)
  272. #除了Project包(招标人和代理人),其他包是不会有冲突的
  273. #查看是否有一个实体出现在了Project包和其他包中,如有,要进行裁剪
  274. _list_real_comba = []
  275. for dict_item in _list_final_comba:
  276. set_project = set()
  277. set_other = set()
  278. for _key in list(dict_item.keys()):
  279. if _key.split("$$")[0]=="Project":
  280. set_project.add(dict_item[_key])
  281. else:
  282. set_other.add(dict_item[_key])
  283. set_common = set_project&set_other
  284. if len(set_common)>0:
  285. dict_project = {}
  286. dict_not_project = {}
  287. for _key in list(dict_item.keys()):
  288. if dict_item[_key] in set_common:
  289. if str(_key.split("$$")[0])=="Project":
  290. dict_project[_key] = dict_item[_key]
  291. else:
  292. dict_not_project[_key] = dict_item[_key]
  293. else:
  294. dict_project[_key] = dict_item[_key]
  295. dict_not_project[_key] = dict_item[_key]
  296. _list_real_comba.append(dict_project)
  297. _list_real_comba.append(dict_not_project)
  298. else:
  299. _list_real_comba.append(dict_item)
  300. return _list_real_comba
  301. def get_dict_entity_prob(list_entity,on_value=0.5):
  302. dict_pack_entity_prob = {}
  303. for entity in list_entity:
  304. if entity.entity_type in ['org','company']:
  305. values = entity.values
  306. role_prob = float(values[int(entity.label)])
  307. _key = entity.packageName+"$$"+str(entity.label)
  308. if role_prob>=on_value and str(entity.label)!="5":
  309. _key_prob = _key+"$text$"+entity.entity_text
  310. if _key_prob in dict_pack_entity_prob:
  311. if role_prob>dict_pack_entity_prob[_key_prob][1]:
  312. dict_pack_entity_prob[_key_prob] = [entity.entity_text,role_prob]
  313. else:
  314. dict_pack_entity_prob[_key_prob] = [entity.entity_text,role_prob]
  315. return dict_pack_entity_prob
  316. #计算合计期望
  317. def getSumExpectation(dict_pack_entity_prob,combination,on_value=0.5):
  318. '''
  319. expect = 0
  320. for entity in list_entity:
  321. if entity.entity_type in ['org','company']:
  322. values = entity.values
  323. role_prob = float(values[int(entity.label)])
  324. _key = entity.packageName+"$$"+str(entity.label)
  325. if role_prob>on_value and str(entity.label)!="5":
  326. if _key in combination.keys() and combination[_key]==entity.entity_text:
  327. expect += math.pow(role_prob,4)
  328. else:
  329. expect -= math.pow(role_prob,4)
  330. '''
  331. #修改为同一个实体只取对应包-角色的最大的概率值
  332. expect = 0
  333. dict_entity_prob = {}
  334. for _key_pack_entity in dict_pack_entity_prob:
  335. _key_pack = _key_pack_entity.split("$text$")[0]
  336. role_prob = dict_pack_entity_prob[_key_pack_entity][1]
  337. if _key_pack in combination.keys() and combination[_key_pack]==dict_pack_entity_prob[_key_pack_entity][0]:
  338. if _key_pack_entity in dict_entity_prob.keys():
  339. if dict_entity_prob[_key_pack_entity]<role_prob:
  340. dict_entity_prob[_key_pack_entity] = role_prob
  341. else:
  342. dict_entity_prob[_key_pack_entity] = role_prob
  343. else:
  344. if _key_pack_entity in dict_entity_prob.keys():
  345. if dict_entity_prob[_key_pack_entity]>-role_prob:
  346. dict_entity_prob[_key_pack_entity] = -role_prob
  347. else:
  348. dict_entity_prob[_key_pack_entity] = -role_prob
  349. # for entity in list_entity:
  350. # if entity.entity_type in ['org','company']:
  351. # values = entity.values
  352. # role_prob = float(values[int(entity.label)])
  353. # _key = entity.packageName+"$$"+str(entity.label)
  354. # if role_prob>=on_value and str(entity.label)!="5":
  355. # if _key in combination.keys() and combination[_key]==entity.entity_text:
  356. # _key_prob = _key+entity.entity_text
  357. # if _key_prob in dict_entity_prob.keys():
  358. # if dict_entity_prob[_key_prob]<role_prob:
  359. # dict_entity_prob[_key_prob] = role_prob
  360. # else:
  361. # dict_entity_prob[_key_prob] = role_prob
  362. # else:
  363. # _key_prob = _key+entity.entity_text
  364. # if _key_prob in dict_entity_prob.keys():
  365. # if dict_entity_prob[_key_prob]>-role_prob:
  366. # dict_entity_prob[_key_prob] = -role_prob
  367. # else:
  368. # dict_entity_prob[_key_prob] = -role_prob
  369. for _key in dict_entity_prob.keys():
  370. symbol = 1 if dict_entity_prob[_key]>0 else -1
  371. expect += symbol*math.pow(dict_entity_prob[_key],2)
  372. return expect
  373. def getRoleList(list_sentence,list_entity,on_value = 0.5):
  374. '''
  375. @summary: 搜索树,得到所有不矛盾的角色组合,取合计期望值最大的作为结果返回
  376. @param:
  377. list_sentence:文章所有的sentence
  378. list_entity:文章所有的实体
  379. on_value:概率阈值
  380. @return:文章的角色list
  381. '''
  382. pack = getPackagesFromArticle(list_sentence,list_entity)
  383. if pack is None:
  384. return None
  385. PackageList,PackageSet,dict_PackageCode = pack
  386. #拿到所有可能的情况
  387. dict_role_combination = {}
  388. #拿到各个实体的packageName,packageCode
  389. for entity in list_entity:
  390. if entity.entity_type in ['org','company']:
  391. #过滤掉字数小于3个的实体
  392. if len(entity.entity_text)<=3:
  393. continue
  394. values = entity.values
  395. role_prob = float(values[int(entity.label)])
  396. if role_prob>=on_value and str(entity.label)!="5":
  397. if str(entity.label) in ["0","1"]:
  398. packageName = "Project"
  399. else:
  400. if len(PackageSet)>0:
  401. packagePointer,_ = getPackage(PackageList,entity.sentence_index,entity.end_index,"role-"+str(entity.label))
  402. if packagePointer is None:
  403. #continue
  404. packageName = "Project"
  405. else:
  406. #add pointer_pack
  407. entity.pointer_pack = packagePointer
  408. packageName = packagePointer.entity_text
  409. else:
  410. packageName = "Project"
  411. find_flag = False
  412. if packageName in dict_PackageCode.keys():
  413. packageCode = dict_PackageCode[packageName]
  414. else:
  415. packageCode = ""
  416. entity.packageCode = packageCode
  417. role_name = dict_role_id.get(str(entity.label))
  418. entity.roleName = role_name
  419. entity.packageName = packageName
  420. if entity.packageName in dict_role_combination.keys():
  421. if str(entity.label) in dict_role_combination[entity.packageName].keys():
  422. dict_role_combination[entity.packageName][str(entity.label)].add(entity.entity_text)
  423. else:
  424. dict_role_combination[entity.packageName][str(entity.label)] = set([entity.entity_text])
  425. else:
  426. dict_role_combination[entity.packageName] = {}
  427. #初始化空值
  428. roleIds = [0,1,2,3,4]
  429. for _roleId in roleIds:
  430. dict_role_combination[entity.packageName][str(_roleId)] = set([""])
  431. dict_role_combination[entity.packageName][str(entity.label)].add(entity.entity_text)
  432. list_real_comba = get_legal_comba(list_entity,dict_role_combination)
  433. #拿到最大期望值的组合
  434. max_index = 0
  435. max_expect = -100
  436. _index = 0
  437. dict_pack_entity_prob = get_dict_entity_prob(list_entity)
  438. for item_combination in list_real_comba:
  439. expect = getSumExpectation(dict_pack_entity_prob, item_combination)
  440. if expect>max_expect:
  441. max_index = _index
  442. max_expect = expect
  443. _index += 1
  444. RoleList = []
  445. RoleSet = set()
  446. if len(list_real_comba)>0:
  447. for _key in list_real_comba[max_index].keys():
  448. packageName = _key.split("$$")[0]
  449. label = _key.split("$$")[1]
  450. role_name = dict_role_id.get(str(label))
  451. entity_text = list_real_comba[max_index][_key]
  452. if packageName in dict_PackageCode.keys():
  453. packagecode = dict_PackageCode.get(packageName)
  454. else:
  455. packagecode = ""
  456. RoleList.append(PREM(packageName,packagecode,role_name,entity_text,0,0,0.0,[]))
  457. RoleSet.add(entity_text)
  458. #根据最优树来修正list_entity中角色对包的连接
  459. for _entity in list_entity:
  460. if _entity.pointer_pack is not None:
  461. _pack_name = _entity.pointer_pack.entity_text
  462. _find_flag = False
  463. for _prem in RoleList:
  464. if _prem.packageName==_pack_name and _prem.entity_text==_entity.entity_text:
  465. _find_flag = True
  466. if not _find_flag:
  467. _entity.pointer_pack = None
  468. return RoleList,RoleSet,PackageList,PackageSet
  469. def getPackageScopePattern():
  470. '''
  471. @summary: 获取包的作用域关键词
  472. '''
  473. df = pd.read_excel(os.path.dirname(__file__)+"/end.xls")
  474. pattern = "("
  475. for item in df["list_word"]:
  476. item = str(item).replace("(","\(").replace(")","\)").replace(".","\.").replace("[","\[").replace("]","\]").replace("-","\-")
  477. pattern += item+"|"
  478. pattern = pattern[:-1]+")[::是为]|业绩.{,30}标段[0-9A-Za-z一二三四五六七八九十]{0,3}"
  479. return pattern
  480. pattern_packageScope = getPackageScopePattern()
  481. def getPackagesFromArticle(list_sentence,list_entity):
  482. '''
  483. @param:
  484. list_sentence:文章的句子list
  485. @summary: 将包的信息插入list_entity中
  486. @return: type:list if [包号,句子index,词偏移,标段号] meaning:文章的包/标段信息
  487. '''
  488. if len(list_sentence)==0:
  489. return None
  490. PackageList = []
  491. PackageList_scope = []
  492. PackageSet = set()
  493. dict_packageCode = dict()
  494. package_name_pattern = re.compile("((标[段号的包]|分包)的?(名[称单]?|包名))[::]?([^::]{3,30}?),{1}")
  495. package_N_name_pattern = re.compile("[^承](分?包|标段|标包|标|包|包组|子项目|包件|项目类型)编?号?[::]?[\((]?([0-9A-Za-z一二三四五六七八九十ⅠⅡⅢⅣⅤⅥⅦ]){1,2},{1}")
  496. package_number_pattern = re.compile("(([^承](包|标[段号的包]|分?包|包组|包件)编?号?|子项目|项目类型)[::]?[0-9A-Za-z一二三四五六七八九十ⅠⅡⅢⅣⅤⅥⅦ]{1,4}[^\.])[^至]?|([^\.]?第?[ⅠⅡⅢⅣⅤⅥⅦ0-9A-Za-z一二三四五六七八九十]{1,4}(包号|标[段号的包]|分?包))")
  497. # other_package_pattern = re.compile('(项目名称|物资名称|设备名称|场次名称|标段名称)[::](.{,20}?)(,|项目)') # 新正则识别标段
  498. other_package_pattern = re.compile('((项目|物资|设备|场次|标段|标的|产品)(名称))[::]([^,。]{2,50}?)[,。]') # # 2020/11/23 大网站规则 调整 package_N_name_pattern, package_N_name_pattern 中的项目 改为 子项目
  499. win_tenderer_pattern = re.compile('(中标人|供应商)[::](.{2,25})[,。]') # 2020/11/23 大网站规则 调整
  500. model_pattern = re.compile('(型号|序号)[::]([^,。]{2,20})[,。]') # 2020/11/23 大网站规则 调整
  501. number_pattern = re.compile("[0-9A-Za-z一二三四五六七八九十ⅠⅡⅢⅣⅤⅥⅦ]{1,4}")
  502. package_code_pattern = re.compile("(?:编号[::]?\s*)([-\dA-Za-z\(\)]+)")
  503. def changeIndexFromWordToWords(tokens,word_index):
  504. '''
  505. @summary:转换某个字的字偏移为词偏移
  506. '''
  507. before_index = 0
  508. after_index = 0
  509. for i in range(len(tokens)):
  510. after_index = after_index+len(tokens[i])
  511. if before_index<=word_index and after_index>=word_index:
  512. return i
  513. before_index = after_index
  514. package_names = []
  515. def extractPackageCode(tokens,word_index,size=20,pattern = package_code_pattern):
  516. '''
  517. @summary:抽取包附近的标段号
  518. @param:
  519. tokens:包所在句子的分词
  520. word_index:包所在字偏移
  521. size:左右各取多少个词
  522. pattern:提取标段号的正则
  523. @return: type:string,meaning:标段号
  524. '''
  525. index = changeIndexFromWordToWords(tokens,word_index)
  526. if index<size:
  527. begin = index
  528. else:
  529. begin = index-size
  530. if index+size>len(tokens):
  531. end = len(tokens)
  532. else:
  533. end = index+size
  534. #拿到左右两边的词语组成短语
  535. text = "".join(tokens[begin:end])
  536. #在短语中的字偏移
  537. new_word_index = word_index-len("".join(tokens[:begin]))
  538. min_distance = len(text)
  539. packageCode = None
  540. for the_iter in re.finditer(pattern,text):
  541. #算出最小距离
  542. distance = min([abs(new_word_index-the_iter.span()[0]),abs(new_word_index-the_iter.span()[1])])
  543. if distance<min_distance:
  544. min_distance = distance
  545. packageCode = the_iter.group(1)
  546. return packageCode
  547. #从标段介绍表格中提取包名和包号
  548. for i in range(len(list_sentence)):
  549. content = list_sentence[i].sentence_text
  550. names = re.findall(package_name_pattern,content)
  551. if names == []:
  552. names = re.findall(other_package_pattern, content)
  553. N_names = re.findall(package_N_name_pattern,content)
  554. if len(names)==1 and len(N_names)==1:
  555. package_names.append([names[0][-1],N_names[0][-1]])
  556. for i in range(len(list_sentence)):
  557. PackageList_item = []
  558. PackageList_item_scope = []
  559. content = list_sentence[i].sentence_text
  560. tokens = list_sentence[i].tokens
  561. for name in package_names[:20]:
  562. for index in findAllIndex(name[0],content):
  563. temp_package_number = re.findall(number_pattern,name[1])[0]
  564. PackageList_item.append({"name":temp_package_number,"sentence_index":list_sentence[i].sentence_index,"offsetWords_begin":changeIndexFromWordToWords(tokens,index),"offsetWord_begin":index,"offsetWord_end":index+len(name[0])})
  565. # PackageList_item.append([temp_package_number,i,changeIndexFromWordToWords(tokens,index),index,index+len(str(temp_package_number))])
  566. code = extractPackageCode(tokens, index)
  567. if code is not None:
  568. dict_packageCode[temp_package_number] = code
  569. PackageSet.add(temp_package_number)
  570. for iter in re.finditer(package_number_pattern,content):
  571. temp_package_number = re.findall(number_pattern,content[iter.span()[0]:iter.span()[1]])[0]
  572. PackageList_item.append({"name":temp_package_number,"sentence_index":list_sentence[i].sentence_index,"offsetWords_begin":changeIndexFromWordToWords(tokens,iter.span()[0]),"offsetWord_begin":iter.span()[0],"offsetWord_end":iter.span()[1]})
  573. # PackageList_item.append([temp_package_number,i,changeIndexFromWordToWords(tokens,iter.span()[0]),iter.span()[0],iter.span()[1]])
  574. code = extractPackageCode(tokens, iter.span()[0])
  575. if code is not None:
  576. dict_packageCode[temp_package_number] = code
  577. PackageSet.add(temp_package_number)
  578. #识别packageScope
  579. for iter in re.finditer(pattern_packageScope,content):
  580. PackageList_item_scope.append({"name":"","sentence_index":list_sentence[i].sentence_index,"offsetWords_begin":changeIndexFromWordToWords(tokens,iter.span()[0]),"offsetWord_begin":iter.span()[0],"offsetWord_end":iter.span()[1]})
  581. # PackageList_item_scope.append(["",i,changeIndexFromWordToWords(tokens,iter.span()[0]),iter.span()[0],iter.span()[1]])
  582. PackageList_item_scope = PackageList_item +PackageList_item_scope
  583. PackageList_item_scope.sort(key=lambda x:x["offsetWord_begin"])
  584. PackageList_scope = PackageList_scope+PackageList_item_scope
  585. PackageList_item.sort(key=lambda x:x["sentence_index"])
  586. #PackageList = PackageList+PackageList_item
  587. #不作为包
  588. # if len(PackageSet)==0:
  589. # for i in range(len(list_sentence)):
  590. # PackageList_item = []
  591. # PackageList_item_scope = []
  592. # content = list_sentence[i].sentence_text
  593. # tokens = list_sentence[i].tokens
  594. # for iter in re.finditer(other_package_pattern,content):
  595. # temp_package_number = iter.group(2)
  596. # PackageList_item.append({"name":temp_package_number,"sentence_index":list_sentence[i].sentence_index,"offsetWords_begin":changeIndexFromWordToWords(tokens,iter.span()[0]),"offsetWord_begin":iter.span()[0],"offsetWord_end":iter.span()[1]})
  597. # # PackageList_item.append([temp_package_number,i,changeIndexFromWordToWords(tokens,iter.span()[0]),iter.span()[0],iter.span()[1]])
  598. # code = extractPackageCode(tokens, iter.span()[0])
  599. # if code is not None:
  600. # dict_packageCode[temp_package_number] = code
  601. # PackageSet.add(temp_package_number)
  602. # #识别packageScope
  603. # for iter in re.finditer(pattern_packageScope,content):
  604. # PackageList_item_scope.append({"name":"","sentence_index":list_sentence[i].sentence_index,"offsetWords_begin":changeIndexFromWordToWords(tokens,iter.span()[0]),"offsetWord_begin":iter.span()[0],"offsetWord_end":iter.span()[1]})
  605. # # PackageList_item_scope.append(["",i,changeIndexFromWordToWords(tokens,iter.span()[0]),iter.span()[0],iter.span()[1]])
  606. # PackageList_item_scope = PackageList_item +PackageList_item_scope
  607. # PackageList_item_scope.sort(key=lambda x:x["offsetWord_begin"])
  608. # PackageList_scope = PackageList_scope+PackageList_item_scope
  609. # PackageList_item.sort(key=lambda x:x["sentence_index"])
  610. # 2020/11/23 大网站规则 调整
  611. if len(PackageSet)==0 and len(set([it.entity_text for it in list_entity if it.entity_type in ['org', 'company'] and it.label==2]))>1:
  612. for i in range(len(list_sentence)):
  613. PackageList_item = []
  614. PackageList_item_scope = []
  615. content = list_sentence[i].sentence_text
  616. tokens = list_sentence[i].tokens
  617. names = re.findall(other_package_pattern, content)
  618. N_names = re.findall(win_tenderer_pattern, content)
  619. if len(names) != 1 or len(N_names) != 1:
  620. continue
  621. for iter in re.finditer(other_package_pattern,content):
  622. temp_package_number = iter.group(4)
  623. xinghao = re.search(model_pattern, content)
  624. if xinghao:
  625. temp_package_number = temp_package_number + '+' + xinghao.group(2)
  626. # print('新正则采购包名补充',temp_package_number)
  627. PackageList_item.append({"name":temp_package_number,"sentence_index":list_sentence[i].sentence_index,"offsetWords_begin":changeIndexFromWordToWords(tokens,iter.span()[0]),"offsetWord_begin":iter.span()[0],"offsetWord_end":iter.span()[1]})
  628. # PackageList_item.append([temp_package_number,i,changeIndexFromWordToWords(tokens,iter.span()[0]),iter.span()[0],iter.span()[1]])
  629. code = extractPackageCode(tokens, iter.span()[0])
  630. if code is not None:
  631. dict_packageCode[temp_package_number] = code
  632. PackageSet.add(temp_package_number)
  633. #识别packageScope
  634. for iter in re.finditer(pattern_packageScope,content):
  635. PackageList_item_scope.append({"name":"","sentence_index":list_sentence[i].sentence_index,"offsetWords_begin":changeIndexFromWordToWords(tokens,iter.span()[0]),"offsetWord_begin":iter.span()[0],"offsetWord_end":iter.span()[1]})
  636. # PackageList_item_scope.append(["",i,changeIndexFromWordToWords(tokens,iter.span()[0]),iter.span()[0],iter.span()[1]])
  637. PackageList_item_scope = PackageList_item +PackageList_item_scope
  638. PackageList_item_scope.sort(key=lambda x:x["offsetWord_begin"])
  639. PackageList_scope = PackageList_scope+PackageList_item_scope
  640. PackageList_item.sort(key=lambda x:x["sentence_index"])
  641. pattern_punctuation = "[::()\(\),,。;;]"
  642. for i in range(len(list_sentence)):
  643. for j in range(len(PackageList_scope)):
  644. if i==PackageList_scope[j]["sentence_index"] and PackageList_scope[j]["name"]!="":
  645. _flag = False
  646. left_str = list_sentence[i].sentence_text[PackageList_scope[j]["offsetWord_begin"]-30:PackageList_scope[j]["offsetWord_begin"]+1]
  647. right_str = list_sentence[i].sentence_text[PackageList_scope[j]["offsetWord_begin"]:PackageList_scope[j]["offsetWord_begin"]+30]
  648. _left_find = re.findall(pattern_punctuation,left_str)
  649. _right_find = re.findall(pattern_punctuation,right_str)
  650. #print(left_str)
  651. if re.search("同",left_str[-1:]) is not None and PackageList_scope[j]["name"]=="一":
  652. continue
  653. if re.search("划分",right_str[:10]) is not None:
  654. continue
  655. if len(_left_find)>0 and _left_find[-1] in [":",":"]:
  656. _flag = True
  657. if len(_right_find)>0 and _right_find[0] in [":",":"]:
  658. _flag = True
  659. if _flag:
  660. scope_begin = [PackageList_scope[j]["sentence_index"],PackageList_scope[j]["offsetWords_begin"]]
  661. else:
  662. if j==0:
  663. scope_begin = [0,0]
  664. else:
  665. scope_begin = [PackageList_scope[j-1]["sentence_index"],PackageList_scope[j-1]["offsetWords_begin"]]
  666. if j==len(PackageList_scope)-1:
  667. scope_end = [PackageList_scope[j]["offsetWords_begin"],changeIndexFromWordToWords(list_sentence[i].tokens, len(list_sentence[i].sentence_text))]
  668. else:
  669. scope_end = [PackageList_scope[j+1]["sentence_index"],PackageList_scope[j+1]["offsetWords_begin"]]
  670. if PackageList_scope[j-1]["sentence_index"]==PackageList_scope[j]["sentence_index"] and PackageList_scope[j-1]["offsetWord_begin"]<=PackageList_scope[j]["offsetWord_begin"] and PackageList_scope[j-1]["offsetWord_end"]>=PackageList_scope[j]["offsetWord_end"]:
  671. continue
  672. #add package to entity
  673. _pack_entity = Entity(doc_id=list_sentence[0].doc_id,entity_id="%s_%s_%s_%s"%(list_sentence[0].doc_id,i,PackageList_scope[j]["offsetWord_begin"],PackageList_scope[j]["offsetWord_begin"]),entity_text=PackageList_scope[j]["name"],entity_type="package",sentence_index=PackageList_scope[j]["sentence_index"],begin_index=changeIndexFromWordToWords(list_sentence[i].tokens,PackageList_scope[j]["offsetWord_begin"]),end_index=changeIndexFromWordToWords(list_sentence[i].tokens,PackageList_scope[j]["offsetWord_end"]),wordOffset_begin=PackageList_scope[j]["offsetWord_begin"],wordOffset_end=PackageList_scope[j]["offsetWord_end"])
  674. list_entity.append(_pack_entity)
  675. copy_pack = copy.copy(PackageList_scope[j])
  676. copy_pack["scope"] = [scope_begin,scope_end]
  677. copy_pack["hit"] = set()
  678. copy_pack["pointer"] = _pack_entity
  679. PackageList.append(copy_pack)
  680. return PackageList,PackageSet,dict_packageCode
  681. def findAttributeAfterEntity(PackDict,roleSet,PackageList,PackageSet,list_entity,on_value = 0.5,on_value_person=0.5,sentence_len=4):
  682. '''
  683. @param:
  684. PackDict:文章包dict
  685. roleSet:文章所有角色的公司名称
  686. PackageList:文章的包信息
  687. PackageSet:文章所有包的名称
  688. list_entity:文章所有经过模型处理的实体
  689. on_value:金额模型的阈值
  690. on_value_person:联系人模型的阈值
  691. sentence_len:公司和属性间隔句子的最大长度
  692. @return:添加了属性信息的角色list
  693. '''
  694. #根据roleid添加金额到rolelist中
  695. def addMoneyByRoleid(packDict,packageName,roleid,money,money_prob):
  696. for i in range(len(packDict[packageName]["roleList"])):
  697. if packDict[packageName]["roleList"][i].role_name==dict_role_id.get(str(roleid)):
  698. if money_prob>packDict[packageName]["roleList"][i].money_prob:
  699. packDict[packageName]["roleList"][i].money = money
  700. packDict[packageName]["roleList"][i].money_prob = money_prob
  701. return packDict
  702. #根据实体名称添加金额到rolelist中
  703. def addMoneyByEntity(packDict,packageName,entity,money,money_prob):
  704. for i in range(len(packDict[packageName]["roleList"])):
  705. if packDict[packageName]["roleList"][i].entity_text==entity:
  706. if money_prob>packDict[packageName]["roleList"][i].money_prob:
  707. packDict[packageName]["roleList"][i].money = money
  708. packDict[packageName]["roleList"][i].money_prob = money_prob
  709. return packDict
  710. #根据实体名称得到角色
  711. def getRoleWithText(packDict,entity_text):
  712. for pack in packDict.keys():
  713. for i in range(len(packDict[pack]["roleList"])):
  714. if packDict[pack]["roleList"][i].entity_text==entity_text:
  715. return packDict[pack]["roleList"][i].role_name
  716. def doesEntityOrLinkedEntity_inRoleSet(entity,RoleSet):
  717. _list_entitys = [entity]+entity.linked_entitys
  718. for _entity in _list_entitys:
  719. if _entity.entity_text in RoleSet:
  720. return True
  721. p_entity = 0
  722. #遍历所有实体
  723. while(p_entity<len(list_entity)):
  724. entity = list_entity[p_entity]
  725. '''
  726. #招标金额从后往前找
  727. if entity.entity_type=="money":
  728. if entity.values[entity.label]>=on_value:
  729. if str(entity.label)=="0":
  730. packagePointer,_ = getPackage(PackageList,entity.sentence_index,entity.begin_index,"money-"+str(entity.label))
  731. if packagePointer is None:
  732. packageName = "Project"
  733. else:
  734. packageName = packagePointer.entity_text
  735. addMoneyByRoleid(PackDict, packageName, "0", entity.entity_text, entity.values[entity.label])
  736. '''
  737. ''' # 2020/11/25 与下面的联系人连接步骤重复,取消
  738. if entity.entity_type=="person":
  739. if entity.values[entity.label]>=on_value_person:
  740. if str(entity.label)=="1":
  741. for i in range(len(PackDict["Project"]["roleList"])):
  742. if PackDict["Project"]["roleList"][i].role_name=="tenderee":
  743. PackDict["Project"]["roleList"][i].linklist.append((entity.entity_text,entity.person_phone))
  744. # add pointer_person
  745. for _entity in list_entity:
  746. if dict_role_id.get(str(_entity.label))=="tenderee":
  747. for i in range(len(PackDict["Project"]["roleList"])):
  748. if PackDict["Project"]["roleList"][i].entity_text==_entity.entity_text and PackDict["Project"]["roleList"][i].role_name=="tenderee":
  749. _entity.pointer_person = entity
  750. elif str(entity.label)=="2":
  751. for i in range(len(PackDict["Project"]["roleList"])):
  752. if PackDict["Project"]["roleList"][i].role_name=="agency":
  753. PackDict["Project"]["roleList"][i].linklist.append((entity.entity_text,entity.person_phone))
  754. # add pointer_person
  755. for _entity in list_entity:
  756. if dict_role_id.get(str(_entity.label))=="agency":
  757. for i in range(len(PackDict["Project"]["roleList"])):
  758. if PackDict["Project"]["roleList"][i].entity_text==_entity.entity_text and PackDict["Project"]["roleList"][i].role_name=="agency":
  759. _entity.pointer_person = entity
  760. '''
  761. #金额往前找实体
  762. if entity.entity_type=="money":
  763. if entity.values[entity.label]>=on_value:
  764. p_entity_money= p_entity
  765. entity_money = list_entity[p_entity_money]
  766. if len(PackageSet)>0:
  767. packagePointer,_ = getPackage(PackageList,entity_money.sentence_index,entity_money.begin_index,"money-"+str(entity_money.entity_text)+"-"+str(entity_money.label))
  768. if packagePointer is None:
  769. packageName_entity = "Project"
  770. else:
  771. packageName_entity = packagePointer.entity_text
  772. else:
  773. packageName_entity = "Project"
  774. while(p_entity_money>0):
  775. entity_before = list_entity[p_entity_money]
  776. if entity_before.entity_type in ['org','company']:
  777. if str(entity_before.label)=="1":
  778. addMoneyByEntity(PackDict, packageName_entity, entity_before.entity_text, entity_money.entity_text, entity_money.values[entity_money.label])
  779. #add pointer_money
  780. entity_before.pointer_money = entity_money
  781. break
  782. p_entity_money -= 1
  783. #如果实体属于角色集合,则往后找属性
  784. if doesEntityOrLinkedEntity_inRoleSet(entity, roleSet):
  785. p_entity += 1
  786. #循环查找符合的属性
  787. while(p_entity<len(list_entity)):
  788. entity_after = list_entity[p_entity]
  789. if entity_after.sentence_index-entity.sentence_index>=sentence_len:
  790. p_entity -= 1
  791. break
  792. #若是遇到公司实体,则跳出循环
  793. if entity_after.entity_type in ['org','company']:
  794. p_entity -= 1
  795. break
  796. if entity_after.values is not None:
  797. if entity_after.entity_type=="money":
  798. if entity_after.values[entity_after.label]>=on_value:
  799. '''
  800. #招标金额从后往前找
  801. if str(entity_after.label)=="0":
  802. packagePointer,_ = getPackage(PackageList,entity.sentence_index,entity.begin_index,"money-"+str(entity.label))
  803. if packagePointer is None:
  804. packageName = "Project"
  805. else:
  806. packageName = packagePointer.entity_text
  807. addMoneyByRoleid(PackDict, packageName, "0", entity_after.entity_text, entity_after.values[entity_after.label])
  808. '''
  809. if str(entity_after.label)=="1":
  810. #print(entity_after.entity_text,entity.entity_text)
  811. _list_entitys = [entity]+entity.linked_entitys
  812. if len(PackageSet)>0:
  813. packagePointer,_ = getPackage(PackageList,entity_after.sentence_index,entity_after.begin_index,"money-"+str(entity_after.entity_text)+"-"+str(entity_after.label))
  814. if packagePointer is None:
  815. packageName_entity = "Project"
  816. else:
  817. packageName_entity = packagePointer.entity_text
  818. else:
  819. packageName_entity = "Project"
  820. if str(entity.label) in ["2","3","4"]:
  821. addMoneyByEntity(PackDict, packageName_entity, entity.entity_text, entity_after.entity_text, entity_after.values[entity_after.label])
  822. #add pointer_money
  823. entity.pointer_money = entity_after
  824. '''
  825. if entity_after.entity_type=="person":
  826. if entity_after.values[entity_after.label]>=on_value_person:
  827. if str(entity_after.label)=="1":
  828. for i in range(len(roleList)):
  829. if roleList[i].role_name=="tenderee":
  830. roleList[i].linklist.append((entity_after.entity_text,entity_after.person_phone))
  831. elif str(entity_after.label)=="2":
  832. for i in range(len(roleList)):
  833. if roleList[i].role_name=="agency":
  834. roleList[i].linklist.append((entity_after.entity_text,entity_after.person_phone))
  835. elif str(entity_after.label)=="3":
  836. _list_entitys = [entity]+entity.linked_entitys
  837. for _entity in _list_entitys:
  838. for i in range(len(roleList)):
  839. if roleList[i].entity_text==_entity.entity_text:
  840. if entity_after.sentence_index-_entity.sentence_index>1 and len(roleList[i].linklist)>0:
  841. break
  842. roleList[i].linklist.append((entity_after.entity_text,entity_after.person_phone))
  843. '''
  844. p_entity += 1
  845. p_entity += 1
  846. ''''''
  847. # 1)第一个公司实体的招标人,则看看下一个实体是否为代理人,如果是则联系人错位连接 。2)在同一句中往后找联系人。3)连接不上在整个文章找联系人。
  848. temp_ent_list = [] # 临时列表,记录0,1角色及3联系人
  849. other_person = [] # 阈值以上的联系人列表
  850. link_person = [] # 有电话没联系上角色的person列表
  851. other_ent = []
  852. link_ent = []
  853. found_person = False
  854. ent_list = []
  855. for entity in list_entity:
  856. if entity.entity_type in ['org','company','person']:
  857. ent_list.append(entity)
  858. #for list_index in range(len(ent_list)):
  859. #if ent_list[list_index].entity_type in ['org','company'] and ent_list[list_index].label == 0 and list_index+2<len(ent_list) and \
  860. #ent_list[list_index+1].entity_type in ['org','company'] and ent_list[list_index+1].label == 1 and ent_list[list_index+2].entity_type in ['person']:
  861. #ent_list[list_index+1], ent_list[list_index+2] = ent_list[list_index+2], ent_list[list_index+1]
  862. # 2020/11/25增加确定角色联系人判断
  863. sure_person_set = set([entity.entity_text for entity in ent_list if entity.entity_type == 'person' and entity.label in [1, 2]])
  864. for index in range(len(ent_list)):
  865. entity = ent_list[index]
  866. if entity.entity_type=="person":
  867. if str(entity.label) == "0": # 2020/11/25 非联系人直接跳过
  868. continue
  869. if entity.values[entity.label]>on_value_person:
  870. if str(entity.label)=="1":
  871. for i in range(len(PackDict["Project"]["roleList"])):
  872. if PackDict["Project"]["roleList"][i].role_name=="tenderee":
  873. PackDict["Project"]["roleList"][i].linklist.append((entity.entity_text,entity.person_phone))
  874. link_person.append(entity.entity_text)
  875. link_ent.append(PackDict["Project"]["roleList"][i].entity_text)
  876. # add pointer_person
  877. for _entity in list_entity:
  878. if dict_role_id.get(str(_entity.label))=="tenderee":
  879. for i in range(len(PackDict["Project"]["roleList"])):
  880. if PackDict["Project"]["roleList"][i].entity_text==_entity.entity_text and PackDict["Project"]["roleList"][i].role_name=="tenderee":
  881. _entity.pointer_person = entity
  882. elif str(entity.label)=="2":
  883. for i in range(len(PackDict["Project"]["roleList"])):
  884. if PackDict["Project"]["roleList"][i].role_name=="agency":
  885. PackDict["Project"]["roleList"][i].linklist.append((entity.entity_text,entity.person_phone))
  886. link_person.append(entity.entity_text)
  887. link_ent.append(PackDict["Project"]["roleList"][i].entity_text)
  888. # add pointer_person
  889. for _entity in list_entity:
  890. if dict_role_id.get(str(_entity.label))=="agency":
  891. for i in range(len(PackDict["Project"]["roleList"])):
  892. if PackDict["Project"]["roleList"][i].entity_text==_entity.entity_text and PackDict["Project"]["roleList"][i].role_name=="agency":
  893. _entity.pointer_person = entity
  894. elif str(entity.label)=="3":
  895. if entity.entity_text in sure_person_set: # 2020/11/25 排除已经确定角色的联系人
  896. continue
  897. #not_link_person.append((entity_after.entity_text,entity_after.person_phone))
  898. other_person.append(entity.entity_text)
  899. temp_ent_list.append((entity.entity_text,entity.person_phone,entity))
  900. #if entity.entity_text in roleSet:
  901. if entity.entity_text in roleSet:
  902. if entity.label in [0,1]:
  903. other_ent.append(entity.entity_text)
  904. temp_ent_list.append((entity.entity_text, entity.label,entity))
  905. for behind_index in range(index+1, len(ent_list)):
  906. entity_after = ent_list[behind_index]
  907. if entity_after.sentence_index-entity.sentence_index>=1 or entity_after.entity_type in ['org','company']: # 只在本句中找联系人
  908. break
  909. if entity_after.values is not None:
  910. if entity_after.entity_type=="person":
  911. if str(entity_after.label) == "0": # 2020/11/25角色后面为非联系人 停止继续往后找
  912. break
  913. if entity_after.values[entity_after.label]>on_value_person:
  914. if str(entity_after.label)=="1":
  915. for i in range(len(PackDict["Project"]["roleList"])):
  916. if PackDict["Project"]["roleList"][i].role_name=="tenderee":
  917. PackDict["Project"]["roleList"][i].linklist.append((entity_after.entity_text,entity_after.person_phone))
  918. link_person.append(entity_after.entity_text)
  919. link_ent.append(PackDict["Project"]["roleList"][i].entity_text)
  920. elif str(entity_after.label)=="2":
  921. for i in range(len(PackDict["Project"]["roleList"])):
  922. if PackDict["Project"]["roleList"][i].role_name=="agency":
  923. PackDict["Project"]["roleList"][i].linklist.append((entity_after.entity_text,entity_after.person_phone))
  924. link_person.append(entity_after.entity_text)
  925. link_ent.append(PackDict["Project"]["roleList"][i].entity_text)
  926. elif str(entity_after.label)=="3":
  927. if entity_after.entity_text in sure_person_set: # 2020/11/25 如果姓名已经出现在确定角色联系人中则停止往后找
  928. break
  929. elif entity_after.begin_index - entity.end_index > 30:#2020/10/25 如果角色实体与联系人实体间隔大于阈值停止
  930. break
  931. for pack in PackDict.keys():
  932. for i in range(len(PackDict[pack]["roleList"])):
  933. if PackDict[pack]["roleList"][i].entity_text==entity.entity_text:
  934. #if entity_after.sentence_index-entity.sentence_index>1 and len(roleList[i].linklist)>0:
  935. #break
  936. PackDict[pack]["roleList"][i].linklist.append((entity_after.entity_text,entity_after.person_phone))
  937. link_person.append(entity_after.entity_text)
  938. #add pointer_person
  939. entity.pointer_person = entity_after
  940. not_link_person = [person for person in other_person if person not in link_person]
  941. not_link_ent = [ent for ent in other_ent if ent not in link_ent]
  942. if len(not_link_person) > 0 and len(not_link_ent) > 0 :
  943. item = temp_ent_list
  944. for i in range(len(item)):
  945. if item[i][0] in not_link_ent and item[i][1] == 0 and i+3 < len(item):
  946. if item[i+1][0] in other_ent and item[i+1][1] == 1 and item[i+2][0] in other_person and item[i+3][0] in other_person:
  947. item[i+1], item[i+2] = item[i+2], item[i+1]
  948. for i in range(len(item)-1, -1, -1):
  949. if item[i][0] in not_link_ent:
  950. for pack in PackDict.keys():
  951. for role in PackDict[pack]["roleList"]:
  952. if role.entity_text == item[i][0] and len(role.linklist) < 1:
  953. for j in range(i+1, len(item)):
  954. if item[j][0] in not_link_person:
  955. role.linklist.append(item[j][:2])
  956. #add pointer_person
  957. item[i][2].pointer_person = item[j][2]
  958. break
  959. else:
  960. break
  961. #寻找多标段招标金额
  962. p_entity = len(list_entity)-1
  963. set_tenderer_money = set()
  964. #遍历所有实体
  965. while(p_entity>=0):
  966. entity = list_entity[p_entity]
  967. if entity.entity_type=="money":
  968. if entity.values[entity.label]>=on_value:
  969. if str(entity.label)=="1":
  970. set_tenderer_money.add(float(entity.entity_text))
  971. if str(entity.label)=="0":
  972. '''
  973. if p_entity>0:
  974. p_before = list_entity[p_entity-1]
  975. if p_before.entity_type=="money" and p_before.label==entity.label and p_before.entity_text==entity.entity_text and abs(entity.begin_index-p_before.end_index)<=2:
  976. p_entity -= 1
  977. continue
  978. '''
  979. packagePointer,_flag = getPackage(PackageList,entity.sentence_index,entity.begin_index,"money-"+str(entity.label),MAX_DIS=2,DIRECT="L")
  980. if packagePointer is None:
  981. packageName = "Project"
  982. else:
  983. packageName = packagePointer.entity_text
  984. if packageName=="Project":
  985. if PackDict["Project"]["tendereeMoney"]<float(entity.entity_text):
  986. PackDict["Project"]["tendereeMoney"] = float(entity.entity_text)
  987. else:
  988. PackDict[packageName]["tendereeMoney"] = float(entity.entity_text)
  989. #add pointer_tendereeMoney
  990. packagePointer.pointer_tendereeMoney = entity
  991. p_entity -= 1
  992. #删除一个机构有多个角色的数据
  993. #删除重复人、概率不回传
  994. final_roleList = []
  995. list_pop = []
  996. set_tenderer_role = set()
  997. dict_pack_tenderer_money = dict()
  998. for pack in PackDict.keys():
  999. #删除无效包
  1000. if PackDict[pack]["code"]=="" and PackDict[pack]["tendereeMoney"]==0 and len(PackDict[pack]["roleList"])==0:
  1001. list_pop.append(pack)
  1002. for i in range(len(PackDict[pack]["roleList"])):
  1003. if PackDict[pack]["roleList"][i].role_name=="win_tenderer":
  1004. if PackDict[pack]["roleList"][i].money==0:
  1005. set_tenderer_role.add(PackDict[pack]["roleList"][i])
  1006. dict_pack_tenderer_money[pack] = [PackDict[pack]["roleList"][i],set()]
  1007. #找到包的中投标金额
  1008. for _index in range(len(PackageList)):
  1009. if "hit" in PackageList[_index]:
  1010. for _hit in list(PackageList[_index]["hit"]):
  1011. _money = float(_hit.split("-")[1]) if _hit.split("-")[0]=="money" else None
  1012. if PackageList[_index]["name"] in dict_pack_tenderer_money and _money is not None:
  1013. dict_pack_tenderer_money[PackageList[_index]["name"]][1].add(_money)
  1014. #只找到一个中标人和中标金额
  1015. if len(set_tenderer_money)==1 and len(set_tenderer_role)==1:
  1016. list(set_tenderer_role)[0].money = list(set_tenderer_money)[0]
  1017. #找到一个中标人和多个招标金额
  1018. if len(set_tenderer_money)>1 and len(set_tenderer_role)==1:
  1019. _maxMoney = 0
  1020. _sumMoney = 0
  1021. for _m in list(set_tenderer_money):
  1022. _sumMoney += _m
  1023. if _m>_maxMoney:
  1024. _maxMoney = _m
  1025. if _sumMoney/_maxMoney==2:
  1026. list(set_tenderer_role)[0].money = _maxMoney
  1027. else:
  1028. list(set_tenderer_role)[0].money = _maxMoney
  1029. #每个包都只找到一个金额
  1030. _flag_pack_money = True
  1031. for k,v in dict_pack_tenderer_money.items():
  1032. if len(v[1])!=1:
  1033. _flag_pack_money = False
  1034. if _flag_pack_money and len(PackageSet)==len(dict_pack_tenderer_money.keys()):
  1035. for k,v in dict_pack_tenderer_money.items():
  1036. v[0].money = list(v[1])[0]
  1037. for pack in PackDict.keys():
  1038. for i in range(len(PackDict[pack]["roleList"])):
  1039. PackDict[pack]["roleList"][i] = PackDict[pack]["roleList"][i].getString()
  1040. for item in list_pop:
  1041. PackDict.pop(item)
  1042. return PackDict
  1043. def initPackageAttr(RoleList,PackageSet):
  1044. '''
  1045. @summary: 根据拿到的roleList和packageSet初始化接口返回的数据
  1046. '''
  1047. packDict = dict()
  1048. packDict["Project"] = {"code":"","tendereeMoney":0,"roleList":[]}
  1049. for item in list(PackageSet):
  1050. packDict[item] = {"code":"","tendereeMoney":0,"roleList":[]}
  1051. for item in RoleList:
  1052. if packDict[item.packageName]["code"] =="":
  1053. packDict[item.packageName]["code"] = item.packageCode
  1054. packDict[item.packageName]["roleList"].append(Role(item.role_name,item.entity_text,0,0,0.0,[]))
  1055. return packDict
  1056. def getPackageRoleMoney(list_sentence,list_entity):
  1057. '''
  1058. @param:
  1059. list_sentence:文章的句子list
  1060. list_entity:文章的实体list
  1061. @return: 拿到文章的包-标段号-角色-实体名称-金额-联系人-联系电话
  1062. '''
  1063. # print("=1")
  1064. theRole = getRoleList(list_sentence,list_entity)
  1065. if not theRole:
  1066. return []
  1067. RoleList,RoleSet,PackageList,PackageSet = theRole
  1068. '''
  1069. for item in PackageList:
  1070. print(item)
  1071. '''
  1072. # print("=2")
  1073. PackDict = initPackageAttr(RoleList, PackageSet)
  1074. # print("=3")
  1075. PackDict = findAttributeAfterEntity(PackDict, RoleSet, PackageList, PackageSet, list_entity)
  1076. # print("=4")
  1077. return PackDict
  1078. def getOtherAttributes(list_entity):
  1079. dict_other = {"bidway":"",
  1080. "moneysource":"",
  1081. "person_review":[],
  1082. "time_release":"",
  1083. "time_bidopen":"",
  1084. "time_bidclose":"",
  1085. "serviceTime":"",
  1086. "product":[]}
  1087. for entity in list_entity:
  1088. if entity.entity_type == 'bidway':
  1089. dict_other["bidway"] = entity.entity_text
  1090. elif entity.entity_type=='moneysource':
  1091. dict_other["moneysource"] = entity.entity_text
  1092. elif entity.entity_type=='serviceTime':
  1093. dict_other["serviceTime"] = entity.entity_text
  1094. elif entity.entity_type == 'time' and entity.label==1:
  1095. dict_other["time_release"] = timeFormat(entity.entity_text)
  1096. elif entity.entity_type == 'time' and entity.label==2:
  1097. dict_other["time_bidopen"] = timeFormat(entity.entity_text)
  1098. elif entity.entity_type == 'time' and entity.label == 3:
  1099. dict_other["time_bidclose"] = timeFormat(entity.entity_text)
  1100. elif entity.entity_type=="person" and entity.label ==4:
  1101. dict_other["person_review"].append(entity.entity_text)
  1102. elif entity.entity_type=='product':
  1103. dict_other["product"].append(entity.entity_text)
  1104. dict_other["product"] = list(set(dict_other["product"]))
  1105. return dict_other
  1106. def getMoneyRange(RoleList):
  1107. pass
  1108. def getPREMs(list_sentences,list_entitys,list_articles):
  1109. '''
  1110. @param:
  1111. list_sentence:所有文章的句子list
  1112. list_entity:所有文章的实体list
  1113. @return:list of dict which include文章的包-角色-实体名称-金额-联系人-联系电话
  1114. '''
  1115. result = []
  1116. for list_sentence,list_entity,list_article in zip(list_sentences,list_entitys,list_articles):
  1117. RoleList = getPackageRoleMoney(list_sentence,list_entity)
  1118. result.append(dict({"prem":RoleList,"docid":list_article.doc_id},**getOtherAttributes(list_entity),**{"fingerprint":list_article.fingerprint,"match_enterprise":list_article.match_enterprise,"match_enterprise_type":list_article.match_enterprise_type}))
  1119. return result
  1120. if __name__=="__main__":
  1121. '''
  1122. conn = getConnection()
  1123. cursor = conn.cursor()
  1124. #sql = " select distinct A.doc_id from entity_mention A,test_predict_role B where A.entity_id=B.entity_id limit 200"
  1125. sql = " select B.doc_id,B.prem from articles_processed A, articles_validation B where A.id=B.doc_id "
  1126. result = []
  1127. cursor.execute(sql)
  1128. rows = cursor.fetchall()
  1129. count = 0
  1130. for row in rows:
  1131. count += 1
  1132. print(count)
  1133. doc_id = row[0]
  1134. roleList = getPackageRoleMoney(doc_id)
  1135. result.append([doc_id,str(roleList),row[1]])
  1136. ''''''
  1137. with codecs.open("getAttribute.html","w",encoding="utf8") as f:
  1138. f.write('<html><head>\
  1139. <meta http-equiv="Content-Type"\
  1140. content="text/html; charset=UTF-8">\
  1141. </head>\
  1142. <body bgcolor="#FFFFFF">\
  1143. <table border="1">\
  1144. <tr>\
  1145. <td>doc_id</td>\
  1146. <td>角色</td>\
  1147. </tr>')
  1148. for item in result:
  1149. f.write("<tr>"+"<td>"+item[0]+"</td>"+"<td>"+item[1]+"</td>"+"<td>"+item[2]+"</td>"+"</tr>")
  1150. f.write("</table></body>")
  1151. '''