getAttributes.py 63 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989910010110210310410510610710810911011111211311411511611711811912012112212312412512612712812913013113213313413513613713813914014114214314414514614714814915015115215315415515615715815916016116216316416516616716816917017117217317417517617717817918018118218318418518618718818919019119219319419519619719819920020120220320420520620720820921021121221321421521621721821922022122222322422522622722822923023123223323423523623723823924024124224324424524624724824925025125225325425525625725825926026126226326426526626726826927027127227327427527627727827928028128228328428528628728828929029129229329429529629729829930030130230330430530630730830931031131231331431531631731831932032132232332432532632732832933033133233333433533633733833934034134234334434534634734834935035135235335435535635735835936036136236336436536636736836937037137237337437537637737837938038138238338438538638738838939039139239339439539639739839940040140240340440540640740840941041141241341441541641741841942042142242342442542642742842943043143243343443543643743843944044144244344444544644744844945045145245345445545645745845946046146246346446546646746846947047147247347447547647747847948048148248348448548648748848949049149249349449549649749849950050150250350450550650750850951051151251351451551651751851952052152252352452552652752852953053153253353453553653753853954054154254354454554654754854955055155255355455555655755855956056156256356456556656756856957057157257357457557657757857958058158258358458558658758858959059159259359459559659759859960060160260360460560660760860961061161261361461561661761861962062162262362462562662762862963063163263363463563663763863964064164264364464564664764864965065165265365465565665765865966066166266366466566666766866967067167267367467567667767867968068168268368468568668768868969069169269369469569669769869970070170270370470570670770870971071171271371471571671771871972072172272372472572672772872973073173273373473573673773873974074174274374474574674774874975075175275375475575675775875976076176276376476576676776876977077177277377477577677777877978078178278378478578678778878979079179279379479579679779879980080180280380480580680780880981081181281381481581681781881982082182282382482582682782882983083183283383483583683783883984084184284384484584684784884985085185285385485585685785885986086186286386486586686786886987087187287387487587687787887988088188288388488588688788888989089189289389489589689789889990090190290390490590690790890991091191291391491591691791891992092192292392492592692792892993093193293393493593693793893994094194294394494594694794894995095195295395495595695795895996096196296396496596696796896997097197297397497597697797897998098198298398498598698798898999099199299399499599699799899910001001100210031004100510061007100810091010101110121013101410151016101710181019102010211022102310241025102610271028102910301031103210331034103510361037103810391040104110421043104410451046104710481049105010511052105310541055105610571058105910601061106210631064106510661067106810691070107110721073107410751076107710781079108010811082108310841085108610871088108910901091109210931094109510961097109810991100110111021103110411051106110711081109111011111112111311141115111611171118111911201121112211231124112511261127112811291130113111321133113411351136113711381139114011411142114311441145114611471148114911501151115211531154115511561157115811591160116111621163116411651166116711681169117011711172117311741175117611771178117911801181118211831184118511861187118811891190119111921193119411951196119711981199120012011202120312041205120612071208120912101211121212131214121512161217121812191220122112221223122412251226122712281229123012311232123312341235
  1. from BiddingKG.dl.common.Utils import findAllIndex,debug,timeFormat
  2. from BiddingKG.dl.interface.Entitys import PREM,Role,Entity
  3. import re
  4. import copy
  5. import math
  6. import pandas as pd
  7. import os
  8. def getTheRole(entity,role_list):
  9. '''
  10. @summary:根据实体名称拿到index
  11. @param:
  12. entity:实体名称
  13. role_list:角色list
  14. @return:该实体所在下标
  15. '''
  16. for role_index in range(len(role_list)):
  17. if entity in role_list[role_index]:
  18. return role_index
  19. return None
  20. dict_role_id = {"0":"tenderee",
  21. "1":"agency",
  22. "2":"win_tenderer",
  23. "3":"second_tenderer",
  24. "4":"third_tenderer"}
  25. def getPackage(packageList,sentence_index,begin_index,roleid,MAX_DIS=None,DIRECT=None):
  26. '''
  27. @param:
  28. packageList:文章的包的信息,包号-sent_index-词偏移-字偏移-[[前作用域句子,句内偏移],[后作用域句子,句内偏移]]-匹配集合
  29. sentence_index:实体所在的句子
  30. begin_index:实体所在句子的起始位置
  31. @return:公司实体所属的包
  32. @summary: 优化多标段,确定标段作用域之后,寻找作用域包含该实体的所有包,从前往后找到一个还没有该roleid的包返回,若找到的包都有roleid,则返回第一个,若没有找到包,返回None
  33. '''
  34. '''
  35. if len(packageList)==0:
  36. return None
  37. before_index = None
  38. after_index = None
  39. equal_index = None
  40. equal_count = 0
  41. for pack_index in range(len(packageList)):
  42. if packageList[pack_index][1]>sentence_index and after_index is None:
  43. after_index = pack_index
  44. if packageList[pack_index][1]<sentence_index:
  45. before_index = pack_index
  46. if packageList[pack_index][1]==sentence_index and equal_index is None:
  47. equal_index = pack_index
  48. #当前句子和之前句子未找到包
  49. if before_index is None and equal_index is None:
  50. return None
  51. else:
  52. if after_index is None:
  53. end_index = len(packageList)
  54. else:
  55. end_index = after_index
  56. #只在当前句子找到一个包号
  57. if end_index-max((before_index if before_index is not None else -1,equal_index if equal_index is not None else -1))==1:
  58. return packageList[end_index-1][0]
  59. else:
  60. for i in range(max((before_index if before_index is not None else -1,equal_index if equal_index is not None else -1)),end_index):
  61. if packageList[i][2]>int(begin_index):
  62. if packageList[i-1][4]:
  63. return packageList[i-1][0]
  64. else:
  65. if packageList[i][4]:
  66. return packageList[i-1][0]
  67. else:
  68. return packageList[i][0]
  69. return packageList[end_index-1][0]
  70. '''
  71. if len(packageList)==0:
  72. return None,False
  73. list_legalPack = []
  74. for pack_index in range(len(packageList)):
  75. if DIRECT=="L" and (packageList[pack_index]["sentence_index"]>sentence_index or (packageList[pack_index]["sentence_index"]==sentence_index and packageList[pack_index]["offsetWords_begin"]>begin_index)):
  76. continue
  77. if DIRECT=="R" and (packageList[pack_index]["sentence_index"]<sentence_index or (packageList[pack_index]["sentence_index"]==sentence_index and packageList[pack_index]["offsetwords_begin"]<begin_index)):
  78. continue
  79. if (packageList[pack_index]["scope"][0][0]<sentence_index or (packageList[pack_index]["scope"][0][0]==sentence_index and packageList[pack_index]["scope"][0][1]<=begin_index)) and (packageList[pack_index]["scope"][1][0]>sentence_index or (packageList[pack_index]["scope"][1][0]==sentence_index and packageList[pack_index]["scope"][1][1]>=begin_index)):
  80. if MAX_DIS is not None:
  81. if abs(sentence_index-packageList[pack_index]["sentence_index"])<=MAX_DIS:
  82. list_legalPack.append(pack_index)
  83. else:
  84. list_legalPack.append(pack_index)
  85. _flag = True
  86. for _index in list_legalPack:
  87. if roleid in packageList[_index]["hit"]:
  88. continue
  89. else:
  90. _flag = False
  91. packageList[_index]["hit"].add(roleid)
  92. return packageList[_index]["pointer"],_flag
  93. if len(list_legalPack)>0:
  94. return packageList[0]["pointer"],_flag
  95. return None,False
  96. #生成合法的组合
  97. def get_legal_comba(list_entity,dict_role_combination):
  98. #拿到一个包中所有合法的组合
  99. def circle_package(_dict_legal_combination):
  100. list_dict_role_first = []
  101. for _role in _dict_legal_combination:
  102. if len(list_dict_role_first)==0:
  103. for _entity in _dict_legal_combination[_role]:
  104. if _entity !="":
  105. list_dict_role_first.append({_role:_entity})
  106. else:
  107. list_dict_role_after = []
  108. _find_count = 0
  109. for _entity in _dict_legal_combination[_role]:
  110. if _entity !="":
  111. for _dict in list_dict_role_first:
  112. _flag = True
  113. for _key1 in _dict:
  114. if _entity==_dict[_key1]:
  115. #修改为招标人和代理人可以为同一个
  116. if str(_key1) in ["0","1"] and str(_role) in ["0","1"]:
  117. _flag = True
  118. else:
  119. _flag = False
  120. if _flag:
  121. _find_count += 1
  122. _new_dict = copy.copy(_dict)
  123. _new_dict[_role] = _entity
  124. if len(list_dict_role_after)>100000:
  125. break
  126. list_dict_role_after.append(_new_dict)
  127. if len(list_dict_role_after)==0:
  128. pass
  129. else:
  130. list_dict_role_first.extend(list_dict_role_after)
  131. return list_dict_role_first
  132. def recursive_package(_dict_legal_combination,set_legal_entity,dict_one_selution,list_all_selution):
  133. last_layer = False
  134. #若是空组合则放回空
  135. if len(_dict_legal_combination.keys())==0:
  136. return []
  137. #递归到最后一层则修改状态
  138. if len(_dict_legal_combination.keys())==1:
  139. last_layer = True
  140. #取一个角色开始进行遍历
  141. _key_role = list(_dict_legal_combination.keys())[0]
  142. for item in _dict_legal_combination[_key_role]:
  143. copy_dict_one_selution = copy.copy(dict_one_selution)
  144. copy_dict_legal_combination = {}
  145. copy_set_legal_entity = copy.copy(set_legal_entity)
  146. #复制余下的所有角色,进行下一轮递归
  147. for _key in _dict_legal_combination.keys():
  148. if _key!=_key_role:
  149. copy_dict_legal_combination[_key] = _dict_legal_combination[_key]
  150. #修改为招标人和代理人可以为同一个
  151. if item !="":
  152. _flag = True
  153. if str(_key_role) in ["0","1"]:
  154. for _key_flag in copy_dict_one_selution:
  155. if _key_flag not in ["0","1"] and copy_dict_one_selution[_key_flag]==item:
  156. _flag = False
  157. else:
  158. for _key_flag in copy_dict_one_selution:
  159. if copy_dict_one_selution[_key_flag]==item:
  160. _flag = False
  161. if _flag:
  162. copy_dict_one_selution[_key_role] = item
  163. '''
  164. if item not in copy_set_legal_entity:
  165. if item !="":
  166. copy_dict_one_selution[_key_role] = item
  167. '''
  168. copy_set_legal_entity.add(item)
  169. if last_layer:
  170. list_all_selution.append(copy_dict_one_selution)
  171. else:
  172. recursive_package(copy_dict_legal_combination,copy_set_legal_entity,copy_dict_one_selution,list_all_selution)
  173. #递归匹配各个包的结果
  174. def recursive_packages(_dict_legal_combination,dict_one_selution,list_all_selution):
  175. last_layer = False
  176. if len(_dict_legal_combination.keys())==0:
  177. return []
  178. if len(_dict_legal_combination.keys())==1:
  179. last_layer = True
  180. _key_pack = list(_dict_legal_combination.keys())[0]
  181. for item in _dict_legal_combination[_key_pack]:
  182. copy_dict_one_selution = copy.copy(dict_one_selution)
  183. copy_dict_legal_combination = {}
  184. for _key in _dict_legal_combination.keys():
  185. if _key!=_key_pack:
  186. copy_dict_legal_combination[_key] = _dict_legal_combination[_key]
  187. for _key_role in item.keys():
  188. copy_dict_one_selution[_key_pack+"$$"+_key_role] = item[_key_role]
  189. if last_layer:
  190. list_all_selution.append(copy_dict_one_selution)
  191. else:
  192. recursive_packages(copy_dict_legal_combination,copy_dict_one_selution,list_all_selution)
  193. return list_all_selution
  194. #循环获取所有包组合
  195. def circle_pageages(_dict_legal_combination):
  196. list_all_selution = []
  197. for _key_pack in _dict_legal_combination.keys():
  198. list_key_selution = []
  199. for item in _dict_legal_combination[_key_pack]:
  200. _dict = dict()
  201. for _key_role in item.keys():
  202. _dict[_key_pack+"$$"+_key_role] = item[_key_role]
  203. list_key_selution.append(_dict)
  204. if len(list_all_selution)==0:
  205. list_all_selution = list_key_selution
  206. else:
  207. _list_all_selution = []
  208. for item_1 in list_all_selution:
  209. for item_2 in list_key_selution:
  210. _list_all_selution.append(dict(item_1,**item_2))
  211. list_all_selution = _list_all_selution
  212. return list_all_selution
  213. #拿到各个包解析之后的结果
  214. _dict_legal_combination = {}
  215. for packageName in dict_role_combination.keys():
  216. _list_all_selution = []
  217. # recursive_package(dict_role_combination[packageName], set(), {}, _list_all_selution)
  218. _list_all_selution = circle_package(dict_role_combination[packageName])
  219. '''
  220. print("===1")
  221. print(packageName)
  222. for item in _list_all_selution:
  223. print(item)
  224. print("===2")
  225. '''
  226. #去除包含子集
  227. list_all_selution_simple = []
  228. _list_set_all_selution = []
  229. for item_selution in _list_all_selution:
  230. item_set_selution = set()
  231. for _key in item_selution.keys():
  232. item_set_selution.add((_key,item_selution[_key]))
  233. _list_set_all_selution.append(item_set_selution)
  234. if len(_list_set_all_selution)>1000:
  235. _dict_legal_combination[packageName] = _list_all_selution
  236. continue
  237. for i in range(len(_list_set_all_selution)):
  238. be_included = False
  239. for j in range(len(_list_set_all_selution)):
  240. if i!=j:
  241. if len(set(_list_set_all_selution[i])&set(_list_set_all_selution[j]))==len(_list_set_all_selution[i]) and len(_list_set_all_selution[i])!=len(_list_set_all_selution[j]):
  242. be_included = True
  243. if not be_included:
  244. list_all_selution_simple.append(_list_all_selution[i])
  245. _dict_legal_combination[packageName] = list_all_selution_simple
  246. _list_final_comba = []
  247. #对各个包的结果进行排列组合
  248. _comba_count = 1
  249. for _key in _dict_legal_combination.keys():
  250. _comba_count *= len(_dict_legal_combination[_key])
  251. #如果过大,则每个包只取概率最大的那个
  252. dict_pack_entity_prob = get_dict_entity_prob(list_entity)
  253. if _comba_count>250:
  254. new_dict_legal_combination = dict()
  255. for _key_pack in _dict_legal_combination.keys():
  256. MAX_PROB = -1000
  257. _MAX_PROB_COMBA = None
  258. for item in _dict_legal_combination[_key_pack]:
  259. # print(_key_pack,item)
  260. _dict = dict()
  261. for _key in item.keys():
  262. _dict[str(_key_pack)+"$$"+str(_key)] = item[_key]
  263. _prob = getSumExpectation(dict_pack_entity_prob, _dict)
  264. if _prob>MAX_PROB:
  265. MAX_PROB = _prob
  266. _MAX_PROB_COMBA = [item]
  267. new_dict_legal_combination[_key_pack] = _MAX_PROB_COMBA
  268. _dict_legal_combination = new_dict_legal_combination
  269. #recursive_packages(_dict_legal_combination, {}, _list_final_comba)
  270. _list_final_comba = circle_pageages(_dict_legal_combination)
  271. #除了Project包(招标人和代理人),其他包是不会有冲突的
  272. #查看是否有一个实体出现在了Project包和其他包中,如有,要进行裁剪
  273. _list_real_comba = []
  274. for dict_item in _list_final_comba:
  275. set_project = set()
  276. set_other = set()
  277. for _key in list(dict_item.keys()):
  278. if _key.split("$$")[0]=="Project":
  279. set_project.add(dict_item[_key])
  280. else:
  281. set_other.add(dict_item[_key])
  282. set_common = set_project&set_other
  283. if len(set_common)>0:
  284. dict_project = {}
  285. dict_not_project = {}
  286. for _key in list(dict_item.keys()):
  287. if dict_item[_key] in set_common:
  288. if str(_key.split("$$")[0])=="Project":
  289. dict_project[_key] = dict_item[_key]
  290. else:
  291. dict_not_project[_key] = dict_item[_key]
  292. else:
  293. dict_project[_key] = dict_item[_key]
  294. dict_not_project[_key] = dict_item[_key]
  295. _list_real_comba.append(dict_project)
  296. _list_real_comba.append(dict_not_project)
  297. else:
  298. _list_real_comba.append(dict_item)
  299. return _list_real_comba
  300. def get_dict_entity_prob(list_entity,on_value=0.5):
  301. dict_pack_entity_prob = {}
  302. for entity in list_entity:
  303. if entity.entity_type in ['org','company']:
  304. values = entity.values
  305. role_prob = float(values[int(entity.label)])
  306. _key = entity.packageName+"$$"+str(entity.label)
  307. if role_prob>=on_value and str(entity.label)!="5":
  308. _key_prob = _key+"$text$"+entity.entity_text
  309. if _key_prob in dict_pack_entity_prob:
  310. if role_prob>dict_pack_entity_prob[_key_prob][1]:
  311. dict_pack_entity_prob[_key_prob] = [entity.entity_text,role_prob]
  312. else:
  313. dict_pack_entity_prob[_key_prob] = [entity.entity_text,role_prob]
  314. return dict_pack_entity_prob
  315. #计算合计期望
  316. def getSumExpectation(dict_pack_entity_prob,combination,on_value=0.5):
  317. '''
  318. expect = 0
  319. for entity in list_entity:
  320. if entity.entity_type in ['org','company']:
  321. values = entity.values
  322. role_prob = float(values[int(entity.label)])
  323. _key = entity.packageName+"$$"+str(entity.label)
  324. if role_prob>on_value and str(entity.label)!="5":
  325. if _key in combination.keys() and combination[_key]==entity.entity_text:
  326. expect += math.pow(role_prob,4)
  327. else:
  328. expect -= math.pow(role_prob,4)
  329. '''
  330. #修改为同一个实体只取对应包-角色的最大的概率值
  331. expect = 0
  332. dict_entity_prob = {}
  333. for _key_pack_entity in dict_pack_entity_prob:
  334. _key_pack = _key_pack_entity.split("$text$")[0]
  335. role_prob = dict_pack_entity_prob[_key_pack_entity][1]
  336. if _key_pack in combination.keys() and combination[_key_pack]==dict_pack_entity_prob[_key_pack_entity][0]:
  337. if _key_pack_entity in dict_entity_prob.keys():
  338. if dict_entity_prob[_key_pack_entity]<role_prob:
  339. dict_entity_prob[_key_pack_entity] = role_prob
  340. else:
  341. dict_entity_prob[_key_pack_entity] = role_prob
  342. else:
  343. if _key_pack_entity in dict_entity_prob.keys():
  344. if dict_entity_prob[_key_pack_entity]>-role_prob:
  345. dict_entity_prob[_key_pack_entity] = -role_prob
  346. else:
  347. dict_entity_prob[_key_pack_entity] = -role_prob
  348. # for entity in list_entity:
  349. # if entity.entity_type in ['org','company']:
  350. # values = entity.values
  351. # role_prob = float(values[int(entity.label)])
  352. # _key = entity.packageName+"$$"+str(entity.label)
  353. # if role_prob>=on_value and str(entity.label)!="5":
  354. # if _key in combination.keys() and combination[_key]==entity.entity_text:
  355. # _key_prob = _key+entity.entity_text
  356. # if _key_prob in dict_entity_prob.keys():
  357. # if dict_entity_prob[_key_prob]<role_prob:
  358. # dict_entity_prob[_key_prob] = role_prob
  359. # else:
  360. # dict_entity_prob[_key_prob] = role_prob
  361. # else:
  362. # _key_prob = _key+entity.entity_text
  363. # if _key_prob in dict_entity_prob.keys():
  364. # if dict_entity_prob[_key_prob]>-role_prob:
  365. # dict_entity_prob[_key_prob] = -role_prob
  366. # else:
  367. # dict_entity_prob[_key_prob] = -role_prob
  368. for _key in dict_entity_prob.keys():
  369. symbol = 1 if dict_entity_prob[_key]>0 else -1
  370. expect += symbol*math.pow(dict_entity_prob[_key],2)
  371. return expect
  372. def getRoleList(list_sentence,list_entity,on_value = 0.5):
  373. '''
  374. @summary: 搜索树,得到所有不矛盾的角色组合,取合计期望值最大的作为结果返回
  375. @param:
  376. list_sentence:文章所有的sentence
  377. list_entity:文章所有的实体
  378. on_value:概率阈值
  379. @return:文章的角色list
  380. '''
  381. pack = getPackagesFromArticle(list_sentence,list_entity)
  382. if pack is None:
  383. return None
  384. PackageList,PackageSet,dict_PackageCode = pack
  385. #拿到所有可能的情况
  386. dict_role_combination = {}
  387. #拿到各个实体的packageName,packageCode
  388. for entity in list_entity:
  389. if entity.entity_type in ['org','company']:
  390. values = entity.values
  391. role_prob = float(values[int(entity.label)])
  392. if role_prob>=on_value and str(entity.label)!="5":
  393. if str(entity.label) in ["0","1"]:
  394. packageName = "Project"
  395. else:
  396. if len(PackageSet)>0:
  397. packagePointer,_ = getPackage(PackageList,entity.sentence_index,entity.end_index,"role-"+str(entity.label))
  398. if packagePointer is None:
  399. #continue
  400. packageName = "Project"
  401. else:
  402. #add pointer_pack
  403. entity.pointer_pack = packagePointer
  404. packageName = packagePointer.entity_text
  405. else:
  406. packageName = "Project"
  407. find_flag = False
  408. if packageName in dict_PackageCode.keys():
  409. packageCode = dict_PackageCode[packageName]
  410. else:
  411. packageCode = ""
  412. entity.packageCode = packageCode
  413. role_name = dict_role_id.get(str(entity.label))
  414. entity.roleName = role_name
  415. entity.packageName = packageName
  416. if entity.packageName in dict_role_combination.keys():
  417. if str(entity.label) in dict_role_combination[entity.packageName].keys():
  418. dict_role_combination[entity.packageName][str(entity.label)].add(entity.entity_text)
  419. else:
  420. dict_role_combination[entity.packageName][str(entity.label)] = set([entity.entity_text])
  421. else:
  422. dict_role_combination[entity.packageName] = {}
  423. #初始化空值
  424. roleIds = [0,1,2,3,4]
  425. for _roleId in roleIds:
  426. dict_role_combination[entity.packageName][str(_roleId)] = set([""])
  427. dict_role_combination[entity.packageName][str(entity.label)].add(entity.entity_text)
  428. list_real_comba = get_legal_comba(list_entity,dict_role_combination)
  429. #拿到最大期望值的组合
  430. max_index = 0
  431. max_expect = -100
  432. _index = 0
  433. dict_pack_entity_prob = get_dict_entity_prob(list_entity)
  434. for item_combination in list_real_comba:
  435. expect = getSumExpectation(dict_pack_entity_prob, item_combination)
  436. if expect>max_expect:
  437. max_index = _index
  438. max_expect = expect
  439. _index += 1
  440. RoleList = []
  441. RoleSet = set()
  442. if len(list_real_comba)>0:
  443. for _key in list_real_comba[max_index].keys():
  444. packageName = _key.split("$$")[0]
  445. label = _key.split("$$")[1]
  446. role_name = dict_role_id.get(str(label))
  447. entity_text = list_real_comba[max_index][_key]
  448. if packageName in dict_PackageCode.keys():
  449. packagecode = dict_PackageCode.get(packageName)
  450. else:
  451. packagecode = ""
  452. RoleList.append(PREM(packageName,packagecode,role_name,entity_text,0,0,0.0,[]))
  453. RoleSet.add(entity_text)
  454. #根据最优树来修正list_entity中角色对包的连接
  455. for _entity in list_entity:
  456. if _entity.pointer_pack is not None:
  457. _pack_name = _entity.pointer_pack.entity_text
  458. _find_flag = False
  459. for _prem in RoleList:
  460. if _prem.packageName==_pack_name and _prem.entity_text==_entity.entity_text:
  461. _find_flag = True
  462. if not _find_flag:
  463. _entity.pointer_pack = None
  464. return RoleList,RoleSet,PackageList,PackageSet
  465. def getPackageScopePattern():
  466. '''
  467. @summary: 获取包的作用域关键词
  468. '''
  469. df = pd.read_excel(os.path.dirname(__file__)+"/end.xls")
  470. pattern = "("
  471. for item in df["list_word"]:
  472. item = str(item).replace("(","\(").replace(")","\)").replace(".","\.").replace("[","\[").replace("]","\]").replace("-","\-")
  473. pattern += item+"|"
  474. pattern = pattern[:-1]+")[::是为]|业绩.{,30}标段[0-9A-Za-z一二三四五六七八九十]{0,3}"
  475. return pattern
  476. pattern_packageScope = getPackageScopePattern()
  477. def getPackagesFromArticle(list_sentence,list_entity):
  478. '''
  479. @param:
  480. list_sentence:文章的句子list
  481. @summary: 将包的信息插入list_entity中
  482. @return: type:list if [包号,句子index,词偏移,标段号] meaning:文章的包/标段信息
  483. '''
  484. if len(list_sentence)==0:
  485. return None
  486. PackageList = []
  487. PackageList_scope = []
  488. PackageSet = set()
  489. dict_packageCode = dict()
  490. package_name_pattern = re.compile("((标[段号的包]|分包)的?(名[称单]?|包名))[::]?([^::]{3,30}?),{1}")
  491. package_N_name_pattern = re.compile("[^承](分?包|标段|标包|标|包|包组|子项目|包件|项目类型)编?号?[::]?[\((]?([0-9A-Za-z一二三四五六七八九十ⅠⅡⅢⅣⅤⅥⅦ]){1,2},{1}")
  492. package_number_pattern = re.compile("(([^承](包|标[段号的包]|分?包|包组|包件)编?号?|子项目|项目类型)[::]?[0-9A-Za-z一二三四五六七八九十ⅠⅡⅢⅣⅤⅥⅦ]{1,4}[^\.])[^至]?|([^\.]?第?[ⅠⅡⅢⅣⅤⅥⅦ0-9A-Za-z一二三四五六七八九十]{1,4}(包号|标[段号的包]|分?包))")
  493. # other_package_pattern = re.compile('(项目名称|物资名称|设备名称|场次名称|标段名称)[::](.{,20}?)(,|项目)') # 新正则识别标段
  494. other_package_pattern = re.compile('((项目|物资|设备|场次|标段|标的|产品)(名称))[::]([^,。]{,50}?)(,|。)') # # 2020/11/23 大网站规则 调整 package_N_name_pattern, package_N_name_pattern 中的项目 改为 子项目
  495. win_tenderer_pattern = re.compile('(中标人|供应商)[::](.{,25})(,|。)') # 2020/11/23 大网站规则 调整
  496. model_pattern = re.compile('(型号|序号)[::]([^,。]{,20})(,|。)') # 2020/11/23 大网站规则 调整
  497. number_pattern = re.compile("[0-9A-Za-z一二三四五六七八九十ⅠⅡⅢⅣⅤⅥⅦ]{1,4}")
  498. package_code_pattern = re.compile("(?:编号[::]?\s*)([-\dA-Za-z\(\)]+)")
  499. def changeIndexFromWordToWords(tokens,word_index):
  500. '''
  501. @summary:转换某个字的字偏移为词偏移
  502. '''
  503. before_index = 0
  504. after_index = 0
  505. for i in range(len(tokens)):
  506. after_index = after_index+len(tokens[i])
  507. if before_index<=word_index and after_index>=word_index:
  508. return i
  509. before_index = after_index
  510. package_names = []
  511. def extractPackageCode(tokens,word_index,size=20,pattern = package_code_pattern):
  512. '''
  513. @summary:抽取包附近的标段号
  514. @param:
  515. tokens:包所在句子的分词
  516. word_index:包所在字偏移
  517. size:左右各取多少个词
  518. pattern:提取标段号的正则
  519. @return: type:string,meaning:标段号
  520. '''
  521. index = changeIndexFromWordToWords(tokens,word_index)
  522. if index<size:
  523. begin = index
  524. else:
  525. begin = index-size
  526. if index+size>len(tokens):
  527. end = len(tokens)
  528. else:
  529. end = index+size
  530. #拿到左右两边的词语组成短语
  531. text = "".join(tokens[begin:end])
  532. #在短语中的字偏移
  533. new_word_index = word_index-len("".join(tokens[:begin]))
  534. min_distance = len(text)
  535. packageCode = None
  536. for the_iter in re.finditer(pattern,text):
  537. #算出最小距离
  538. distance = min([abs(new_word_index-the_iter.span()[0]),abs(new_word_index-the_iter.span()[1])])
  539. if distance<min_distance:
  540. min_distance = distance
  541. packageCode = the_iter.group(1)
  542. return packageCode
  543. #从标段介绍表格中提取包名和包号
  544. for i in range(len(list_sentence)):
  545. content = list_sentence[i].sentence_text
  546. names = re.findall(package_name_pattern,content)
  547. if names == []:
  548. names = re.findall(other_package_pattern, content)
  549. N_names = re.findall(package_N_name_pattern,content)
  550. if len(names)==1 and len(N_names)==1:
  551. package_names.append([names[0][-1],N_names[0][-1]])
  552. for i in range(len(list_sentence)):
  553. PackageList_item = []
  554. PackageList_item_scope = []
  555. content = list_sentence[i].sentence_text
  556. tokens = list_sentence[i].tokens
  557. for name in package_names[:20]:
  558. for index in findAllIndex(name[0],content):
  559. temp_package_number = re.findall(number_pattern,name[1])[0]
  560. PackageList_item.append({"name":temp_package_number,"sentence_index":i,"offsetWords_begin":changeIndexFromWordToWords(tokens,index),"offsetWord_begin":index,"offsetWord_end":index+len(name[0])})
  561. # PackageList_item.append([temp_package_number,i,changeIndexFromWordToWords(tokens,index),index,index+len(str(temp_package_number))])
  562. code = extractPackageCode(tokens, index)
  563. if code is not None:
  564. dict_packageCode[temp_package_number] = code
  565. PackageSet.add(temp_package_number)
  566. for iter in re.finditer(package_number_pattern,content):
  567. temp_package_number = re.findall(number_pattern,content[iter.span()[0]:iter.span()[1]])[0]
  568. PackageList_item.append({"name":temp_package_number,"sentence_index":i,"offsetWords_begin":changeIndexFromWordToWords(tokens,iter.span()[0]),"offsetWord_begin":iter.span()[0],"offsetWord_end":iter.span()[1]})
  569. # PackageList_item.append([temp_package_number,i,changeIndexFromWordToWords(tokens,iter.span()[0]),iter.span()[0],iter.span()[1]])
  570. code = extractPackageCode(tokens, iter.span()[0])
  571. if code is not None:
  572. dict_packageCode[temp_package_number] = code
  573. PackageSet.add(temp_package_number)
  574. #识别packageScope
  575. for iter in re.finditer(pattern_packageScope,content):
  576. PackageList_item_scope.append({"name":"","sentence_index":i,"offsetWords_begin":changeIndexFromWordToWords(tokens,iter.span()[0]),"offsetWord_begin":iter.span()[0],"offsetWord_end":iter.span()[1]})
  577. # PackageList_item_scope.append(["",i,changeIndexFromWordToWords(tokens,iter.span()[0]),iter.span()[0],iter.span()[1]])
  578. PackageList_item_scope = PackageList_item +PackageList_item_scope
  579. PackageList_item_scope.sort(key=lambda x:x["offsetWord_begin"])
  580. PackageList_scope = PackageList_scope+PackageList_item_scope
  581. PackageList_item.sort(key=lambda x:x["sentence_index"])
  582. #PackageList = PackageList+PackageList_item
  583. #不作为包
  584. # if len(PackageSet)==0:
  585. # for i in range(len(list_sentence)):
  586. # PackageList_item = []
  587. # PackageList_item_scope = []
  588. # content = list_sentence[i].sentence_text
  589. # tokens = list_sentence[i].tokens
  590. # for iter in re.finditer(other_package_pattern,content):
  591. # temp_package_number = iter.group(2)
  592. # PackageList_item.append({"name":temp_package_number,"sentence_index":i,"offsetWords_begin":changeIndexFromWordToWords(tokens,iter.span()[0]),"offsetWord_begin":iter.span()[0],"offsetWord_end":iter.span()[1]})
  593. # # PackageList_item.append([temp_package_number,i,changeIndexFromWordToWords(tokens,iter.span()[0]),iter.span()[0],iter.span()[1]])
  594. # code = extractPackageCode(tokens, iter.span()[0])
  595. # if code is not None:
  596. # dict_packageCode[temp_package_number] = code
  597. # PackageSet.add(temp_package_number)
  598. # #识别packageScope
  599. # for iter in re.finditer(pattern_packageScope,content):
  600. # PackageList_item_scope.append({"name":"","sentence_index":i,"offsetWords_begin":changeIndexFromWordToWords(tokens,iter.span()[0]),"offsetWord_begin":iter.span()[0],"offsetWord_end":iter.span()[1]})
  601. # # PackageList_item_scope.append(["",i,changeIndexFromWordToWords(tokens,iter.span()[0]),iter.span()[0],iter.span()[1]])
  602. # PackageList_item_scope = PackageList_item +PackageList_item_scope
  603. # PackageList_item_scope.sort(key=lambda x:x["offsetWord_begin"])
  604. # PackageList_scope = PackageList_scope+PackageList_item_scope
  605. # PackageList_item.sort(key=lambda x:x["sentence_index"])
  606. # 2020/11/23 大网站规则 调整
  607. if len(PackageSet)==0 and len(set([it.entity_text for it in list_entity if it.entity_type in ['org', 'company'] and it.label==2]))>1:
  608. for i in range(len(list_sentence)):
  609. PackageList_item = []
  610. PackageList_item_scope = []
  611. content = list_sentence[i].sentence_text
  612. tokens = list_sentence[i].tokens
  613. names = re.findall(other_package_pattern, content)
  614. N_names = re.findall(win_tenderer_pattern, content)
  615. if len(names) != 1 or len(N_names) != 1:
  616. continue
  617. for iter in re.finditer(other_package_pattern,content):
  618. temp_package_number = iter.group(4)
  619. xinghao = re.search(model_pattern, content)
  620. if xinghao:
  621. temp_package_number = temp_package_number + '+' + xinghao.group(2)
  622. # print('新正则采购包名补充',temp_package_number)
  623. PackageList_item.append({"name":temp_package_number,"sentence_index":i,"offsetWords_begin":changeIndexFromWordToWords(tokens,iter.span()[0]),"offsetWord_begin":iter.span()[0],"offsetWord_end":iter.span()[1]})
  624. # PackageList_item.append([temp_package_number,i,changeIndexFromWordToWords(tokens,iter.span()[0]),iter.span()[0],iter.span()[1]])
  625. code = extractPackageCode(tokens, iter.span()[0])
  626. if code is not None:
  627. dict_packageCode[temp_package_number] = code
  628. PackageSet.add(temp_package_number)
  629. #识别packageScope
  630. for iter in re.finditer(pattern_packageScope,content):
  631. PackageList_item_scope.append({"name":"","sentence_index":i,"offsetWords_begin":changeIndexFromWordToWords(tokens,iter.span()[0]),"offsetWord_begin":iter.span()[0],"offsetWord_end":iter.span()[1]})
  632. # PackageList_item_scope.append(["",i,changeIndexFromWordToWords(tokens,iter.span()[0]),iter.span()[0],iter.span()[1]])
  633. PackageList_item_scope = PackageList_item +PackageList_item_scope
  634. PackageList_item_scope.sort(key=lambda x:x["offsetWord_begin"])
  635. PackageList_scope = PackageList_scope+PackageList_item_scope
  636. PackageList_item.sort(key=lambda x:x["sentence_index"])
  637. pattern_punctuation = "[::()\(\),,。;;]"
  638. for i in range(len(list_sentence)):
  639. for j in range(len(PackageList_scope)):
  640. if i==PackageList_scope[j]["sentence_index"] and PackageList_scope[j]["name"]!="":
  641. _flag = False
  642. left_str = list_sentence[i].sentence_text[PackageList_scope[j]["offsetWord_begin"]-30:PackageList_scope[j]["offsetWord_begin"]+1]
  643. right_str = list_sentence[i].sentence_text[PackageList_scope[j]["offsetWord_begin"]:PackageList_scope[j]["offsetWord_begin"]+30]
  644. _left_find = re.findall(pattern_punctuation,left_str)
  645. _right_find = re.findall(pattern_punctuation,right_str)
  646. #print(left_str)
  647. if re.search("同",left_str[-1:]) is not None and PackageList_scope[j]["name"]=="一":
  648. continue
  649. if re.search("划分",right_str[:10]) is not None:
  650. continue
  651. if len(_left_find)>0 and _left_find[-1] in [":",":"]:
  652. _flag = True
  653. if len(_right_find)>0 and _right_find[0] in [":",":"]:
  654. _flag = True
  655. if _flag:
  656. scope_begin = [PackageList_scope[j]["sentence_index"],PackageList_scope[j]["offsetWords_begin"]]
  657. else:
  658. if j==0:
  659. scope_begin = [0,0]
  660. else:
  661. scope_begin = [PackageList_scope[j-1]["sentence_index"],PackageList_scope[j-1]["offsetWords_begin"]]
  662. if j==len(PackageList_scope)-1:
  663. scope_end = [PackageList_scope[j]["offsetWords_begin"],changeIndexFromWordToWords(list_sentence[i].tokens, len(list_sentence[i].sentence_text))]
  664. else:
  665. scope_end = [PackageList_scope[j+1]["sentence_index"],PackageList_scope[j+1]["offsetWords_begin"]]
  666. if PackageList_scope[j-1]["sentence_index"]==PackageList_scope[j]["sentence_index"] and PackageList_scope[j-1]["offsetWord_begin"]<=PackageList_scope[j]["offsetWord_begin"] and PackageList_scope[j-1]["offsetWord_end"]>=PackageList_scope[j]["offsetWord_end"]:
  667. continue
  668. #add package to entity
  669. _pack_entity = Entity(doc_id=list_sentence[0].doc_id,entity_id="%s_%s_%s_%s"%(list_sentence[0].doc_id,i,PackageList_scope[j]["offsetWord_begin"],PackageList_scope[j]["offsetWord_begin"]),entity_text=PackageList_scope[j]["name"],entity_type="package",sentence_index=PackageList_scope[j]["sentence_index"],begin_index=changeIndexFromWordToWords(list_sentence[i].tokens,PackageList_scope[j]["offsetWord_begin"]),end_index=changeIndexFromWordToWords(list_sentence[i].tokens,PackageList_scope[j]["offsetWord_end"]),wordOffset_begin=PackageList_scope[j]["offsetWord_begin"],wordOffset_end=PackageList_scope[j]["offsetWord_end"])
  670. list_entity.append(_pack_entity)
  671. copy_pack = copy.copy(PackageList_scope[j])
  672. copy_pack["scope"] = [scope_begin,scope_end]
  673. copy_pack["hit"] = set()
  674. copy_pack["pointer"] = _pack_entity
  675. PackageList.append(copy_pack)
  676. return PackageList,PackageSet,dict_packageCode
  677. def findAttributeAfterEntity(PackDict,roleSet,PackageList,PackageSet,list_entity,on_value = 0.5,on_value_person=0.5,sentence_len=4):
  678. '''
  679. @param:
  680. PackDict:文章包dict
  681. roleSet:文章所有角色的公司名称
  682. PackageList:文章的包信息
  683. PackageSet:文章所有包的名称
  684. list_entity:文章所有经过模型处理的实体
  685. on_value:金额模型的阈值
  686. on_value_person:联系人模型的阈值
  687. sentence_len:公司和属性间隔句子的最大长度
  688. @return:添加了属性信息的角色list
  689. '''
  690. #根据roleid添加金额到rolelist中
  691. def addMoneyByRoleid(packDict,packageName,roleid,money,money_prob):
  692. for i in range(len(packDict[packageName]["roleList"])):
  693. if packDict[packageName]["roleList"][i].role_name==dict_role_id.get(str(roleid)):
  694. if money_prob>packDict[packageName]["roleList"][i].money_prob:
  695. packDict[packageName]["roleList"][i].money = money
  696. packDict[packageName]["roleList"][i].money_prob = money_prob
  697. return packDict
  698. #根据实体名称添加金额到rolelist中
  699. def addMoneyByEntity(packDict,packageName,entity,money,money_prob):
  700. for i in range(len(packDict[packageName]["roleList"])):
  701. if packDict[packageName]["roleList"][i].entity_text==entity:
  702. if money_prob>packDict[packageName]["roleList"][i].money_prob:
  703. packDict[packageName]["roleList"][i].money = money
  704. packDict[packageName]["roleList"][i].money_prob = money_prob
  705. return packDict
  706. #根据实体名称得到角色
  707. def getRoleWithText(packDict,entity_text):
  708. for pack in packDict.keys():
  709. for i in range(len(packDict[pack]["roleList"])):
  710. if packDict[pack]["roleList"][i].entity_text==entity_text:
  711. return packDict[pack]["roleList"][i].role_name
  712. def doesEntityOrLinkedEntity_inRoleSet(entity,RoleSet):
  713. _list_entitys = [entity]+entity.linked_entitys
  714. for _entity in _list_entitys:
  715. if _entity.entity_text in RoleSet:
  716. return True
  717. p_entity = 0
  718. #遍历所有实体
  719. while(p_entity<len(list_entity)):
  720. entity = list_entity[p_entity]
  721. '''
  722. #招标金额从后往前找
  723. if entity.entity_type=="money":
  724. if entity.values[entity.label]>=on_value:
  725. if str(entity.label)=="0":
  726. packagePointer,_ = getPackage(PackageList,entity.sentence_index,entity.begin_index,"money-"+str(entity.label))
  727. if packagePointer is None:
  728. packageName = "Project"
  729. else:
  730. packageName = packagePointer.entity_text
  731. addMoneyByRoleid(PackDict, packageName, "0", entity.entity_text, entity.values[entity.label])
  732. '''
  733. ''' # 2020/11/25 与下面的联系人连接步骤重复,取消
  734. if entity.entity_type=="person":
  735. if entity.values[entity.label]>=on_value_person:
  736. if str(entity.label)=="1":
  737. for i in range(len(PackDict["Project"]["roleList"])):
  738. if PackDict["Project"]["roleList"][i].role_name=="tenderee":
  739. PackDict["Project"]["roleList"][i].linklist.append((entity.entity_text,entity.person_phone))
  740. # add pointer_person
  741. for _entity in list_entity:
  742. if dict_role_id.get(str(_entity.label))=="tenderee":
  743. for i in range(len(PackDict["Project"]["roleList"])):
  744. if PackDict["Project"]["roleList"][i].entity_text==_entity.entity_text and PackDict["Project"]["roleList"][i].role_name=="tenderee":
  745. _entity.pointer_person = entity
  746. elif str(entity.label)=="2":
  747. for i in range(len(PackDict["Project"]["roleList"])):
  748. if PackDict["Project"]["roleList"][i].role_name=="agency":
  749. PackDict["Project"]["roleList"][i].linklist.append((entity.entity_text,entity.person_phone))
  750. # add pointer_person
  751. for _entity in list_entity:
  752. if dict_role_id.get(str(_entity.label))=="agency":
  753. for i in range(len(PackDict["Project"]["roleList"])):
  754. if PackDict["Project"]["roleList"][i].entity_text==_entity.entity_text and PackDict["Project"]["roleList"][i].role_name=="agency":
  755. _entity.pointer_person = entity
  756. '''
  757. #金额往前找实体
  758. if entity.entity_type=="money":
  759. if entity.values[entity.label]>=on_value:
  760. p_entity_money= p_entity
  761. entity_money = list_entity[p_entity_money]
  762. if len(PackageSet)>0:
  763. packagePointer,_ = getPackage(PackageList,entity_money.sentence_index,entity_money.begin_index,"money-"+str(entity_money.entity_text)+"-"+str(entity_money.label))
  764. if packagePointer is None:
  765. packageName_entity = "Project"
  766. else:
  767. packageName_entity = packagePointer.entity_text
  768. else:
  769. packageName_entity = "Project"
  770. while(p_entity_money>0):
  771. entity_before = list_entity[p_entity_money]
  772. if entity_before.entity_type in ['org','company']:
  773. if str(entity_before.label)=="1":
  774. addMoneyByEntity(PackDict, packageName_entity, entity_before.entity_text, entity_money.entity_text, entity_money.values[entity_money.label])
  775. #add pointer_money
  776. entity_before.pointer_money = entity_money
  777. break
  778. p_entity_money -= 1
  779. #如果实体属于角色集合,则往后找属性
  780. if doesEntityOrLinkedEntity_inRoleSet(entity, roleSet):
  781. p_entity += 1
  782. #循环查找符合的属性
  783. while(p_entity<len(list_entity)):
  784. entity_after = list_entity[p_entity]
  785. if entity_after.sentence_index-entity.sentence_index>=sentence_len:
  786. p_entity -= 1
  787. break
  788. #若是遇到公司实体,则跳出循环
  789. if entity_after.entity_type in ['org','company']:
  790. p_entity -= 1
  791. break
  792. if entity_after.values is not None:
  793. if entity_after.entity_type=="money":
  794. if entity_after.values[entity_after.label]>=on_value:
  795. '''
  796. #招标金额从后往前找
  797. if str(entity_after.label)=="0":
  798. packagePointer,_ = getPackage(PackageList,entity.sentence_index,entity.begin_index,"money-"+str(entity.label))
  799. if packagePointer is None:
  800. packageName = "Project"
  801. else:
  802. packageName = packagePointer.entity_text
  803. addMoneyByRoleid(PackDict, packageName, "0", entity_after.entity_text, entity_after.values[entity_after.label])
  804. '''
  805. if str(entity_after.label)=="1":
  806. #print(entity_after.entity_text,entity.entity_text)
  807. _list_entitys = [entity]+entity.linked_entitys
  808. if len(PackageSet)>0:
  809. packagePointer,_ = getPackage(PackageList,entity_after.sentence_index,entity_after.begin_index,"money-"+str(entity_after.entity_text)+"-"+str(entity_after.label))
  810. if packagePointer is None:
  811. packageName_entity = "Project"
  812. else:
  813. packageName_entity = packagePointer.entity_text
  814. else:
  815. packageName_entity = "Project"
  816. if str(entity.label) in ["2","3","4"]:
  817. addMoneyByEntity(PackDict, packageName_entity, entity.entity_text, entity_after.entity_text, entity_after.values[entity_after.label])
  818. #add pointer_money
  819. entity.pointer_money = entity_after
  820. '''
  821. if entity_after.entity_type=="person":
  822. if entity_after.values[entity_after.label]>=on_value_person:
  823. if str(entity_after.label)=="1":
  824. for i in range(len(roleList)):
  825. if roleList[i].role_name=="tenderee":
  826. roleList[i].linklist.append((entity_after.entity_text,entity_after.person_phone))
  827. elif str(entity_after.label)=="2":
  828. for i in range(len(roleList)):
  829. if roleList[i].role_name=="agency":
  830. roleList[i].linklist.append((entity_after.entity_text,entity_after.person_phone))
  831. elif str(entity_after.label)=="3":
  832. _list_entitys = [entity]+entity.linked_entitys
  833. for _entity in _list_entitys:
  834. for i in range(len(roleList)):
  835. if roleList[i].entity_text==_entity.entity_text:
  836. if entity_after.sentence_index-_entity.sentence_index>1 and len(roleList[i].linklist)>0:
  837. break
  838. roleList[i].linklist.append((entity_after.entity_text,entity_after.person_phone))
  839. '''
  840. p_entity += 1
  841. p_entity += 1
  842. ''''''
  843. # 1)第一个公司实体的招标人,则看看下一个实体是否为代理人,如果是则联系人错位连接 。2)在同一句中往后找联系人。3)连接不上在整个文章找联系人。
  844. temp_ent_list = [] # 临时列表,记录0,1角色及3联系人
  845. other_person = [] # 阈值以上的联系人列表
  846. link_person = [] # 有电话没联系上角色的person列表
  847. other_ent = []
  848. link_ent = []
  849. found_person = False
  850. ent_list = []
  851. for entity in list_entity:
  852. if entity.entity_type in ['org','company','person']:
  853. ent_list.append(entity)
  854. #for list_index in range(len(ent_list)):
  855. #if ent_list[list_index].entity_type in ['org','company'] and ent_list[list_index].label == 0 and list_index+2<len(ent_list) and \
  856. #ent_list[list_index+1].entity_type in ['org','company'] and ent_list[list_index+1].label == 1 and ent_list[list_index+2].entity_type in ['person']:
  857. #ent_list[list_index+1], ent_list[list_index+2] = ent_list[list_index+2], ent_list[list_index+1]
  858. # 2020/11/25增加确定角色联系人判断
  859. sure_person_set = set([entity.entity_text for entity in ent_list if entity.entity_type == 'person' and entity.label in [1, 2]])
  860. for index in range(len(ent_list)):
  861. entity = ent_list[index]
  862. if entity.entity_type=="person":
  863. if str(entity.label) == "0": # 2020/11/25 非联系人直接跳过
  864. continue
  865. if entity.values[entity.label]>on_value_person:
  866. if str(entity.label)=="1":
  867. for i in range(len(PackDict["Project"]["roleList"])):
  868. if PackDict["Project"]["roleList"][i].role_name=="tenderee":
  869. PackDict["Project"]["roleList"][i].linklist.append((entity.entity_text,entity.person_phone))
  870. link_person.append(entity.entity_text)
  871. link_ent.append(PackDict["Project"]["roleList"][i].entity_text)
  872. # add pointer_person
  873. for _entity in list_entity:
  874. if dict_role_id.get(str(_entity.label))=="tenderee":
  875. for i in range(len(PackDict["Project"]["roleList"])):
  876. if PackDict["Project"]["roleList"][i].entity_text==_entity.entity_text and PackDict["Project"]["roleList"][i].role_name=="tenderee":
  877. _entity.pointer_person = entity
  878. elif str(entity.label)=="2":
  879. for i in range(len(PackDict["Project"]["roleList"])):
  880. if PackDict["Project"]["roleList"][i].role_name=="agency":
  881. PackDict["Project"]["roleList"][i].linklist.append((entity.entity_text,entity.person_phone))
  882. link_person.append(entity.entity_text)
  883. link_ent.append(PackDict["Project"]["roleList"][i].entity_text)
  884. # add pointer_person
  885. for _entity in list_entity:
  886. if dict_role_id.get(str(_entity.label))=="agency":
  887. for i in range(len(PackDict["Project"]["roleList"])):
  888. if PackDict["Project"]["roleList"][i].entity_text==_entity.entity_text and PackDict["Project"]["roleList"][i].role_name=="agency":
  889. _entity.pointer_person = entity
  890. elif str(entity.label)=="3":
  891. if entity.entity_text in sure_person_set: # 2020/11/25 排除已经确定角色的联系人
  892. continue
  893. #not_link_person.append((entity_after.entity_text,entity_after.person_phone))
  894. other_person.append(entity.entity_text)
  895. temp_ent_list.append((entity.entity_text,entity.person_phone,entity))
  896. #if entity.entity_text in roleSet:
  897. if entity.entity_text in roleSet:
  898. if entity.label in [0,1]:
  899. other_ent.append(entity.entity_text)
  900. temp_ent_list.append((entity.entity_text, entity.label,entity))
  901. for behind_index in range(index+1, len(ent_list)):
  902. entity_after = ent_list[behind_index]
  903. if entity_after.sentence_index-entity.sentence_index>=1 or entity_after.entity_type in ['org','company']: # 只在本句中找联系人
  904. break
  905. if entity_after.values is not None:
  906. if entity_after.entity_type=="person":
  907. if str(entity_after.label) == "0": # 2020/11/25角色后面为非联系人 停止继续往后找
  908. break
  909. if entity_after.values[entity_after.label]>on_value_person:
  910. if str(entity_after.label)=="1":
  911. for i in range(len(PackDict["Project"]["roleList"])):
  912. if PackDict["Project"]["roleList"][i].role_name=="tenderee":
  913. PackDict["Project"]["roleList"][i].linklist.append((entity_after.entity_text,entity_after.person_phone))
  914. link_person.append(entity_after.entity_text)
  915. link_ent.append(PackDict["Project"]["roleList"][i].entity_text)
  916. elif str(entity_after.label)=="2":
  917. for i in range(len(PackDict["Project"]["roleList"])):
  918. if PackDict["Project"]["roleList"][i].role_name=="agency":
  919. PackDict["Project"]["roleList"][i].linklist.append((entity_after.entity_text,entity_after.person_phone))
  920. link_person.append(entity_after.entity_text)
  921. link_ent.append(PackDict["Project"]["roleList"][i].entity_text)
  922. elif str(entity_after.label)=="3":
  923. if entity_after.entity_text in sure_person_set: # 2020/11/25 如果姓名已经出现在确定角色联系人中则停止往后找
  924. break
  925. elif entity_after.begin_index - entity.end_index > 30:#2020/10/25 如果角色实体与联系人实体间隔大于阈值停止
  926. break
  927. for pack in PackDict.keys():
  928. for i in range(len(PackDict[pack]["roleList"])):
  929. if PackDict[pack]["roleList"][i].entity_text==entity.entity_text:
  930. #if entity_after.sentence_index-entity.sentence_index>1 and len(roleList[i].linklist)>0:
  931. #break
  932. PackDict[pack]["roleList"][i].linklist.append((entity_after.entity_text,entity_after.person_phone))
  933. link_person.append(entity_after.entity_text)
  934. #add pointer_person
  935. entity.pointer_person = entity_after
  936. not_link_person = [person for person in other_person if person not in link_person]
  937. not_link_ent = [ent for ent in other_ent if ent not in link_ent]
  938. if len(not_link_person) > 0 and len(not_link_ent) > 0 :
  939. item = temp_ent_list
  940. for i in range(len(item)):
  941. if item[i][0] in not_link_ent and item[i][1] == 0 and i+3 < len(item):
  942. if item[i+1][0] in other_ent and item[i+1][1] == 1 and item[i+2][0] in other_person and item[i+3][0] in other_person:
  943. item[i+1], item[i+2] = item[i+2], item[i+1]
  944. for i in range(len(item)-1, -1, -1):
  945. if item[i][0] in not_link_ent:
  946. for pack in PackDict.keys():
  947. for role in PackDict[pack]["roleList"]:
  948. if role.entity_text == item[i][0] and len(role.linklist) < 1:
  949. for j in range(i+1, len(item)):
  950. if item[j][0] in not_link_person:
  951. role.linklist.append(item[j][:2])
  952. #add pointer_person
  953. item[i][2].pointer_person = item[j][2]
  954. break
  955. else:
  956. break
  957. #寻找多标段招标金额
  958. p_entity = len(list_entity)-1
  959. set_tenderer_money = set()
  960. #遍历所有实体
  961. while(p_entity>=0):
  962. entity = list_entity[p_entity]
  963. if entity.entity_type=="money":
  964. if entity.values[entity.label]>=on_value:
  965. if str(entity.label)=="1":
  966. set_tenderer_money.add(float(entity.entity_text))
  967. if str(entity.label)=="0":
  968. '''
  969. if p_entity>0:
  970. p_before = list_entity[p_entity-1]
  971. if p_before.entity_type=="money" and p_before.label==entity.label and p_before.entity_text==entity.entity_text and abs(entity.begin_index-p_before.end_index)<=2:
  972. p_entity -= 1
  973. continue
  974. '''
  975. packagePointer,_flag = getPackage(PackageList,entity.sentence_index,entity.begin_index,"money-"+str(entity.label),MAX_DIS=2,DIRECT="L")
  976. if packagePointer is None:
  977. packageName = "Project"
  978. else:
  979. packageName = packagePointer.entity_text
  980. if packageName=="Project":
  981. if PackDict["Project"]["tendereeMoney"]<float(entity.entity_text):
  982. PackDict["Project"]["tendereeMoney"] = float(entity.entity_text)
  983. else:
  984. PackDict[packageName]["tendereeMoney"] = float(entity.entity_text)
  985. #add pointer_tendereeMoney
  986. packagePointer.pointer_tendereeMoney = entity
  987. p_entity -= 1
  988. #删除一个机构有多个角色的数据
  989. #删除重复人、概率不回传
  990. final_roleList = []
  991. list_pop = []
  992. set_tenderer_role = set()
  993. dict_pack_tenderer_money = dict()
  994. for pack in PackDict.keys():
  995. #删除无效包
  996. if PackDict[pack]["code"]=="" and PackDict[pack]["tendereeMoney"]==0 and len(PackDict[pack]["roleList"])==0:
  997. list_pop.append(pack)
  998. for i in range(len(PackDict[pack]["roleList"])):
  999. if PackDict[pack]["roleList"][i].role_name=="win_tenderer":
  1000. if PackDict[pack]["roleList"][i].money==0:
  1001. set_tenderer_role.add(PackDict[pack]["roleList"][i])
  1002. dict_pack_tenderer_money[pack] = [PackDict[pack]["roleList"][i],set()]
  1003. #找到包的中投标金额
  1004. for _index in range(len(PackageList)):
  1005. if "hit" in PackageList[_index]:
  1006. for _hit in list(PackageList[_index]["hit"]):
  1007. _money = float(_hit.split("-")[1]) if _hit.split("-")[0]=="money" else None
  1008. if PackageList[_index]["name"] in dict_pack_tenderer_money and _money is not None:
  1009. dict_pack_tenderer_money[PackageList[_index]["name"]][1].add(_money)
  1010. #只找到一个中标人和中标金额
  1011. if len(set_tenderer_money)==1 and len(set_tenderer_role)==1:
  1012. list(set_tenderer_role)[0].money = list(set_tenderer_money)[0]
  1013. #找到一个中标人和多个招标金额
  1014. if len(set_tenderer_money)>1 and len(set_tenderer_role)==1:
  1015. _maxMoney = 0
  1016. _sumMoney = 0
  1017. for _m in list(set_tenderer_money):
  1018. _sumMoney += _m
  1019. if _m>_maxMoney:
  1020. _maxMoney = _m
  1021. if _sumMoney/_maxMoney==2:
  1022. list(set_tenderer_role)[0].money = _maxMoney
  1023. else:
  1024. list(set_tenderer_role)[0].money = _maxMoney
  1025. #每个包都只找到一个金额
  1026. _flag_pack_money = True
  1027. for k,v in dict_pack_tenderer_money.items():
  1028. if len(v[1])!=1:
  1029. _flag_pack_money = False
  1030. if _flag_pack_money and len(PackageSet)==len(dict_pack_tenderer_money.keys()):
  1031. for k,v in dict_pack_tenderer_money.items():
  1032. v[0].money = list(v[1])[0]
  1033. for pack in PackDict.keys():
  1034. for i in range(len(PackDict[pack]["roleList"])):
  1035. PackDict[pack]["roleList"][i] = PackDict[pack]["roleList"][i].getString()
  1036. for item in list_pop:
  1037. PackDict.pop(item)
  1038. return PackDict
  1039. def initPackageAttr(RoleList,PackageSet):
  1040. '''
  1041. @summary: 根据拿到的roleList和packageSet初始化接口返回的数据
  1042. '''
  1043. packDict = dict()
  1044. packDict["Project"] = {"code":"","tendereeMoney":0,"roleList":[]}
  1045. for item in list(PackageSet):
  1046. packDict[item] = {"code":"","tendereeMoney":0,"roleList":[]}
  1047. for item in RoleList:
  1048. if packDict[item.packageName]["code"] =="":
  1049. packDict[item.packageName]["code"] = item.packageCode
  1050. packDict[item.packageName]["roleList"].append(Role(item.role_name,item.entity_text,0,0,0.0,[]))
  1051. return packDict
  1052. def getPackageRoleMoney(list_sentence,list_entity):
  1053. '''
  1054. @param:
  1055. list_sentence:文章的句子list
  1056. list_entity:文章的实体list
  1057. @return: 拿到文章的包-标段号-角色-实体名称-金额-联系人-联系电话
  1058. '''
  1059. # print("=1")
  1060. theRole = getRoleList(list_sentence,list_entity)
  1061. if not theRole:
  1062. return []
  1063. RoleList,RoleSet,PackageList,PackageSet = theRole
  1064. '''
  1065. for item in PackageList:
  1066. print(item)
  1067. '''
  1068. # print("=2")
  1069. PackDict = initPackageAttr(RoleList, PackageSet)
  1070. # print("=3")
  1071. PackDict = findAttributeAfterEntity(PackDict, RoleSet, PackageList, PackageSet, list_entity)
  1072. # print("=4")
  1073. return PackDict
  1074. def getOtherAttributes(list_entity):
  1075. dict_other = {"bidway":"",
  1076. "moneysource":"",
  1077. "person_review":[],
  1078. "time_release":"",
  1079. "time_bidopen":"",
  1080. "time_bidclose":"",
  1081. "serviceTime":"",
  1082. "product":[]}
  1083. for entity in list_entity:
  1084. if entity.entity_type == 'bidway':
  1085. dict_other["bidway"] = entity.entity_text
  1086. elif entity.entity_type=='moneysource':
  1087. dict_other["moneysource"] = entity.entity_text
  1088. elif entity.entity_type=='serviceTime':
  1089. dict_other["serviceTime"] = entity.entity_text
  1090. elif entity.entity_type == 'time' and entity.label==1:
  1091. dict_other["time_release"] = timeFormat(entity.entity_text)
  1092. elif entity.entity_type == 'time' and entity.label==2:
  1093. dict_other["time_bidopen"] = timeFormat(entity.entity_text)
  1094. elif entity.entity_type == 'time' and entity.label == 3:
  1095. dict_other["time_bidclose"] = timeFormat(entity.entity_text)
  1096. elif entity.entity_type=="person" and entity.label ==4:
  1097. dict_other["person_review"].append(entity.entity_text)
  1098. elif entity.entity_type=='product':
  1099. dict_other["product"].append(entity.entity_text)
  1100. dict_other["product"] = list(set(dict_other["product"]))
  1101. return dict_other
  1102. def getPREMs(list_sentences,list_entitys,list_articles):
  1103. '''
  1104. @param:
  1105. list_sentence:所有文章的句子list
  1106. list_entity:所有文章的实体list
  1107. @return:list of dict which include文章的包-角色-实体名称-金额-联系人-联系电话
  1108. '''
  1109. result = []
  1110. for list_sentence,list_entity,list_article in zip(list_sentences,list_entitys,list_articles):
  1111. RoleList = getPackageRoleMoney(list_sentence,list_entity)
  1112. result.append(dict({"prem":RoleList,"docid":list_article.doc_id},**getOtherAttributes(list_entity)))
  1113. return result
  1114. if __name__=="__main__":
  1115. '''
  1116. conn = getConnection()
  1117. cursor = conn.cursor()
  1118. #sql = " select distinct A.doc_id from entity_mention A,test_predict_role B where A.entity_id=B.entity_id limit 200"
  1119. sql = " select B.doc_id,B.prem from articles_processed A, articles_validation B where A.id=B.doc_id "
  1120. result = []
  1121. cursor.execute(sql)
  1122. rows = cursor.fetchall()
  1123. count = 0
  1124. for row in rows:
  1125. count += 1
  1126. print(count)
  1127. doc_id = row[0]
  1128. roleList = getPackageRoleMoney(doc_id)
  1129. result.append([doc_id,str(roleList),row[1]])
  1130. ''''''
  1131. with codecs.open("getAttribute.html","w",encoding="utf8") as f:
  1132. f.write('<html><head>\
  1133. <meta http-equiv="Content-Type"\
  1134. content="text/html; charset=UTF-8">\
  1135. </head>\
  1136. <body bgcolor="#FFFFFF">\
  1137. <table border="1">\
  1138. <tr>\
  1139. <td>doc_id</td>\
  1140. <td>角色</td>\
  1141. </tr>')
  1142. for item in result:
  1143. f.write("<tr>"+"<td>"+item[0]+"</td>"+"<td>"+item[1]+"</td>"+"<td>"+item[2]+"</td>"+"</tr>")
  1144. f.write("</table></body>")
  1145. '''