getAttributes.py 63 KB


  1. from BiddingKG.dl.common.Utils import findAllIndex,debug,timeFormat
  2. from BiddingKG.dl.interface.Entitys import PREM,Role,Entity
  3. import re
  4. import copy
  5. import math
  6. import pandas as pd
  7. import os
  8. def getTheRole(entity,role_list):
  9. '''
  10. @summary:根据实体名称拿到index
  11. @param:
  12. entity:实体名称
  13. role_list:角色list
  14. @return:该实体所在下标
  15. '''
  16. for role_index in range(len(role_list)):
  17. if entity in role_list[role_index]:
  18. return role_index
  19. return None
  20. dict_role_id = {"0":"tenderee",
  21. "1":"agency",
  22. "2":"win_tenderer",
  23. "3":"second_tenderer",
  24. "4":"third_tenderer"}
  25. def getPackage(packageList,sentence_index,begin_index,roleid,MAX_DIS=None,DIRECT=None):
  26. '''
  27. @param:
  28. packageList:文章的包的信息,包号-sent_index-词偏移-字偏移-[[前作用域句子,句内偏移],[后作用域句子,句内偏移]]-匹配集合
  29. sentence_index:实体所在的句子
  30. begin_index:实体所在句子的起始位置
  31. @return:公司实体所属的包
  32. @summary: 优化多标段,确定标段作用域之后,寻找作用域包含该实体的所有包,从前往后找到一个还没有该roleid的包返回,若找到的包都有roleid,则返回第一个,若没有找到包,返回None
  33. '''
  34. '''
  35. if len(packageList)==0:
  36. return None
  37. before_index = None
  38. after_index = None
  39. equal_index = None
  40. equal_count = 0
  41. for pack_index in range(len(packageList)):
  42. if packageList[pack_index][1]>sentence_index and after_index is None:
  43. after_index = pack_index
  44. if packageList[pack_index][1]<sentence_index:
  45. before_index = pack_index
  46. if packageList[pack_index][1]==sentence_index and equal_index is None:
  47. equal_index = pack_index
  48. #当前句子和之前句子未找到包
  49. if before_index is None and equal_index is None:
  50. return None
  51. else:
  52. if after_index is None:
  53. end_index = len(packageList)
  54. else:
  55. end_index = after_index
  56. #只在当前句子找到一个包号
  57. if end_index-max((before_index if before_index is not None else -1,equal_index if equal_index is not None else -1))==1:
  58. return packageList[end_index-1][0]
  59. else:
  60. for i in range(max((before_index if before_index is not None else -1,equal_index if equal_index is not None else -1)),end_index):
  61. if packageList[i][2]>int(begin_index):
  62. if packageList[i-1][4]:
  63. return packageList[i-1][0]
  64. else:
  65. if packageList[i][4]:
  66. return packageList[i-1][0]
  67. else:
  68. return packageList[i][0]
  69. return packageList[end_index-1][0]
  70. '''
  71. if len(packageList)==0:
  72. return None,False
  73. list_legalPack = []
  74. for pack_index in range(len(packageList)):
  75. if DIRECT=="L" and (packageList[pack_index]["sentence_index"]>sentence_index or (packageList[pack_index]["sentence_index"]==sentence_index and packageList[pack_index]["offsetWords_begin"]>begin_index)):
  76. continue
  77. if DIRECT=="R" and (packageList[pack_index]["sentence_index"]<sentence_index or (packageList[pack_index]["sentence_index"]==sentence_index and packageList[pack_index]["offsetwords_begin"]<begin_index)):
  78. continue
  79. if (packageList[pack_index]["scope"][0][0]<sentence_index or (packageList[pack_index]["scope"][0][0]==sentence_index and packageList[pack_index]["scope"][0][1]<=begin_index)) and (packageList[pack_index]["scope"][1][0]>sentence_index or (packageList[pack_index]["scope"][1][0]==sentence_index and packageList[pack_index]["scope"][1][1]>=begin_index)):
  80. if MAX_DIS is not None:
  81. if abs(sentence_index-packageList[pack_index]["sentence_index"])<=MAX_DIS:
  82. list_legalPack.append(pack_index)
  83. else:
  84. list_legalPack.append(pack_index)
  85. _flag = True
  86. for _index in list_legalPack:
  87. if roleid in packageList[_index]["hit"]:
  88. continue
  89. else:
  90. _flag = False
  91. packageList[_index]["hit"].add(roleid)
  92. return packageList[_index]["pointer"],_flag
  93. if len(list_legalPack)>0:
  94. return packageList[0]["pointer"],_flag
  95. return None,False
  96. #生成合法的组合
  97. def get_legal_comba(list_entity,dict_role_combination):
  98. #拿到一个包中所有合法的组合
  99. def circle_package(_dict_legal_combination):
  100. list_dict_role_first = []
  101. for _role in _dict_legal_combination:
  102. if len(list_dict_role_first)==0:
  103. for _entity in _dict_legal_combination[_role]:
  104. if _entity !="":
  105. list_dict_role_first.append({_role:_entity})
  106. else:
  107. list_dict_role_after = []
  108. _find_count = 0
  109. for _entity in _dict_legal_combination[_role]:
  110. if _entity !="":
  111. for _dict in list_dict_role_first:
  112. _flag = True
  113. for _key1 in _dict:
  114. if _entity==_dict[_key1]:
  115. #修改为招标人和代理人可以为同一个
  116. if str(_key1) in ["0","1"] and str(_role) in ["0","1"]:
  117. _flag = True
  118. else:
  119. _flag = False
  120. if _flag:
  121. _find_count += 1
  122. _new_dict = copy.copy(_dict)
  123. _new_dict[_role] = _entity
  124. if len(list_dict_role_after)>100000:
  125. break
  126. list_dict_role_after.append(_new_dict)
  127. if len(list_dict_role_after)==0:
  128. pass
  129. else:
  130. list_dict_role_first.extend(list_dict_role_after)
  131. return list_dict_role_first
  132. def recursive_package(_dict_legal_combination,set_legal_entity,dict_one_selution,list_all_selution):
  133. last_layer = False
  134. #若是空组合则放回空
  135. if len(_dict_legal_combination.keys())==0:
  136. return []
  137. #递归到最后一层则修改状态
  138. if len(_dict_legal_combination.keys())==1:
  139. last_layer = True
  140. #取一个角色开始进行遍历
  141. _key_role = list(_dict_legal_combination.keys())[0]
  142. for item in _dict_legal_combination[_key_role]:
  143. copy_dict_one_selution = copy.copy(dict_one_selution)
  144. copy_dict_legal_combination = {}
  145. copy_set_legal_entity = copy.copy(set_legal_entity)
  146. #复制余下的所有角色,进行下一轮递归
  147. for _key in _dict_legal_combination.keys():
  148. if _key!=_key_role:
  149. copy_dict_legal_combination[_key] = _dict_legal_combination[_key]
  150. #修改为招标人和代理人可以为同一个
  151. if item !="":
  152. _flag = True
  153. if str(_key_role) in ["0","1"]:
  154. for _key_flag in copy_dict_one_selution:
  155. if _key_flag not in ["0","1"] and copy_dict_one_selution[_key_flag]==item:
  156. _flag = False
  157. else:
  158. for _key_flag in copy_dict_one_selution:
  159. if copy_dict_one_selution[_key_flag]==item:
  160. _flag = False
  161. if _flag:
  162. copy_dict_one_selution[_key_role] = item
  163. '''
  164. if item not in copy_set_legal_entity:
  165. if item !="":
  166. copy_dict_one_selution[_key_role] = item
  167. '''
  168. copy_set_legal_entity.add(item)
  169. if last_layer:
  170. list_all_selution.append(copy_dict_one_selution)
  171. else:
  172. recursive_package(copy_dict_legal_combination,copy_set_legal_entity,copy_dict_one_selution,list_all_selution)
  173. #递归匹配各个包的结果
  174. def recursive_packages(_dict_legal_combination,dict_one_selution,list_all_selution):
  175. last_layer = False
  176. if len(_dict_legal_combination.keys())==0:
  177. return []
  178. if len(_dict_legal_combination.keys())==1:
  179. last_layer = True
  180. _key_pack = list(_dict_legal_combination.keys())[0]
  181. for item in _dict_legal_combination[_key_pack]:
  182. copy_dict_one_selution = copy.copy(dict_one_selution)
  183. copy_dict_legal_combination = {}
  184. for _key in _dict_legal_combination.keys():
  185. if _key!=_key_pack:
  186. copy_dict_legal_combination[_key] = _dict_legal_combination[_key]
  187. for _key_role in item.keys():
  188. copy_dict_one_selution[_key_pack+"$$"+_key_role] = item[_key_role]
  189. if last_layer:
  190. list_all_selution.append(copy_dict_one_selution)
  191. else:
  192. recursive_packages(copy_dict_legal_combination,copy_dict_one_selution,list_all_selution)
  193. return list_all_selution
  194. #循环获取所有包组合
  195. def circle_pageages(_dict_legal_combination):
  196. list_all_selution = []
  197. for _key_pack in _dict_legal_combination.keys():
  198. list_key_selution = []
  199. for item in _dict_legal_combination[_key_pack]:
  200. _dict = dict()
  201. for _key_role in item.keys():
  202. _dict[_key_pack+"$$"+_key_role] = item[_key_role]
  203. list_key_selution.append(_dict)
  204. if len(list_all_selution)==0:
  205. list_all_selution = list_key_selution
  206. else:
  207. _list_all_selution = []
  208. for item_1 in list_all_selution:
  209. for item_2 in list_key_selution:
  210. _list_all_selution.append(dict(item_1,**item_2))
  211. list_all_selution = _list_all_selution
  212. return list_all_selution
  213. #拿到各个包解析之后的结果
  214. _dict_legal_combination = {}
  215. for packageName in dict_role_combination.keys():
  216. _list_all_selution = []
  217. # recursive_package(dict_role_combination[packageName], set(), {}, _list_all_selution)
  218. _list_all_selution = circle_package(dict_role_combination[packageName])
  219. '''
  220. print("===1")
  221. print(packageName)
  222. for item in _list_all_selution:
  223. print(item)
  224. print("===2")
  225. '''
  226. #去除包含子集
  227. list_all_selution_simple = []
  228. _list_set_all_selution = []
  229. for item_selution in _list_all_selution:
  230. item_set_selution = set()
  231. for _key in item_selution.keys():
  232. item_set_selution.add((_key,item_selution[_key]))
  233. _list_set_all_selution.append(item_set_selution)
  234. if len(_list_set_all_selution)>1000:
  235. _dict_legal_combination[packageName] = _list_all_selution
  236. continue
  237. for i in range(len(_list_set_all_selution)):
  238. be_included = False
  239. for j in range(len(_list_set_all_selution)):
  240. if i!=j:
  241. if len(set(_list_set_all_selution[i])&set(_list_set_all_selution[j]))==len(_list_set_all_selution[i]) and len(_list_set_all_selution[i])!=len(_list_set_all_selution[j]):
  242. be_included = True
  243. if not be_included:
  244. list_all_selution_simple.append(_list_all_selution[i])
  245. _dict_legal_combination[packageName] = list_all_selution_simple
  246. _list_final_comba = []
  247. #对各个包的结果进行排列组合
  248. _comba_count = 1
  249. for _key in _dict_legal_combination.keys():
  250. _comba_count *= len(_dict_legal_combination[_key])
  251. #如果过大,则每个包只取概率最大的那个
  252. dict_pack_entity_prob = get_dict_entity_prob(list_entity)
  253. if _comba_count>250:
  254. new_dict_legal_combination = dict()
  255. for _key_pack in _dict_legal_combination.keys():
  256. MAX_PROB = -1000
  257. _MAX_PROB_COMBA = None
  258. for item in _dict_legal_combination[_key_pack]:
  259. # print(_key_pack,item)
  260. _dict = dict()
  261. for _key in item.keys():
  262. _dict[str(_key_pack)+"$$"+str(_key)] = item[_key]
  263. _prob = getSumExpectation(dict_pack_entity_prob, _dict)
  264. if _prob>MAX_PROB:
  265. MAX_PROB = _prob
  266. _MAX_PROB_COMBA = [item]
  267. new_dict_legal_combination[_key_pack] = _MAX_PROB_COMBA
  268. _dict_legal_combination = new_dict_legal_combination
  269. #recursive_packages(_dict_legal_combination, {}, _list_final_comba)
  270. _list_final_comba = circle_pageages(_dict_legal_combination)
  271. #除了Project包(招标人和代理人),其他包是不会有冲突的
  272. #查看是否有一个实体出现在了Project包和其他包中,如有,要进行裁剪
  273. _list_real_comba = []
  274. for dict_item in _list_final_comba:
  275. set_project = set()
  276. set_other = set()
  277. for _key in list(dict_item.keys()):
  278. if _key.split("$$")[0]=="Project":
  279. set_project.add(dict_item[_key])
  280. else:
  281. set_other.add(dict_item[_key])
  282. set_common = set_project&set_other
  283. if len(set_common)>0:
  284. dict_project = {}
  285. dict_not_project = {}
  286. for _key in list(dict_item.keys()):
  287. if dict_item[_key] in set_common:
  288. if str(_key.split("$$")[0])=="Project":
  289. dict_project[_key] = dict_item[_key]
  290. else:
  291. dict_not_project[_key] = dict_item[_key]
  292. else:
  293. dict_project[_key] = dict_item[_key]
  294. dict_not_project[_key] = dict_item[_key]
  295. _list_real_comba.append(dict_project)
  296. _list_real_comba.append(dict_not_project)
  297. else:
  298. _list_real_comba.append(dict_item)
  299. return _list_real_comba
  300. def get_dict_entity_prob(list_entity,on_value=0.5):
  301. dict_pack_entity_prob = {}
  302. for entity in list_entity:
  303. if entity.entity_type in ['org','company']:
  304. values = entity.values
  305. role_prob = float(values[int(entity.label)])
  306. _key = entity.packageName+"$$"+str(entity.label)
  307. if role_prob>=on_value and str(entity.label)!="5":
  308. _key_prob = _key+"$text$"+entity.entity_text
  309. if _key_prob in dict_pack_entity_prob:
  310. if role_prob>dict_pack_entity_prob[_key_prob][1]:
  311. dict_pack_entity_prob[_key_prob] = [entity.entity_text,role_prob]
  312. else:
  313. dict_pack_entity_prob[_key_prob] = [entity.entity_text,role_prob]
  314. return dict_pack_entity_prob
  315. #计算合计期望
  316. def getSumExpectation(dict_pack_entity_prob,combination,on_value=0.5):
  317. '''
  318. expect = 0
  319. for entity in list_entity:
  320. if entity.entity_type in ['org','company']:
  321. values = entity.values
  322. role_prob = float(values[int(entity.label)])
  323. _key = entity.packageName+"$$"+str(entity.label)
  324. if role_prob>on_value and str(entity.label)!="5":
  325. if _key in combination.keys() and combination[_key]==entity.entity_text:
  326. expect += math.pow(role_prob,4)
  327. else:
  328. expect -= math.pow(role_prob,4)
  329. '''
  330. #修改为同一个实体只取对应包-角色的最大的概率值
  331. expect = 0
  332. dict_entity_prob = {}
  333. for _key_pack_entity in dict_pack_entity_prob:
  334. _key_pack = _key_pack_entity.split("$text$")[0]
  335. role_prob = dict_pack_entity_prob[_key_pack_entity][1]
  336. if _key_pack in combination.keys() and combination[_key_pack]==dict_pack_entity_prob[_key_pack_entity][0]:
  337. if _key_pack_entity in dict_entity_prob.keys():
  338. if dict_entity_prob[_key_pack_entity]<role_prob:
  339. dict_entity_prob[_key_pack_entity] = role_prob
  340. else:
  341. dict_entity_prob[_key_pack_entity] = role_prob
  342. else:
  343. if _key_pack_entity in dict_entity_prob.keys():
  344. if dict_entity_prob[_key_pack_entity]>-role_prob:
  345. dict_entity_prob[_key_pack_entity] = -role_prob
  346. else:
  347. dict_entity_prob[_key_pack_entity] = -role_prob
  348. # for entity in list_entity:
  349. # if entity.entity_type in ['org','company']:
  350. # values = entity.values
  351. # role_prob = float(values[int(entity.label)])
  352. # _key = entity.packageName+"$$"+str(entity.label)
  353. # if role_prob>=on_value and str(entity.label)!="5":
  354. # if _key in combination.keys() and combination[_key]==entity.entity_text:
  355. # _key_prob = _key+entity.entity_text
  356. # if _key_prob in dict_entity_prob.keys():
  357. # if dict_entity_prob[_key_prob]<role_prob:
  358. # dict_entity_prob[_key_prob] = role_prob
  359. # else:
  360. # dict_entity_prob[_key_prob] = role_prob
  361. # else:
  362. # _key_prob = _key+entity.entity_text
  363. # if _key_prob in dict_entity_prob.keys():
  364. # if dict_entity_prob[_key_prob]>-role_prob:
  365. # dict_entity_prob[_key_prob] = -role_prob
  366. # else:
  367. # dict_entity_prob[_key_prob] = -role_prob
  368. for _key in dict_entity_prob.keys():
  369. symbol = 1 if dict_entity_prob[_key]>0 else -1
  370. expect += symbol*math.pow(dict_entity_prob[_key],2)
  371. return expect
  372. def getRoleList(list_sentence,list_entity,on_value = 0.5):
  373. '''
  374. @summary: 搜索树,得到所有不矛盾的角色组合,取合计期望值最大的作为结果返回
  375. @param:
  376. list_sentence:文章所有的sentence
  377. list_entity:文章所有的实体
  378. on_value:概率阈值
  379. @return:文章的角色list
  380. '''
  381. pack = getPackagesFromArticle(list_sentence,list_entity)
  382. if pack is None:
  383. return None
  384. PackageList,PackageSet,dict_PackageCode = pack
  385. #拿到所有可能的情况
  386. dict_role_combination = {}
  387. #拿到各个实体的packageName,packageCode
  388. for entity in list_entity:
  389. if entity.entity_type in ['org','company']:
  390. values = entity.values
  391. role_prob = float(values[int(entity.label)])
  392. if role_prob>=on_value and str(entity.label)!="5":
  393. if str(entity.label) in ["0","1"]:
  394. packageName = "Project"
  395. else:
  396. if len(PackageSet)>0:
  397. packagePointer,_ = getPackage(PackageList,entity.sentence_index,entity.end_index,"role-"+str(entity.label))
  398. if packagePointer is None:
  399. #continue
  400. packageName = "Project"
  401. else:
  402. #add pointer_pack
  403. entity.pointer_pack = packagePointer
  404. packageName = packagePointer.entity_text
  405. else:
  406. packageName = "Project"
  407. find_flag = False
  408. if packageName in dict_PackageCode.keys():
  409. packageCode = dict_PackageCode[packageName]
  410. else:
  411. packageCode = ""
  412. entity.packageCode = packageCode
  413. role_name = dict_role_id.get(str(entity.label))
  414. entity.roleName = role_name
  415. entity.packageName = packageName
  416. if entity.packageName in dict_role_combination.keys():
  417. if str(entity.label) in dict_role_combination[entity.packageName].keys():
  418. dict_role_combination[entity.packageName][str(entity.label)].add(entity.entity_text)
  419. else:
  420. dict_role_combination[entity.packageName][str(entity.label)] = set([entity.entity_text])
  421. else:
  422. dict_role_combination[entity.packageName] = {}
  423. #初始化空值
  424. roleIds = [0,1,2,3,4]
  425. for _roleId in roleIds:
  426. dict_role_combination[entity.packageName][str(_roleId)] = set([""])
  427. dict_role_combination[entity.packageName][str(entity.label)].add(entity.entity_text)
  428. list_real_comba = get_legal_comba(list_entity,dict_role_combination)
  429. #拿到最大期望值的组合
  430. max_index = 0
  431. max_expect = -100
  432. _index = 0
  433. dict_pack_entity_prob = get_dict_entity_prob(list_entity)
  434. for item_combination in list_real_comba:
  435. expect = getSumExpectation(dict_pack_entity_prob, item_combination)
  436. if expect>max_expect:
  437. max_index = _index
  438. max_expect = expect
  439. _index += 1
  440. RoleList = []
  441. RoleSet = set()
  442. if len(list_real_comba)>0:
  443. for _key in list_real_comba[max_index].keys():
  444. packageName = _key.split("$$")[0]
  445. label = _key.split("$$")[1]
  446. role_name = dict_role_id.get(str(label))
  447. entity_text = list_real_comba[max_index][_key]
  448. if packageName in dict_PackageCode.keys():
  449. packagecode = dict_PackageCode.get(packageName)
  450. else:
  451. packagecode = ""
  452. RoleList.append(PREM(packageName,packagecode,role_name,entity_text,0,0,0.0,[]))
  453. RoleSet.add(entity_text)
  454. #根据最优树来修正list_entity中角色对包的连接
  455. for _entity in list_entity:
  456. if _entity.pointer_pack is not None:
  457. _pack_name = _entity.pointer_pack.entity_text
  458. _find_flag = False
  459. for _prem in RoleList:
  460. if _prem.packageName==_pack_name and _prem.entity_text==_entity.entity_text:
  461. _find_flag = True
  462. if not _find_flag:
  463. _entity.pointer_pack = None
  464. return RoleList,RoleSet,PackageList,PackageSet
  465. def getPackageScopePattern():
  466. '''
  467. @summary: 获取包的作用域关键词
  468. '''
  469. df = pd.read_excel(os.path.dirname(__file__)+"/end.xls")
  470. pattern = "("
  471. for item in df["list_word"]:
  472. item = str(item).replace("(","\(").replace(")","\)").replace(".","\.").replace("[","\[").replace("]","\]").replace("-","\-")
  473. pattern += item+"|"
  474. pattern = pattern[:-1]+")[::是为]|业绩.{,30}标段[0-9A-Za-z一二三四五六七八九十]{0,3}"
  475. return pattern
  476. pattern_packageScope = getPackageScopePattern()
  477. def getPackagesFromArticle(list_sentence,list_entity):
  478. '''
  479. @param:
  480. list_sentence:文章的句子list
  481. @summary: 将包的信息插入list_entity中
  482. @return: type:list if [包号,句子index,词偏移,标段号] meaning:文章的包/标段信息
  483. '''
  484. if len(list_sentence)==0:
  485. return None
  486. PackageList = []
  487. PackageList_scope = []
  488. PackageSet = set()
  489. dict_packageCode = dict()
  490. package_name_pattern = re.compile("((标[段号的包]|分包)的?(名[称单]?|包名))[::]?([^::]{3,30}?),{1}")
  491. package_N_name_pattern = re.compile("[^承](分?包|标段|标包|标|包|包组|子项目|包件|项目类型)编?号?[::]?[\((]?([0-9A-Za-z一二三四五六七八九十ⅠⅡⅢⅣⅤⅥⅦ]){1,2},{1}")
  492. package_number_pattern = re.compile("(([^承](包|标[段号的包]|分?包|包组|包件)编?号?|子项目|项目类型)[::]?[0-9A-Za-z一二三四五六七八九十ⅠⅡⅢⅣⅤⅥⅦ]{1,4}[^\.])[^至]?|([^\.]?第?[ⅠⅡⅢⅣⅤⅥⅦ0-9A-Za-z一二三四五六七八九十]{1,4}(包号|标[段号的包]|分?包))")
  493. # other_package_pattern = re.compile('(项目名称|物资名称|设备名称|场次名称|标段名称)[::](.{,20}?)(,|项目)') # 新正则识别标段
  494. other_package_pattern = re.compile('((项目|物资|设备|场次|标段|标的|产品)(名称))[::]([^,。]{,50}?)(,|。)') # # 2020/11/23 大网站规则 调整 package_N_name_pattern, package_N_name_pattern 中的项目 改为 子项目
  495. win_tenderer_pattern = re.compile('(中标人|供应商)[::](.{,25})(,|。)') # 2020/11/23 大网站规则 调整
  496. model_pattern = re.compile('(型号|序号)[::]([^,。]{,20})(,|。)') # 2020/11/23 大网站规则 调整
  497. number_pattern = re.compile("[0-9A-Za-z一二三四五六七八九十ⅠⅡⅢⅣⅤⅥⅦ]{1,4}")
  498. package_code_pattern = re.compile("(?:编号[::]?\s*)([-\dA-Za-z\(\)]+)")
  499. def changeIndexFromWordToWords(tokens,word_index):
  500. '''
  501. @summary:转换某个字的字偏移为词偏移
  502. '''
  503. before_index = 0
  504. after_index = 0
  505. for i in range(len(tokens)):
  506. after_index = after_index+len(tokens[i])
  507. if before_index<=word_index and after_index>=word_index:
  508. return i
  509. before_index = after_index
  510. package_names = []
  511. def extractPackageCode(tokens,word_index,size=20,pattern = package_code_pattern):
  512. '''
  513. @summary:抽取包附近的标段号
  514. @param:
  515. tokens:包所在句子的分词
  516. word_index:包所在字偏移
  517. size:左右各取多少个词
  518. pattern:提取标段号的正则
  519. @return: type:string,meaning:标段号
  520. '''
  521. index = changeIndexFromWordToWords(tokens,word_index)
  522. if index<size:
  523. begin = index
  524. else:
  525. begin = index-size
  526. if index+size>len(tokens):
  527. end = len(tokens)
  528. else:
  529. end = index+size
  530. #拿到左右两边的词语组成短语
  531. text = "".join(tokens[begin:end])
  532. #在短语中的字偏移
  533. new_word_index = word_index-len("".join(tokens[:begin]))
  534. min_distance = len(text)
  535. packageCode = None
  536. for the_iter in re.finditer(pattern,text):
  537. #算出最小距离
  538. distance = min([abs(new_word_index-the_iter.span()[0]),abs(new_word_index-the_iter.span()[1])])
  539. if distance<min_distance:
  540. min_distance = distance
  541. packageCode = the_iter.group(1)
  542. return packageCode
  543. #从标段介绍表格中提取包名和包号
  544. for i in range(len(list_sentence)):
  545. content = list_sentence[i].sentence_text
  546. names = re.findall(package_name_pattern,content)
  547. if names == []:
  548. names = re.findall(other_package_pattern, content)
  549. N_names = re.findall(package_N_name_pattern,content)
  550. if len(names)==1 and len(N_names)==1:
  551. package_names.append([names[0][-1],N_names[0][-1]])
  552. for i in range(len(list_sentence)):
  553. PackageList_item = []
  554. PackageList_item_scope = []
  555. content = list_sentence[i].sentence_text
  556. tokens = list_sentence[i].tokens
  557. for name in package_names[:20]:
  558. for index in findAllIndex(name[0],content):
  559. temp_package_number = re.findall(number_pattern,name[1])[0]
  560. PackageList_item.append({"name":temp_package_number,"sentence_index":i,"offsetWords_begin":changeIndexFromWordToWords(tokens,index),"offsetWord_begin":index,"offsetWord_end":index+len(name[0])})
  561. # PackageList_item.append([temp_package_number,i,changeIndexFromWordToWords(tokens,index),index,index+len(str(temp_package_number))])
  562. code = extractPackageCode(tokens, index)
  563. if code is not None:
  564. dict_packageCode[temp_package_number] = code
  565. PackageSet.add(temp_package_number)
  566. for iter in re.finditer(package_number_pattern,content):
  567. temp_package_number = re.findall(number_pattern,content[iter.span()[0]:iter.span()[1]])[0]
  568. PackageList_item.append({"name":temp_package_number,"sentence_index":i,"offsetWords_begin":changeIndexFromWordToWords(tokens,iter.span()[0]),"offsetWord_begin":iter.span()[0],"offsetWord_end":iter.span()[1]})
  569. # PackageList_item.append([temp_package_number,i,changeIndexFromWordToWords(tokens,iter.span()[0]),iter.span()[0],iter.span()[1]])
  570. code = extractPackageCode(tokens, iter.span()[0])
  571. if code is not None:
  572. dict_packageCode[temp_package_number] = code
  573. PackageSet.add(temp_package_number)
  574. #识别packageScope
  575. for iter in re.finditer(pattern_packageScope,content):
  576. PackageList_item_scope.append({"name":"","sentence_index":i,"offsetWords_begin":changeIndexFromWordToWords(tokens,iter.span()[0]),"offsetWord_begin":iter.span()[0],"offsetWord_end":iter.span()[1]})
  577. # PackageList_item_scope.append(["",i,changeIndexFromWordToWords(tokens,iter.span()[0]),iter.span()[0],iter.span()[1]])
  578. PackageList_item_scope = PackageList_item +PackageList_item_scope
  579. PackageList_item_scope.sort(key=lambda x:x["offsetWord_begin"])
  580. PackageList_scope = PackageList_scope+PackageList_item_scope
  581. PackageList_item.sort(key=lambda x:x["sentence_index"])
  582. #PackageList = PackageList+PackageList_item
  583. #不作为包
  584. # if len(PackageSet)==0:
  585. # for i in range(len(list_sentence)):
  586. # PackageList_item = []
  587. # PackageList_item_scope = []
  588. # content = list_sentence[i].sentence_text
  589. # tokens = list_sentence[i].tokens
  590. # for iter in re.finditer(other_package_pattern,content):
  591. # temp_package_number = iter.group(2)
  592. # PackageList_item.append({"name":temp_package_number,"sentence_index":i,"offsetWords_begin":changeIndexFromWordToWords(tokens,iter.span()[0]),"offsetWord_begin":iter.span()[0],"offsetWord_end":iter.span()[1]})
  593. # # PackageList_item.append([temp_package_number,i,changeIndexFromWordToWords(tokens,iter.span()[0]),iter.span()[0],iter.span()[1]])
  594. # code = extractPackageCode(tokens, iter.span()[0])
  595. # if code is not None:
  596. # dict_packageCode[temp_package_number] = code
  597. # PackageSet.add(temp_package_number)
  598. # #识别packageScope
  599. # for iter in re.finditer(pattern_packageScope,content):
  600. # PackageList_item_scope.append({"name":"","sentence_index":i,"offsetWords_begin":changeIndexFromWordToWords(tokens,iter.span()[0]),"offsetWord_begin":iter.span()[0],"offsetWord_end":iter.span()[1]})
  601. # # PackageList_item_scope.append(["",i,changeIndexFromWordToWords(tokens,iter.span()[0]),iter.span()[0],iter.span()[1]])
  602. # PackageList_item_scope = PackageList_item +PackageList_item_scope
  603. # PackageList_item_scope.sort(key=lambda x:x["offsetWord_begin"])
  604. # PackageList_scope = PackageList_scope+PackageList_item_scope
  605. # PackageList_item.sort(key=lambda x:x["sentence_index"])
  606. # 2020/11/23 大网站规则 调整
  607. if len(PackageSet)==0 and len(set([it.entity_text for it in list_entity if it.entity_type in ['org', 'company'] and it.label==2]))>1:
  608. for i in range(len(list_sentence)):
  609. PackageList_item = []
  610. PackageList_item_scope = []
  611. content = list_sentence[i].sentence_text
  612. tokens = list_sentence[i].tokens
  613. names = re.findall(other_package_pattern, content)
  614. N_names = re.findall(win_tenderer_pattern, content)
  615. if len(names) != 1 or len(N_names) != 1:
  616. continue
  617. for iter in re.finditer(other_package_pattern,content):
  618. temp_package_number = iter.group(4)
  619. xinghao = re.search(model_pattern, content)
  620. if xinghao:
  621. temp_package_number = temp_package_number + '+' + xinghao.group(2)
  622. # print('新正则采购包名补充',temp_package_number)
  623. PackageList_item.append({"name":temp_package_number,"sentence_index":i,"offsetWords_begin":changeIndexFromWordToWords(tokens,iter.span()[0]),"offsetWord_begin":iter.span()[0],"offsetWord_end":iter.span()[1]})
  624. # PackageList_item.append([temp_package_number,i,changeIndexFromWordToWords(tokens,iter.span()[0]),iter.span()[0],iter.span()[1]])
  625. code = extractPackageCode(tokens, iter.span()[0])
  626. if code is not None:
  627. dict_packageCode[temp_package_number] = code
  628. PackageSet.add(temp_package_number)
  629. #识别packageScope
  630. for iter in re.finditer(pattern_packageScope,content):
  631. PackageList_item_scope.append({"name":"","sentence_index":i,"offsetWords_begin":changeIndexFromWordToWords(tokens,iter.span()[0]),"offsetWord_begin":iter.span()[0],"offsetWord_end":iter.span()[1]})
  632. # PackageList_item_scope.append(["",i,changeIndexFromWordToWords(tokens,iter.span()[0]),iter.span()[0],iter.span()[1]])
  633. PackageList_item_scope = PackageList_item +PackageList_item_scope
  634. PackageList_item_scope.sort(key=lambda x:x["offsetWord_begin"])
  635. PackageList_scope = PackageList_scope+PackageList_item_scope
  636. PackageList_item.sort(key=lambda x:x["sentence_index"])
  637. pattern_punctuation = "[::()\(\),,。;;]"
  638. for i in range(len(list_sentence)):
  639. for j in range(len(PackageList_scope)):
  640. if i==PackageList_scope[j]["sentence_index"] and PackageList_scope[j]["name"]!="":
  641. _flag = False
  642. left_str = list_sentence[i].sentence_text[PackageList_scope[j]["offsetWord_begin"]-30:PackageList_scope[j]["offsetWord_begin"]+1]
  643. right_str = list_sentence[i].sentence_text[PackageList_scope[j]["offsetWord_begin"]:PackageList_scope[j]["offsetWord_begin"]+30]
  644. _left_find = re.findall(pattern_punctuation,left_str)
  645. _right_find = re.findall(pattern_punctuation,right_str)
  646. #print(left_str)
  647. if re.search("同",left_str[-1:]) is not None and PackageList_scope[j]["name"]=="一":
  648. continue
  649. if re.search("划分",right_str[:10]) is not None:
  650. continue
  651. if len(_left_find)>0 and _left_find[-1] in [":",":"]:
  652. _flag = True
  653. if len(_right_find)>0 and _right_find[0] in [":",":"]:
  654. _flag = True
  655. if _flag:
  656. scope_begin = [PackageList_scope[j]["sentence_index"],PackageList_scope[j]["offsetWords_begin"]]
  657. else:
  658. if j==0:
  659. scope_begin = [0,0]
  660. else:
  661. scope_begin = [PackageList_scope[j-1]["sentence_index"],PackageList_scope[j-1]["offsetWords_begin"]]
  662. if j==len(PackageList_scope)-1:
  663. scope_end = [PackageList_scope[j]["offsetWords_begin"],changeIndexFromWordToWords(list_sentence[i].tokens, len(list_sentence[i].sentence_text))]
  664. else:
  665. scope_end = [PackageList_scope[j+1]["sentence_index"],PackageList_scope[j+1]["offsetWords_begin"]]
  666. if PackageList_scope[j-1]["sentence_index"]==PackageList_scope[j]["sentence_index"] and PackageList_scope[j-1]["offsetWord_begin"]<=PackageList_scope[j]["offsetWord_begin"] and PackageList_scope[j-1]["offsetWord_end"]>=PackageList_scope[j]["offsetWord_end"]:
  667. continue
  668. #add package to entity
  669. _pack_entity = Entity(doc_id=list_sentence[0].doc_id,entity_id="%s_%s_%s_%s"%(list_sentence[0].doc_id,i,PackageList_scope[j]["offsetWord_begin"],PackageList_scope[j]["offsetWord_begin"]),entity_text=PackageList_scope[j]["name"],entity_type="package",sentence_index=PackageList_scope[j]["sentence_index"],begin_index=changeIndexFromWordToWords(list_sentence[i].tokens,PackageList_scope[j]["offsetWord_begin"]),end_index=changeIndexFromWordToWords(list_sentence[i].tokens,PackageList_scope[j]["offsetWord_end"]),wordOffset_begin=PackageList_scope[j]["offsetWord_begin"],wordOffset_end=PackageList_scope[j]["offsetWord_end"])
  670. list_entity.append(_pack_entity)
  671. copy_pack = copy.copy(PackageList_scope[j])
  672. copy_pack["scope"] = [scope_begin,scope_end]
  673. copy_pack["hit"] = set()
  674. copy_pack["pointer"] = _pack_entity
  675. PackageList.append(copy_pack)
  676. return PackageList,PackageSet,dict_packageCode
  677. def findAttributeAfterEntity(PackDict,roleSet,PackageList,PackageSet,list_entity,on_value = 0.5,on_value_person=0.5,sentence_len=4):
  678. '''
  679. @param:
  680. PackDict:文章包dict
  681. roleSet:文章所有角色的公司名称
  682. PackageList:文章的包信息
  683. PackageSet:文章所有包的名称
  684. list_entity:文章所有经过模型处理的实体
  685. on_value:金额模型的阈值
  686. on_value_person:联系人模型的阈值
  687. sentence_len:公司和属性间隔句子的最大长度
  688. @return:添加了属性信息的角色list
  689. '''
  690. #根据roleid添加金额到rolelist中
  691. def addMoneyByRoleid(packDict,packageName,roleid,money,money_prob):
  692. for i in range(len(packDict[packageName]["roleList"])):
  693. if packDict[packageName]["roleList"][i].role_name==dict_role_id.get(str(roleid)):
  694. if money_prob>packDict[packageName]["roleList"][i].money_prob:
  695. packDict[packageName]["roleList"][i].money = money
  696. packDict[packageName]["roleList"][i].money_prob = money_prob
  697. return packDict
  698. #根据实体名称添加金额到rolelist中
  699. def addMoneyByEntity(packDict,packageName,entity,money,money_prob):
  700. for i in range(len(packDict[packageName]["roleList"])):
  701. if packDict[packageName]["roleList"][i].entity_text==entity:
  702. if money_prob>packDict[packageName]["roleList"][i].money_prob:
  703. packDict[packageName]["roleList"][i].money = money
  704. packDict[packageName]["roleList"][i].money_prob = money_prob
  705. return packDict
  706. #根据实体名称得到角色
  707. def getRoleWithText(packDict,entity_text):
  708. for pack in packDict.keys():
  709. for i in range(len(packDict[pack]["roleList"])):
  710. if packDict[pack]["roleList"][i].entity_text==entity_text:
  711. return packDict[pack]["roleList"][i].role_name
  712. def doesEntityOrLinkedEntity_inRoleSet(entity,RoleSet):
  713. _list_entitys = [entity]+entity.linked_entitys
  714. for _entity in _list_entitys:
  715. if _entity.entity_text in RoleSet:
  716. return True
  717. p_entity = 0
  718. #遍历所有实体
  719. while(p_entity<len(list_entity)):
  720. entity = list_entity[p_entity]
  721. '''
  722. #招标金额从后往前找
  723. if entity.entity_type=="money":
  724. if entity.values[entity.label]>=on_value:
  725. if str(entity.label)=="0":
  726. packagePointer,_ = getPackage(PackageList,entity.sentence_index,entity.begin_index,"money-"+str(entity.label))
  727. if packagePointer is None:
  728. packageName = "Project"
  729. else:
  730. packageName = packagePointer.entity_text
  731. addMoneyByRoleid(PackDict, packageName, "0", entity.entity_text, entity.values[entity.label])
  732. '''
  733. ''' # 2020/11/25 与下面的联系人连接步骤重复,取消
  734. if entity.entity_type=="person":
  735. if entity.values[entity.label]>=on_value_person:
  736. if str(entity.label)=="1":
  737. for i in range(len(PackDict["Project"]["roleList"])):
  738. if PackDict["Project"]["roleList"][i].role_name=="tenderee":
  739. PackDict["Project"]["roleList"][i].linklist.append((entity.entity_text,entity.person_phone))
  740. # add pointer_person
  741. for _entity in list_entity:
  742. if dict_role_id.get(str(_entity.label))=="tenderee":
  743. for i in range(len(PackDict["Project"]["roleList"])):
  744. if PackDict["Project"]["roleList"][i].entity_text==_entity.entity_text and PackDict["Project"]["roleList"][i].role_name=="tenderee":
  745. _entity.pointer_person = entity
  746. elif str(entity.label)=="2":
  747. for i in range(len(PackDict["Project"]["roleList"])):
  748. if PackDict["Project"]["roleList"][i].role_name=="agency":
  749. PackDict["Project"]["roleList"][i].linklist.append((entity.entity_text,entity.person_phone))
  750. # add pointer_person
  751. for _entity in list_entity:
  752. if dict_role_id.get(str(_entity.label))=="agency":
  753. for i in range(len(PackDict["Project"]["roleList"])):
  754. if PackDict["Project"]["roleList"][i].entity_text==_entity.entity_text and PackDict["Project"]["roleList"][i].role_name=="agency":
  755. _entity.pointer_person = entity
  756. '''
  757. #金额往前找实体
  758. if entity.entity_type=="money":
  759. if entity.values[entity.label]>=on_value:
  760. p_entity_money= p_entity
  761. entity_money = list_entity[p_entity_money]
  762. if len(PackageSet)>0:
  763. packagePointer,_ = getPackage(PackageList,entity_money.sentence_index,entity_money.begin_index,"money-"+str(entity_money.entity_text)+"-"+str(entity_money.label))
  764. if packagePointer is None:
  765. packageName_entity = "Project"
  766. else:
  767. packageName_entity = packagePointer.entity_text
  768. else:
  769. packageName_entity = "Project"
  770. while(p_entity_money>0):
  771. entity_before = list_entity[p_entity_money]
  772. if entity_before.entity_type in ['org','company']:
  773. if str(entity_before.label)=="1":
  774. addMoneyByEntity(PackDict, packageName_entity, entity_before.entity_text, entity_money.entity_text, entity_money.values[entity_money.label])
  775. #add pointer_money
  776. entity_before.pointer_money = entity_money
  777. break
  778. p_entity_money -= 1
  779. #如果实体属于角色集合,则往后找属性
  780. if doesEntityOrLinkedEntity_inRoleSet(entity, roleSet):
  781. p_entity += 1
  782. #循环查找符合的属性
  783. while(p_entity<len(list_entity)):
  784. entity_after = list_entity[p_entity]
  785. if entity_after.sentence_index-entity.sentence_index>=sentence_len:
  786. p_entity -= 1
  787. break
  788. #若是遇到公司实体,则跳出循环
  789. if entity_after.entity_type in ['org','company']:
  790. p_entity -= 1
  791. break
  792. if entity_after.values is not None:
  793. if entity_after.entity_type=="money":
  794. if entity_after.values[entity_after.label]>=on_value:
  795. '''
  796. #招标金额从后往前找
  797. if str(entity_after.label)=="0":
  798. packagePointer,_ = getPackage(PackageList,entity.sentence_index,entity.begin_index,"money-"+str(entity.label))
  799. if packagePointer is None:
  800. packageName = "Project"
  801. else:
  802. packageName = packagePointer.entity_text
  803. addMoneyByRoleid(PackDict, packageName, "0", entity_after.entity_text, entity_after.values[entity_after.label])
  804. '''
  805. if str(entity_after.label)=="1":
  806. #print(entity_after.entity_text,entity.entity_text)
  807. _list_entitys = [entity]+entity.linked_entitys
  808. if len(PackageSet)>0:
  809. packagePointer,_ = getPackage(PackageList,entity_after.sentence_index,entity_after.begin_index,"money-"+str(entity_after.entity_text)+"-"+str(entity_after.label))
  810. if packagePointer is None:
  811. packageName_entity = "Project"
  812. else:
  813. packageName_entity = packagePointer.entity_text
  814. else:
  815. packageName_entity = "Project"
  816. if str(entity.label) in ["2","3","4"]:
  817. addMoneyByEntity(PackDict, packageName_entity, entity.entity_text, entity_after.entity_text, entity_after.values[entity_after.label])
  818. #add pointer_money
  819. entity.pointer_money = entity_after
  820. '''
  821. if entity_after.entity_type=="person":
  822. if entity_after.values[entity_after.label]>=on_value_person:
  823. if str(entity_after.label)=="1":
  824. for i in range(len(roleList)):
  825. if roleList[i].role_name=="tenderee":
  826. roleList[i].linklist.append((entity_after.entity_text,entity_after.person_phone))
  827. elif str(entity_after.label)=="2":
  828. for i in range(len(roleList)):
  829. if roleList[i].role_name=="agency":
  830. roleList[i].linklist.append((entity_after.entity_text,entity_after.person_phone))
  831. elif str(entity_after.label)=="3":
  832. _list_entitys = [entity]+entity.linked_entitys
  833. for _entity in _list_entitys:
  834. for i in range(len(roleList)):
  835. if roleList[i].entity_text==_entity.entity_text:
  836. if entity_after.sentence_index-_entity.sentence_index>1 and len(roleList[i].linklist)>0:
  837. break
  838. roleList[i].linklist.append((entity_after.entity_text,entity_after.person_phone))
  839. '''
  840. p_entity += 1
  841. p_entity += 1
  842. ''''''
  843. # 1)第一个公司实体的招标人,则看看下一个实体是否为代理人,如果是则联系人错位连接 。2)在同一句中往后找联系人。3)连接不上在整个文章找联系人。
  844. temp_ent_list = [] # 临时列表,记录0,1角色及3联系人
  845. other_person = [] # 阈值以上的联系人列表
  846. link_person = [] # 有电话没联系上角色的person列表
  847. other_ent = []
  848. link_ent = []
  849. found_person = False
  850. ent_list = []
  851. for entity in list_entity:
  852. if entity.entity_type in ['org','company','person']:
  853. ent_list.append(entity)
  854. #for list_index in range(len(ent_list)):
  855. #if ent_list[list_index].entity_type in ['org','company'] and ent_list[list_index].label == 0 and list_index+2<len(ent_list) and \
  856. #ent_list[list_index+1].entity_type in ['org','company'] and ent_list[list_index+1].label == 1 and ent_list[list_index+2].entity_type in ['person']:
  857. #ent_list[list_index+1], ent_list[list_index+2] = ent_list[list_index+2], ent_list[list_index+1]
  858. # 2020/11/25增加确定角色联系人判断
  859. sure_person_set = set([entity.entity_text for entity in ent_list if entity.entity_type == 'person' and entity.label in [1, 2]])
  860. for index in range(len(ent_list)):
  861. entity = ent_list[index]
  862. if entity.entity_type=="person":
  863. if str(entity.label) == "0": # 2020/11/25 非联系人直接跳过
  864. continue
  865. if entity.values[entity.label]>on_value_person:
  866. if str(entity.label)=="1":
  867. for i in range(len(PackDict["Project"]["roleList"])):
  868. if PackDict["Project"]["roleList"][i].role_name=="tenderee":
  869. PackDict["Project"]["roleList"][i].linklist.append((entity.entity_text,entity.person_phone))
  870. link_person.append(entity.entity_text)
  871. link_ent.append(PackDict["Project"]["roleList"][i].entity_text)
  872. # add pointer_person
  873. for _entity in list_entity:
  874. if dict_role_id.get(str(_entity.label))=="tenderee":
  875. for i in range(len(PackDict["Project"]["roleList"])):
  876. if PackDict["Project"]["roleList"][i].entity_text==_entity.entity_text and PackDict["Project"]["roleList"][i].role_name=="tenderee":
  877. _entity.pointer_person = entity
  878. elif str(entity.label)=="2":
  879. for i in range(len(PackDict["Project"]["roleList"])):
  880. if PackDict["Project"]["roleList"][i].role_name=="agency":
  881. PackDict["Project"]["roleList"][i].linklist.append((entity.entity_text,entity.person_phone))
  882. link_person.append(entity.entity_text)
  883. link_ent.append(PackDict["Project"]["roleList"][i].entity_text)
  884. # add pointer_person
  885. for _entity in list_entity:
  886. if dict_role_id.get(str(_entity.label))=="agency":
  887. for i in range(len(PackDict["Project"]["roleList"])):
  888. if PackDict["Project"]["roleList"][i].entity_text==_entity.entity_text and PackDict["Project"]["roleList"][i].role_name=="agency":
  889. _entity.pointer_person = entity
  890. elif str(entity.label)=="3":
  891. if entity.entity_text in sure_person_set: # 2020/11/25 排除已经确定角色的联系人
  892. continue
  893. #not_link_person.append((entity_after.entity_text,entity_after.person_phone))
  894. other_person.append(entity.entity_text)
  895. temp_ent_list.append((entity.entity_text,entity.person_phone,entity))
  896. #if entity.entity_text in roleSet:
  897. if entity.entity_text in roleSet:
  898. if entity.label in [0,1]:
  899. other_ent.append(entity.entity_text)
  900. temp_ent_list.append((entity.entity_text, entity.label,entity))
  901. for behind_index in range(index+1, len(ent_list)):
  902. entity_after = ent_list[behind_index]
  903. if entity_after.sentence_index-entity.sentence_index>=1 or entity_after.entity_type in ['org','company']: # 只在本句中找联系人
  904. break
  905. if entity_after.values is not None:
  906. if entity_after.entity_type=="person":
  907. if str(entity_after.label) == "0": # 2020/11/25角色后面为非联系人 停止继续往后找
  908. break
  909. if entity_after.values[entity_after.label]>on_value_person:
  910. if str(entity_after.label)=="1":
  911. for i in range(len(PackDict["Project"]["roleList"])):
  912. if PackDict["Project"]["roleList"][i].role_name=="tenderee":
  913. PackDict["Project"]["roleList"][i].linklist.append((entity_after.entity_text,entity_after.person_phone))
  914. link_person.append(entity_after.entity_text)
  915. link_ent.append(PackDict["Project"]["roleList"][i].entity_text)
  916. elif str(entity_after.label)=="2":
  917. for i in range(len(PackDict["Project"]["roleList"])):
  918. if PackDict["Project"]["roleList"][i].role_name=="agency":
  919. PackDict["Project"]["roleList"][i].linklist.append((entity_after.entity_text,entity_after.person_phone))
  920. link_person.append(entity_after.entity_text)
  921. link_ent.append(PackDict["Project"]["roleList"][i].entity_text)
  922. elif str(entity_after.label)=="3":
  923. if entity_after.entity_text in sure_person_set: # 2020/11/25 如果姓名已经出现在确定角色联系人中则停止往后找
  924. break
  925. elif entity_after.begin_index - entity.end_index > 30:#2020/10/25 如果角色实体与联系人实体间隔大于阈值停止
  926. break
  927. for pack in PackDict.keys():
  928. for i in range(len(PackDict[pack]["roleList"])):
  929. if PackDict[pack]["roleList"][i].entity_text==entity.entity_text:
  930. #if entity_after.sentence_index-entity.sentence_index>1 and len(roleList[i].linklist)>0:
  931. #break
  932. PackDict[pack]["roleList"][i].linklist.append((entity_after.entity_text,entity_after.person_phone))
  933. link_person.append(entity_after.entity_text)
  934. #add pointer_person
  935. entity.pointer_person = entity_after
  936. not_link_person = [person for person in other_person if person not in link_person]
  937. not_link_ent = [ent for ent in other_ent if ent not in link_ent]
  938. if len(not_link_person) > 0 and len(not_link_ent) > 0 :
  939. item = temp_ent_list
  940. for i in range(len(item)):
  941. if item[i][0] in not_link_ent and item[i][1] == 0 and i+3 < len(item):
  942. if item[i+1][0] in other_ent and item[i+1][1] == 1 and item[i+2][0] in other_person and item[i+3][0] in other_person:
  943. item[i+1], item[i+2] = item[i+2], item[i+1]
  944. for i in range(len(item)-1, -1, -1):
  945. if item[i][0] in not_link_ent:
  946. for pack in PackDict.keys():
  947. for role in PackDict[pack]["roleList"]:
  948. if role.entity_text == item[i][0] and len(role.linklist) < 1:
  949. for j in range(i+1, len(item)):
  950. if item[j][0] in not_link_person:
  951. role.linklist.append(item[j][:2])
  952. #add pointer_person
  953. item[i][2].pointer_person = item[j][2]
  954. break
  955. else:
  956. break
  957. #寻找多标段招标金额
  958. p_entity = len(list_entity)-1
  959. set_tenderer_money = set()
  960. #遍历所有实体
  961. while(p_entity>=0):
  962. entity = list_entity[p_entity]
  963. if entity.entity_type=="money":
  964. if entity.values[entity.label]>=on_value:
  965. if str(entity.label)=="1":
  966. set_tenderer_money.add(float(entity.entity_text))
  967. if str(entity.label)=="0":
  968. '''
  969. if p_entity>0:
  970. p_before = list_entity[p_entity-1]
  971. if p_before.entity_type=="money" and p_before.label==entity.label and p_before.entity_text==entity.entity_text and abs(entity.begin_index-p_before.end_index)<=2:
  972. p_entity -= 1
  973. continue
  974. '''
  975. packagePointer,_flag = getPackage(PackageList,entity.sentence_index,entity.begin_index,"money-"+str(entity.label),MAX_DIS=2,DIRECT="L")
  976. if packagePointer is None:
  977. packageName = "Project"
  978. else:
  979. packageName = packagePointer.entity_text
  980. if packageName=="Project":
  981. if PackDict["Project"]["tendereeMoney"]<float(entity.entity_text):
  982. PackDict["Project"]["tendereeMoney"] = float(entity.entity_text)
  983. else:
  984. PackDict[packageName]["tendereeMoney"] = float(entity.entity_text)
  985. #add pointer_tendereeMoney
  986. packagePointer.pointer_tendereeMoney = entity
  987. p_entity -= 1
  988. #删除一个机构有多个角色的数据
  989. #删除重复人、概率不回传
  990. final_roleList = []
  991. list_pop = []
  992. set_tenderer_role = set()
  993. dict_pack_tenderer_money = dict()
  994. for pack in PackDict.keys():
  995. #删除无效包
  996. if PackDict[pack]["code"]=="" and PackDict[pack]["tendereeMoney"]==0 and len(PackDict[pack]["roleList"])==0:
  997. list_pop.append(pack)
  998. for i in range(len(PackDict[pack]["roleList"])):
  999. if PackDict[pack]["roleList"][i].role_name=="win_tenderer":
  1000. if PackDict[pack]["roleList"][i].money==0:
  1001. set_tenderer_role.add(PackDict[pack]["roleList"][i])
  1002. dict_pack_tenderer_money[pack] = [PackDict[pack]["roleList"][i],set()]
  1003. #找到包的中投标金额
  1004. for _index in range(len(PackageList)):
  1005. if "hit" in PackageList[_index]:
  1006. for _hit in list(PackageList[_index]["hit"]):
  1007. _money = float(_hit.split("-")[1]) if _hit.split("-")[0]=="money" else None
  1008. if PackageList[_index]["name"] in dict_pack_tenderer_money and _money is not None:
  1009. dict_pack_tenderer_money[PackageList[_index]["name"]][1].add(_money)
  1010. #只找到一个中标人和中标金额
  1011. if len(set_tenderer_money)==1 and len(set_tenderer_role)==1:
  1012. list(set_tenderer_role)[0].money = list(set_tenderer_money)[0]
  1013. #找到一个中标人和多个招标金额
  1014. if len(set_tenderer_money)>1 and len(set_tenderer_role)==1:
  1015. _maxMoney = 0
  1016. _sumMoney = 0
  1017. for _m in list(set_tenderer_money):
  1018. _sumMoney += _m
  1019. if _m>_maxMoney:
  1020. _maxMoney = _m
  1021. if _sumMoney/_maxMoney==2:
  1022. list(set_tenderer_role)[0].money = _maxMoney
  1023. else:
  1024. list(set_tenderer_role)[0].money = _maxMoney
  1025. #每个包都只找到一个金额
  1026. _flag_pack_money = True
  1027. for k,v in dict_pack_tenderer_money.items():
  1028. if len(v[1])!=1:
  1029. _flag_pack_money = False
  1030. if _flag_pack_money and len(PackageSet)==len(dict_pack_tenderer_money.keys()):
  1031. for k,v in dict_pack_tenderer_money.items():
  1032. v[0].money = list(v[1])[0]
  1033. for pack in PackDict.keys():
  1034. for i in range(len(PackDict[pack]["roleList"])):
  1035. PackDict[pack]["roleList"][i] = PackDict[pack]["roleList"][i].getString()
  1036. for item in list_pop:
  1037. PackDict.pop(item)
  1038. return PackDict
  1039. def initPackageAttr(RoleList,PackageSet):
  1040. '''
  1041. @summary: 根据拿到的roleList和packageSet初始化接口返回的数据
  1042. '''
  1043. packDict = dict()
  1044. packDict["Project"] = {"code":"","tendereeMoney":0,"roleList":[]}
  1045. for item in list(PackageSet):
  1046. packDict[item] = {"code":"","tendereeMoney":0,"roleList":[]}
  1047. for item in RoleList:
  1048. if packDict[item.packageName]["code"] =="":
  1049. packDict[item.packageName]["code"] = item.packageCode
  1050. packDict[item.packageName]["roleList"].append(Role(item.role_name,item.entity_text,0,0,0.0,[]))
  1051. return packDict
  1052. def getPackageRoleMoney(list_sentence,list_entity):
  1053. '''
  1054. @param:
  1055. list_sentence:文章的句子list
  1056. list_entity:文章的实体list
  1057. @return: 拿到文章的包-标段号-角色-实体名称-金额-联系人-联系电话
  1058. '''
  1059. # print("=1")
  1060. theRole = getRoleList(list_sentence,list_entity)
  1061. if not theRole:
  1062. return []
  1063. RoleList,RoleSet,PackageList,PackageSet = theRole
  1064. '''
  1065. for item in PackageList:
  1066. print(item)
  1067. '''
  1068. # print("=2")
  1069. PackDict = initPackageAttr(RoleList, PackageSet)
  1070. # print("=3")
  1071. PackDict = findAttributeAfterEntity(PackDict, RoleSet, PackageList, PackageSet, list_entity)
  1072. # print("=4")
  1073. return PackDict
  1074. def getOtherAttributes(list_entity):
  1075. dict_other = {"bidway":"",
  1076. "moneysource":"",
  1077. "person_review":[],
  1078. "time_release":"",
  1079. "time_bidopen":"",
  1080. "time_bidclose":"",
  1081. "serviceTime":"",
  1082. "product":[]}
  1083. for entity in list_entity:
  1084. if entity.entity_type == 'bidway':
  1085. dict_other["bidway"] = entity.entity_text
  1086. elif entity.entity_type=='moneysource':
  1087. dict_other["moneysource"] = entity.entity_text
  1088. elif entity.entity_type=='serviceTime':
  1089. dict_other["serviceTime"] = entity.entity_text
  1090. elif entity.entity_type == 'time' and entity.label==1:
  1091. dict_other["time_release"] = timeFormat(entity.entity_text)
  1092. elif entity.entity_type == 'time' and entity.label==2:
  1093. dict_other["time_bidopen"] = timeFormat(entity.entity_text)
  1094. elif entity.entity_type == 'time' and entity.label == 3:
  1095. dict_other["time_bidclose"] = timeFormat(entity.entity_text)
  1096. elif entity.entity_type=="person" and entity.label ==4:
  1097. dict_other["person_review"].append(entity.entity_text)
  1098. elif entity.entity_type=='product':
  1099. dict_other["product"].append(entity.entity_text)
  1100. dict_other["product"] = list(set(dict_other["product"]))
  1101. return dict_other
  1102. def getPREMs(list_sentences,list_entitys,list_articles):
  1103. '''
  1104. @param:
  1105. list_sentence:所有文章的句子list
  1106. list_entity:所有文章的实体list
  1107. @return:list of dict which include文章的包-角色-实体名称-金额-联系人-联系电话
  1108. '''
  1109. result = []
  1110. for list_sentence,list_entity,list_article in zip(list_sentences,list_entitys,list_articles):
  1111. RoleList = getPackageRoleMoney(list_sentence,list_entity)
  1112. result.append(dict({"prem":RoleList,"docid":list_article.doc_id},**getOtherAttributes(list_entity)))
  1113. return result
  1114. if __name__=="__main__":
  1115. '''
  1116. conn = getConnection()
  1117. cursor = conn.cursor()
  1118. #sql = " select distinct A.doc_id from entity_mention A,test_predict_role B where A.entity_id=B.entity_id limit 200"
  1119. sql = " select B.doc_id,B.prem from articles_processed A, articles_validation B where A.id=B.doc_id "
  1120. result = []
  1121. cursor.execute(sql)
  1122. rows = cursor.fetchall()
  1123. count = 0
  1124. for row in rows:
  1125. count += 1
  1126. print(count)
  1127. doc_id = row[0]
  1128. roleList = getPackageRoleMoney(doc_id)
  1129. result.append([doc_id,str(roleList),row[1]])
  1130. ''''''
  1131. with codecs.open("getAttribute.html","w",encoding="utf8") as f:
  1132. f.write('<html><head>\
  1133. <meta http-equiv="Content-Type"\
  1134. content="text/html; charset=UTF-8">\
  1135. </head>\
  1136. <body bgcolor="#FFFFFF">\
  1137. <table border="1">\
  1138. <tr>\
  1139. <td>doc_id</td>\
  1140. <td>角色</td>\
  1141. </tr>')
  1142. for item in result:
  1143. f.write("<tr>"+"<td>"+item[0]+"</td>"+"<td>"+item[1]+"</td>"+"<td>"+item[2]+"</td>"+"</tr>")
  1144. f.write("</table></body>")
  1145. '''