123.py 37 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808
  1. from dl.common.Utils import findAllIndex
  2. from dl.interface.Entitys import PREM
  3. import re
  4. import copy
  5. import math
  6. def getTheRole(entity,role_list):
  7. '''
  8. @summary:根据实体名称拿到index
  9. @param:
  10. entity:实体名称
  11. role_list:角色list
  12. @return:该实体所在下标
  13. '''
  14. for role_index in range(len(role_list)):
  15. if entity in role_list[role_index]:
  16. return role_index
  17. return None
  18. dict_role_id = {"0":"tenderee",
  19. "1":"agency",
  20. "2":"win_tenderer",
  21. "3":"second_tenderer",
  22. "4":"third_tenderer"}
  23. def getPackage(packageList,sentence_index,begin_index):
  24. '''
  25. @param:
  26. packageList:文章的包的信息
  27. sentence_index:实体所在的句子
  28. begin_index:实体所在句子的起始位置
  29. @return:公司实体所属的包
  30. '''
  31. if len(packageList)==0:
  32. return None
  33. before_index = None
  34. after_index = None
  35. equal_index = None
  36. equal_count = 0
  37. for pack_index in range(len(packageList)):
  38. if packageList[pack_index][1]>sentence_index and after_index is None:
  39. after_index = pack_index
  40. if packageList[pack_index][1]<sentence_index:
  41. before_index = pack_index
  42. if packageList[pack_index][1]==sentence_index and equal_index is None:
  43. equal_index = pack_index
  44. #当前句子和之前句子未找到包
  45. if before_index is None and equal_index is None:
  46. return None
  47. else:
  48. if after_index is None:
  49. end_index = len(packageList)
  50. else:
  51. end_index = after_index
  52. #只在当前句子找到一个包号
  53. if end_index-max((before_index if before_index is not None else -1,equal_index if equal_index is not None else -1))==1:
  54. return packageList[end_index-1][0]
  55. else:
  56. for i in range(max((before_index if before_index is not None else -1,equal_index if equal_index is not None else -1)),end_index):
  57. if packageList[i][2]>int(begin_index):
  58. if packageList[i-1][4]:
  59. return packageList[i-1][0]
  60. else:
  61. if packageList[i][4]:
  62. return packageList[i-1][0]
  63. else:
  64. return packageList[i][0]
  65. return packageList[end_index-1][0]
  66. return None
  67. #生成合法的组合
  68. def get_legal_comba(list_entity,dict_role_combination):
  69. #拿到一个包中所有合法的组合
  70. def recursive_package(_dict_legal_combination,set_legal_entity,dict_one_selution,list_all_selution):
  71. last_layer = False
  72. #若是空组合则放回空
  73. if len(_dict_legal_combination.keys())==0:
  74. return []
  75. #递归到最后一层则修改状态
  76. if len(_dict_legal_combination.keys())==1:
  77. last_layer = True
  78. #取一个角色开始进行遍历
  79. _key_role = list(_dict_legal_combination.keys())[0]
  80. for item in _dict_legal_combination[_key_role]:
  81. copy_dict_one_selution = copy.copy(dict_one_selution)
  82. copy_dict_legal_combination = {}
  83. copy_set_legal_entity = copy.copy(set_legal_entity)
  84. #复制余下的所有角色,进行下一轮递归
  85. for _key in _dict_legal_combination.keys():
  86. if _key!=_key_role:
  87. copy_dict_legal_combination[_key] = _dict_legal_combination[_key]
  88. #修改为招标人和代理人可以为同一个
  89. if item !="":
  90. _flag = True
  91. if str(_key_role) in ["0","1"]:
  92. for _key_flag in copy_dict_one_selution:
  93. if _key_flag not in ["0","1"] and copy_dict_one_selution[_key_flag]==item:
  94. _flag = False
  95. else:
  96. for _key_flag in copy_dict_one_selution:
  97. if copy_dict_one_selution[_key_flag]==item:
  98. _flag = False
  99. if _flag:
  100. copy_dict_one_selution[_key_role] = item
  101. '''
  102. if item not in copy_set_legal_entity:
  103. if item !="":
  104. copy_dict_one_selution[_key_role] = item
  105. '''
  106. copy_set_legal_entity.add(item)
  107. if last_layer:
  108. list_all_selution.append(copy_dict_one_selution)
  109. else:
  110. recursive_package(copy_dict_legal_combination,copy_set_legal_entity,copy_dict_one_selution,list_all_selution)
  111. #递归匹配各个包的结果
  112. def recursive_packages(_dict_legal_combination,dict_one_selution,list_all_selution):
  113. last_layer = False
  114. if len(_dict_legal_combination.keys())==0:
  115. return []
  116. if len(_dict_legal_combination.keys())==1:
  117. last_layer = True
  118. _key_pack = list(_dict_legal_combination.keys())[0]
  119. for item in _dict_legal_combination[_key_pack]:
  120. copy_dict_one_selution = copy.copy(dict_one_selution)
  121. copy_dict_legal_combination = {}
  122. for _key in _dict_legal_combination.keys():
  123. if _key!=_key_pack:
  124. copy_dict_legal_combination[_key] = _dict_legal_combination[_key]
  125. for _key_role in item.keys():
  126. copy_dict_one_selution[_key_pack+"$$"+_key_role] = item[_key_role]
  127. if last_layer:
  128. list_all_selution.append(copy_dict_one_selution)
  129. else:
  130. recursive_packages(copy_dict_legal_combination,copy_dict_one_selution,list_all_selution)
  131. return list_all_selution
  132. #循环获取所有包组合
  133. def circle_pageages(_dict_legal_combination):
  134. list_all_selution = []
  135. for _key_pack in _dict_legal_combination.keys():
  136. list_key_selution = []
  137. for item in _dict_legal_combination[_key_pack]:
  138. _dict = dict()
  139. for _key_role in item.keys():
  140. _dict[_key_pack+"$$"+_key_role] = item[_key_role]
  141. list_key_selution.append(_dict)
  142. if len(list_all_selution)==0:
  143. list_all_selution = list_key_selution
  144. else:
  145. _list_all_selution = []
  146. for item_1 in list_all_selution:
  147. for item_2 in list_key_selution:
  148. _list_all_selution.append(dict(item_1,**item_2))
  149. list_all_selution = _list_all_selution
  150. return list_all_selution
  151. #拿到各个包解析之后的结果
  152. _dict_legal_combination = {}
  153. for packageName in dict_role_combination.keys():
  154. _list_all_selution = []
  155. recursive_package(dict_role_combination[packageName], set(), {}, _list_all_selution)
  156. '''
  157. print("===1")
  158. print(packageName)
  159. for item in _list_all_selution:
  160. print(item)
  161. print("===2")
  162. '''
  163. #去除包含子集
  164. list_all_selution_simple = []
  165. _list_set_all_selution = []
  166. for item_selution in _list_all_selution:
  167. item_set_selution = set()
  168. for _key in item_selution.keys():
  169. item_set_selution.add((_key,item_selution[_key]))
  170. _list_set_all_selution.append(item_set_selution)
  171. if len(_list_set_all_selution)>1000:
  172. _dict_legal_combination[packageName] = _list_all_selution
  173. continue
  174. for i in range(len(_list_set_all_selution)):
  175. be_included = False
  176. for j in range(len(_list_set_all_selution)):
  177. if i!=j:
  178. if len(set(_list_set_all_selution[i])&set(_list_set_all_selution[j]))==len(_list_set_all_selution[i]) and len(_list_set_all_selution[i])!=len(_list_set_all_selution[j]):
  179. be_included = True
  180. if not be_included:
  181. list_all_selution_simple.append(_list_all_selution[i])
  182. _dict_legal_combination[packageName] = list_all_selution_simple
  183. _list_final_comba = []
  184. #对各个包的结果进行排列组合
  185. _comba_count = 1
  186. for _key in _dict_legal_combination.keys():
  187. _comba_count *= len(_dict_legal_combination[_key])
  188. #如果过大,则每个包只取概率最大的那个
  189. if _comba_count>250:
  190. new_dict_legal_combination = dict()
  191. for _key_pack in _dict_legal_combination.keys():
  192. MAX_PROB = -1000
  193. _MAX_PROB_COMBA = None
  194. for item in _dict_legal_combination[_key_pack]:
  195. _dict = dict()
  196. for _key in item.keys():
  197. _dict[str(_key_pack)+"$$"+str(_key)] = item[_key]
  198. _prob = getSumExpectation(list_entity, _dict)
  199. if _prob>MAX_PROB:
  200. MAX_PROB = _prob
  201. _MAX_PROB_COMBA = [item]
  202. new_dict_legal_combination[_key_pack] = _MAX_PROB_COMBA
  203. _dict_legal_combination = new_dict_legal_combination
  204. #recursive_packages(_dict_legal_combination, {}, _list_final_comba)
  205. _list_final_comba = circle_pageages(_dict_legal_combination)
  206. #除了Project包(招标人和代理人),其他包是不会有冲突的
  207. #查看是否有一个实体出现在了Project包和其他包中,如有,要进行裁剪
  208. _list_real_comba = []
  209. for dict_item in _list_final_comba:
  210. set_project = set()
  211. set_other = set()
  212. for _key in list(dict_item.keys()):
  213. if _key.split("$$")[0]=="Project":
  214. set_project.add(dict_item[_key])
  215. else:
  216. set_other.add(dict_item[_key])
  217. set_common = set_project&set_other
  218. if len(set_common)>0:
  219. dict_project = {}
  220. dict_not_project = {}
  221. for _key in list(dict_item.keys()):
  222. if dict_item[_key] in set_common:
  223. if str(_key.split("$$")[0])=="Project":
  224. dict_project[_key] = dict_item[_key]
  225. else:
  226. dict_not_project[_key] = dict_item[_key]
  227. else:
  228. dict_project[_key] = dict_item[_key]
  229. dict_not_project[_key] = dict_item[_key]
  230. _list_real_comba.append(dict_project)
  231. _list_real_comba.append(dict_not_project)
  232. else:
  233. _list_real_comba.append(dict_item)
  234. return _list_real_comba
  235. #计算合计期望
  236. def getSumExpectation(list_entity,combination,on_value=0.5):
  237. '''
  238. expect = 0
  239. for entity in list_entity:
  240. if entity.entity_type in ['org','company']:
  241. values = entity.values
  242. role_prob = float(values[int(entity.label)])
  243. _key = entity.packageName+"$$"+str(entity.label)
  244. if role_prob>on_value and str(entity.label)!="5":
  245. if _key in combination.keys() and combination[_key]==entity.entity_text:
  246. expect += math.pow(role_prob,4)
  247. else:
  248. expect -= math.pow(role_prob,4)
  249. '''
  250. #修改为同一个实体只取对应包-角色的最大的概率值
  251. expect = 0
  252. dict_entity_prob = {}
  253. for entity in list_entity:
  254. if entity.entity_type in ['org','company']:
  255. values = entity.values
  256. role_prob = float(values[int(entity.label)])
  257. _key = entity.packageName+"$$"+str(entity.label)
  258. if role_prob>=on_value and str(entity.label)!="5":
  259. if _key in combination.keys() and combination[_key]==entity.entity_text:
  260. _key_prob = _key+entity.entity_text
  261. if _key_prob in dict_entity_prob.keys():
  262. if dict_entity_prob[_key_prob]<role_prob:
  263. dict_entity_prob[_key_prob] = role_prob
  264. else:
  265. dict_entity_prob[_key_prob] = role_prob
  266. else:
  267. _key_prob = _key+entity.entity_text
  268. if _key_prob in dict_entity_prob.keys():
  269. if dict_entity_prob[_key_prob]>-role_prob:
  270. dict_entity_prob[_key_prob] = -role_prob
  271. else:
  272. dict_entity_prob[_key_prob] = -role_prob
  273. for _key in dict_entity_prob.keys():
  274. symbol = 1 if dict_entity_prob[_key]>0 else -1
  275. expect += symbol*math.pow(dict_entity_prob[_key],2)
  276. return expect
  277. def getRoleList(list_sentence,list_entity,on_value = 0.5):
  278. '''
  279. @summary: 搜索树,得到所有不矛盾的角色组合,取合计期望值最大的作为结果返回
  280. @param:
  281. list_sentence:文章所有的sentence
  282. list_entity:文章所有的实体
  283. on_value:概率阈值
  284. @return:文章的角色list
  285. '''
  286. pack = getPackagesFromArticle(list_sentence)
  287. if pack is None:
  288. return None
  289. PackageList,PackageSet,dict_PackageCode = pack
  290. #拿到所有可能的情况
  291. dict_role_combination = {}
  292. #拿到各个实体的packageName,packageCode
  293. for entity in list_entity:
  294. if entity.entity_type in ['org','company']:
  295. values = entity.values
  296. role_prob = float(values[int(entity.label)])
  297. if role_prob>=on_value and str(entity.label)!="5":
  298. if str(entity.label) in ["0","1"]:
  299. packageName = "Project"
  300. else:
  301. if len(PackageSet)>1:
  302. packageName = getPackage(PackageList,entity.sentence_index,entity.end_index)
  303. if packageName is None:
  304. #continue
  305. packageName = "Project"
  306. else:
  307. packageName = "Project"
  308. find_flag = False
  309. role_name = dict_role_id.get(str(entity.label))
  310. if packageName in dict_PackageCode.keys():
  311. packageCode = dict_PackageCode[packageName]
  312. else:
  313. packageCode = ""
  314. entity.packageName = packageName
  315. entity.packageCode = packageCode
  316. entity.roleName = role_name
  317. if entity.packageName in dict_role_combination.keys():
  318. if str(entity.label) in dict_role_combination[entity.packageName].keys():
  319. dict_role_combination[entity.packageName][str(entity.label)].add(entity.entity_text)
  320. else:
  321. dict_role_combination[entity.packageName][str(entity.label)] = set([entity.entity_text])
  322. else:
  323. dict_role_combination[entity.packageName] = {}
  324. #初始化空值
  325. roleIds = [0,1,2,3,4]
  326. for _roleId in roleIds:
  327. dict_role_combination[entity.packageName][str(_roleId)] = set([""])
  328. dict_role_combination[entity.packageName][str(entity.label)].add(entity.entity_text)
  329. list_real_comba = get_legal_comba(list_entity,dict_role_combination)
  330. #拿到最大期望值的组合
  331. max_index = 0
  332. max_expect = -100
  333. _index = 0
  334. for item_combination in list_real_comba:
  335. expect = getSumExpectation(list_entity, item_combination)
  336. if expect>max_expect:
  337. max_index = _index
  338. max_expect = expect
  339. _index += 1
  340. RoleList = []
  341. RoleSet = set()
  342. if len(list_real_comba)>0:
  343. for _key in list_real_comba[max_index].keys():
  344. packageName = _key.split("$$")[0]
  345. label = _key.split("$$")[1]
  346. role_name = dict_role_id.get(str(label))
  347. entity_text = list_real_comba[max_index][_key]
  348. if packageName in dict_PackageCode.keys():
  349. packagecode = dict_PackageCode.get(packageName)
  350. else:
  351. packagecode = ""
  352. RoleList.append(PREM(packageName,packagecode,role_name,entity_text,0,0,0.0,[]))
  353. RoleSet.add(entity_text)
  354. return RoleList,RoleSet,PackageList,PackageSet
  355. def getPackagesFromArticle(list_sentence):
  356. '''
  357. @param:
  358. list_sentence:文章的句子list
  359. @return: type:list if [包号,句子index,词偏移,标段号] meaning:文章的包/标段信息
  360. '''
  361. if len(list_sentence)==0:
  362. return None
  363. PackageList = []
  364. PackageSet = set()
  365. dict_packageCode = dict()
  366. package_name_pattern = re.compile("((标[段号的包]|分包)的?(名[称单]?|包名))[::]?([^::]{3,30}?),{1}")
  367. package_N_name_pattern = re.compile("(分?包|标段|标|包|包组|项目)编?号?[::]?[\((]?([0-9A-Za-z一二三四五六七八九十]){1,2},{1}")
  368. package_number_pattern = re.compile("((包|标[段号的包]|分?包|包组|项目)编?号?[::]?[\((]?[0-9A-Za-z一二三四五六七八九十]{1,4})|(第?[0-9A-Za-z一二三四五六七八九十]{1,4}(包号|标[段号的包]|分?包))")
  369. other_package_pattern = re.compile('(项目名称|物资名称|设备名称|场次名称|标段名称)[::](.{,20}?)(,|项目)') # 新正则识别标段
  370. number_pattern = re.compile("[0-9A-Za-z一二三四五六七八九十]{1,4}")
  371. package_code_pattern = re.compile("(?:编号[::]?\s*)([-\dA-Za-z]+)")
  372. def changeIndexFromWordToWords(tokens,word_index):
  373. '''
  374. @summary:转换某个字的字偏移为词偏移
  375. '''
  376. before_index = 0
  377. after_index = 0
  378. for i in range(len(tokens)):
  379. after_index = after_index+len(tokens[i])
  380. if before_index<=word_index and after_index>=word_index:
  381. return i
  382. before_index = after_index
  383. package_names = []
  384. def extractPackageCode(tokens,word_index,size=20,pattern = package_code_pattern):
  385. '''
  386. @summary:抽取包附近的标段号
  387. @param:
  388. tokens:包所在句子的分词
  389. word_index:包所在字偏移
  390. size:左右各取多少个词
  391. pattern:提取标段号的正则
  392. @return: type:string,meaning:标段号
  393. '''
  394. index = changeIndexFromWordToWords(tokens,word_index)
  395. if index<size:
  396. begin = index
  397. else:
  398. begin = index-size
  399. if index+size>len(tokens):
  400. end = len(tokens)
  401. else:
  402. end = index+size
  403. #拿到左右两边的词语组成短语
  404. text = "".join(tokens[begin:end])
  405. #在短语中的字偏移
  406. new_word_index = word_index-len("".join(tokens[:begin]))
  407. min_distance = len(text)
  408. packageCode = None
  409. for the_iter in re.finditer(pattern,text):
  410. #算出最小距离
  411. distance = min([abs(new_word_index-the_iter.span()[0]),abs(new_word_index-the_iter.span()[1])])
  412. if distance<min_distance:
  413. min_distance = distance
  414. packageCode = the_iter.group(1)
  415. return packageCode
  416. #从标段介绍表格中提取包名和包号
  417. for i in range(len(list_sentence)):
  418. content = list_sentence[i].sentence_text
  419. names = re.findall(package_name_pattern,content)
  420. if names == []:
  421. names = re.findall(other_package_pattern, content)
  422. N_names = re.findall(package_N_name_pattern,content)
  423. if len(names)==1 and len(N_names)==1:
  424. package_names.append([names[0][-1],N_names[0][-1]])
  425. for i in range(len(list_sentence)):
  426. PackageList_item = []
  427. content = list_sentence[i].sentence_text
  428. tokens = list_sentence[i].tokens
  429. for name in package_names:
  430. for index in findAllIndex(name[0],content):
  431. temp_package_number = re.findall(number_pattern,name[1])[0]
  432. PackageList_item.append([temp_package_number,i,changeIndexFromWordToWords(tokens,index),index])
  433. code = extractPackageCode(tokens, index)
  434. if code is not None:
  435. dict_packageCode[temp_package_number] = code
  436. PackageSet.add(temp_package_number)
  437. for iter in re.finditer(package_number_pattern,content):
  438. temp_package_number = re.findall(number_pattern,content[iter.span()[0]:iter.span()[1]])[0]
  439. PackageList_item.append([temp_package_number,i,changeIndexFromWordToWords(tokens,iter.span()[0]),iter.span()[0]])
  440. code = extractPackageCode(tokens, iter.span()[0])
  441. if code is not None:
  442. dict_packageCode[temp_package_number] = code
  443. PackageSet.add(temp_package_number)
  444. if PackageList_item == []: # 原有正则没有识别到标段和包号时增加以下正则识别
  445. for iter in re.finditer(other_package_pattern,content):
  446. temp_package_number = iter.group(2)
  447. PackageList_item.append([temp_package_number,i,changeIndexFromWordToWords(tokens,iter.span()[0]),iter.span()[0]])
  448. code = extractPackageCode(tokens, iter.span()[0])
  449. if code is not None:
  450. dict_packageCode[temp_package_number] = code
  451. PackageSet.add(temp_package_number)
  452. PackageList_item.sort(key=lambda x:x[2])
  453. PackageList = PackageList+PackageList_item
  454. pattern_punctuation = "[::()\(\),,。;;]"
  455. for i in range(len(list_sentence)):
  456. for j in range(len(PackageList)):
  457. if i==PackageList[j][1]:
  458. _flag = False
  459. left_str = list_sentence[i].sentence_text[PackageList[j][3]-30:PackageList[j][3]]
  460. right_str = list_sentence[i].sentence_text[PackageList[j][3]:PackageList[j][3]+30]
  461. _left_find = re.findall(pattern_punctuation,left_str)
  462. _right_find = re.findall(pattern_punctuation,right_str)
  463. if len(_left_find)>0 and _left_find[-1] in [":",":"]:
  464. _flag = True
  465. if len(_right_find)>0 and _right_find[0] in [":",":"]:
  466. _flag = True
  467. PackageList[j].append(_flag)
  468. return PackageList,PackageSet,dict_packageCode
  469. def findAttributeAfterEntity(roleList,roleSet,PackageList,PackageSet,list_entity,on_value = 0.5,on_value_person=0.5,sentence_len=4):
  470. '''
  471. @param:
  472. roleList:文章角色list
  473. roleSet:文章所有角色的公司名称
  474. PackageList:文章的包信息
  475. PackageSet:文章所有包的名称
  476. list_entity:文章所有经过模型处理的实体
  477. on_value:金额模型的阈值
  478. on_value_person:联系人模型的阈值
  479. sentence_len:公司和属性间隔句子的最大长度
  480. @return:添加了属性信息的角色list
  481. '''
  482. #根据roleid添加金额到rolelist中
  483. def addMoneyByRoleid(RoleList,packageName,roleid,money,money_prob):
  484. for i in range(len(RoleList)):
  485. if RoleList[i].packageName==packageName and RoleList[i].role_name==dict_role_id.get(str(roleid)):
  486. if money_prob>RoleList[i].money_prob:
  487. RoleList[i].money = money
  488. RoleList[i].money_prob = money_prob
  489. return RoleList
  490. #根据实体名称添加金额到rolelist中
  491. def addMoneyByEntity(RoleList,packageName,entity,money,money_prob):
  492. for i in range(len(RoleList)):
  493. if RoleList[i].packageName==packageName and RoleList[i].entity_text==entity:
  494. if money_prob>RoleList[i].money_prob:
  495. RoleList[i].money = money
  496. RoleList[i].money_prob = money_prob
  497. return RoleList
  498. #根据实体名称得到角色
  499. def getRoleWithText(roleList,entity_text):
  500. for i in range(len(roleList)):
  501. if roleList[i].entity_text==entity_text:
  502. return roleList[i].role_name
  503. def doesEntityOrLinkedEntity_inRoleSet(entity,RoleSet):
  504. _list_entitys = [entity]+entity.linked_entitys
  505. for _entity in _list_entitys:
  506. if _entity.entity_text in RoleSet:
  507. return True
  508. p_entity = 0
  509. set_tenderer_role = set()
  510. set_tenderer_money = set()
  511. #遍历所有实体
  512. while(p_entity<len(list_entity)):
  513. entity = list_entity[p_entity]
  514. if entity.entity_type=="money":
  515. if entity.values[entity.label]>=on_value:
  516. if str(entity.label)=="1":
  517. set_tenderer_money.add(float(entity.entity_text))
  518. if str(entity.label)=="0":
  519. packageName = "Project"
  520. addMoneyByRoleid(roleList, packageName, "0", entity.entity_text, entity.values[entity.label])
  521. if entity.entity_type=="person":
  522. if entity.values[entity.label]>=on_value_person:
  523. if str(entity.label)=="1":
  524. for i in range(len(roleList)):
  525. if roleList[i].role_name=="tenderee":
  526. roleList[i].linklist.append((entity.entity_text,entity.person_phone))
  527. elif str(entity.label)=="2":
  528. for i in range(len(roleList)):
  529. if roleList[i].role_name=="agency":
  530. roleList[i].linklist.append((entity.entity_text,entity.person_phone))
  531. #如果实体属于角色集合,则往后找属性
  532. if doesEntityOrLinkedEntity_inRoleSet(entity, roleSet):
  533. p_entity += 1
  534. #循环查找符合的属性
  535. while(p_entity<len(list_entity)):
  536. entity_after = list_entity[p_entity]
  537. if entity_after.sentence_index-entity.sentence_index>=sentence_len:
  538. p_entity -= 1
  539. break
  540. #若是遇到公司实体,则跳出循环
  541. if entity_after.entity_type in ['org','company']:
  542. p_entity -= 1
  543. break
  544. if entity_after.values is not None:
  545. if entity_after.entity_type=="money":
  546. if entity_after.values[entity_after.label]>=on_value:
  547. if str(entity_after.label)=="0":
  548. packageName = "Project"
  549. addMoneyByRoleid(roleList, packageName, "0", entity_after.entity_text, entity_after.values[entity_after.label])
  550. elif str(entity_after.label)=="1":
  551. _list_entitys = [entity]+entity.linked_entitys
  552. for _entity in _list_entitys:
  553. if getRoleWithText(roleList, _entity.entity_text) in ['tenderee','agency']:
  554. packageName_entity = "Project"
  555. else:
  556. if len(PackageSet)>1:
  557. packageName_entity = getPackage(PackageList,_entity.sentence_index,_entity.begin_index)
  558. if packageName_entity is None:
  559. continue
  560. else:
  561. packageName_entity = "Project"
  562. if str(_entity.label) in ["2","3","4"]:
  563. addMoneyByEntity(roleList, packageName_entity, _entity.entity_text, entity_after.entity_text, entity_after.values[entity_after.label])
  564. '''
  565. if entity_after.entity_type=="person":
  566. if entity_after.values[entity_after.label]>=on_value_person:
  567. if str(entity_after.label)=="1":
  568. for i in range(len(roleList)):
  569. if roleList[i].role_name=="tenderee":
  570. roleList[i].linklist.append((entity_after.entity_text,entity_after.person_phone))
  571. elif str(entity_after.label)=="2":
  572. for i in range(len(roleList)):
  573. if roleList[i].role_name=="agency":
  574. roleList[i].linklist.append((entity_after.entity_text,entity_after.person_phone))
  575. elif str(entity_after.label)=="3":
  576. _list_entitys = [entity]+entity.linked_entitys
  577. for _entity in _list_entitys:
  578. for i in range(len(roleList)):
  579. if roleList[i].entity_text==_entity.entity_text:
  580. if entity_after.sentence_index-_entity.sentence_index>1 and len(roleList[i].linklist)>0:
  581. break
  582. roleList[i].linklist.append((entity_after.entity_text,entity_after.person_phone))
  583. '''
  584. p_entity += 1
  585. p_entity += 1
  586. ''''''
  587. #删除一个机构有多个角色的数据
  588. # 1)第一个公司实体的招标人,则看看下一个实体是否为代理人,如果是则联系人错位连接 。2)在同一句中往后找联系人。3)连接不上在整个文章找联系人。
  589. temp_ent_list = [] # 临时列表,记录0,1角色及3联系人
  590. other_person = [] # 阈值以上的联系人列表
  591. link_person = [] # 有电话没联系上角色的person列表
  592. other_ent = []
  593. link_ent = []
  594. found_person = False
  595. ent_list = []
  596. for entity in list_entity:
  597. if entity.entity_type in ['org','company','person']:
  598. ent_list.append(entity)
  599. #for list_index in range(len(ent_list)):
  600. #if ent_list[list_index].entity_type in ['org','company'] and ent_list[list_index].label == 0 and list_index+2<len(ent_list) and \
  601. #ent_list[list_index+1].entity_type in ['org','company'] and ent_list[list_index+1].label == 1 and ent_list[list_index+2].entity_type in ['person']:
  602. #ent_list[list_index+1], ent_list[list_index+2] = ent_list[list_index+2], ent_list[list_index+1]
  603. for index in range(len(ent_list)):
  604. entity = ent_list[index]
  605. if entity.entity_type=="person":
  606. if entity.values[entity.label]>on_value_person:
  607. if str(entity.label)=="1":
  608. for i in range(len(roleList)):
  609. if roleList[i].role_name=="tenderee":
  610. roleList[i].linklist.append((entity.entity_text,entity.person_phone))
  611. link_person.append(entity.entity_text)
  612. link_ent.append(roleList[i].entity_text)
  613. elif str(entity.label)=="2":
  614. for i in range(len(roleList)):
  615. if roleList[i].role_name=="agency":
  616. roleList[i].linklist.append((entity.entity_text,entity.person_phone))
  617. link_person.append(entity.entity_text)
  618. link_ent.append(roleList[i].entity_text)
  619. elif str(entity.label)=="3":
  620. #not_link_person.append((entity_after.entity_text,entity_after.person_phone))
  621. other_person.append(entity.entity_text)
  622. temp_ent_list.append((entity.entity_text,entity.person_phone))
  623. #if entity.entity_text in roleSet:
  624. if entity.entity_text in set([ent.entity_text for ent in roleList]):
  625. if entity.label in [0,1]:
  626. other_ent.append(entity.entity_text)
  627. temp_ent_list.append((entity.entity_text, entity.label))
  628. for behind_index in range(index+1, len(ent_list)):
  629. entity_after = ent_list[behind_index]
  630. if entity_after.sentence_index-entity.sentence_index>=1 or entity_after.entity_type in ['org','company']: # 只在本句中找联系人
  631. break
  632. if entity_after.values is not None:
  633. if entity_after.entity_type=="person":
  634. if entity_after.values[entity_after.label]>on_value_person:
  635. if str(entity_after.label)=="1":
  636. for i in range(len(roleList)):
  637. if roleList[i].role_name=="tenderee":
  638. roleList[i].linklist.append((entity_after.entity_text,entity_after.person_phone))
  639. link_person.append(entity_after.entity_text)
  640. link_ent.append(roleList[i].entity_text)
  641. elif str(entity_after.label)=="2":
  642. for i in range(len(roleList)):
  643. if roleList[i].role_name=="agency":
  644. roleList[i].linklist.append((entity_after.entity_text,entity_after.person_phone))
  645. link_person.append(entity_after.entity_text)
  646. link_ent.append(roleList[i].entity_text)
  647. elif str(entity_after.label)=="3":
  648. for i in range(len(roleList)):
  649. if roleList[i].entity_text==entity.entity_text:
  650. #if entity_after.sentence_index-entity.sentence_index>1 and len(roleList[i].linklist)>0:
  651. #break
  652. roleList[i].linklist.append((entity_after.entity_text,entity_after.person_phone))
  653. link_person.append(entity_after.entity_text)
  654. not_link_person = [person for person in other_person if person not in link_person]
  655. not_link_ent = [ent for ent in other_ent if ent not in link_ent]
  656. if len(not_link_person) > 0 and len(not_link_ent) > 0 :
  657. item = temp_ent_list
  658. for i in range(len(item)):
  659. if item[i][0] in not_link_ent and item[i][1] == 0 and i+3 < len(item):
  660. if item[i+1][0] in other_ent and item[i+1][1] == 1 and item[i+2][0] in other_person and item[i+3][0] in other_person:
  661. item[i+1], item[i+2] = item[i+2], item[i+1]
  662. for i in range(len(item)-1, -1, -1):
  663. if item[i][0] in not_link_ent:
  664. for role in roleList:
  665. if role.entity_text == item[i][0] and len(role.linklist) < 1:
  666. for j in range(i+1, len(item)):
  667. if item[j][0] in not_link_person:
  668. role.linklist.append(item[j])
  669. break
  670. else:
  671. break
  672. for i in range(len(roleList)):
  673. if roleList[i].role_name=="win_tenderer":
  674. set_tenderer_role.add(roleList[i])
  675. if len(set_tenderer_money)==1 and len(set_tenderer_role)==1:
  676. list(set_tenderer_role)[0].money = list(set_tenderer_money)[0]
  677. #删除一个机构有多个角色的数据
  678. #删除重复人、概率不回传
  679. final_roleList = []
  680. for i in range(len(roleList)):
  681. item = roleList[i].getString(roleList)
  682. if item:
  683. final_roleList.append(item)
  684. return final_roleList
  685. def getPackageRoleMoney(list_sentence,list_entity):
  686. '''
  687. @param:
  688. list_sentence:文章的句子list
  689. list_entity:文章的实体list
  690. @return: 拿到文章的包-标段号-角色-实体名称-金额-联系人-联系电话
  691. '''
  692. theRole = getRoleList(list_sentence,list_entity)
  693. if not theRole:
  694. return []
  695. RoleList,RoleSet,PackageList,PackageSet = theRole
  696. RoleList = findAttributeAfterEntity(RoleList, RoleSet, PackageList, PackageSet, list_entity)
  697. return RoleList
  698. def getPREMs(list_sentences,list_entitys,list_articles):
  699. '''
  700. @param:
  701. list_sentence:所有文章的句子list
  702. list_entity:所有文章的实体list
  703. @return:list of dict which include文章的包-角色-实体名称-金额-联系人-联系电话
  704. '''
  705. result = []
  706. for list_sentence,list_entity,list_article in zip(list_sentences,list_entitys,list_articles):
  707. RoleList = getPackageRoleMoney(list_sentence,list_entity)
  708. result.append([list_article.id,{"prem":RoleList}])
  709. return result
  710. if __name__=="__main__":
  711. '''
  712. conn = getConnection()
  713. cursor = conn.cursor()
  714. #sql = " select distinct A.doc_id from entity_mention A,test_predict_role B where A.entity_id=B.entity_id limit 200"
  715. sql = " select B.doc_id,B.prem from articles_processed A, articles_validation B where A.id=B.doc_id "
  716. result = []
  717. cursor.execute(sql)
  718. rows = cursor.fetchall()
  719. count = 0
  720. for row in rows:
  721. count += 1
  722. print(count)
  723. doc_id = row[0]
  724. roleList = getPackageRoleMoney(doc_id)
  725. result.append([doc_id,str(roleList),row[1]])
  726. ''''''
  727. with codecs.open("getAttribute.html","w",encoding="utf8") as f:
  728. f.write('<html><head>\
  729. <meta http-equiv="Content-Type"\
  730. content="text/html; charset=UTF-8">\
  731. </head>\
  732. <body bgcolor="#FFFFFF">\
  733. <table border="1">\
  734. <tr>\
  735. <td>doc_id</td>\
  736. <td>角色</td>\
  737. </tr>')
  738. for item in result:
  739. f.write("<tr>"+"<td>"+item[0]+"</td>"+"<td>"+item[1]+"</td>"+"<td>"+item[2]+"</td>"+"</tr>")
  740. f.write("</table></body>")
  741. '''