|
@@ -1,1239 +0,0 @@
|
|
|
-from BiddingKG.dl.common.Utils import findAllIndex,debug,timeFormat
|
|
|
-from BiddingKG.dl.interface.Entitys import PREM,Role,Entity
|
|
|
-import re
|
|
|
-import copy
|
|
|
-import math
|
|
|
-import pandas as pd
|
|
|
-import os
|
|
|
-
|
|
|
-def getTheRole(entity,role_list):
|
|
|
- '''
|
|
|
- @summary:根据实体名称拿到index
|
|
|
- @param:
|
|
|
- entity:实体名称
|
|
|
- role_list:角色list
|
|
|
- @return:该实体所在下标
|
|
|
- '''
|
|
|
- for role_index in range(len(role_list)):
|
|
|
- if entity in role_list[role_index]:
|
|
|
- return role_index
|
|
|
- return None
|
|
|
-
|
|
|
-dict_role_id = {"0":"tenderee",
|
|
|
- "1":"agency",
|
|
|
- "2":"win_tenderer",
|
|
|
- "3":"second_tenderer",
|
|
|
- "4":"third_tenderer"}
|
|
|
-
|
|
|
-def getPackage(packageList,sentence_index,begin_index,roleid,MAX_DIS=None,DIRECT=None):
|
|
|
- '''
|
|
|
- @param:
|
|
|
- packageList:文章的包的信息,包号-sent_index-词偏移-字偏移-[[前作用域句子,句内偏移],[后作用域句子,句内偏移]]-匹配集合
|
|
|
- sentence_index:实体所在的句子
|
|
|
- begin_index:实体所在句子的起始位置
|
|
|
- @return:公司实体所属的包
|
|
|
- @summary: 优化多标段,确定标段作用域之后,寻找作用域包含该实体的所有包,从前往后找到一个还没有该roleid的包返回,若找到的包都有roleid,则返回第一个,若没有找到包,返回None
|
|
|
- '''
|
|
|
-
|
|
|
- '''
|
|
|
- if len(packageList)==0:
|
|
|
- return None
|
|
|
- before_index = None
|
|
|
- after_index = None
|
|
|
- equal_index = None
|
|
|
- equal_count = 0
|
|
|
-
|
|
|
-
|
|
|
- for pack_index in range(len(packageList)):
|
|
|
- if packageList[pack_index][1]>sentence_index and after_index is None:
|
|
|
- after_index = pack_index
|
|
|
- if packageList[pack_index][1]<sentence_index:
|
|
|
- before_index = pack_index
|
|
|
- if packageList[pack_index][1]==sentence_index and equal_index is None:
|
|
|
- equal_index = pack_index
|
|
|
- #当前句子和之前句子未找到包
|
|
|
- if before_index is None and equal_index is None:
|
|
|
- return None
|
|
|
- else:
|
|
|
- if after_index is None:
|
|
|
- end_index = len(packageList)
|
|
|
- else:
|
|
|
- end_index = after_index
|
|
|
- #只在当前句子找到一个包号
|
|
|
- if end_index-max((before_index if before_index is not None else -1,equal_index if equal_index is not None else -1))==1:
|
|
|
- return packageList[end_index-1][0]
|
|
|
- else:
|
|
|
- for i in range(max((before_index if before_index is not None else -1,equal_index if equal_index is not None else -1)),end_index):
|
|
|
- if packageList[i][2]>int(begin_index):
|
|
|
- if packageList[i-1][4]:
|
|
|
- return packageList[i-1][0]
|
|
|
- else:
|
|
|
- if packageList[i][4]:
|
|
|
- return packageList[i-1][0]
|
|
|
- else:
|
|
|
- return packageList[i][0]
|
|
|
- return packageList[end_index-1][0]
|
|
|
- '''
|
|
|
- if len(packageList)==0:
|
|
|
- return None,False
|
|
|
- list_legalPack = []
|
|
|
- for pack_index in range(len(packageList)):
|
|
|
- if DIRECT=="L" and (packageList[pack_index]["sentence_index"]>sentence_index or (packageList[pack_index]["sentence_index"]==sentence_index and packageList[pack_index]["offsetWords_begin"]>begin_index)):
|
|
|
- continue
|
|
|
- if DIRECT=="R" and (packageList[pack_index]["sentence_index"]<sentence_index or (packageList[pack_index]["sentence_index"]==sentence_index and packageList[pack_index]["offsetwords_begin"]<begin_index)):
|
|
|
- continue
|
|
|
- if (packageList[pack_index]["scope"][0][0]<sentence_index or (packageList[pack_index]["scope"][0][0]==sentence_index and packageList[pack_index]["scope"][0][1]<=begin_index)) and (packageList[pack_index]["scope"][1][0]>sentence_index or (packageList[pack_index]["scope"][1][0]==sentence_index and packageList[pack_index]["scope"][1][1]>=begin_index)):
|
|
|
- if MAX_DIS is not None:
|
|
|
- if abs(sentence_index-packageList[pack_index]["sentence_index"])<=MAX_DIS:
|
|
|
- list_legalPack.append(pack_index)
|
|
|
- else:
|
|
|
- list_legalPack.append(pack_index)
|
|
|
- _flag = True
|
|
|
- for _index in list_legalPack:
|
|
|
- if roleid in packageList[_index]["hit"]:
|
|
|
- continue
|
|
|
- else:
|
|
|
- _flag = False
|
|
|
- packageList[_index]["hit"].add(roleid)
|
|
|
- return packageList[_index]["pointer"],_flag
|
|
|
- if len(list_legalPack)>0:
|
|
|
- return packageList[0]["pointer"],_flag
|
|
|
-
|
|
|
- return None,False
|
|
|
-
|
|
|
-#生成合法的组合
|
|
|
-def get_legal_comba(list_entity,dict_role_combination):
|
|
|
-
|
|
|
- #拿到一个包中所有合法的组合
|
|
|
- def circle_package(_dict_legal_combination):
|
|
|
- list_dict_role_first = []
|
|
|
- for _role in _dict_legal_combination:
|
|
|
- if len(list_dict_role_first)==0:
|
|
|
- for _entity in _dict_legal_combination[_role]:
|
|
|
- if _entity !="":
|
|
|
- list_dict_role_first.append({_role:_entity})
|
|
|
- else:
|
|
|
- list_dict_role_after = []
|
|
|
- _find_count = 0
|
|
|
- for _entity in _dict_legal_combination[_role]:
|
|
|
- if _entity !="":
|
|
|
- for _dict in list_dict_role_first:
|
|
|
- _flag = True
|
|
|
- for _key1 in _dict:
|
|
|
- if _entity==_dict[_key1]:
|
|
|
- #修改为招标人和代理人可以为同一个
|
|
|
- if str(_key1) in ["0","1"] and str(_role) in ["0","1"]:
|
|
|
- _flag = True
|
|
|
- else:
|
|
|
- _flag = False
|
|
|
- if _flag:
|
|
|
- _find_count += 1
|
|
|
- _new_dict = copy.copy(_dict)
|
|
|
- _new_dict[_role] = _entity
|
|
|
- if len(list_dict_role_after)>100000:
|
|
|
- break
|
|
|
- list_dict_role_after.append(_new_dict)
|
|
|
- if len(list_dict_role_after)==0:
|
|
|
- pass
|
|
|
- else:
|
|
|
- list_dict_role_first.extend(list_dict_role_after)
|
|
|
-
|
|
|
- return list_dict_role_first
|
|
|
-
|
|
|
-
|
|
|
-
|
|
|
- def recursive_package(_dict_legal_combination,set_legal_entity,dict_one_selution,list_all_selution):
|
|
|
- last_layer = False
|
|
|
- #若是空组合则放回空
|
|
|
- if len(_dict_legal_combination.keys())==0:
|
|
|
- return []
|
|
|
- #递归到最后一层则修改状态
|
|
|
- if len(_dict_legal_combination.keys())==1:
|
|
|
- last_layer = True
|
|
|
- #取一个角色开始进行遍历
|
|
|
- _key_role = list(_dict_legal_combination.keys())[0]
|
|
|
- for item in _dict_legal_combination[_key_role]:
|
|
|
- copy_dict_one_selution = copy.copy(dict_one_selution)
|
|
|
- copy_dict_legal_combination = {}
|
|
|
- copy_set_legal_entity = copy.copy(set_legal_entity)
|
|
|
-
|
|
|
- #复制余下的所有角色,进行下一轮递归
|
|
|
- for _key in _dict_legal_combination.keys():
|
|
|
- if _key!=_key_role:
|
|
|
- copy_dict_legal_combination[_key] = _dict_legal_combination[_key]
|
|
|
-
|
|
|
- #修改为招标人和代理人可以为同一个
|
|
|
- if item !="":
|
|
|
- _flag = True
|
|
|
- if str(_key_role) in ["0","1"]:
|
|
|
- for _key_flag in copy_dict_one_selution:
|
|
|
- if _key_flag not in ["0","1"] and copy_dict_one_selution[_key_flag]==item:
|
|
|
- _flag = False
|
|
|
- else:
|
|
|
- for _key_flag in copy_dict_one_selution:
|
|
|
- if copy_dict_one_selution[_key_flag]==item:
|
|
|
- _flag = False
|
|
|
- if _flag:
|
|
|
- copy_dict_one_selution[_key_role] = item
|
|
|
-
|
|
|
- '''
|
|
|
- if item not in copy_set_legal_entity:
|
|
|
- if item !="":
|
|
|
- copy_dict_one_selution[_key_role] = item
|
|
|
- '''
|
|
|
- copy_set_legal_entity.add(item)
|
|
|
- if last_layer:
|
|
|
- list_all_selution.append(copy_dict_one_selution)
|
|
|
- else:
|
|
|
- recursive_package(copy_dict_legal_combination,copy_set_legal_entity,copy_dict_one_selution,list_all_selution)
|
|
|
-
|
|
|
-
|
|
|
- #递归匹配各个包的结果
|
|
|
- def recursive_packages(_dict_legal_combination,dict_one_selution,list_all_selution):
|
|
|
- last_layer = False
|
|
|
- if len(_dict_legal_combination.keys())==0:
|
|
|
- return []
|
|
|
- if len(_dict_legal_combination.keys())==1:
|
|
|
- last_layer = True
|
|
|
- _key_pack = list(_dict_legal_combination.keys())[0]
|
|
|
- for item in _dict_legal_combination[_key_pack]:
|
|
|
- copy_dict_one_selution = copy.copy(dict_one_selution)
|
|
|
- copy_dict_legal_combination = {}
|
|
|
- for _key in _dict_legal_combination.keys():
|
|
|
- if _key!=_key_pack:
|
|
|
- copy_dict_legal_combination[_key] = _dict_legal_combination[_key]
|
|
|
- for _key_role in item.keys():
|
|
|
- copy_dict_one_selution[_key_pack+"$$"+_key_role] = item[_key_role]
|
|
|
- if last_layer:
|
|
|
- list_all_selution.append(copy_dict_one_selution)
|
|
|
- else:
|
|
|
- recursive_packages(copy_dict_legal_combination,copy_dict_one_selution,list_all_selution)
|
|
|
- return list_all_selution
|
|
|
-
|
|
|
- #循环获取所有包组合
|
|
|
- def circle_pageages(_dict_legal_combination):
|
|
|
- list_all_selution = []
|
|
|
- for _key_pack in _dict_legal_combination.keys():
|
|
|
- list_key_selution = []
|
|
|
- for item in _dict_legal_combination[_key_pack]:
|
|
|
- _dict = dict()
|
|
|
- for _key_role in item.keys():
|
|
|
- _dict[_key_pack+"$$"+_key_role] = item[_key_role]
|
|
|
- list_key_selution.append(_dict)
|
|
|
- if len(list_all_selution)==0:
|
|
|
- list_all_selution = list_key_selution
|
|
|
- else:
|
|
|
- _list_all_selution = []
|
|
|
- for item_1 in list_all_selution:
|
|
|
- for item_2 in list_key_selution:
|
|
|
- _list_all_selution.append(dict(item_1,**item_2))
|
|
|
- list_all_selution = _list_all_selution
|
|
|
- return list_all_selution
|
|
|
-
|
|
|
- #拿到各个包解析之后的结果
|
|
|
- _dict_legal_combination = {}
|
|
|
- for packageName in dict_role_combination.keys():
|
|
|
- _list_all_selution = []
|
|
|
-
|
|
|
- # recursive_package(dict_role_combination[packageName], set(), {}, _list_all_selution)
|
|
|
- _list_all_selution = circle_package(dict_role_combination[packageName])
|
|
|
- '''
|
|
|
- print("===1")
|
|
|
- print(packageName)
|
|
|
- for item in _list_all_selution:
|
|
|
- print(item)
|
|
|
- print("===2")
|
|
|
- '''
|
|
|
- #去除包含子集
|
|
|
- list_all_selution_simple = []
|
|
|
- _list_set_all_selution = []
|
|
|
- for item_selution in _list_all_selution:
|
|
|
- item_set_selution = set()
|
|
|
- for _key in item_selution.keys():
|
|
|
- item_set_selution.add((_key,item_selution[_key]))
|
|
|
- _list_set_all_selution.append(item_set_selution)
|
|
|
- if len(_list_set_all_selution)>1000:
|
|
|
- _dict_legal_combination[packageName] = _list_all_selution
|
|
|
- continue
|
|
|
- for i in range(len(_list_set_all_selution)):
|
|
|
-
|
|
|
- be_included = False
|
|
|
- for j in range(len(_list_set_all_selution)):
|
|
|
- if i!=j:
|
|
|
- if len(set(_list_set_all_selution[i])&set(_list_set_all_selution[j]))==len(_list_set_all_selution[i]) and len(_list_set_all_selution[i])!=len(_list_set_all_selution[j]):
|
|
|
- be_included = True
|
|
|
- if not be_included:
|
|
|
- list_all_selution_simple.append(_list_all_selution[i])
|
|
|
- _dict_legal_combination[packageName] = list_all_selution_simple
|
|
|
- _list_final_comba = []
|
|
|
- #对各个包的结果进行排列组合
|
|
|
- _comba_count = 1
|
|
|
- for _key in _dict_legal_combination.keys():
|
|
|
- _comba_count *= len(_dict_legal_combination[_key])
|
|
|
- #如果过大,则每个包只取概率最大的那个
|
|
|
- dict_pack_entity_prob = get_dict_entity_prob(list_entity)
|
|
|
- if _comba_count>250:
|
|
|
- new_dict_legal_combination = dict()
|
|
|
- for _key_pack in _dict_legal_combination.keys():
|
|
|
- MAX_PROB = -1000
|
|
|
- _MAX_PROB_COMBA = None
|
|
|
- for item in _dict_legal_combination[_key_pack]:
|
|
|
- # print(_key_pack,item)
|
|
|
- _dict = dict()
|
|
|
- for _key in item.keys():
|
|
|
- _dict[str(_key_pack)+"$$"+str(_key)] = item[_key]
|
|
|
- _prob = getSumExpectation(dict_pack_entity_prob, _dict)
|
|
|
- if _prob>MAX_PROB:
|
|
|
- MAX_PROB = _prob
|
|
|
- _MAX_PROB_COMBA = [item]
|
|
|
- if _MAX_PROB_COMBA is not None:
|
|
|
- new_dict_legal_combination[_key_pack] = _MAX_PROB_COMBA
|
|
|
- _dict_legal_combination = new_dict_legal_combination
|
|
|
- #recursive_packages(_dict_legal_combination, {}, _list_final_comba)
|
|
|
- _list_final_comba = circle_pageages(_dict_legal_combination)
|
|
|
- #除了Project包(招标人和代理人),其他包是不会有冲突的
|
|
|
- #查看是否有一个实体出现在了Project包和其他包中,如有,要进行裁剪
|
|
|
- _list_real_comba = []
|
|
|
- for dict_item in _list_final_comba:
|
|
|
- set_project = set()
|
|
|
- set_other = set()
|
|
|
- for _key in list(dict_item.keys()):
|
|
|
- if _key.split("$$")[0]=="Project":
|
|
|
- set_project.add(dict_item[_key])
|
|
|
- else:
|
|
|
- set_other.add(dict_item[_key])
|
|
|
- set_common = set_project&set_other
|
|
|
- if len(set_common)>0:
|
|
|
- dict_project = {}
|
|
|
- dict_not_project = {}
|
|
|
- for _key in list(dict_item.keys()):
|
|
|
- if dict_item[_key] in set_common:
|
|
|
- if str(_key.split("$$")[0])=="Project":
|
|
|
- dict_project[_key] = dict_item[_key]
|
|
|
- else:
|
|
|
- dict_not_project[_key] = dict_item[_key]
|
|
|
- else:
|
|
|
- dict_project[_key] = dict_item[_key]
|
|
|
- dict_not_project[_key] = dict_item[_key]
|
|
|
-
|
|
|
- _list_real_comba.append(dict_project)
|
|
|
- _list_real_comba.append(dict_not_project)
|
|
|
- else:
|
|
|
- _list_real_comba.append(dict_item)
|
|
|
-
|
|
|
- return _list_real_comba
|
|
|
-
|
|
|
-def get_dict_entity_prob(list_entity,on_value=0.5):
|
|
|
- dict_pack_entity_prob = {}
|
|
|
- for entity in list_entity:
|
|
|
- if entity.entity_type in ['org','company']:
|
|
|
- values = entity.values
|
|
|
- role_prob = float(values[int(entity.label)])
|
|
|
- _key = entity.packageName+"$$"+str(entity.label)
|
|
|
- if role_prob>=on_value and str(entity.label)!="5":
|
|
|
- _key_prob = _key+"$text$"+entity.entity_text
|
|
|
- if _key_prob in dict_pack_entity_prob:
|
|
|
- if role_prob>dict_pack_entity_prob[_key_prob][1]:
|
|
|
- dict_pack_entity_prob[_key_prob] = [entity.entity_text,role_prob]
|
|
|
- else:
|
|
|
- dict_pack_entity_prob[_key_prob] = [entity.entity_text,role_prob]
|
|
|
- return dict_pack_entity_prob
|
|
|
-
|
|
|
-
|
|
|
-#计算合计期望
|
|
|
-def getSumExpectation(dict_pack_entity_prob,combination,on_value=0.5):
|
|
|
- '''
|
|
|
- expect = 0
|
|
|
- for entity in list_entity:
|
|
|
- if entity.entity_type in ['org','company']:
|
|
|
- values = entity.values
|
|
|
- role_prob = float(values[int(entity.label)])
|
|
|
- _key = entity.packageName+"$$"+str(entity.label)
|
|
|
- if role_prob>on_value and str(entity.label)!="5":
|
|
|
- if _key in combination.keys() and combination[_key]==entity.entity_text:
|
|
|
- expect += math.pow(role_prob,4)
|
|
|
- else:
|
|
|
- expect -= math.pow(role_prob,4)
|
|
|
- '''
|
|
|
- #修改为同一个实体只取对应包-角色的最大的概率值
|
|
|
- expect = 0
|
|
|
- dict_entity_prob = {}
|
|
|
- for _key_pack_entity in dict_pack_entity_prob:
|
|
|
- _key_pack = _key_pack_entity.split("$text$")[0]
|
|
|
- role_prob = dict_pack_entity_prob[_key_pack_entity][1]
|
|
|
- if _key_pack in combination.keys() and combination[_key_pack]==dict_pack_entity_prob[_key_pack_entity][0]:
|
|
|
- if _key_pack_entity in dict_entity_prob.keys():
|
|
|
- if dict_entity_prob[_key_pack_entity]<role_prob:
|
|
|
- dict_entity_prob[_key_pack_entity] = role_prob
|
|
|
- else:
|
|
|
- dict_entity_prob[_key_pack_entity] = role_prob
|
|
|
- else:
|
|
|
- if _key_pack_entity in dict_entity_prob.keys():
|
|
|
- if dict_entity_prob[_key_pack_entity]>-role_prob:
|
|
|
- dict_entity_prob[_key_pack_entity] = -role_prob
|
|
|
- else:
|
|
|
- dict_entity_prob[_key_pack_entity] = -role_prob
|
|
|
- # for entity in list_entity:
|
|
|
- # if entity.entity_type in ['org','company']:
|
|
|
- # values = entity.values
|
|
|
- # role_prob = float(values[int(entity.label)])
|
|
|
- # _key = entity.packageName+"$$"+str(entity.label)
|
|
|
- # if role_prob>=on_value and str(entity.label)!="5":
|
|
|
- # if _key in combination.keys() and combination[_key]==entity.entity_text:
|
|
|
- # _key_prob = _key+entity.entity_text
|
|
|
- # if _key_prob in dict_entity_prob.keys():
|
|
|
- # if dict_entity_prob[_key_prob]<role_prob:
|
|
|
- # dict_entity_prob[_key_prob] = role_prob
|
|
|
- # else:
|
|
|
- # dict_entity_prob[_key_prob] = role_prob
|
|
|
- # else:
|
|
|
- # _key_prob = _key+entity.entity_text
|
|
|
- # if _key_prob in dict_entity_prob.keys():
|
|
|
- # if dict_entity_prob[_key_prob]>-role_prob:
|
|
|
- # dict_entity_prob[_key_prob] = -role_prob
|
|
|
- # else:
|
|
|
- # dict_entity_prob[_key_prob] = -role_prob
|
|
|
- for _key in dict_entity_prob.keys():
|
|
|
- symbol = 1 if dict_entity_prob[_key]>0 else -1
|
|
|
- expect += symbol*math.pow(dict_entity_prob[_key],2)
|
|
|
- return expect
|
|
|
-
|
|
|
-
|
|
|
-def getRoleList(list_sentence,list_entity,on_value = 0.5):
|
|
|
- '''
|
|
|
- @summary: 搜索树,得到所有不矛盾的角色组合,取合计期望值最大的作为结果返回
|
|
|
- @param:
|
|
|
- list_sentence:文章所有的sentence
|
|
|
- list_entity:文章所有的实体
|
|
|
- on_value:概率阈值
|
|
|
- @return:文章的角色list
|
|
|
- '''
|
|
|
-
|
|
|
- pack = getPackagesFromArticle(list_sentence,list_entity)
|
|
|
- if pack is None:
|
|
|
- return None
|
|
|
- PackageList,PackageSet,dict_PackageCode = pack
|
|
|
-
|
|
|
-
|
|
|
- #拿到所有可能的情况
|
|
|
- dict_role_combination = {}
|
|
|
- #拿到各个实体的packageName,packageCode
|
|
|
- for entity in list_entity:
|
|
|
- if entity.entity_type in ['org','company']:
|
|
|
- #过滤掉字数小于3个的实体
|
|
|
- if len(entity.entity_text)<=3:
|
|
|
- continue
|
|
|
- values = entity.values
|
|
|
- role_prob = float(values[int(entity.label)])
|
|
|
- if role_prob>=on_value and str(entity.label)!="5":
|
|
|
- if str(entity.label) in ["0","1"]:
|
|
|
- packageName = "Project"
|
|
|
- else:
|
|
|
- if len(PackageSet)>0:
|
|
|
- packagePointer,_ = getPackage(PackageList,entity.sentence_index,entity.end_index,"role-"+str(entity.label))
|
|
|
- if packagePointer is None:
|
|
|
- #continue
|
|
|
- packageName = "Project"
|
|
|
- else:
|
|
|
- #add pointer_pack
|
|
|
- entity.pointer_pack = packagePointer
|
|
|
- packageName = packagePointer.entity_text
|
|
|
- else:
|
|
|
- packageName = "Project"
|
|
|
- find_flag = False
|
|
|
-
|
|
|
- if packageName in dict_PackageCode.keys():
|
|
|
- packageCode = dict_PackageCode[packageName]
|
|
|
- else:
|
|
|
- packageCode = ""
|
|
|
- entity.packageCode = packageCode
|
|
|
- role_name = dict_role_id.get(str(entity.label))
|
|
|
- entity.roleName = role_name
|
|
|
- entity.packageName = packageName
|
|
|
- if entity.packageName in dict_role_combination.keys():
|
|
|
- if str(entity.label) in dict_role_combination[entity.packageName].keys():
|
|
|
- dict_role_combination[entity.packageName][str(entity.label)].add(entity.entity_text)
|
|
|
- else:
|
|
|
- dict_role_combination[entity.packageName][str(entity.label)] = set([entity.entity_text])
|
|
|
- else:
|
|
|
- dict_role_combination[entity.packageName] = {}
|
|
|
- #初始化空值
|
|
|
- roleIds = [0,1,2,3,4]
|
|
|
- for _roleId in roleIds:
|
|
|
- dict_role_combination[entity.packageName][str(_roleId)] = set([""])
|
|
|
- dict_role_combination[entity.packageName][str(entity.label)].add(entity.entity_text)
|
|
|
- list_real_comba = get_legal_comba(list_entity,dict_role_combination)
|
|
|
-
|
|
|
- #拿到最大期望值的组合
|
|
|
- max_index = 0
|
|
|
- max_expect = -100
|
|
|
- _index = 0
|
|
|
- dict_pack_entity_prob = get_dict_entity_prob(list_entity)
|
|
|
- for item_combination in list_real_comba:
|
|
|
- expect = getSumExpectation(dict_pack_entity_prob, item_combination)
|
|
|
- if expect>max_expect:
|
|
|
- max_index = _index
|
|
|
- max_expect = expect
|
|
|
- _index += 1
|
|
|
- RoleList = []
|
|
|
- RoleSet = set()
|
|
|
- if len(list_real_comba)>0:
|
|
|
- for _key in list_real_comba[max_index].keys():
|
|
|
- packageName = _key.split("$$")[0]
|
|
|
- label = _key.split("$$")[1]
|
|
|
- role_name = dict_role_id.get(str(label))
|
|
|
- entity_text = list_real_comba[max_index][_key]
|
|
|
- if packageName in dict_PackageCode.keys():
|
|
|
- packagecode = dict_PackageCode.get(packageName)
|
|
|
- else:
|
|
|
- packagecode = ""
|
|
|
- RoleList.append(PREM(packageName,packagecode,role_name,entity_text,0,0,0.0,[]))
|
|
|
- RoleSet.add(entity_text)
|
|
|
- #根据最优树来修正list_entity中角色对包的连接
|
|
|
- for _entity in list_entity:
|
|
|
- if _entity.pointer_pack is not None:
|
|
|
- _pack_name = _entity.pointer_pack.entity_text
|
|
|
- _find_flag = False
|
|
|
- for _prem in RoleList:
|
|
|
- if _prem.packageName==_pack_name and _prem.entity_text==_entity.entity_text:
|
|
|
- _find_flag = True
|
|
|
- if not _find_flag:
|
|
|
- _entity.pointer_pack = None
|
|
|
-
|
|
|
- return RoleList,RoleSet,PackageList,PackageSet
|
|
|
-
|
|
|
-def getPackageScopePattern():
|
|
|
- '''
|
|
|
- @summary: 获取包的作用域关键词
|
|
|
- '''
|
|
|
- df = pd.read_excel(os.path.dirname(__file__)+"/end.xls")
|
|
|
- pattern = "("
|
|
|
- for item in df["list_word"]:
|
|
|
- item = str(item).replace("(","\(").replace(")","\)").replace(".","\.").replace("[","\[").replace("]","\]").replace("-","\-")
|
|
|
- pattern += item+"|"
|
|
|
- pattern = pattern[:-1]+")[::是为]|业绩.{,30}标段[0-9A-Za-z一二三四五六七八九十]{0,3}"
|
|
|
- return pattern
|
|
|
-
|
|
|
-pattern_packageScope = getPackageScopePattern()
|
|
|
-
|
|
|
-def getPackagesFromArticle(list_sentence,list_entity):
|
|
|
- '''
|
|
|
- @param:
|
|
|
- list_sentence:文章的句子list
|
|
|
- @summary: 将包的信息插入list_entity中
|
|
|
- @return: type:list if [包号,句子index,词偏移,标段号] meaning:文章的包/标段信息
|
|
|
- '''
|
|
|
-
|
|
|
- if len(list_sentence)==0:
|
|
|
- return None
|
|
|
-
|
|
|
- PackageList = []
|
|
|
- PackageList_scope = []
|
|
|
- PackageSet = set()
|
|
|
- dict_packageCode = dict()
|
|
|
-
|
|
|
- package_name_pattern = re.compile("((标[段号的包]|分包)的?(名[称单]?|包名))[::]?([^::]{3,30}?),{1}")
|
|
|
- package_N_name_pattern = re.compile("[^承](分?包|标段|标包|标|包|包组|子项目|包件|项目类型)编?号?[::]?[\((]?([0-9A-Za-z一二三四五六七八九十ⅠⅡⅢⅣⅤⅥⅦ]){1,2},{1}")
|
|
|
- package_number_pattern = re.compile("(([^承](包|标[段号的包]|分?包|包组|包件)编?号?|子项目|项目类型)[::]?[0-9A-Za-z一二三四五六七八九十ⅠⅡⅢⅣⅤⅥⅦ]{1,4}[^\.])[^至]?|([^\.]?第?[ⅠⅡⅢⅣⅤⅥⅦ0-9A-Za-z一二三四五六七八九十]{1,4}(包号|标[段号的包]|分?包))")
|
|
|
- # other_package_pattern = re.compile('(项目名称|物资名称|设备名称|场次名称|标段名称)[::](.{,20}?)(,|项目)') # 新正则识别标段
|
|
|
- other_package_pattern = re.compile('((项目|物资|设备|场次|标段|标的|产品)(名称))[::]([^,。]{2,50}?)[,。]') # # 2020/11/23 大网站规则 调整 package_N_name_pattern, package_N_name_pattern 中的项目 改为 子项目
|
|
|
- win_tenderer_pattern = re.compile('(中标人|供应商)[::](.{2,25})[,。]') # 2020/11/23 大网站规则 调整
|
|
|
- model_pattern = re.compile('(型号|序号)[::]([^,。]{2,20})[,。]') # 2020/11/23 大网站规则 调整
|
|
|
- number_pattern = re.compile("[0-9A-Za-z一二三四五六七八九十ⅠⅡⅢⅣⅤⅥⅦ]{1,4}")
|
|
|
-
|
|
|
- package_code_pattern = re.compile("(?:编号[::]?\s*)([-\dA-Za-z\(\)]+)")
|
|
|
-
|
|
|
- def changeIndexFromWordToWords(tokens,word_index):
|
|
|
- '''
|
|
|
- @summary:转换某个字的字偏移为词偏移
|
|
|
- '''
|
|
|
- before_index = 0
|
|
|
- after_index = 0
|
|
|
- for i in range(len(tokens)):
|
|
|
- after_index = after_index+len(tokens[i])
|
|
|
- if before_index<=word_index and after_index>=word_index:
|
|
|
- return i
|
|
|
- before_index = after_index
|
|
|
- package_names = []
|
|
|
-
|
|
|
- def extractPackageCode(tokens,word_index,size=20,pattern = package_code_pattern):
|
|
|
- '''
|
|
|
- @summary:抽取包附近的标段号
|
|
|
- @param:
|
|
|
- tokens:包所在句子的分词
|
|
|
- word_index:包所在字偏移
|
|
|
- size:左右各取多少个词
|
|
|
- pattern:提取标段号的正则
|
|
|
- @return: type:string,meaning:标段号
|
|
|
- '''
|
|
|
- index = changeIndexFromWordToWords(tokens,word_index)
|
|
|
- if index<size:
|
|
|
- begin = index
|
|
|
- else:
|
|
|
- begin = index-size
|
|
|
- if index+size>len(tokens):
|
|
|
- end = len(tokens)
|
|
|
- else:
|
|
|
- end = index+size
|
|
|
- #拿到左右两边的词语组成短语
|
|
|
- text = "".join(tokens[begin:end])
|
|
|
- #在短语中的字偏移
|
|
|
- new_word_index = word_index-len("".join(tokens[:begin]))
|
|
|
- min_distance = len(text)
|
|
|
- packageCode = None
|
|
|
- for the_iter in re.finditer(pattern,text):
|
|
|
- #算出最小距离
|
|
|
- distance = min([abs(new_word_index-the_iter.span()[0]),abs(new_word_index-the_iter.span()[1])])
|
|
|
- if distance<min_distance:
|
|
|
- min_distance = distance
|
|
|
- packageCode = the_iter.group(1)
|
|
|
- return packageCode
|
|
|
- #从标段介绍表格中提取包名和包号
|
|
|
- for i in range(len(list_sentence)):
|
|
|
- content = list_sentence[i].sentence_text
|
|
|
- names = re.findall(package_name_pattern,content)
|
|
|
- if names == []:
|
|
|
- names = re.findall(other_package_pattern, content)
|
|
|
- N_names = re.findall(package_N_name_pattern,content)
|
|
|
- if len(names)==1 and len(N_names)==1:
|
|
|
- package_names.append([names[0][-1],N_names[0][-1]])
|
|
|
- for i in range(len(list_sentence)):
|
|
|
- PackageList_item = []
|
|
|
- PackageList_item_scope = []
|
|
|
- content = list_sentence[i].sentence_text
|
|
|
- tokens = list_sentence[i].tokens
|
|
|
- for name in package_names[:20]:
|
|
|
- for index in findAllIndex(name[0],content):
|
|
|
- temp_package_number = re.findall(number_pattern,name[1])[0]
|
|
|
- PackageList_item.append({"name":temp_package_number,"sentence_index":list_sentence[i].sentence_index,"offsetWords_begin":changeIndexFromWordToWords(tokens,index),"offsetWord_begin":index,"offsetWord_end":index+len(name[0])})
|
|
|
- # PackageList_item.append([temp_package_number,i,changeIndexFromWordToWords(tokens,index),index,index+len(str(temp_package_number))])
|
|
|
- code = extractPackageCode(tokens, index)
|
|
|
- if code is not None:
|
|
|
- dict_packageCode[temp_package_number] = code
|
|
|
- PackageSet.add(temp_package_number)
|
|
|
- for iter in re.finditer(package_number_pattern,content):
|
|
|
- temp_package_number = re.findall(number_pattern,content[iter.span()[0]:iter.span()[1]])[0]
|
|
|
- PackageList_item.append({"name":temp_package_number,"sentence_index":list_sentence[i].sentence_index,"offsetWords_begin":changeIndexFromWordToWords(tokens,iter.span()[0]),"offsetWord_begin":iter.span()[0],"offsetWord_end":iter.span()[1]})
|
|
|
- # PackageList_item.append([temp_package_number,i,changeIndexFromWordToWords(tokens,iter.span()[0]),iter.span()[0],iter.span()[1]])
|
|
|
- code = extractPackageCode(tokens, iter.span()[0])
|
|
|
- if code is not None:
|
|
|
- dict_packageCode[temp_package_number] = code
|
|
|
- PackageSet.add(temp_package_number)
|
|
|
-
|
|
|
- #识别packageScope
|
|
|
- for iter in re.finditer(pattern_packageScope,content):
|
|
|
- PackageList_item_scope.append({"name":"","sentence_index":list_sentence[i].sentence_index,"offsetWords_begin":changeIndexFromWordToWords(tokens,iter.span()[0]),"offsetWord_begin":iter.span()[0],"offsetWord_end":iter.span()[1]})
|
|
|
- # PackageList_item_scope.append(["",i,changeIndexFromWordToWords(tokens,iter.span()[0]),iter.span()[0],iter.span()[1]])
|
|
|
- PackageList_item_scope = PackageList_item +PackageList_item_scope
|
|
|
- PackageList_item_scope.sort(key=lambda x:x["offsetWord_begin"])
|
|
|
- PackageList_scope = PackageList_scope+PackageList_item_scope
|
|
|
- PackageList_item.sort(key=lambda x:x["sentence_index"])
|
|
|
- #PackageList = PackageList+PackageList_item
|
|
|
- #不作为包
|
|
|
- # if len(PackageSet)==0:
|
|
|
- # for i in range(len(list_sentence)):
|
|
|
- # PackageList_item = []
|
|
|
- # PackageList_item_scope = []
|
|
|
- # content = list_sentence[i].sentence_text
|
|
|
- # tokens = list_sentence[i].tokens
|
|
|
- # for iter in re.finditer(other_package_pattern,content):
|
|
|
- # temp_package_number = iter.group(2)
|
|
|
- # PackageList_item.append({"name":temp_package_number,"sentence_index":list_sentence[i].sentence_index,"offsetWords_begin":changeIndexFromWordToWords(tokens,iter.span()[0]),"offsetWord_begin":iter.span()[0],"offsetWord_end":iter.span()[1]})
|
|
|
- # # PackageList_item.append([temp_package_number,i,changeIndexFromWordToWords(tokens,iter.span()[0]),iter.span()[0],iter.span()[1]])
|
|
|
- # code = extractPackageCode(tokens, iter.span()[0])
|
|
|
- # if code is not None:
|
|
|
- # dict_packageCode[temp_package_number] = code
|
|
|
- # PackageSet.add(temp_package_number)
|
|
|
- # #识别packageScope
|
|
|
- # for iter in re.finditer(pattern_packageScope,content):
|
|
|
- # PackageList_item_scope.append({"name":"","sentence_index":list_sentence[i].sentence_index,"offsetWords_begin":changeIndexFromWordToWords(tokens,iter.span()[0]),"offsetWord_begin":iter.span()[0],"offsetWord_end":iter.span()[1]})
|
|
|
- # # PackageList_item_scope.append(["",i,changeIndexFromWordToWords(tokens,iter.span()[0]),iter.span()[0],iter.span()[1]])
|
|
|
- # PackageList_item_scope = PackageList_item +PackageList_item_scope
|
|
|
- # PackageList_item_scope.sort(key=lambda x:x["offsetWord_begin"])
|
|
|
- # PackageList_scope = PackageList_scope+PackageList_item_scope
|
|
|
- # PackageList_item.sort(key=lambda x:x["sentence_index"])
|
|
|
-
|
|
|
- # 2020/11/23 大网站规则 调整
|
|
|
- if len(PackageSet)==0 and len(set([it.entity_text for it in list_entity if it.entity_type in ['org', 'company'] and it.label==2]))>1:
|
|
|
- for i in range(len(list_sentence)):
|
|
|
- PackageList_item = []
|
|
|
- PackageList_item_scope = []
|
|
|
- content = list_sentence[i].sentence_text
|
|
|
- tokens = list_sentence[i].tokens
|
|
|
- names = re.findall(other_package_pattern, content)
|
|
|
- N_names = re.findall(win_tenderer_pattern, content)
|
|
|
- if len(names) != 1 or len(N_names) != 1:
|
|
|
- continue
|
|
|
- for iter in re.finditer(other_package_pattern,content):
|
|
|
- temp_package_number = iter.group(4)
|
|
|
- xinghao = re.search(model_pattern, content)
|
|
|
- if xinghao:
|
|
|
- temp_package_number = temp_package_number + '+' + xinghao.group(2)
|
|
|
- # print('新正则采购包名补充',temp_package_number)
|
|
|
- PackageList_item.append({"name":temp_package_number,"sentence_index":list_sentence[i].sentence_index,"offsetWords_begin":changeIndexFromWordToWords(tokens,iter.span()[0]),"offsetWord_begin":iter.span()[0],"offsetWord_end":iter.span()[1]})
|
|
|
- # PackageList_item.append([temp_package_number,i,changeIndexFromWordToWords(tokens,iter.span()[0]),iter.span()[0],iter.span()[1]])
|
|
|
- code = extractPackageCode(tokens, iter.span()[0])
|
|
|
- if code is not None:
|
|
|
- dict_packageCode[temp_package_number] = code
|
|
|
- PackageSet.add(temp_package_number)
|
|
|
- #识别packageScope
|
|
|
- for iter in re.finditer(pattern_packageScope,content):
|
|
|
- PackageList_item_scope.append({"name":"","sentence_index":list_sentence[i].sentence_index,"offsetWords_begin":changeIndexFromWordToWords(tokens,iter.span()[0]),"offsetWord_begin":iter.span()[0],"offsetWord_end":iter.span()[1]})
|
|
|
- # PackageList_item_scope.append(["",i,changeIndexFromWordToWords(tokens,iter.span()[0]),iter.span()[0],iter.span()[1]])
|
|
|
- PackageList_item_scope = PackageList_item +PackageList_item_scope
|
|
|
- PackageList_item_scope.sort(key=lambda x:x["offsetWord_begin"])
|
|
|
- PackageList_scope = PackageList_scope+PackageList_item_scope
|
|
|
- PackageList_item.sort(key=lambda x:x["sentence_index"])
|
|
|
-
|
|
|
- pattern_punctuation = "[::()\(\),,。;;]"
|
|
|
- for i in range(len(list_sentence)):
|
|
|
- for j in range(len(PackageList_scope)):
|
|
|
- if i==PackageList_scope[j]["sentence_index"] and PackageList_scope[j]["name"]!="":
|
|
|
- _flag = False
|
|
|
- left_str = list_sentence[i].sentence_text[PackageList_scope[j]["offsetWord_begin"]-30:PackageList_scope[j]["offsetWord_begin"]+1]
|
|
|
- right_str = list_sentence[i].sentence_text[PackageList_scope[j]["offsetWord_begin"]:PackageList_scope[j]["offsetWord_begin"]+30]
|
|
|
- _left_find = re.findall(pattern_punctuation,left_str)
|
|
|
- _right_find = re.findall(pattern_punctuation,right_str)
|
|
|
- #print(left_str)
|
|
|
- if re.search("同",left_str[-1:]) is not None and PackageList_scope[j]["name"]=="一":
|
|
|
- continue
|
|
|
- if re.search("划分",right_str[:10]) is not None:
|
|
|
- continue
|
|
|
- if len(_left_find)>0 and _left_find[-1] in [":",":"]:
|
|
|
- _flag = True
|
|
|
- if len(_right_find)>0 and _right_find[0] in [":",":"]:
|
|
|
- _flag = True
|
|
|
- if _flag:
|
|
|
- scope_begin = [PackageList_scope[j]["sentence_index"],PackageList_scope[j]["offsetWords_begin"]]
|
|
|
- else:
|
|
|
- if j==0:
|
|
|
- scope_begin = [0,0]
|
|
|
- else:
|
|
|
- scope_begin = [PackageList_scope[j-1]["sentence_index"],PackageList_scope[j-1]["offsetWords_begin"]]
|
|
|
- if j==len(PackageList_scope)-1:
|
|
|
- scope_end = [PackageList_scope[j]["offsetWords_begin"],changeIndexFromWordToWords(list_sentence[i].tokens, len(list_sentence[i].sentence_text))]
|
|
|
- else:
|
|
|
- scope_end = [PackageList_scope[j+1]["sentence_index"],PackageList_scope[j+1]["offsetWords_begin"]]
|
|
|
- if PackageList_scope[j-1]["sentence_index"]==PackageList_scope[j]["sentence_index"] and PackageList_scope[j-1]["offsetWord_begin"]<=PackageList_scope[j]["offsetWord_begin"] and PackageList_scope[j-1]["offsetWord_end"]>=PackageList_scope[j]["offsetWord_end"]:
|
|
|
- continue
|
|
|
-
|
|
|
- #add package to entity
|
|
|
- _pack_entity = Entity(doc_id=list_sentence[0].doc_id,entity_id="%s_%s_%s_%s"%(list_sentence[0].doc_id,i,PackageList_scope[j]["offsetWord_begin"],PackageList_scope[j]["offsetWord_begin"]),entity_text=PackageList_scope[j]["name"],entity_type="package",sentence_index=PackageList_scope[j]["sentence_index"],begin_index=changeIndexFromWordToWords(list_sentence[i].tokens,PackageList_scope[j]["offsetWord_begin"]),end_index=changeIndexFromWordToWords(list_sentence[i].tokens,PackageList_scope[j]["offsetWord_end"]),wordOffset_begin=PackageList_scope[j]["offsetWord_begin"],wordOffset_end=PackageList_scope[j]["offsetWord_end"])
|
|
|
- list_entity.append(_pack_entity)
|
|
|
-
|
|
|
- copy_pack = copy.copy(PackageList_scope[j])
|
|
|
- copy_pack["scope"] = [scope_begin,scope_end]
|
|
|
- copy_pack["hit"] = set()
|
|
|
- copy_pack["pointer"] = _pack_entity
|
|
|
-
|
|
|
- PackageList.append(copy_pack)
|
|
|
- return PackageList,PackageSet,dict_packageCode
|
|
|
-
|
|
|
-
|
|
|
-def findAttributeAfterEntity(PackDict,roleSet,PackageList,PackageSet,list_entity,on_value = 0.5,on_value_person=0.5,sentence_len=4):
|
|
|
- '''
|
|
|
- @param:
|
|
|
- PackDict:文章包dict
|
|
|
- roleSet:文章所有角色的公司名称
|
|
|
- PackageList:文章的包信息
|
|
|
- PackageSet:文章所有包的名称
|
|
|
- list_entity:文章所有经过模型处理的实体
|
|
|
- on_value:金额模型的阈值
|
|
|
- on_value_person:联系人模型的阈值
|
|
|
- sentence_len:公司和属性间隔句子的最大长度
|
|
|
- @return:添加了属性信息的角色list
|
|
|
- '''
|
|
|
-
|
|
|
- #根据roleid添加金额到rolelist中
|
|
|
- def addMoneyByRoleid(packDict,packageName,roleid,money,money_prob):
|
|
|
- for i in range(len(packDict[packageName]["roleList"])):
|
|
|
- if packDict[packageName]["roleList"][i].role_name==dict_role_id.get(str(roleid)):
|
|
|
- if money_prob>packDict[packageName]["roleList"][i].money_prob:
|
|
|
- packDict[packageName]["roleList"][i].money = money
|
|
|
- packDict[packageName]["roleList"][i].money_prob = money_prob
|
|
|
- return packDict
|
|
|
-
|
|
|
- #根据实体名称添加金额到rolelist中
|
|
|
- def addMoneyByEntity(packDict,packageName,entity,money,money_prob):
|
|
|
- for i in range(len(packDict[packageName]["roleList"])):
|
|
|
- if packDict[packageName]["roleList"][i].entity_text==entity:
|
|
|
- if money_prob>packDict[packageName]["roleList"][i].money_prob:
|
|
|
- packDict[packageName]["roleList"][i].money = money
|
|
|
- packDict[packageName]["roleList"][i].money_prob = money_prob
|
|
|
- return packDict
|
|
|
-
|
|
|
- #根据实体名称得到角色
|
|
|
- def getRoleWithText(packDict,entity_text):
|
|
|
- for pack in packDict.keys():
|
|
|
- for i in range(len(packDict[pack]["roleList"])):
|
|
|
- if packDict[pack]["roleList"][i].entity_text==entity_text:
|
|
|
- return packDict[pack]["roleList"][i].role_name
|
|
|
-
|
|
|
- def doesEntityOrLinkedEntity_inRoleSet(entity,RoleSet):
|
|
|
- _list_entitys = [entity]+entity.linked_entitys
|
|
|
- for _entity in _list_entitys:
|
|
|
- if _entity.entity_text in RoleSet:
|
|
|
- return True
|
|
|
-
|
|
|
- p_entity = 0
|
|
|
-
|
|
|
- #遍历所有实体
|
|
|
- while(p_entity<len(list_entity)):
|
|
|
- entity = list_entity[p_entity]
|
|
|
- '''
|
|
|
- #招标金额从后往前找
|
|
|
- if entity.entity_type=="money":
|
|
|
- if entity.values[entity.label]>=on_value:
|
|
|
- if str(entity.label)=="0":
|
|
|
- packagePointer,_ = getPackage(PackageList,entity.sentence_index,entity.begin_index,"money-"+str(entity.label))
|
|
|
- if packagePointer is None:
|
|
|
- packageName = "Project"
|
|
|
- else:
|
|
|
- packageName = packagePointer.entity_text
|
|
|
- addMoneyByRoleid(PackDict, packageName, "0", entity.entity_text, entity.values[entity.label])
|
|
|
- '''
|
|
|
- ''' # 2020/11/25 与下面的联系人连接步骤重复,取消
|
|
|
- if entity.entity_type=="person":
|
|
|
- if entity.values[entity.label]>=on_value_person:
|
|
|
- if str(entity.label)=="1":
|
|
|
- for i in range(len(PackDict["Project"]["roleList"])):
|
|
|
- if PackDict["Project"]["roleList"][i].role_name=="tenderee":
|
|
|
- PackDict["Project"]["roleList"][i].linklist.append((entity.entity_text,entity.person_phone))
|
|
|
- # add pointer_person
|
|
|
- for _entity in list_entity:
|
|
|
- if dict_role_id.get(str(_entity.label))=="tenderee":
|
|
|
- for i in range(len(PackDict["Project"]["roleList"])):
|
|
|
- if PackDict["Project"]["roleList"][i].entity_text==_entity.entity_text and PackDict["Project"]["roleList"][i].role_name=="tenderee":
|
|
|
- _entity.pointer_person = entity
|
|
|
- elif str(entity.label)=="2":
|
|
|
- for i in range(len(PackDict["Project"]["roleList"])):
|
|
|
- if PackDict["Project"]["roleList"][i].role_name=="agency":
|
|
|
- PackDict["Project"]["roleList"][i].linklist.append((entity.entity_text,entity.person_phone))
|
|
|
- # add pointer_person
|
|
|
- for _entity in list_entity:
|
|
|
- if dict_role_id.get(str(_entity.label))=="agency":
|
|
|
- for i in range(len(PackDict["Project"]["roleList"])):
|
|
|
- if PackDict["Project"]["roleList"][i].entity_text==_entity.entity_text and PackDict["Project"]["roleList"][i].role_name=="agency":
|
|
|
- _entity.pointer_person = entity
|
|
|
- '''
|
|
|
- #金额往前找实体
|
|
|
- if entity.entity_type=="money":
|
|
|
- if entity.values[entity.label]>=on_value:
|
|
|
- p_entity_money= p_entity
|
|
|
- entity_money = list_entity[p_entity_money]
|
|
|
- if len(PackageSet)>0:
|
|
|
- packagePointer,_ = getPackage(PackageList,entity_money.sentence_index,entity_money.begin_index,"money-"+str(entity_money.entity_text)+"-"+str(entity_money.label))
|
|
|
- if packagePointer is None:
|
|
|
- packageName_entity = "Project"
|
|
|
- else:
|
|
|
- packageName_entity = packagePointer.entity_text
|
|
|
- else:
|
|
|
- packageName_entity = "Project"
|
|
|
- while(p_entity_money>0):
|
|
|
- entity_before = list_entity[p_entity_money]
|
|
|
- if entity_before.entity_type in ['org','company']:
|
|
|
- if str(entity_before.label)=="1":
|
|
|
- addMoneyByEntity(PackDict, packageName_entity, entity_before.entity_text, entity_money.entity_text, entity_money.values[entity_money.label])
|
|
|
- #add pointer_money
|
|
|
- entity_before.pointer_money = entity_money
|
|
|
- break
|
|
|
- p_entity_money -= 1
|
|
|
-
|
|
|
-
|
|
|
- #如果实体属于角色集合,则往后找属性
|
|
|
- if doesEntityOrLinkedEntity_inRoleSet(entity, roleSet):
|
|
|
-
|
|
|
- p_entity += 1
|
|
|
- #循环查找符合的属性
|
|
|
- while(p_entity<len(list_entity)):
|
|
|
-
|
|
|
- entity_after = list_entity[p_entity]
|
|
|
- if entity_after.sentence_index-entity.sentence_index>=sentence_len:
|
|
|
- p_entity -= 1
|
|
|
- break
|
|
|
- #若是遇到公司实体,则跳出循环
|
|
|
- if entity_after.entity_type in ['org','company']:
|
|
|
- p_entity -= 1
|
|
|
- break
|
|
|
- if entity_after.values is not None:
|
|
|
- if entity_after.entity_type=="money":
|
|
|
- if entity_after.values[entity_after.label]>=on_value:
|
|
|
- '''
|
|
|
- #招标金额从后往前找
|
|
|
- if str(entity_after.label)=="0":
|
|
|
- packagePointer,_ = getPackage(PackageList,entity.sentence_index,entity.begin_index,"money-"+str(entity.label))
|
|
|
- if packagePointer is None:
|
|
|
- packageName = "Project"
|
|
|
- else:
|
|
|
- packageName = packagePointer.entity_text
|
|
|
- addMoneyByRoleid(PackDict, packageName, "0", entity_after.entity_text, entity_after.values[entity_after.label])
|
|
|
- '''
|
|
|
- if str(entity_after.label)=="1":
|
|
|
- #print(entity_after.entity_text,entity.entity_text)
|
|
|
- _list_entitys = [entity]+entity.linked_entitys
|
|
|
- if len(PackageSet)>0:
|
|
|
- packagePointer,_ = getPackage(PackageList,entity_after.sentence_index,entity_after.begin_index,"money-"+str(entity_after.entity_text)+"-"+str(entity_after.label))
|
|
|
- if packagePointer is None:
|
|
|
- packageName_entity = "Project"
|
|
|
- else:
|
|
|
- packageName_entity = packagePointer.entity_text
|
|
|
- else:
|
|
|
- packageName_entity = "Project"
|
|
|
- if str(entity.label) in ["2","3","4"]:
|
|
|
- addMoneyByEntity(PackDict, packageName_entity, entity.entity_text, entity_after.entity_text, entity_after.values[entity_after.label])
|
|
|
- #add pointer_money
|
|
|
- entity.pointer_money = entity_after
|
|
|
- '''
|
|
|
- if entity_after.entity_type=="person":
|
|
|
- if entity_after.values[entity_after.label]>=on_value_person:
|
|
|
- if str(entity_after.label)=="1":
|
|
|
- for i in range(len(roleList)):
|
|
|
- if roleList[i].role_name=="tenderee":
|
|
|
- roleList[i].linklist.append((entity_after.entity_text,entity_after.person_phone))
|
|
|
- elif str(entity_after.label)=="2":
|
|
|
- for i in range(len(roleList)):
|
|
|
- if roleList[i].role_name=="agency":
|
|
|
- roleList[i].linklist.append((entity_after.entity_text,entity_after.person_phone))
|
|
|
- elif str(entity_after.label)=="3":
|
|
|
- _list_entitys = [entity]+entity.linked_entitys
|
|
|
- for _entity in _list_entitys:
|
|
|
- for i in range(len(roleList)):
|
|
|
- if roleList[i].entity_text==_entity.entity_text:
|
|
|
- if entity_after.sentence_index-_entity.sentence_index>1 and len(roleList[i].linklist)>0:
|
|
|
- break
|
|
|
- roleList[i].linklist.append((entity_after.entity_text,entity_after.person_phone))
|
|
|
- '''
|
|
|
-
|
|
|
- p_entity += 1
|
|
|
-
|
|
|
- p_entity += 1
|
|
|
-
|
|
|
- ''''''
|
|
|
- # 1)第一个公司实体的招标人,则看看下一个实体是否为代理人,如果是则联系人错位连接 。2)在同一句中往后找联系人。3)连接不上在整个文章找联系人。
|
|
|
- temp_ent_list = [] # 临时列表,记录0,1角色及3联系人
|
|
|
- other_person = [] # 阈值以上的联系人列表
|
|
|
- link_person = [] # 有电话没联系上角色的person列表
|
|
|
- other_ent = []
|
|
|
- link_ent = []
|
|
|
- found_person = False
|
|
|
- ent_list = []
|
|
|
- for entity in list_entity:
|
|
|
- if entity.entity_type in ['org','company','person']:
|
|
|
- ent_list.append(entity)
|
|
|
- #for list_index in range(len(ent_list)):
|
|
|
- #if ent_list[list_index].entity_type in ['org','company'] and ent_list[list_index].label == 0 and list_index+2<len(ent_list) and \
|
|
|
- #ent_list[list_index+1].entity_type in ['org','company'] and ent_list[list_index+1].label == 1 and ent_list[list_index+2].entity_type in ['person']:
|
|
|
- #ent_list[list_index+1], ent_list[list_index+2] = ent_list[list_index+2], ent_list[list_index+1]
|
|
|
- # 2020/11/25增加确定角色联系人判断
|
|
|
- sure_person_set = set([entity.entity_text for entity in ent_list if entity.entity_type == 'person' and entity.label in [1, 2]])
|
|
|
-
|
|
|
- for index in range(len(ent_list)):
|
|
|
- entity = ent_list[index]
|
|
|
- if entity.entity_type=="person":
|
|
|
- if str(entity.label) == "0": # 2020/11/25 非联系人直接跳过
|
|
|
- continue
|
|
|
- if entity.values[entity.label]>on_value_person:
|
|
|
- if str(entity.label)=="1":
|
|
|
- for i in range(len(PackDict["Project"]["roleList"])):
|
|
|
- if PackDict["Project"]["roleList"][i].role_name=="tenderee":
|
|
|
- PackDict["Project"]["roleList"][i].linklist.append((entity.entity_text,entity.person_phone))
|
|
|
- link_person.append(entity.entity_text)
|
|
|
- link_ent.append(PackDict["Project"]["roleList"][i].entity_text)
|
|
|
- # add pointer_person
|
|
|
- for _entity in list_entity:
|
|
|
- if dict_role_id.get(str(_entity.label))=="tenderee":
|
|
|
- for i in range(len(PackDict["Project"]["roleList"])):
|
|
|
- if PackDict["Project"]["roleList"][i].entity_text==_entity.entity_text and PackDict["Project"]["roleList"][i].role_name=="tenderee":
|
|
|
- _entity.pointer_person = entity
|
|
|
- elif str(entity.label)=="2":
|
|
|
- for i in range(len(PackDict["Project"]["roleList"])):
|
|
|
- if PackDict["Project"]["roleList"][i].role_name=="agency":
|
|
|
- PackDict["Project"]["roleList"][i].linklist.append((entity.entity_text,entity.person_phone))
|
|
|
- link_person.append(entity.entity_text)
|
|
|
- link_ent.append(PackDict["Project"]["roleList"][i].entity_text)
|
|
|
- # add pointer_person
|
|
|
- for _entity in list_entity:
|
|
|
- if dict_role_id.get(str(_entity.label))=="agency":
|
|
|
- for i in range(len(PackDict["Project"]["roleList"])):
|
|
|
- if PackDict["Project"]["roleList"][i].entity_text==_entity.entity_text and PackDict["Project"]["roleList"][i].role_name=="agency":
|
|
|
- _entity.pointer_person = entity
|
|
|
- elif str(entity.label)=="3":
|
|
|
- if entity.entity_text in sure_person_set: # 2020/11/25 排除已经确定角色的联系人
|
|
|
- continue
|
|
|
- #not_link_person.append((entity_after.entity_text,entity_after.person_phone))
|
|
|
- other_person.append(entity.entity_text)
|
|
|
- temp_ent_list.append((entity.entity_text,entity.person_phone,entity))
|
|
|
-
|
|
|
- #if entity.entity_text in roleSet:
|
|
|
- if entity.entity_text in roleSet:
|
|
|
- if entity.label in [0,1]:
|
|
|
- other_ent.append(entity.entity_text)
|
|
|
- temp_ent_list.append((entity.entity_text, entity.label,entity))
|
|
|
- for behind_index in range(index+1, len(ent_list)):
|
|
|
- entity_after = ent_list[behind_index]
|
|
|
- if entity_after.sentence_index-entity.sentence_index>=1 or entity_after.entity_type in ['org','company']: # 只在本句中找联系人
|
|
|
- break
|
|
|
- if entity_after.values is not None:
|
|
|
- if entity_after.entity_type=="person":
|
|
|
- if str(entity_after.label) == "0": # 2020/11/25角色后面为非联系人 停止继续往后找
|
|
|
- break
|
|
|
- if entity_after.values[entity_after.label]>on_value_person:
|
|
|
- if str(entity_after.label)=="1":
|
|
|
- for i in range(len(PackDict["Project"]["roleList"])):
|
|
|
- if PackDict["Project"]["roleList"][i].role_name=="tenderee":
|
|
|
- PackDict["Project"]["roleList"][i].linklist.append((entity_after.entity_text,entity_after.person_phone))
|
|
|
- link_person.append(entity_after.entity_text)
|
|
|
- link_ent.append(PackDict["Project"]["roleList"][i].entity_text)
|
|
|
- elif str(entity_after.label)=="2":
|
|
|
- for i in range(len(PackDict["Project"]["roleList"])):
|
|
|
- if PackDict["Project"]["roleList"][i].role_name=="agency":
|
|
|
- PackDict["Project"]["roleList"][i].linklist.append((entity_after.entity_text,entity_after.person_phone))
|
|
|
- link_person.append(entity_after.entity_text)
|
|
|
- link_ent.append(PackDict["Project"]["roleList"][i].entity_text)
|
|
|
- elif str(entity_after.label)=="3":
|
|
|
- if entity_after.entity_text in sure_person_set: # 2020/11/25 如果姓名已经出现在确定角色联系人中则停止往后找
|
|
|
- break
|
|
|
- elif entity_after.begin_index - entity.end_index > 30:#2020/10/25 如果角色实体与联系人实体间隔大于阈值停止
|
|
|
- break
|
|
|
- for pack in PackDict.keys():
|
|
|
- for i in range(len(PackDict[pack]["roleList"])):
|
|
|
- if PackDict[pack]["roleList"][i].entity_text==entity.entity_text:
|
|
|
- #if entity_after.sentence_index-entity.sentence_index>1 and len(roleList[i].linklist)>0:
|
|
|
- #break
|
|
|
- PackDict[pack]["roleList"][i].linklist.append((entity_after.entity_text,entity_after.person_phone))
|
|
|
- link_person.append(entity_after.entity_text)
|
|
|
- #add pointer_person
|
|
|
- entity.pointer_person = entity_after
|
|
|
-
|
|
|
- not_link_person = [person for person in other_person if person not in link_person]
|
|
|
- not_link_ent = [ent for ent in other_ent if ent not in link_ent]
|
|
|
- if len(not_link_person) > 0 and len(not_link_ent) > 0 :
|
|
|
- item = temp_ent_list
|
|
|
- for i in range(len(item)):
|
|
|
- if item[i][0] in not_link_ent and item[i][1] == 0 and i+3 < len(item):
|
|
|
- if item[i+1][0] in other_ent and item[i+1][1] == 1 and item[i+2][0] in other_person and item[i+3][0] in other_person:
|
|
|
- item[i+1], item[i+2] = item[i+2], item[i+1]
|
|
|
- for i in range(len(item)-1, -1, -1):
|
|
|
- if item[i][0] in not_link_ent:
|
|
|
- for pack in PackDict.keys():
|
|
|
- for role in PackDict[pack]["roleList"]:
|
|
|
- if role.entity_text == item[i][0] and len(role.linklist) < 1:
|
|
|
- for j in range(i+1, len(item)):
|
|
|
- if item[j][0] in not_link_person:
|
|
|
- role.linklist.append(item[j][:2])
|
|
|
- #add pointer_person
|
|
|
- item[i][2].pointer_person = item[j][2]
|
|
|
- break
|
|
|
- else:
|
|
|
- break
|
|
|
-
|
|
|
-
|
|
|
- #寻找多标段招标金额
|
|
|
- p_entity = len(list_entity)-1
|
|
|
-
|
|
|
- set_tenderer_money = set()
|
|
|
- #遍历所有实体
|
|
|
- while(p_entity>=0):
|
|
|
- entity = list_entity[p_entity]
|
|
|
- if entity.entity_type=="money":
|
|
|
- if entity.values[entity.label]>=on_value:
|
|
|
- if str(entity.label)=="1":
|
|
|
- set_tenderer_money.add(float(entity.entity_text))
|
|
|
- if str(entity.label)=="0":
|
|
|
- '''
|
|
|
- if p_entity>0:
|
|
|
- p_before = list_entity[p_entity-1]
|
|
|
- if p_before.entity_type=="money" and p_before.label==entity.label and p_before.entity_text==entity.entity_text and abs(entity.begin_index-p_before.end_index)<=2:
|
|
|
- p_entity -= 1
|
|
|
- continue
|
|
|
- '''
|
|
|
- packagePointer,_flag = getPackage(PackageList,entity.sentence_index,entity.begin_index,"money-"+str(entity.label),MAX_DIS=2,DIRECT="L")
|
|
|
- if packagePointer is None:
|
|
|
- packageName = "Project"
|
|
|
- else:
|
|
|
- packageName = packagePointer.entity_text
|
|
|
-
|
|
|
- if packageName=="Project":
|
|
|
- if PackDict["Project"]["tendereeMoney"]<float(entity.entity_text):
|
|
|
- PackDict["Project"]["tendereeMoney"] = float(entity.entity_text)
|
|
|
- else:
|
|
|
- PackDict[packageName]["tendereeMoney"] = float(entity.entity_text)
|
|
|
- #add pointer_tendereeMoney
|
|
|
- packagePointer.pointer_tendereeMoney = entity
|
|
|
- p_entity -= 1
|
|
|
-
|
|
|
-
|
|
|
- #删除一个机构有多个角色的数据
|
|
|
- #删除重复人、概率不回传
|
|
|
- final_roleList = []
|
|
|
- list_pop = []
|
|
|
- set_tenderer_role = set()
|
|
|
- dict_pack_tenderer_money = dict()
|
|
|
-
|
|
|
- for pack in PackDict.keys():
|
|
|
- #删除无效包
|
|
|
- if PackDict[pack]["code"]=="" and PackDict[pack]["tendereeMoney"]==0 and len(PackDict[pack]["roleList"])==0:
|
|
|
- list_pop.append(pack)
|
|
|
- for i in range(len(PackDict[pack]["roleList"])):
|
|
|
- if PackDict[pack]["roleList"][i].role_name=="win_tenderer":
|
|
|
- if PackDict[pack]["roleList"][i].money==0:
|
|
|
- set_tenderer_role.add(PackDict[pack]["roleList"][i])
|
|
|
- dict_pack_tenderer_money[pack] = [PackDict[pack]["roleList"][i],set()]
|
|
|
- #找到包的中投标金额
|
|
|
- for _index in range(len(PackageList)):
|
|
|
- if "hit" in PackageList[_index]:
|
|
|
- for _hit in list(PackageList[_index]["hit"]):
|
|
|
- _money = float(_hit.split("-")[1]) if _hit.split("-")[0]=="money" else None
|
|
|
- if PackageList[_index]["name"] in dict_pack_tenderer_money and _money is not None:
|
|
|
- dict_pack_tenderer_money[PackageList[_index]["name"]][1].add(_money)
|
|
|
- #只找到一个中标人和中标金额
|
|
|
- if len(set_tenderer_money)==1 and len(set_tenderer_role)==1:
|
|
|
- list(set_tenderer_role)[0].money = list(set_tenderer_money)[0]
|
|
|
- #找到一个中标人和多个招标金额
|
|
|
- if len(set_tenderer_money)>1 and len(set_tenderer_role)==1:
|
|
|
- _maxMoney = 0
|
|
|
- _sumMoney = 0
|
|
|
- for _m in list(set_tenderer_money):
|
|
|
- _sumMoney += _m
|
|
|
- if _m>_maxMoney:
|
|
|
- _maxMoney = _m
|
|
|
- if _sumMoney/_maxMoney==2:
|
|
|
- list(set_tenderer_role)[0].money = _maxMoney
|
|
|
- else:
|
|
|
- list(set_tenderer_role)[0].money = _maxMoney
|
|
|
- #每个包都只找到一个金额
|
|
|
- _flag_pack_money = True
|
|
|
- for k,v in dict_pack_tenderer_money.items():
|
|
|
- if len(v[1])!=1:
|
|
|
- _flag_pack_money = False
|
|
|
- if _flag_pack_money and len(PackageSet)==len(dict_pack_tenderer_money.keys()):
|
|
|
- for k,v in dict_pack_tenderer_money.items():
|
|
|
- v[0].money = list(v[1])[0]
|
|
|
- for pack in PackDict.keys():
|
|
|
- for i in range(len(PackDict[pack]["roleList"])):
|
|
|
- PackDict[pack]["roleList"][i] = PackDict[pack]["roleList"][i].getString()
|
|
|
-
|
|
|
- for item in list_pop:
|
|
|
- PackDict.pop(item)
|
|
|
-
|
|
|
- return PackDict
|
|
|
-
|
|
|
-def initPackageAttr(RoleList,PackageSet):
|
|
|
- '''
|
|
|
- @summary: 根据拿到的roleList和packageSet初始化接口返回的数据
|
|
|
- '''
|
|
|
- packDict = dict()
|
|
|
- packDict["Project"] = {"code":"","tendereeMoney":0,"roleList":[]}
|
|
|
- for item in list(PackageSet):
|
|
|
- packDict[item] = {"code":"","tendereeMoney":0,"roleList":[]}
|
|
|
- for item in RoleList:
|
|
|
- if packDict[item.packageName]["code"] =="":
|
|
|
- packDict[item.packageName]["code"] = item.packageCode
|
|
|
- packDict[item.packageName]["roleList"].append(Role(item.role_name,item.entity_text,0,0,0.0,[]))
|
|
|
- return packDict
|
|
|
-
|
|
|
-def getPackageRoleMoney(list_sentence,list_entity):
|
|
|
- '''
|
|
|
- @param:
|
|
|
- list_sentence:文章的句子list
|
|
|
- list_entity:文章的实体list
|
|
|
- @return: 拿到文章的包-标段号-角色-实体名称-金额-联系人-联系电话
|
|
|
- '''
|
|
|
- # print("=1")
|
|
|
- theRole = getRoleList(list_sentence,list_entity)
|
|
|
- if not theRole:
|
|
|
- return []
|
|
|
- RoleList,RoleSet,PackageList,PackageSet = theRole
|
|
|
- '''
|
|
|
- for item in PackageList:
|
|
|
- print(item)
|
|
|
- '''
|
|
|
- # print("=2")
|
|
|
- PackDict = initPackageAttr(RoleList, PackageSet)
|
|
|
- # print("=3")
|
|
|
- PackDict = findAttributeAfterEntity(PackDict, RoleSet, PackageList, PackageSet, list_entity)
|
|
|
- # print("=4")
|
|
|
- return PackDict
|
|
|
-
|
|
|
-def getOtherAttributes(list_entity):
|
|
|
- dict_other = {"bidway":"",
|
|
|
- "moneysource":"",
|
|
|
- "person_review":[],
|
|
|
- "time_release":"",
|
|
|
- "time_bidopen":"",
|
|
|
- "time_bidclose":"",
|
|
|
- "serviceTime":"",
|
|
|
- "product":[]}
|
|
|
- for entity in list_entity:
|
|
|
- if entity.entity_type == 'bidway':
|
|
|
- dict_other["bidway"] = entity.entity_text
|
|
|
- elif entity.entity_type=='moneysource':
|
|
|
- dict_other["moneysource"] = entity.entity_text
|
|
|
- elif entity.entity_type=='serviceTime':
|
|
|
- dict_other["serviceTime"] = entity.entity_text
|
|
|
- elif entity.entity_type == 'time' and entity.label==1:
|
|
|
- dict_other["time_release"] = timeFormat(entity.entity_text)
|
|
|
- elif entity.entity_type == 'time' and entity.label==2:
|
|
|
- dict_other["time_bidopen"] = timeFormat(entity.entity_text)
|
|
|
- elif entity.entity_type == 'time' and entity.label == 3:
|
|
|
- dict_other["time_bidclose"] = timeFormat(entity.entity_text)
|
|
|
- elif entity.entity_type=="person" and entity.label ==4:
|
|
|
- dict_other["person_review"].append(entity.entity_text)
|
|
|
- elif entity.entity_type=='product':
|
|
|
- dict_other["product"].append(entity.entity_text)
|
|
|
- dict_other["product"] = list(set(dict_other["product"]))
|
|
|
- return dict_other
|
|
|
-
|
|
|
-def getMoneyRange(RoleList):
|
|
|
- pass
|
|
|
-
|
|
|
-def getPREMs(list_sentences,list_entitys,list_articles):
|
|
|
- '''
|
|
|
- @param:
|
|
|
- list_sentence:所有文章的句子list
|
|
|
- list_entity:所有文章的实体list
|
|
|
- @return:list of dict which include文章的包-角色-实体名称-金额-联系人-联系电话
|
|
|
- '''
|
|
|
- result = []
|
|
|
- for list_sentence,list_entity,list_article in zip(list_sentences,list_entitys,list_articles):
|
|
|
- RoleList = getPackageRoleMoney(list_sentence,list_entity)
|
|
|
- result.append(dict({"prem":RoleList,"docid":list_article.doc_id},**getOtherAttributes(list_entity),**{"fingerprint":list_article.fingerprint,"match_enterprise":list_article.match_enterprise,"match_enterprise_type":list_article.match_enterprise_type}))
|
|
|
- return result
|
|
|
-
|
|
|
-
|
|
|
-if __name__=="__main__":
|
|
|
- '''
|
|
|
- conn = getConnection()
|
|
|
- cursor = conn.cursor()
|
|
|
- #sql = " select distinct A.doc_id from entity_mention A,test_predict_role B where A.entity_id=B.entity_id limit 200"
|
|
|
- sql = " select B.doc_id,B.prem from articles_processed A, articles_validation B where A.id=B.doc_id "
|
|
|
-
|
|
|
- result = []
|
|
|
-
|
|
|
- cursor.execute(sql)
|
|
|
- rows = cursor.fetchall()
|
|
|
- count = 0
|
|
|
- for row in rows:
|
|
|
-
|
|
|
- count += 1
|
|
|
- print(count)
|
|
|
- doc_id = row[0]
|
|
|
-
|
|
|
- roleList = getPackageRoleMoney(doc_id)
|
|
|
- result.append([doc_id,str(roleList),row[1]])
|
|
|
- ''''''
|
|
|
- with codecs.open("getAttribute.html","w",encoding="utf8") as f:
|
|
|
- f.write('<html><head>\
|
|
|
- <meta http-equiv="Content-Type"\
|
|
|
- content="text/html; charset=UTF-8">\
|
|
|
- </head>\
|
|
|
- <body bgcolor="#FFFFFF">\
|
|
|
- <table border="1">\
|
|
|
- <tr>\
|
|
|
- <td>doc_id</td>\
|
|
|
- <td>角色</td>\
|
|
|
- </tr>')
|
|
|
- for item in result:
|
|
|
- f.write("<tr>"+"<td>"+item[0]+"</td>"+"<td>"+item[1]+"</td>"+"<td>"+item[2]+"</td>"+"</tr>")
|
|
|
- f.write("</table></body>")
|
|
|
- '''
|