|
@@ -888,7 +888,10 @@ def dispatch(match_list):
|
|
|
main_roles = list(set([match.main_role for match in match_list]))
|
|
|
# print('main_roles',[i.entity_text for i in main_roles])
|
|
|
attributes = list(set([match.attribute for match in match_list]))
|
|
|
- # print('attributes',[i.entity_text for i in attributes])
|
|
|
+ # try:
|
|
|
+ # print('attributes',[i.entity_text for i in attributes])
|
|
|
+ # except:
|
|
|
+ # pass
|
|
|
|
|
|
label = np.zeros(shape=(len(main_roles), len(attributes)))
|
|
|
for match in match_list:
|
|
@@ -907,7 +910,7 @@ def dispatch(match_list):
|
|
|
from BiddingKG.dl.common.Utils import getUnifyMoney
|
|
|
from BiddingKG.dl.interface.modelFactory import Model_relation_extraction
|
|
|
relationExtraction_model = Model_relation_extraction()
|
|
|
-def findAttributeAfterEntity(PackDict,roleSet,PackageList,PackageSet,list_sentence,list_entity,list_outline,on_value = 0.5,on_value_person=0.5,sentence_len=4):
|
|
|
+def findAttributeAfterEntity(PackDict,roleSet,PackageList,PackageSet,list_sentence,list_entity,list_outline,winter_scope,on_value = 0.5,on_value_person=0.5,sentence_len=4):
|
|
|
'''
|
|
|
@param:
|
|
|
PackDict:文章包dict
|
|
@@ -1286,6 +1289,7 @@ def findAttributeAfterEntity(PackDict,roleSet,PackageList,PackageSet,list_senten
|
|
|
agency_contact = set()
|
|
|
agency_phone = set()
|
|
|
winter_contact = set()
|
|
|
+ rule_winter_phone = set()
|
|
|
for _person in person_list:
|
|
|
if _person.label == 1:
|
|
|
tenderee_contact.add(_person.entity_text)
|
|
@@ -1305,22 +1309,39 @@ def findAttributeAfterEntity(PackDict,roleSet,PackageList,PackageSet,list_senten
|
|
|
'[\(|\(]0[1-9]\d{1,2}[\)|\)]-?[2-9]\d{6}\d?-?\d{,4}|' \
|
|
|
'[2-9]\d{6,7})'
|
|
|
re_tenderee_phone = re.compile(
|
|
|
- "(?:(?:(?:采购|招标|议价|议标|比选)(?:人|公司|单位|组织|部门)|建设(?:单位|业主)|(?:采购|招标|甲)方|询价单位|项目业主|业主)[^。代理]{0,5}(?:电话|联系方式|联系人|联系电话)[::]?[^。]{0,7}?)"
|
|
|
+ # "(?:(?:(?:采购|招标|议价|议标|比选|业主|委托)(?:人|公司|单位|组织|部门)|建设(?:单位|业主)|(?:采购|招标|甲)方|询价单位|项目业主|业主)(?:单位)?[^。代理]{0,5}(?:电话|联系方式|联系人|联系电话)[::]?[^。]{0,7}?)"
|
|
|
+ "(?:(?:(?:遴选|寻源|采购|招标|竞价|议价|比选|(?:[^受被]|^)委托|询比?价|比价|评选|谈判|邀标|邀请|洽谈|约谈|选取|抽取|抽选|项目|需求?|甲方?|转让|招租|议标|合同主体|挂牌|出租|出让|出售|标卖|处置|发包|最终|建设|业主|竞卖|申购|公选)"
|
|
|
+ "(?:人|方|商|单位|组织|用户|业主|主体|部门|公司|企业))(?:单位)?[^。代理]{0,5}(?:电话|联系方式|联系人|联系电话)[::]?[^。]{0,7}?)"
|
|
|
# 电话号码
|
|
|
+ phone_pattern)
|
|
|
# 例:"采购人地址和联系方式:峨边彝族自治县教育局,0833-5226788,"
|
|
|
re_tenderee_phone2 = re.compile(
|
|
|
- "(?:(?:(?:采购|招标|议价|议标|比选)(?:人|公司|单位|组织|部门)|建设(?:单位|业主)|(?:采购|招标|甲)方|询价单位|项目业主|业主)[^。代理]{0,3}(?:地址)[^。]{0,3}(?:电话|联系方式|联系人|联系电话)[::]?[^。]{0,20}?)"
|
|
|
+ # "(?:(?:(?:采购|招标|议价|议标|比选|业主)(?:人|公司|单位|组织|部门)|建设(?:单位|业主)|(?:采购|招标|甲)方|询价单位|项目业主|业主)(?:单位)?[^。代理]{0,3}(?:地址)[^。]{0,3}(?:电话|联系方式|联系人|联系电话)[::]?[^。]{0,20}?)"
|
|
|
+ "(?:(?:(?:遴选|寻源|采购|招标|竞价|议价|比选|(?:[^受被]|^)委托|询比?价|比价|评选|谈判|邀标|邀请|洽谈|约谈|选取|抽取|抽选|项目|需求?|甲方?|转让|招租|议标|合同主体|挂牌|出租|出让|出售|标卖|处置|发包|最终|建设|业主|竞卖|申购|公选)"
|
|
|
+ "(?:人|方|商|单位|组织|用户|业主|主体|部门|公司|企业))(?:单位)?[^。代理]{0,3}(?:地址)[^。]{0,3}(?:电话|联系方式|联系人|联系电话|联系人和联系方式)[::]?[^。]{0,20}?)"
|
|
|
# 电话号码
|
|
|
+ phone_pattern)
|
|
|
re_agent_phone = re.compile(
|
|
|
- "(?:(?:代理(?:人|机构|公司|单位|组织|方)|采购机构|集中采购机构|集采机构|招标机构)[^。]{0,5}(?:电话|联系方式|联系人|联系电话)[::]?[^。]{0,7}?)"
|
|
|
+ "(?:(?:(?:代理|[受被]委托)(?:人|方|商|机构|公司|单位|组织|企业)|采购机构|集中采购机构|集采机构|招标机构)[^。]{0,5}(?:电话|联系方式|联系人|联系电话|联系人和联系方式)[::]?[^。]{0,7}?)"
|
|
|
# 电话号码
|
|
|
+ phone_pattern)
|
|
|
re_agent_phone2 = re.compile(
|
|
|
- "(?:(?:代理(?:人|机构|公司|单位|组织|方)|采购机构|集中采购机构|集采机构|招标机构)[^。]{0,3}(?:地址)[^。]{0,3}(?:电话|联系方式|联系人|联系电话)[::]?[^。]{0,20}?)"
|
|
|
+ "(?:(?:(?:代理|[受被]委托)(?:人|方|商|机构|公司|单位|组织|企业)|采购机构|集中采购机构|集采机构|招标机构)[^。]{0,3}(?:地址)[^。]{0,3}(?:电话|联系方式|联系人|联系电话|联系人和联系方式)[::]?[^。]{0,20}?)"
|
|
|
# 电话号码
|
|
|
+ phone_pattern)
|
|
|
+ re_win_tenderer_phone = re.compile(
|
|
|
+ "(?:(?:(?:乙|竞得|受让|买受|签约|供货|供应?|合作|承做|承包|承建|承销|承保|承接|承制|承担|承修|承租(?:(包))?|入围|入选|竞买|中标|中选|中价|中签|成交|候选)"
|
|
|
+ "(?:候选|投标)?(?:人|单位|(?:中介)?(?:服务)?机构|供应商|客户|方|公司|企业|厂商|商|社会资本方?)|选定单位|中[标选]银行|成交对象)[^。审核]{0,5}(?:负责人|联系人|项目)?(?:经理|电话|联系方式|联系人|负责人|联系电话|联系人和联系方式)[::]?[^。]{0,7}?)"
|
|
|
+ + phone_pattern)
|
|
|
+ re_win_tenderer_phone2 = re.compile(
|
|
|
+ "(?:(?:(?:乙|竞得|受让|买受|签约|供货|供应?|合作|承做|承包|承建|承销|承保|承接|承制|承担|承修|承租(?:(包))?|入围|入选|竞买|中标|中选|中价|中签|成交|候选)"
|
|
|
+ "(?:候选|投标)?(?:人|单位|(?:中介)?(?:服务)?机构|供应商|客户|方|公司|企业|厂商|商|社会资本方?)|选定单位|中[标选]银行|成交对象)[^。]{0,3}(?:地址)[^。审核]{0,3}(?:负责人|联系人|项目)?(?:经理|电话|联系方式|联系人|负责人|联系电话|联系人和联系方式)[::]?[^。]{0,20}?)"
|
|
|
+ + phone_pattern)
|
|
|
+ not_win_tenderer_contact = re.compile("纪检|监察|质疑|投诉|监督|受理|请.{0,4}(联系|与)"
|
|
|
+ "|(遴选|寻源|采购|招标|竞价|议价|比选|委托|询比?价|比价|评选|谈判|邀标|邀请|洽谈|约谈|选取|抽取|抽选|项目|需求?|甲方?|转让|招租|议标|合同主体|挂牌|出租|出让|出售|标卖|处置|发包|最终|建设|业主|竞卖|申购|公选|发布|代理|拍卖|转出){1,2}"
|
|
|
+ "(人|方|商|单位|组织|用户|业主|主体|部门|公司|企业|工厂|银行|机构){0,2}"
|
|
|
+ "[\u4e00-\u9fa5]{0,4}(联系|咨询|电话)(人|电话|方式)?")
|
|
|
+
|
|
|
content = ""
|
|
|
for _sentence in list_sentence:
|
|
|
content += "".join(_sentence.tokens)
|
|
@@ -1366,6 +1387,20 @@ def findAttributeAfterEntity(PackDict,roleSet,PackageList,PackageSet,list_senten
|
|
|
for one_phone in _phone:
|
|
|
PackDict["Project"]["roleList"][i].linklist.append(("", one_phone))
|
|
|
agency_phone.add(one_phone)
|
|
|
+ # 中标人联系方式规则筛选
|
|
|
+ _winter_phone = re.findall(re_win_tenderer_phone, content)
|
|
|
+ if _winter_phone:
|
|
|
+ for _phone in _winter_phone:
|
|
|
+ _phone = _phone.split("/")
|
|
|
+ for one_phone in _phone:
|
|
|
+ rule_winter_phone.add(one_phone)
|
|
|
+ _winter_phone2 = re.findall(re_win_tenderer_phone2, content)
|
|
|
+ if _winter_phone2:
|
|
|
+ for _phone in _winter_phone2:
|
|
|
+ _phone = _phone.split("/")
|
|
|
+ for one_phone in _phone:
|
|
|
+ rule_winter_phone.add(one_phone)
|
|
|
+
|
|
|
# 正则提取电话号码实体
|
|
|
# key_word = re.compile('((?:电话|联系方式|联系人).{0,4}?)([0-1]\d{6,11})')
|
|
|
phone = re.compile('1[3-9][0-9][-—-―]?\d{4}[-—-―]?\d{4}|'
|
|
@@ -1443,7 +1478,7 @@ def findAttributeAfterEntity(PackDict,roleSet,PackageList,PackageSet,list_senten
|
|
|
error_numStr_index.append(numStr_index)
|
|
|
last_phone_mask = False
|
|
|
continue
|
|
|
- if re.search("身份证号?码?|注册[证号]|帐号|编[号码]|报价|费率|标价|证号|价格|型号|附件|代码|列号|行号|税号|[\(\(]万?元[\)\)]|[a-zA-Z]+\d*$", re.sub(",","",phone_left)):
|
|
|
+ if re.search("身份证号?码?|注册[证号]|帐号|编[号码]|报价|费率|标价|证号|资格证|资质|价格|金额|型号|附件|代码|列号|行号|税号|[\(\(]万?元[\)\)]|[a-zA-Z]+\d*$", re.sub(",","",phone_left)):
|
|
|
error_numStr_index.append(numStr_index)
|
|
|
last_phone_mask = False
|
|
|
continue
|
|
@@ -1611,7 +1646,9 @@ def findAttributeAfterEntity(PackDict,roleSet,PackageList,PackageSet,list_senten
|
|
|
if (_subject.label==0 and _object.entity_text in agency_contact ) or (_subject.label==1 and _object.entity_text in tenderee_contact):
|
|
|
continue
|
|
|
# 角色为中标候选人,排除"质疑|投诉|监督|受理"相关的联系人
|
|
|
- if _subject.label in [2,3,4] and re.search("纪检|监察|质疑|投诉|监督|受理|项目(单位)?联系|^联系人|请.{0,4}联系",list_sentence[_object.sentence_index].sentence_text[max(0,_object.wordOffset_begin-10):_object.wordOffset_begin]):
|
|
|
+ # if _subject.label in [2,3,4] and re.search("纪检|监察|质疑|投诉|监督|受理|项目(单位)?联系|^联系人|请.{0,4}联系",list_sentence[_object.sentence_index].sentence_text[max(0,_object.wordOffset_begin-10):_object.wordOffset_begin]):
|
|
|
+ if _subject.label in [2,3,4] and re.search(not_win_tenderer_contact,list_sentence[_object.sentence_index].sentence_text[max(0,_object.wordOffset_begin-15):_object.wordOffset_begin]):
|
|
|
+ # print('not_win_tenderer_contact1')
|
|
|
continue
|
|
|
# 角色为招标/代理人,排除"纪检|监察"相关的联系人
|
|
|
if _subject.label in [0,1] and re.search("纪检|监察|乙方|中标",list_sentence[_object.sentence_index].sentence_text[max(0,_object.wordOffset_begin - 10):_object.wordOffset_begin]):
|
|
@@ -1678,7 +1715,8 @@ def findAttributeAfterEntity(PackDict,roleSet,PackageList,PackageSet,list_senten
|
|
|
if temp.begin_index>combo[0].begin_index:
|
|
|
is_continue = True
|
|
|
break
|
|
|
- if is_continue: continue
|
|
|
+ if is_continue:
|
|
|
+ continue
|
|
|
combo[0].person_phone.append(combo[1])
|
|
|
linked_connetPerson.add(combo[0])
|
|
|
linked_phone.add(combo[1])
|
|
@@ -1913,9 +1951,11 @@ def findAttributeAfterEntity(PackDict,roleSet,PackageList,PackageSet,list_senten
|
|
|
person_phone = [phone for phone in per.person_phone] if per.person_phone else []
|
|
|
if not person_phone:
|
|
|
if per.entity_text not in tenderee_contact and per.entity_text not in agency_contact:
|
|
|
- PackDict[k]["roleList"][i].linklist.append((per.entity_text, ""))
|
|
|
- winter_contact.add(per.entity_text)
|
|
|
- continue
|
|
|
+ # 角色为中标候选人,联系人无号码且上文没有联系关键词时排除
|
|
|
+ if re.search("联系人|联系方式|电话|负责人|经理|法人|法定代表人",list_sentence[per.sentence_index].sentence_text[max(0, per.wordOffset_begin - 10):per.wordOffset_begin]):
|
|
|
+ PackDict[k]["roleList"][i].linklist.append((per.entity_text, ""))
|
|
|
+ winter_contact.add(per.entity_text)
|
|
|
+ continue
|
|
|
for _p in person_phone:
|
|
|
if per.entity_text not in tenderee_contact and _p.entity_text not in tenderee_phone and \
|
|
|
per.entity_text not in agency_contact and _p.entity_text not in agency_phone:
|
|
@@ -1947,6 +1987,122 @@ def findAttributeAfterEntity(PackDict,roleSet,PackageList,PackageSet,list_senten
|
|
|
for _p in person_phone:
|
|
|
if per.entity_text not in tenderee_contact and _p.entity_text not in tenderee_phone and per.entity_text not in winter_contact:
|
|
|
PackDict[k]["roleList"][i].linklist.append((per.entity_text, _p.entity_text))
|
|
|
+
|
|
|
+ # 使用中标信息大纲提取联系人
|
|
|
+ winter_scope_group = []
|
|
|
+ if winter_scope:
|
|
|
+ winter_scope_begin = winter_scope[0]
|
|
|
+ winter_scope_end = winter_scope[1]
|
|
|
+ # print(list_sentence[winter_scope_begin[0]].sentence_text[winter_scope_begin[1]:winter_scope_end[1]])
|
|
|
+ winter_temporary_list = []
|
|
|
+ for entity in list_entity:
|
|
|
+ if entity.entity_type in ['org', 'company', 'person']:
|
|
|
+ winter_temporary_list.append(entity)
|
|
|
+ winter_temporary_list = sorted(winter_temporary_list, key=lambda x: (x.sentence_index, x.begin_index))
|
|
|
+ winter_temporary_list2 = []
|
|
|
+ for _entity in winter_temporary_list:
|
|
|
+ if _entity.sentence_index>=winter_scope_begin[0] and _entity.sentence_index<=winter_scope_end[0]:
|
|
|
+ if (_entity.sentence_index==winter_scope_begin[0] and _entity.wordOffset_begin>=winter_scope_begin[1]) or \
|
|
|
+ _entity.sentence_index>winter_scope_begin[0]:
|
|
|
+ if (_entity.sentence_index == winter_scope_end[0] and _entity.wordOffset_end<=winter_scope_end[1]) or \
|
|
|
+ _entity.sentence_index<winter_scope_end[0]:
|
|
|
+ winter_temporary_list2.append(_entity)
|
|
|
+ # print('winter_scope_entity',[i.entity_text for i in winter_temporary_list2])
|
|
|
+ winter_scope_group = winter_temporary_list2
|
|
|
+
|
|
|
+ match_list_winter = []
|
|
|
+ for index in range(len(winter_scope_group)):
|
|
|
+ entity = winter_scope_group[index]
|
|
|
+ if entity.entity_type in ['company','org']:
|
|
|
+ match_nums = 0
|
|
|
+ for after_index in range(index + 1, min(len(winter_scope_group), index + 4)):
|
|
|
+ after_entity = winter_scope_group[after_index]
|
|
|
+ if match_nums > 2:
|
|
|
+ break
|
|
|
+ if after_entity.entity_type == 'person':
|
|
|
+ distance = (tokens_num_dict[after_entity.sentence_index] + after_entity.begin_index) - (
|
|
|
+ tokens_num_dict[entity.sentence_index] + entity.end_index)
|
|
|
+ # 实体为中标人/候选人,联系人已确定类别【1,2】
|
|
|
+ if entity.label in [2, 3, 4] and after_entity.label in [1, 2]:
|
|
|
+ break
|
|
|
+ if entity.label in [2, 3, 4] and distance >= 30:
|
|
|
+ break
|
|
|
+ # 角色为中标候选人,排除"质疑|投诉|监督|受理"相关的联系人
|
|
|
+ if entity.label in [2, 3, 4] and re.search(not_win_tenderer_contact, list_sentence[after_entity.sentence_index].sentence_text[max(0,after_entity.wordOffset_begin - 15):after_entity.wordOffset_begin]):
|
|
|
+ break
|
|
|
+ # 角色为中标候选人,联系人无号码且上文没有联系关键词时排除
|
|
|
+ if entity.label in [2, 3, 4] and not after_entity.person_phone and not re.search(
|
|
|
+ "联系人|联系方式|电话|负责人|经理|法人|法定代表人", list_sentence[after_entity.sentence_index].sentence_text[max(0,after_entity.wordOffset_begin - 10):after_entity.wordOffset_begin]):
|
|
|
+ continue
|
|
|
+ # 角色为招标/代理人,排除"纪检|监察"相关的联系人
|
|
|
+ if entity.label in [0, 1] and re.search("纪检|监察|乙方|中标", list_sentence[after_entity.sentence_index].sentence_text[max(0,after_entity.wordOffset_begin - 10):after_entity.wordOffset_begin]):
|
|
|
+ break
|
|
|
+ if after_entity.sentence_index != 0 and after_entity.wordOffset_begin <= 10:
|
|
|
+ if entity.label in [2, 3, 4] and re.search("请.{0,5}联系",list_sentence[after_entity.sentence_index - 1].sentence_text[-10:] +
|
|
|
+ list_sentence[after_entity.sentence_index].sentence_text[0:after_entity.wordOffset_begin]):
|
|
|
+ continue
|
|
|
+ if distance < 80:
|
|
|
+ if (entity.label == 0 and after_entity.label == 1) or (
|
|
|
+ entity.label == 1 and after_entity.label == 2):
|
|
|
+ distance = distance / 100
|
|
|
+ value = (-1 / 2 * (distance ** 2)) / 10000
|
|
|
+ match_list_winter.append(Match(entity, after_entity, value))
|
|
|
+ match_nums += 1
|
|
|
+ # 前向查找匹配
|
|
|
+ if index != 0:
|
|
|
+ previous_entity = winter_scope_group[index - 1]
|
|
|
+ if previous_entity.entity_type == 'person' and previous_entity.label in [1,2,3]:
|
|
|
+ if entity.label in [2, 3, 4] and previous_entity.label in [1, 2]:
|
|
|
+ continue
|
|
|
+ # 角色为中标候选人,排除"质疑|投诉|监督|受理"相关的联系人
|
|
|
+ if entity.label in [2, 3, 4] and re.search(not_win_tenderer_contact, list_sentence[previous_entity.sentence_index].sentence_text[
|
|
|
+ max(0,previous_entity.wordOffset_begin - 15):previous_entity.wordOffset_begin]):
|
|
|
+ break
|
|
|
+ # 角色为中标候选人,联系人无号码且上文没有联系关键词时排除
|
|
|
+ if entity.label in [2, 3, 4] and not previous_entity.person_phone and not re.search(
|
|
|
+ "联系人|联系方式|电话|负责人|经理|法人|法定代表人",list_sentence[previous_entity.sentence_index].sentence_text[
|
|
|
+ max(0, previous_entity.wordOffset_begin - 10):previous_entity.wordOffset_begin]):
|
|
|
+ continue
|
|
|
+ # 角色为招标/代理人,排除"纪检|监察"相关的联系人
|
|
|
+ if entity.label in [0, 1] and re.search("纪检|监察|乙方|中标", list_sentence[previous_entity.sentence_index].sentence_text[
|
|
|
+ max(0,previous_entity.wordOffset_begin - 10):previous_entity.wordOffset_begin]):
|
|
|
+ break
|
|
|
+ if previous_entity.sentence_index == entity.sentence_index:
|
|
|
+ distance = (tokens_num_dict[entity.sentence_index] + entity.begin_index) - (
|
|
|
+ tokens_num_dict[
|
|
|
+ previous_entity.sentence_index] + previous_entity.end_index)
|
|
|
+ if distance < 30:
|
|
|
+ # 距离相等时,前向添加处罚值
|
|
|
+ # distance += 1
|
|
|
+ # 前向 没有 /10000
|
|
|
+ value = (-1 / 2 * (distance ** 2))
|
|
|
+ match_list_winter.append(Match(entity, previous_entity, value))
|
|
|
+ # test
|
|
|
+ # match_list_winter = company_contact_link([winter_scope_group])
|
|
|
+ # km算法分配求解
|
|
|
+ result_winter = dispatch(match_list_winter)
|
|
|
+ for match in result_winter:
|
|
|
+ _company = match[0]
|
|
|
+ _person = match[1]
|
|
|
+ _person = _person.entity_text
|
|
|
+ # 更新中标人联系方式
|
|
|
+ if _company.label==2:
|
|
|
+ phone_ = [i.entity_text for i in match[1].person_phone] if match[1].person_phone else []
|
|
|
+ for k in PackDict.keys():
|
|
|
+ for i in range(len(PackDict[k]["roleList"])):
|
|
|
+ if PackDict[k]["roleList"][i].role_name == "win_tenderer":
|
|
|
+ if PackDict[k]["roleList"][i].entity_text == _company.entity_text:
|
|
|
+ if _person not in tenderee_contact and len(set(phone_) & set(tenderee_phone)) == 0 and \
|
|
|
+ _person not in agency_contact and len(set(phone_) & set(agency_phone)) == 0:
|
|
|
+ if not phone_:
|
|
|
+ PackDict[k]["roleList"][i].linklist.append((_person, ""))
|
|
|
+ for p in phone_:
|
|
|
+ PackDict[k]["roleList"][i].linklist.append((_person, p))
|
|
|
+ if phone_:
|
|
|
+ for p in phone_:
|
|
|
+ rule_winter_phone.add(p)
|
|
|
+ # print('rule_winter_phone',rule_winter_phone)
|
|
|
+
|
|
|
re_split = re.compile("[^\u4e00-\u9fa5、](十一|十二|十三|十四|十五|一|二|三|四|五|六|七|八|九|十)、")
|
|
|
split_list = [0] * 16
|
|
|
split_dict = {
|
|
@@ -2045,11 +2201,17 @@ def findAttributeAfterEntity(PackDict,roleSet,PackageList,PackageSet,list_senten
|
|
|
# 实体为中标人/候选人,联系人已确定类别【1,2】
|
|
|
if entity.label in [2, 3, 4] and after_entity.label in [1, 2]:
|
|
|
break
|
|
|
- if entity.label in [2, 3, 4] and distance>=20:
|
|
|
+ if entity.label in [2, 3, 4] and distance>=30:
|
|
|
break
|
|
|
# 角色为中标候选人,排除"质疑|投诉|监督|受理"相关的联系人
|
|
|
- if entity.label in [2, 3, 4] and re.search("纪检|监察|质疑|投诉|监督|受理|项目(单位)?联系|(采购|招标)人?联系", list_sentence[after_entity.sentence_index].sentence_text[max(0,after_entity.wordOffset_begin - 10):after_entity.wordOffset_begin]):
|
|
|
+ # if entity.label in [2, 3, 4] and re.search("纪检|监察|质疑|投诉|监督|受理|项目(单位|业主)?联系|(采购|招标)人?联系|请.{0,4}联系", list_sentence[after_entity.sentence_index].sentence_text[max(0,after_entity.wordOffset_begin - 10):after_entity.wordOffset_begin]):
|
|
|
+ if entity.label in [2, 3, 4] and re.search(not_win_tenderer_contact, list_sentence[after_entity.sentence_index].sentence_text[max(0,after_entity.wordOffset_begin - 15):after_entity.wordOffset_begin]):
|
|
|
+ # print('not_win_tenderer_contact2')
|
|
|
break
|
|
|
+ # 角色为中标候选人,联系人无号码且上文没有联系关键词时排除
|
|
|
+ # print('test',after_entity.entity_text,after_entity.person_phone,list_sentence[after_entity.sentence_index].sentence_text[max(0,after_entity.wordOffset_begin - 10):after_entity.wordOffset_begin])
|
|
|
+ if entity.label in [2, 3, 4] and not after_entity.person_phone and not re.search("联系人|联系方式|电话|负责人|经理|法人|法定代表人",list_sentence[after_entity.sentence_index].sentence_text[max(0,after_entity.wordOffset_begin - 10):after_entity.wordOffset_begin]):
|
|
|
+ continue
|
|
|
# 角色为招标/代理人,排除"纪检|监察"相关的联系人
|
|
|
if entity.label in [0,1] and re.search("纪检|监察|乙方|中标",list_sentence[after_entity.sentence_index].sentence_text[max(0,after_entity.wordOffset_begin - 10):after_entity.wordOffset_begin]):
|
|
|
break
|
|
@@ -2136,7 +2298,9 @@ def findAttributeAfterEntity(PackDict,roleSet,PackageList,PackageSet,list_senten
|
|
|
new_split_list[split_index][1]:
|
|
|
mid_sentence = mid_sentence[max(0, phone_begin - 15):phone_begin].replace(",", "")
|
|
|
if re.search(key_phone, mid_sentence):
|
|
|
- if entity.label in [2, 3, 4] and re.search("纪检|监察|质疑|投诉|监督|受理|项目(单位)?联系",mid_sentence[-8:]):
|
|
|
+ # if entity.label in [2, 3, 4] and re.search("纪检|监察|质疑|投诉|监督|受理|项目(单位|业主)?联系|(采购|招标)人?联系|请.{0,4}联系",mid_sentence[-10:]):
|
|
|
+ if entity.label in [2, 3, 4] and re.search(not_win_tenderer_contact,mid_sentence[-15:]):
|
|
|
+ # print('not_win_tenderer_contact3')
|
|
|
pass
|
|
|
else:
|
|
|
distance = 1
|
|
@@ -2189,7 +2353,9 @@ def findAttributeAfterEntity(PackDict,roleSet,PackageList,PackageSet,list_senten
|
|
|
p_phone = [p.entity_text for p in next_entity.person_phone] if next_entity.person_phone else []
|
|
|
if next_entity.entity_type == 'person' and _phone in p_phone:
|
|
|
pass
|
|
|
- elif entity.label in [2, 3, 4] and re.search("纪检|监察|质疑|投诉|监督|受理|项目(单位)?联系", mid_sentence[-8:]):
|
|
|
+ # elif entity.label in [2, 3, 4] and re.search("纪检|监察|质疑|投诉|监督|受理|项目(单位|业主)?联系|(采购|招标)人?联系|请.{0,4}联系", mid_sentence[-10:]):
|
|
|
+ elif entity.label in [2, 3, 4] and re.search(not_win_tenderer_contact, mid_sentence[-15:]):
|
|
|
+ # print('not_win_tenderer_contact4')
|
|
|
pass
|
|
|
else:
|
|
|
distance = (tokens_num_dict[
|
|
@@ -2213,6 +2379,19 @@ def findAttributeAfterEntity(PackDict,roleSet,PackageList,PackageSet,list_senten
|
|
|
if previous_entity.entity_type == 'person' and previous_entity.label in [1, 2, 3]:
|
|
|
if entity.label in [2, 3, 4] and previous_entity.label in [1, 2]:
|
|
|
continue
|
|
|
+ # 角色为中标候选人,排除"质疑|投诉|监督|受理"相关的联系人
|
|
|
+ if entity.label in [2, 3, 4] and re.search(not_win_tenderer_contact,list_sentence[previous_entity.sentence_index].sentence_text[
|
|
|
+ max(0,previous_entity.wordOffset_begin - 15):previous_entity.wordOffset_begin]):
|
|
|
+ # print('not_win_tenderer_contact2')
|
|
|
+ break
|
|
|
+ # 角色为中标候选人,联系人无号码且上文没有联系关键词时排除
|
|
|
+ if entity.label in [2, 3,4] and not previous_entity.person_phone and not re.search("联系人|联系方式|电话|负责人|经理|法人|法定代表人",
|
|
|
+ list_sentence[previous_entity.sentence_index].sentence_text[max(0,previous_entity.wordOffset_begin - 10):previous_entity.wordOffset_begin]):
|
|
|
+ continue
|
|
|
+ # 角色为招标/代理人,排除"纪检|监察"相关的联系人
|
|
|
+ if entity.label in [0, 1] and re.search("纪检|监察|乙方|中标", list_sentence[previous_entity.sentence_index].sentence_text[
|
|
|
+ max(0,previous_entity.wordOffset_begin - 10):previous_entity.wordOffset_begin]):
|
|
|
+ break
|
|
|
if previous_entity.sentence_index == entity.sentence_index:
|
|
|
distance = (tokens_num_dict[entity.sentence_index] + entity.begin_index) - (
|
|
|
tokens_num_dict[
|
|
@@ -2384,10 +2563,12 @@ def findAttributeAfterEntity(PackDict,roleSet,PackageList,PackageSet,list_senten
|
|
|
|
|
|
# "roleList"中联系人电话去重
|
|
|
tenderee_agency_phone = []
|
|
|
+ tenderee_agency_contact = []
|
|
|
for k in PackDict.keys():
|
|
|
for i in range(len(PackDict[k]["roleList"])):
|
|
|
if PackDict[k]["roleList"][i].role_name in ['agency','tenderee']:
|
|
|
tenderee_agency_phone.extend([person_phone[1] for person_phone in PackDict[k]["roleList"][i].linklist if person_phone[1]])
|
|
|
+ tenderee_agency_contact.extend([person_phone[0]+'-'+person_phone[1] for person_phone in PackDict[k]["roleList"][i].linklist])
|
|
|
# 带有联系人的电话
|
|
|
with_person = [person_phone[1] for person_phone in PackDict[k]["roleList"][i].linklist if person_phone[0]]
|
|
|
# 带有电话的联系人
|
|
@@ -2407,22 +2588,25 @@ def findAttributeAfterEntity(PackDict,roleSet,PackageList,PackageSet,list_senten
|
|
|
for k in PackDict.keys():
|
|
|
for i in range(len(PackDict[k]["roleList"])):
|
|
|
if PackDict[k]["roleList"][i].role_name in ['win_tenderer', 'second_tenderer','third_tenderer']:
|
|
|
- if tenderee_agency_phone:
|
|
|
+ if tenderee_agency_phone or tenderee_agency_contact:
|
|
|
remove_list = []
|
|
|
for item in PackDict[k]["roleList"][i].linklist:
|
|
|
if item[1] and item[1] in tenderee_agency_phone:
|
|
|
remove_list.append(item)
|
|
|
+ elif item[0]+'-'+item[1] in tenderee_agency_contact:
|
|
|
+ remove_list.append(item)
|
|
|
+ for _item in remove_list:
|
|
|
+ PackDict[k]["roleList"][i].linklist.remove(_item)
|
|
|
+ elif not tenderee_agency_phone:
|
|
|
+ # 公告中无招标代理联系方式时,可排除中标联系方式
|
|
|
+ remove_list = []
|
|
|
+ for _item in PackDict[k]["roleList"][i].linklist:
|
|
|
+ # 排除非正则规则识别的联系方式
|
|
|
+ if _item[1] not in rule_winter_phone:
|
|
|
+ remove_list.append(_item)
|
|
|
+ # print('remove_list',remove_list)
|
|
|
for _item in remove_list:
|
|
|
PackDict[k]["roleList"][i].linklist.remove(_item)
|
|
|
- # else:
|
|
|
- # # 公告中无招标代理联系方式时,可排除中标联系方式
|
|
|
- # remove_list = []
|
|
|
- # for _item in PackDict[k]["roleList"][i].linklist:
|
|
|
- # # 有联系方式
|
|
|
- # if _item[1]:
|
|
|
- # remove_list.append(_item)
|
|
|
- # for _item in remove_list:
|
|
|
- # PackDict[k]["roleList"][i].linklist.remove(_item)
|
|
|
# PackDict更新company/org地址
|
|
|
last_role_prob = {}
|
|
|
for ent in pre_entity:
|
|
@@ -2934,7 +3118,7 @@ def findAttributeAfterEntity(PackDict,roleSet,PackageList,PackageSet,list_senten
|
|
|
# 公告中只有"招标人"且无"联系人"链接时
|
|
|
if len(PackDict)==1:
|
|
|
k = list(PackDict.keys())[0]
|
|
|
- tenderee_agency_role = [role for role in PackDict[k]["roleList"] if role.role_name in ['tenderee','agency']]
|
|
|
+ tenderee_agency_role = [role for role in PackDict[k]["roleList"] if role.role_name in ['tenderee','agency','win_tenderer']]
|
|
|
if len(tenderee_agency_role)==1:
|
|
|
exist_person = []
|
|
|
exist_phone = []
|
|
@@ -2951,7 +3135,8 @@ def findAttributeAfterEntity(PackDict,roleSet,PackageList,PackageSet,list_senten
|
|
|
if not get_contacts:
|
|
|
# 根据大纲Outline类召回联系人
|
|
|
for outline in list_outline:
|
|
|
- if re.search("联系人|联系方|联系方式|联系电话|电话|负责人|与.{2,4}联系",outline.outline_summary):
|
|
|
+ if re.search("联系人|联系方|联系方式|联系电话|电话|负责人|与.{2,4}联系",outline.outline_summary) and \
|
|
|
+ not re.search("代理|乙方|竞得|受让|买受|签约|供货|供应|承做|承包|承建|承销|承保|承接|承制|承担|承修|承租(?:(包))?|入围|入选|竞买|中标|中选|中价|中签|成交|候选",outline.outline_summary):
|
|
|
for t_person in [p for p in temporary_list2 if p.entity_type=='person' and p.label==3]:
|
|
|
if words_num_dict[t_person.sentence_index] + t_person.wordOffset_begin >= words_num_dict[outline.sentence_begin_index] + outline.wordOffset_begin and words_num_dict[
|
|
|
t_person.sentence_index] + t_person.wordOffset_end < words_num_dict[outline.sentence_end_index] + outline.wordOffset_end:
|
|
@@ -2995,7 +3180,8 @@ def findAttributeAfterEntity(PackDict,roleSet,PackageList,PackageSet,list_senten
|
|
|
for _start, _end in new_split_list:
|
|
|
temp_sentence = _content[_start:_end]
|
|
|
sentence_outline = temp_sentence.split(",::")[0]
|
|
|
- if re.search("联系人|联系方|联系方式|联系电话|电话|负责人|与.{2,4}联系", sentence_outline):
|
|
|
+ if re.search("联系人|联系方|联系方式|联系电话|电话|负责人|与.{2,4}联系", sentence_outline) and \
|
|
|
+ not re.search("代理|乙方|竞得|受让|买受|签约|供货|供应|承做|承包|承建|承销|承保|承接|承制|承担|承修|承租(?:(包))?|入围|入选|竞买|中标|中选|中价|中签|成交|候选",sentence_outline):
|
|
|
sentence_phone = phone.findall(temp_sentence)
|
|
|
if sentence_phone:
|
|
|
if sentence_phone[0] in [ent.entity_text for ent in phone_entitys] and sentence_phone[0] not in ",".join(exist_phone):
|
|
@@ -3069,7 +3255,7 @@ def initPackageAttr(RoleList,PackageSet,win_tenderer_set,tenderee_or_agency_set,
|
|
|
packDict[item.packageName]["roleList"].append(Role(item.role_name,item.entity_text,item.role_prob,0,0.0,[],set(item.multi_winner)-win_tenderer_set-tenderee_or_agency_set)) #Role(角色名称,实体名称,角色阈值,金额,金额阈值,连接列表,多中标人)
|
|
|
return packDict
|
|
|
|
|
|
-def getPackageRoleMoney(list_sentence,list_entity,list_outline):
|
|
|
+def getPackageRoleMoney(list_sentence,list_entity,list_outline,winter_scope):
|
|
|
'''
|
|
|
@param:
|
|
|
list_sentence:文章的句子list
|
|
@@ -3089,7 +3275,7 @@ def getPackageRoleMoney(list_sentence,list_entity,list_outline):
|
|
|
# PackDict = initPackageAttr(RoleList, PackageSet)
|
|
|
PackDict = initPackageAttr(RoleList, PackageSet, win_tenderer_set,tenderee_or_agency_set,main_body_pack)
|
|
|
|
|
|
- PackDict = findAttributeAfterEntity(PackDict, RoleSet, PackageList, PackageSet, list_sentence, list_entity, list_outline)
|
|
|
+ PackDict = findAttributeAfterEntity(PackDict, RoleSet, PackageList, PackageSet, list_sentence, list_entity, list_outline, winter_scope)
|
|
|
return PackDict
|
|
|
|
|
|
def turnBidWay(bidway):
|
|
@@ -4272,7 +4458,7 @@ def getProjectContacts(list_entity, list_sentence):
|
|
|
|
|
|
return {'project_contacts':project_contacts_list}
|
|
|
|
|
|
-def getPREMs(list_sentences,list_entitys,list_articles,list_outlines,page_time):
|
|
|
+def getPREMs(list_sentences,list_entitys,list_articles,list_outlines,page_time,winter_scope):
|
|
|
'''
|
|
|
@param:
|
|
|
list_sentence:所有文章的句子list
|
|
@@ -4281,7 +4467,7 @@ def getPREMs(list_sentences,list_entitys,list_articles,list_outlines,page_time):
|
|
|
'''
|
|
|
result = []
|
|
|
for list_sentence,list_entity,list_article,list_outline in zip(list_sentences,list_entitys,list_articles,list_outlines):
|
|
|
- RoleList = getPackageRoleMoney(list_sentence,list_entity,list_outline)
|
|
|
+ RoleList = getPackageRoleMoney(list_sentence,list_entity,list_outline,winter_scope)
|
|
|
result.append(dict({"prem": RoleList, "docid": list_article.doc_id},
|
|
|
**getTimeAttributes(list_entity, list_sentence,page_time),
|
|
|
**getProjectContacts(list_entity, list_sentence),
|