|
@@ -31,6 +31,12 @@ dict_role_id = {"0":"tenderee",
|
|
"3":"second_tenderer",
|
|
"3":"second_tenderer",
|
|
"4":"third_tenderer"}
|
|
"4":"third_tenderer"}
|
|
|
|
|
|
|
|
+role2id_dict = {"tenderee":0,
|
|
|
|
+ "agency":1,
|
|
|
|
+ "win_tenderer":2,
|
|
|
|
+ "second_tenderer":3,
|
|
|
|
+ "third_tenderer":4}
|
|
|
|
+
|
|
def getPackage(packageList,sentence_index,begin_index,roleid,MAX_DIS=None,DIRECT=None):
|
|
def getPackage(packageList,sentence_index,begin_index,roleid,MAX_DIS=None,DIRECT=None):
|
|
'''
|
|
'''
|
|
@param:
|
|
@param:
|
|
@@ -851,7 +857,7 @@ def findAttributeAfterEntity(PackDict,roleSet,PackageList,PackageSet,list_senten
|
|
def addRatioByEntity(packDict,packageName,entity,ratio):
|
|
def addRatioByEntity(packDict,packageName,entity,ratio):
|
|
for i in range(len(packDict[packageName]["roleList"])):
|
|
for i in range(len(packDict[packageName]["roleList"])):
|
|
if packDict[packageName]["roleList"][i].entity_text==entity:
|
|
if packDict[packageName]["roleList"][i].entity_text==entity:
|
|
- packDict[packageName]["roleList"][i].ratio = ratio.entity_text
|
|
|
|
|
|
+ packDict[packageName]["roleList"][i].ratio = ratio.ratio_value
|
|
def addServiceTimeByEntity(packDict,packageName,entity,serviceTime):
|
|
def addServiceTimeByEntity(packDict,packageName,entity,serviceTime):
|
|
for i in range(len(packDict[packageName]["roleList"])):
|
|
for i in range(len(packDict[packageName]["roleList"])):
|
|
if packDict[packageName]["roleList"][i].entity_text==entity:
|
|
if packDict[packageName]["roleList"][i].entity_text==entity:
|
|
@@ -1253,8 +1259,9 @@ def findAttributeAfterEntity(PackDict,roleSet,PackageList,PackageSet,list_senten
|
|
'0[1-9]\d{1,2}[-—-―]?[1-9]\d{6}\d?(?=[1-9]\d{6,7})|'
|
|
'0[1-9]\d{1,2}[-—-―]?[1-9]\d{6}\d?(?=[1-9]\d{6,7})|'
|
|
'0[1-9]\d{1,2}[-—-―]?[1-9]\d{6}\d?|'
|
|
'0[1-9]\d{1,2}[-—-―]?[1-9]\d{6}\d?|'
|
|
'[\(|\(]0[1-9]\d{1,2}[\)|\)]-?\d{7,8}-?\d{,4}|'
|
|
'[\(|\(]0[1-9]\d{1,2}[\)|\)]-?\d{7,8}-?\d{,4}|'
|
|
|
|
+ '400\d{7}转\d{1,4}|'
|
|
'[2-9]\d{6,7}')
|
|
'[2-9]\d{6,7}')
|
|
- url_pattern = re.compile("http[s]?://(?:[a-zA-Z]|[0-9]|[$\-_@.&+=\?:/]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+")
|
|
|
|
|
|
+ url_pattern = re.compile("http[s]?://(?:[a-zA-Z]|[0-9]|[#$\-_@.&+=\?:/]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+")
|
|
email_pattern = re.compile("[a-zA-Z0-9][a-zA-Z0-9_-]+(?:\.[a-zA-Z0-9_-]+)*@"
|
|
email_pattern = re.compile("[a-zA-Z0-9][a-zA-Z0-9_-]+(?:\.[a-zA-Z0-9_-]+)*@"
|
|
"[a-zA-Z0-9_-]+(?:\.[a-zA-Z0-9_-]+)*(?:\.[a-zA-Z]{2,})")
|
|
"[a-zA-Z0-9_-]+(?:\.[a-zA-Z0-9_-]+)*(?:\.[a-zA-Z]{2,})")
|
|
phone_entitys = []
|
|
phone_entitys = []
|
|
@@ -1308,7 +1315,7 @@ def findAttributeAfterEntity(PackDict,roleSet,PackageList,PackageSet,list_senten
|
|
pass
|
|
pass
|
|
else:
|
|
else:
|
|
# 排除“传真号”和其它错误项
|
|
# 排除“传真号”和其它错误项
|
|
- if re.search("传,?真|信,?箱|邮,?[箱件]|QQ|qq", phone_left):
|
|
|
|
|
|
+ if re.search("传,?真|信,?箱|邮,?[编箱件]|QQ|qq", phone_left):
|
|
if not re.search("电,?话", phone_left):
|
|
if not re.search("电,?话", phone_left):
|
|
error_numStr_index.append(numStr_index)
|
|
error_numStr_index.append(numStr_index)
|
|
last_phone_mask = False
|
|
last_phone_mask = False
|
|
@@ -1350,6 +1357,20 @@ def findAttributeAfterEntity(PackDict,roleSet,PackageList,PackageSet,list_senten
|
|
error_numStr_index.append(numStr_index)
|
|
error_numStr_index.append(numStr_index)
|
|
last_phone_mask = False
|
|
last_phone_mask = False
|
|
continue
|
|
continue
|
|
|
|
+ left_context = re.search("[\da-zA-Z\-—-―]+$",sentence_text[:item[1]])
|
|
|
|
+ if left_context:
|
|
|
|
+ if len(left_context.group()) != len("".join(re.findall(phone, left_context.group()))):
|
|
|
|
+ # if not re.search("(" + phone.pattern + ")$", left_context.group()):
|
|
|
|
+ error_numStr_index.append(numStr_index)
|
|
|
|
+ last_phone_mask = False
|
|
|
|
+ continue
|
|
|
|
+ right_context = re.search("^[\da-zA-Z\-—-―]+", sentence_text[item[2]:])
|
|
|
|
+ if right_context:
|
|
|
|
+ if len(right_context.group()) != len("".join(re.findall(phone, right_context.group()))):
|
|
|
|
+ # if not re.search("^(" + phone.pattern + ")", right_context.group()):
|
|
|
|
+ error_numStr_index.append(numStr_index)
|
|
|
|
+ last_phone_mask = False
|
|
|
|
+ continue
|
|
# if:上一个phone实体不符合条件
|
|
# if:上一个phone实体不符合条件
|
|
if not last_phone_mask:
|
|
if not last_phone_mask:
|
|
item_start = item[1]
|
|
item_start = item[1]
|
|
@@ -1525,52 +1546,58 @@ def findAttributeAfterEntity(PackDict,roleSet,PackageList,PackageSet,list_senten
|
|
break
|
|
break
|
|
# print(3,combo[0].entity_text,combo[1].entity_text)
|
|
# print(3,combo[0].entity_text,combo[1].entity_text)
|
|
|
|
|
|
- # "公司——地址" 链接规则补充
|
|
|
|
- company_lacation_EntityList = [ent for ent in pre_entity if ent.entity_type in ['company', 'org', 'location']]
|
|
|
|
- company_lacation_EntityList = sorted(company_lacation_EntityList, key=lambda x: (x.sentence_index, x.begin_index))
|
|
|
|
- t_match_list = []
|
|
|
|
- for ent_idx in range(len(company_lacation_EntityList)):
|
|
|
|
- entity = company_lacation_EntityList[ent_idx]
|
|
|
|
- if entity.entity_type in ['company', 'org']:
|
|
|
|
- match_nums = 0
|
|
|
|
- company_nums = 0 # 经过其他公司的数量
|
|
|
|
- location_nums = 0 # 经过电话的数量
|
|
|
|
- for after_index in range(ent_idx + 1, min(len(company_lacation_EntityList), ent_idx + 5)):
|
|
|
|
- after_entity = company_lacation_EntityList[after_index]
|
|
|
|
- if after_entity.entity_type == "location":
|
|
|
|
- distance = (tokens_num_dict[after_entity.sentence_index] + after_entity.begin_index) - (
|
|
|
|
- tokens_num_dict[entity.sentence_index] + entity.end_index)
|
|
|
|
- location_nums += 1
|
|
|
|
- if distance > 100 or location_nums >= 3:
|
|
|
|
- break
|
|
|
|
- sentence_distance = after_entity.sentence_index - entity.sentence_index
|
|
|
|
- value = (-1 / 2 * (distance ** 2)) / 10000
|
|
|
|
- if sentence_distance == 0:
|
|
|
|
- if distance < 80:
|
|
|
|
- t_match_list.append(Match(entity, after_entity, value))
|
|
|
|
- match_nums += 1
|
|
|
|
- if company_nums:
|
|
|
|
- break
|
|
|
|
- else:
|
|
|
|
- if distance < 50:
|
|
|
|
- t_match_list.append(Match(entity, after_entity, value))
|
|
|
|
- match_nums += 1
|
|
|
|
- if company_nums:
|
|
|
|
- break
|
|
|
|
|
|
+ # "公司——地址" 链接规则补充
|
|
|
|
+ company_lacation_EntityList = [ent for ent in pre_entity if ent.entity_type in ['company', 'org', 'location']]
|
|
|
|
+ # company_lacation_EntityList = [ent for ent in pre_entity if (ent.entity_type in ['company', 'org'] and ent.label!=5) or ent.entity_type=="location"]
|
|
|
|
+ company_lacation_EntityList = sorted(company_lacation_EntityList, key=lambda x: (x.sentence_index, x.begin_index))
|
|
|
|
+ t_match_list = []
|
|
|
|
+ for ent_idx in range(len(company_lacation_EntityList)):
|
|
|
|
+ entity = company_lacation_EntityList[ent_idx]
|
|
|
|
+ if entity.entity_type in ['company', 'org'] and entity.label!=5:
|
|
|
|
+ match_nums = 0
|
|
|
|
+ company_nums = 0 # 经过其他公司的数量
|
|
|
|
+ location_nums = 0 # 经过电话的数量
|
|
|
|
+ for after_index in range(ent_idx + 1, min(len(company_lacation_EntityList), ent_idx + 5)):
|
|
|
|
+ after_entity = company_lacation_EntityList[after_index]
|
|
|
|
+ if after_entity.entity_type == "location":
|
|
|
|
+ distance = (tokens_num_dict[after_entity.sentence_index] + after_entity.begin_index) - (
|
|
|
|
+ tokens_num_dict[entity.sentence_index] + entity.end_index)
|
|
|
|
+ location_nums += 1
|
|
|
|
+ if distance > 100 or location_nums >= 3:
|
|
|
|
+ break
|
|
|
|
+ sentence_distance = after_entity.sentence_index - entity.sentence_index
|
|
|
|
+ value = (-1 / 2 * (distance ** 2)) / 10000
|
|
|
|
+ if sentence_distance == 0:
|
|
|
|
+ if distance < 80:
|
|
|
|
+ t_match_list.append(Match(entity, after_entity, value))
|
|
|
|
+ match_nums += 1
|
|
|
|
+ if company_nums:
|
|
|
|
+ break
|
|
else:
|
|
else:
|
|
- # type:company/org
|
|
|
|
- company_nums += 1
|
|
|
|
- if entity.label in [2, 3, 4] and after_entity.label in [0, 1]:
|
|
|
|
- break
|
|
|
|
|
|
+ if distance < 50:
|
|
|
|
+ t_match_list.append(Match(entity, after_entity, value))
|
|
|
|
+ match_nums += 1
|
|
|
|
+ if company_nums:
|
|
|
|
+ break
|
|
|
|
+ else:
|
|
|
|
+ # type:company/org
|
|
|
|
+ company_nums += 1
|
|
|
|
+ if entity.label in [2, 3, 4] and after_entity.label in [0, 1]:
|
|
|
|
+ break
|
|
|
|
+ if entity.label in [0, 1] and after_entity.label in [2, 3, 4]:
|
|
|
|
+ break
|
|
|
|
|
|
- # km算法分配求解
|
|
|
|
- relate_location_result = dispatch(t_match_list)
|
|
|
|
- relate_location_result = sorted(relate_location_result, key=lambda x: (x[0].sentence_index, x[0].begin_index))
|
|
|
|
- for match in relate_location_result:
|
|
|
|
- _company = match[0]
|
|
|
|
- _relation = match[1]
|
|
|
|
- if not _company.pointer_address:
|
|
|
|
- _company.pointer_address = _relation
|
|
|
|
|
|
+ # km算法分配求解
|
|
|
|
+ # for item in t_match_list:
|
|
|
|
+ # print("loc_rela",item.main_role.entity_text,item.attribute.entity_text)
|
|
|
|
+ relate_location_result = dispatch(t_match_list)
|
|
|
|
+ relate_location_result = sorted(relate_location_result, key=lambda x: (x[0].sentence_index, x[0].begin_index))
|
|
|
|
+ for match in relate_location_result:
|
|
|
|
+ _company = match[0]
|
|
|
|
+ _relation = match[1]
|
|
|
|
+ # print("loc_rela2", _company.entity_text, _relation.entity_text, )
|
|
|
|
+ if not _company.pointer_address:
|
|
|
|
+ _company.pointer_address = _relation
|
|
# "联系人——联系电话" 链接规则补充
|
|
# "联系人——联系电话" 链接规则补充
|
|
person_phone_EntityList = [ent for ent in pre_entity+ phone_entitys if ent.entity_type not in ['company','org','location']]
|
|
person_phone_EntityList = [ent for ent in pre_entity+ phone_entitys if ent.entity_type not in ['company','org','location']]
|
|
person_phone_EntityList = sorted(person_phone_EntityList, key=lambda x: (x.sentence_index, x.begin_index))
|
|
person_phone_EntityList = sorted(person_phone_EntityList, key=lambda x: (x.sentence_index, x.begin_index))
|
|
@@ -2182,6 +2209,7 @@ def findAttributeAfterEntity(PackDict,roleSet,PackageList,PackageSet,list_senten
|
|
PackDict[k]["roleList"][i].linklist.remove(_item)
|
|
PackDict[k]["roleList"][i].linklist.remove(_item)
|
|
|
|
|
|
# PackDict更新company/org地址
|
|
# PackDict更新company/org地址
|
|
|
|
+ last_role_prob = {}
|
|
for ent in pre_entity:
|
|
for ent in pre_entity:
|
|
if ent.entity_type in ['company','org']:
|
|
if ent.entity_type in ['company','org']:
|
|
if ent.pointer_address:
|
|
if ent.pointer_address:
|
|
@@ -2190,9 +2218,16 @@ def findAttributeAfterEntity(PackDict,roleSet,PackageList,PackageSet,list_senten
|
|
if PackDict[k]["roleList"][i].entity_text == ent.entity_text:
|
|
if PackDict[k]["roleList"][i].entity_text == ent.entity_text:
|
|
if not PackDict[k]["roleList"][i].address:
|
|
if not PackDict[k]["roleList"][i].address:
|
|
PackDict[k]["roleList"][i].address = ent.pointer_address.entity_text
|
|
PackDict[k]["roleList"][i].address = ent.pointer_address.entity_text
|
|
|
|
+ last_role_prob[PackDict[k]["roleList"][i].role_name] = ent.values[role2id_dict[PackDict[k]["roleList"][i].role_name]]
|
|
else:
|
|
else:
|
|
- if len(ent.pointer_address.entity_text) > len(PackDict[k]["roleList"][i].address):
|
|
|
|
- PackDict[k]["roleList"][i].address = ent.pointer_address.entity_text
|
|
|
|
|
|
+ if PackDict[k]["roleList"][i].role_name in ['tenderee','agency']:
|
|
|
|
+ # 角色为招标/代理人时,取其实体概率高的链接地址作为角色address
|
|
|
|
+ if ent.values[role2id_dict[PackDict[k]["roleList"][i].role_name]] > last_role_prob[PackDict[k]["roleList"][i].role_name]:
|
|
|
|
+ PackDict[k]["roleList"][i].address = ent.pointer_address.entity_text
|
|
|
|
+ last_role_prob[PackDict[k]["roleList"][i].role_name] = ent.values[role2id_dict[PackDict[k]["roleList"][i].role_name]]
|
|
|
|
+ else:
|
|
|
|
+ if len(ent.pointer_address.entity_text) > len(PackDict[k]["roleList"][i].address):
|
|
|
|
+ PackDict[k]["roleList"][i].address = ent.pointer_address.entity_text
|
|
|
|
|
|
# 联系人——电子邮箱链接
|
|
# 联系人——电子邮箱链接
|
|
temporary_list3 = [entity for entity in list_entity if entity.entity_type=='email' or (entity.entity_type=='person' and entity.label in [1,2,3])]
|
|
temporary_list3 = [entity for entity in list_entity if entity.entity_type=='email' or (entity.entity_type=='person' and entity.label in [1,2,3])]
|