|
@@ -1148,32 +1148,32 @@ def findAttributeAfterEntity(PackDict,roleSet,PackageList,PackageSet,list_entity
|
|
|
agency_contact.add(_person.entity_text)
|
|
|
# 正则匹配无 '主体/联系人' 的电话
|
|
|
# 例:"采购人联系方式:0833-5226788,"
|
|
|
- phone_pattern = '(1[3|4|5|6|7|8|9][0-9][-—-―]?\d{4}[-—-―]?\d{4}|' \
|
|
|
- '\+86.?1[3|4|5|6|7|8|9]\d{9}|' \
|
|
|
+ phone_pattern = '(1[3-9][0-9][-—-―]?\d{4}[-—-―]?\d{4}|' \
|
|
|
+ '\+86.?1[3-9]\d{9}|' \
|
|
|
'0[1-9]\d{1,2}[-—-―][1-9]\d{6,7}/[1-9]\d{6,10}|' \
|
|
|
'0[1-9]\d{1,2}[-—-―]\d{7,8}.?转\d{1,4}|' \
|
|
|
'0[1-9]\d{1,2}[-—-―]\d{7,8}[-—-―]\d{1,4}|' \
|
|
|
- '0[1-9]\d{1,2}[-—-―]?[1-9]\d{6}\d?(?=1[3|4|5|6|7|8|9]\d{9})|' \
|
|
|
+ '0[1-9]\d{1,2}[-—-―]?[1-9]\d{6}\d?(?=1[3-9]\d{9})|' \
|
|
|
'0[1-9]\d{1,2}[-—-―]?[1-9]\d{6}\d?(?=0[1-9]\d{1,2}[-—-―]?[1-9]\d{6}\d?)|' \
|
|
|
'0[1-9]\d{1,2}[-—-―]?[1-9]\d{6}\d?(?=[1-9]\d{6,7})|' \
|
|
|
'0[1-9]\d{1,2}[-—-―]?[1-9]\d{6}\d?|' \
|
|
|
'[\(|\(]0[1-9]\d{1,2}[\)|\)]-?\d{7,8}-?\d{,4}|' \
|
|
|
'[2-9]\d{6,7})'
|
|
|
re_tenderee_phone = re.compile(
|
|
|
- "(?:(?:(?:采购|招标|议价|议标|比选)(?:人|公司|单位|组织|部门)|建设(?:单位|业主)|(?:采购|招标|甲)方|询价单位|项目业主|业主)[^。]{0,5}(?:电话|联系方式|联系人)[::]?[^。]{0,7}?)"
|
|
|
+ "(?:(?:(?:采购|招标|议价|议标|比选)(?:人|公司|单位|组织|部门)|建设(?:单位|业主)|(?:采购|招标|甲)方|询价单位|项目业主|业主)[^。]{0,5}(?:电话|联系方式|联系人|联系电话)[::]?[^。]{0,7}?)"
|
|
|
# 电话号码
|
|
|
+ phone_pattern)
|
|
|
# 例:"采购人地址和联系方式:峨边彝族自治县教育局,0833-5226788,"
|
|
|
re_tenderee_phone2 = re.compile(
|
|
|
- "(?:(?:(?:采购|招标|议价|议标|比选)(?:人|公司|单位|组织|部门)|建设(?:单位|业主)|(?:采购|招标|甲)方|询价单位|项目业主|业主)[^。]{0,3}(?:地址)[^。]{0,3}(?:电话|联系方式|联系人)[::]?[^。]{0,20}?)"
|
|
|
+ "(?:(?:(?:采购|招标|议价|议标|比选)(?:人|公司|单位|组织|部门)|建设(?:单位|业主)|(?:采购|招标|甲)方|询价单位|项目业主|业主)[^。]{0,3}(?:地址)[^。]{0,3}(?:电话|联系方式|联系人|联系电话)[::]?[^。]{0,20}?)"
|
|
|
# 电话号码
|
|
|
+ phone_pattern)
|
|
|
re_agent_phone = re.compile(
|
|
|
- "(?:(?:代理(?:人|机构|公司|单位|组织|方)|采购机构|集中采购机构|集采机构|招标机构)[^。]{0,5}(?:电话|联系方式|联系人)[::]?[^。]{0,7}?)"
|
|
|
+ "(?:(?:代理(?:人|机构|公司|单位|组织|方)|采购机构|集中采购机构|集采机构|招标机构)[^。]{0,5}(?:电话|联系方式|联系人|联系电话)[::]?[^。]{0,7}?)"
|
|
|
# 电话号码
|
|
|
+ phone_pattern)
|
|
|
re_agent_phone2 = re.compile(
|
|
|
- "(?:(?:代理(?:人|机构|公司|单位|组织|方)|采购机构|集中采购机构|集采机构|招标机构)[^。]{0,3}(?:地址)[^。]{0,3}(?:电话|联系方式|联系人)[::]?[^。]{0,20}?)"
|
|
|
+ "(?:(?:代理(?:人|机构|公司|单位|组织|方)|采购机构|集中采购机构|集采机构|招标机构)[^。]{0,3}(?:地址)[^。]{0,3}(?:电话|联系方式|联系人|联系电话)[::]?[^。]{0,20}?)"
|
|
|
# 电话号码
|
|
|
+ phone_pattern)
|
|
|
content = ""
|
|
@@ -1224,18 +1224,22 @@ def findAttributeAfterEntity(PackDict,roleSet,PackageList,PackageSet,list_entity
|
|
|
|
|
|
# 正则提取电话号码实体
|
|
|
# key_word = re.compile('((?:电话|联系方式|联系人).{0,4}?)([0-1]\d{6,11})')
|
|
|
- phone = re.compile('1[3|4|5|6|7|8|9][0-9][-—-―]?\d{4}[-—-―]?\d{4}|'
|
|
|
- '\+86.?1[3|4|5|6|7|8|9]\d{9}|'
|
|
|
+ phone = re.compile('1[3-9][0-9][-—-―]?\d{4}[-—-―]?\d{4}|'
|
|
|
+ '\+86.?1[3-9]\d{9}|'
|
|
|
# '0[^0]\d{1,2}[-—-―][1-9]\d{6,7}/[1-9]\d{6,10}|'
|
|
|
'0[1-9]\d{1,2}[-—-―]\d{7,8}.?转\d{1,4}|'
|
|
|
'0[1-9]\d{1,2}[-—-―]\d{7,8}[-—-―]\d{1,4}|'
|
|
|
- '0[1-9]\d{1,2}[-—-―]?[1-9]\d{6}\d?(?=1[3|4|5|6|7|8|9]\d{9})|'
|
|
|
+ '0[1-9]\d{1,2}[-—-―]?[1-9]\d{6}\d?(?=1[3-9]\d{9})|'
|
|
|
'0[1-9]\d{1,2}[-—-―]?[1-9]\d{6}\d?(?=0[1-9]\d{1,2}[-—-―]?[1-9]\d{6}\d?)|'
|
|
|
'0[1-9]\d{1,2}[-—-―]?[1-9]\d{6}\d?(?=[1-9]\d{6,7})|'
|
|
|
'0[1-9]\d{1,2}[-—-―]?[1-9]\d{6}\d?|'
|
|
|
'[\(|\(]0[1-9]\d{1,2}[\)|\)]-?\d{7,8}-?\d{,4}|'
|
|
|
'[2-9]\d{6,7}')
|
|
|
+ url_pattern = re.compile("http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+")
|
|
|
+ email_pattern = re.compile("[a-zA-Z0-9][a-zA-Z0-9_-]+(?:\.[a-zA-Z0-9_-]+)*@"
|
|
|
+ "[a-zA-Z0-9_-]+(?:\.[a-zA-Z0-9_-]+)*(?:\.[a-zA-Z]{2,})")
|
|
|
phone_entitys = []
|
|
|
+ code_entitys = [ent for ent in list_entity if ent.entity_type=='code']
|
|
|
for _sentence in list_sentence:
|
|
|
sentence_text = _sentence.sentence_text
|
|
|
list_tokenbegin = []
|
|
@@ -1244,9 +1248,23 @@ def findAttributeAfterEntity(PackDict,roleSet,PackageList,PackageSet,list_entity
|
|
|
list_tokenbegin.append(begin)
|
|
|
begin += len(str(_sentence.tokens[i]))
|
|
|
list_tokenbegin.append(begin + 1)
|
|
|
-
|
|
|
+ # 排除网址、邮箱、项目编号实体
|
|
|
+ error_list = []
|
|
|
+ for i in re.finditer(url_pattern, sentence_text):
|
|
|
+ error_list.append((i.start(), i.end()))
|
|
|
+ for i in re.finditer(email_pattern, sentence_text):
|
|
|
+ error_list.append((i.start(), i.end()))
|
|
|
+ for code_ent in [ent for ent in code_entitys if ent.sentence_index==_sentence.sentence_index]:
|
|
|
+ error_list.append((code_ent.wordOffset_begin,code_ent.wordOffset_end))
|
|
|
res_set = set()
|
|
|
for i in re.finditer(phone, sentence_text):
|
|
|
+ is_continue = False
|
|
|
+ for error_ent in error_list:
|
|
|
+ if i.start()>=error_ent[0] and i.end()<=error_ent[1]:
|
|
|
+ is_continue = True
|
|
|
+ break
|
|
|
+ if is_continue:
|
|
|
+ continue
|
|
|
res_set.add((i.group(), i.start(), i.end()))
|
|
|
res_set = sorted(list(res_set),key=lambda x:x[1])
|
|
|
last_phone_mask = True
|
|
@@ -1254,24 +1272,43 @@ def findAttributeAfterEntity(PackDict,roleSet,PackageList,PackageSet,list_entity
|
|
|
item = res_set[item_idx]
|
|
|
phone_left = sentence_text[max(0, item[1] - 10):item[1]]
|
|
|
phone_right = sentence_text[item[2]:item[2] + 8]
|
|
|
- # 排除“传真号”和其它错误项
|
|
|
- if re.search("传,?真|信,?箱|邮,?[箱件]", phone_left):
|
|
|
- if not re.search("电,?话", phone_left):
|
|
|
+ if re.search("电话|手机|联系人|联系方式”",re.sub(",","",phone_left)):
|
|
|
+ pass
|
|
|
+ else:
|
|
|
+ # 排除“传真号”和其它错误项
|
|
|
+ if re.search("传,?真|信,?箱|邮,?[箱件]|QQ|qq", phone_left):
|
|
|
+ if not re.search("电,?话", phone_left):
|
|
|
+ last_phone_mask = False
|
|
|
+ continue
|
|
|
+ if re.search("注册[证号]|帐号|编[号码]|报价|标价|证号|价格|型号|附件|代码|列号|行号|税号|[\(\(]万?元[\)\)]|[a-zA-Z]+\d*$", re.sub(",","",phone_left)):
|
|
|
last_phone_mask = False
|
|
|
continue
|
|
|
- if re.search("注册[证号]|帐,?号|编,?[号码]|报,?价|标,?价|证,?号|价,?格|[\(\(]万?元[\)\)]|[a-zA-Z]+\d*$", phone_left):
|
|
|
- last_phone_mask = False
|
|
|
- continue
|
|
|
- if re.search("^\d{0,4}[.,]\d{2,}|^[0-9a-zA-Z\.]*@|^\d*[a-zA-Z]+|元", phone_right):
|
|
|
- last_phone_mask = False
|
|
|
- continue
|
|
|
- # if:上一个phone实体不符合条件
|
|
|
- if not last_phone_mask:
|
|
|
- item_start = item[1]
|
|
|
- last_item_end = res_set[item_idx-1][2]
|
|
|
- if item_start - last_item_end<=1 or re.search("^\d+$",sentence_text[last_item_end:item_start]):
|
|
|
+ if re.search("^\d{0,4}[.,]\d{2,}|^[0-9a-zA-Z\.]*@|^\d*[a-zA-Z]+|元", phone_right):
|
|
|
+ last_phone_mask = False
|
|
|
+ continue
|
|
|
+ # 前后跟着字母
|
|
|
+ if re.search("[a-zA-Z/]+$", phone_left) or re.search("^[a-zA-Z/]+", phone_right):
|
|
|
last_phone_mask = False
|
|
|
continue
|
|
|
+ # 前后跟着长度小于一定值数字的正则排除
|
|
|
+ if re.search("\d+[-—-―]?\d*$",phone_left) or re.search("^\d+[-—-―]?\d*",phone_right):
|
|
|
+ phone_left_number = re.search("\d+[-—-―]?\d*$",phone_left)
|
|
|
+ phone_right_number = re.search("^\d+[-—-―]?\d+",phone_right)
|
|
|
+ if phone_left_number:
|
|
|
+ if len(phone_left_number.group())<7:
|
|
|
+ last_phone_mask = False
|
|
|
+ continue
|
|
|
+ if phone_right_number:
|
|
|
+ if len(phone_right_number.group())<7:
|
|
|
+ last_phone_mask = False
|
|
|
+ continue
|
|
|
+ # if:上一个phone实体不符合条件
|
|
|
+ if not last_phone_mask:
|
|
|
+ item_start = item[1]
|
|
|
+ last_item_end = res_set[item_idx-1][2]
|
|
|
+ if item_start - last_item_end<=1 or re.search("^[\da-zA-Z\-—-―]+$",sentence_text[last_item_end:item_start]):
|
|
|
+ last_phone_mask = False
|
|
|
+ continue
|
|
|
for j in range(len(list_tokenbegin)):
|
|
|
if list_tokenbegin[j] == item[1]:
|
|
|
begin_index = j
|
|
@@ -1295,7 +1332,7 @@ def findAttributeAfterEntity(PackDict,roleSet,PackageList,PackageSet,list_entity
|
|
|
if ent.is_tail==True:
|
|
|
return False
|
|
|
entity_left = text[max(0,entity.wordOffset_begin-10):entity.wordOffset_begin]
|
|
|
- entity_left = re.sub(",()\(\)::","",entity_left)
|
|
|
+ entity_left = re.sub(",()\(\)","",entity_left)
|
|
|
entity_left = entity_left[-5:]
|
|
|
if re.search("地址|地点|银行[::]",entity_left):
|
|
|
return False
|
|
@@ -1410,6 +1447,43 @@ def findAttributeAfterEntity(PackDict,roleSet,PackageList,PackageSet,list_entity
|
|
|
PackDict["Project"]["roleList"][i].linklist.append((combo[0].entity_text,combo[1].entity_text))
|
|
|
break
|
|
|
# print(3,combo[0].entity_text,combo[1].entity_text)
|
|
|
+ # 2022/01/25 固定电话可连多个联系人
|
|
|
+ temp_person_entitys = [entity for entity in pre_entity if entity.entity_type == 'person']
|
|
|
+ temp_person_entitys2 = [] #和固定电话相连的联系人
|
|
|
+ for entity in temp_person_entitys:
|
|
|
+ if entity.person_phone:
|
|
|
+ for _phone in entity.person_phone:
|
|
|
+ if not re.search("^1[3-9]\d{9}$", _phone.entity_text):
|
|
|
+ temp_person_entitys2.append(entity)
|
|
|
+ break
|
|
|
+ for index in range(len(temp_person_entitys)):
|
|
|
+ entity = temp_person_entitys[index]
|
|
|
+ if entity in temp_person_entitys2:
|
|
|
+ last_person = entity
|
|
|
+ for after_index in range(index + 1, min(len(temp_person_entitys), index + 5)):
|
|
|
+ after_entity = temp_person_entitys[after_index]
|
|
|
+ if after_entity.sentence_index == last_person.sentence_index and after_entity.begin_index - last_person.end_index < 3:
|
|
|
+ for _phone in entity.person_phone:
|
|
|
+ if not re.search("^1[3-9]\d{9}$", _phone.entity_text):
|
|
|
+ if _phone not in after_entity.person_phone:
|
|
|
+ after_entity.person_phone.append(_phone)
|
|
|
+ last_person = after_entity
|
|
|
+ else:
|
|
|
+ break
|
|
|
+ if index==0:
|
|
|
+ continue
|
|
|
+ last_person = entity
|
|
|
+ for before_index in range(index-1, max(-1,index-5), -1):
|
|
|
+ before_entity = temp_person_entitys[before_index]
|
|
|
+ if before_entity.sentence_index == last_person.sentence_index and last_person.begin_index - before_entity.end_index < 3:
|
|
|
+ for _phone in entity.person_phone:
|
|
|
+ if not re.search("^1[3-9]\d{9}$", _phone.entity_text):
|
|
|
+ if _phone not in before_entity.person_phone:
|
|
|
+ before_entity.person_phone.append(_phone)
|
|
|
+ last_person = before_entity
|
|
|
+ else:
|
|
|
+ break
|
|
|
+
|
|
|
# 更新 PackDict
|
|
|
not_sure_linked = []
|
|
|
for link_p in list(linked_company):
|
|
@@ -1527,12 +1601,12 @@ def findAttributeAfterEntity(PackDict,roleSet,PackageList,PackageSet,list_entity
|
|
|
last_words_num = len(sentence.sentence_text)
|
|
|
|
|
|
# 公司-联系人连接(km算法)
|
|
|
- re_phone = re.compile('1[3|4|5|6|7|8|9][0-9][-—-―]?\d{4}[-—-―]?\d{4}|'
|
|
|
- '\+86.?1[3|4|5|6|7|8|9]\d{9}|'
|
|
|
+ re_phone = re.compile('1[3-9][0-9][-—-―]?\d{4}[-—-―]?\d{4}|'
|
|
|
+ '\+86.?1[3-9]\d{9}|'
|
|
|
'0[1-9]\d{1,2}[-—-―][1-9]\d{6,7}/[1-9]\d{6,10}|'
|
|
|
'0[1-9]\d{1,2}[-—-―]\d{7,8}.?转\d{1,4}|'
|
|
|
'0[1-9]\d{1,2}[-—-―]\d{7,8}[-—-―]\d{1,4}|'
|
|
|
- '0[1-9]\d{1,2}[-—-―]?[1-9]\d{6}\d?(?=1[3|4|5|6|7|8|9]\d{9})|'
|
|
|
+ '0[1-9]\d{1,2}[-—-―]?[1-9]\d{6}\d?(?=1[3-9]\d{9})|'
|
|
|
'0[1-9]\d{1,2}[-—-―]?[1-9]\d{6}\d?(?=0[1-9]\d{1,2}[-—-―]?[1-9]\d{6}\d?)|'
|
|
|
'0[1-9]\d{1,2}[-—-―]?[1-9]\d{6}\d?(?=[1-9]\d{6,7})|'
|
|
|
'0[1-9]\d{1,2}[-—-―]?[1-9]\d{6}\d?|'
|
|
@@ -1661,6 +1735,13 @@ def findAttributeAfterEntity(PackDict,roleSet,PackageList,PackageSet,list_entity
|
|
|
match_nums += 1
|
|
|
else:
|
|
|
next_entity = split_entitys[index + 1]
|
|
|
+ if next_entity.entity_type in ["org","company"]:
|
|
|
+ _entity_left = list_sentence[next_entity.sentence_index].sentence_text[max(0, next_entity.wordOffset_begin - 10):next_entity.wordOffset_begin]
|
|
|
+ _entity_left = re.sub(",()\(\)::", "", _entity_left)
|
|
|
+ _entity_left = _entity_left[-5:]
|
|
|
+ if re.search("地址|地点", _entity_left):
|
|
|
+ if index + 2<= len(split_entitys) - 1:
|
|
|
+ next_entity = split_entitys[index + 2]
|
|
|
if entity.sentence_index == next_entity.sentence_index:
|
|
|
mid_tokens += list_sentence[entity.sentence_index].tokens[
|
|
|
entity.end_index + 1:next_entity.begin_index]
|
|
@@ -2271,6 +2352,51 @@ def findAttributeAfterEntity(PackDict,roleSet,PackageList,PackageSet,list_entity
|
|
|
for _p in _phone:
|
|
|
PackDict[k]["roleList"][0].linklist.append((_entity.entity_text, _p))
|
|
|
break
|
|
|
+ # 公告中只有"招标人",无"联系人"链接且上一条规则无效时时,如果文中只有一个“phone”实体,则直接取为联系人电话
|
|
|
+ if len(PackDict)==1:
|
|
|
+ k = list(PackDict.keys())[0]
|
|
|
+ if len(PackDict[k]["roleList"])==1:
|
|
|
+ if PackDict[k]["roleList"][0].role_name == "tenderee":
|
|
|
+ if not PackDict[k]["roleList"][0].linklist:
|
|
|
+ if len(phone_entitys)==1:
|
|
|
+ PackDict[k]["roleList"][0].linklist.append(("", phone_entitys[0].entity_text))
|
|
|
+ # 公告中只有"招标人",无"联系人"链接且上一条规则无效时时,通过大纲直接取电话
|
|
|
+ if len(PackDict)==1:
|
|
|
+ k = list(PackDict.keys())[0]
|
|
|
+ if len(PackDict[k]["roleList"])==1:
|
|
|
+ if PackDict[k]["roleList"][0].role_name == "tenderee":
|
|
|
+ if not PackDict[k]["roleList"][0].linklist:
|
|
|
+ if len(new_split_list)>1:
|
|
|
+ for _start,_end in new_split_list:
|
|
|
+ temp_sentence = _content[_start:_end]
|
|
|
+ sentence_outline = temp_sentence.split(",")[0]
|
|
|
+ if re.search("联系人|联系方|联系方式|联系电话|电话|负责人",sentence_outline):
|
|
|
+ sentence_phone = phone.findall(temp_sentence)
|
|
|
+ if sentence_phone:
|
|
|
+ PackDict[k]["roleList"][0].linklist.append(("", sentence_phone[0]))
|
|
|
+ break
|
|
|
+ # 公告中只有"招标人",无"联系人"链接且上一条规则无效时时,通过正则提取句子段落进行提取电话
|
|
|
+ if len(PackDict)==1:
|
|
|
+ k = list(PackDict.keys())[0]
|
|
|
+ if len(PackDict[k]["roleList"])==1:
|
|
|
+ if PackDict[k]["roleList"][0].role_name == "tenderee":
|
|
|
+ if not PackDict[k]["roleList"][0].linklist:
|
|
|
+ contacts_person = "(?:联系人|联系方|联系方式|负责人|电话|联系电话)[::]?"
|
|
|
+ tenderee_pattern = "(?:(?:采购|招标|议价|议标|比选)(?:人|公司|单位|组织|部门)|建设(?:单位|业主)|(?:采购|招标|甲)方|询价单位|项目业主|业主|业主单位)[^。]{0,5}"
|
|
|
+ contact_pattern_list = [tenderee_pattern + contacts_person,"(?:采购[^。,]{0,2}项目|采购事项|招标)[^。,]{0,4}" + contacts_person,
|
|
|
+ "(?:项目|采购)[^。,]{0,4}"+contacts_person,"(?:报名|报价|业务咨询|业务|投标咨询)[^。,]{0,4}"+contacts_person,]
|
|
|
+ for _pattern in contact_pattern_list:
|
|
|
+ get_tenderee_contacts = False
|
|
|
+ for regular_match in re.finditer(_pattern,_content):
|
|
|
+ match_text = _content[regular_match.end():regular_match.end()+40]
|
|
|
+ match_text = match_text.split("。")[0]
|
|
|
+ sentence_phone = phone.findall(match_text)
|
|
|
+ if sentence_phone:
|
|
|
+ PackDict[k]["roleList"][0].linklist.append(("", sentence_phone[0]))
|
|
|
+ get_tenderee_contacts = True
|
|
|
+ break
|
|
|
+ if get_tenderee_contacts:
|
|
|
+ break
|
|
|
|
|
|
for pack in PackDict.keys():
|
|
|
for i in range(len(PackDict[pack]["roleList"])):
|