|
@@ -1283,30 +1283,52 @@ def findAttributeAfterEntity(PackDict,roleSet,PackageList,PackageSet,list_senten
|
|
res_set.add((i.group(), i.start(), i.end()))
|
|
res_set.add((i.group(), i.start(), i.end()))
|
|
res_set = sorted(list(res_set),key=lambda x:x[1])
|
|
res_set = sorted(list(res_set),key=lambda x:x[1])
|
|
last_phone_mask = True
|
|
last_phone_mask = True
|
|
|
|
+ error_numStr_index = []
|
|
|
|
+ sentence_phone_list = []
|
|
for item_idx in range(len(res_set)):
|
|
for item_idx in range(len(res_set)):
|
|
item = res_set[item_idx]
|
|
item = res_set[item_idx]
|
|
phone_left = sentence_text[max(0, item[1] - 10):item[1]]
|
|
phone_left = sentence_text[max(0, item[1] - 10):item[1]]
|
|
- phone_right = sentence_text[item[2]:item[2] + 8]
|
|
|
|
|
|
+ phone_right = sentence_text[item[2]:item[2] + 10]
|
|
|
|
+ phone_left_num = re.search("[\da-zA-Z\-—-―]+$",phone_left)
|
|
|
|
+ numStr_left = item[1]
|
|
|
|
+ if phone_left_num:
|
|
|
|
+ numStr_left -= len(phone_left_num.group())
|
|
|
|
+ phone_right_num = re.search("^[\da-zA-Z\-—-―]+",phone_right)
|
|
|
|
+ numStr_right = item[2]
|
|
|
|
+ if phone_right_num:
|
|
|
|
+ numStr_right += len(phone_right_num.group())
|
|
|
|
+ numStr_index = (numStr_left,numStr_right)
|
|
|
|
+
|
|
if re.search("电话|手机|联系[人方]|联系方式",re.sub(",","",phone_left)):
|
|
if re.search("电话|手机|联系[人方]|联系方式",re.sub(",","",phone_left)):
|
|
pass
|
|
pass
|
|
else:
|
|
else:
|
|
# 排除“传真号”和其它错误项
|
|
# 排除“传真号”和其它错误项
|
|
if re.search("传,?真|信,?箱|邮,?[箱件]|QQ|qq", phone_left):
|
|
if re.search("传,?真|信,?箱|邮,?[箱件]|QQ|qq", phone_left):
|
|
if not re.search("电,?话", phone_left):
|
|
if not re.search("电,?话", phone_left):
|
|
|
|
+ error_numStr_index.append(numStr_index)
|
|
last_phone_mask = False
|
|
last_phone_mask = False
|
|
continue
|
|
continue
|
|
- if re.search("注册[证号]|帐号|编[号码]|报价|标价|证号|价格|型号|附件|代码|列号|行号|税号|[\(\(]万?元[\)\)]|[a-zA-Z]+\d*$", re.sub(",","",phone_left)):
|
|
|
|
|
|
+ if re.search("身份证号?码?|注册[证号]|帐号|编[号码]|报价|标价|证号|价格|型号|附件|代码|列号|行号|税号|[\(\(]万?元[\)\)]|[a-zA-Z]+\d*$", re.sub(",","",phone_left)):
|
|
|
|
+ error_numStr_index.append(numStr_index)
|
|
last_phone_mask = False
|
|
last_phone_mask = False
|
|
continue
|
|
continue
|
|
if re.search("^\d{0,4}[.,]\d{2,}|^[0-9a-zA-Z\.]*@|^\d*[a-zA-Z]+|元", phone_right):
|
|
if re.search("^\d{0,4}[.,]\d{2,}|^[0-9a-zA-Z\.]*@|^\d*[a-zA-Z]+|元", phone_right):
|
|
|
|
+ error_numStr_index.append(numStr_index)
|
|
last_phone_mask = False
|
|
last_phone_mask = False
|
|
continue
|
|
continue
|
|
# 号码含有0过多,不符合规则
|
|
# 号码含有0过多,不符合规则
|
|
- if re.search("0{5,}",item[0]):
|
|
|
|
|
|
+ if re.search("0{6,}",item[0]):
|
|
|
|
+ error_numStr_index.append(numStr_index)
|
|
last_phone_mask = False
|
|
last_phone_mask = False
|
|
continue
|
|
continue
|
|
# 前后跟着字母
|
|
# 前后跟着字母
|
|
if re.search("[a-zA-Z/]+$", phone_left) or re.search("^[a-zA-Z/]+", phone_right):
|
|
if re.search("[a-zA-Z/]+$", phone_left) or re.search("^[a-zA-Z/]+", phone_right):
|
|
|
|
+ error_numStr_index.append(numStr_index)
|
|
|
|
+ last_phone_mask = False
|
|
|
|
+ continue
|
|
|
|
+ # 时间日期类排除
|
|
|
|
+ if re.search("时间|日期", phone_left):
|
|
|
|
+ error_numStr_index.append(numStr_index)
|
|
last_phone_mask = False
|
|
last_phone_mask = False
|
|
continue
|
|
continue
|
|
# 前后跟着长度小于一定值数字的正则排除
|
|
# 前后跟着长度小于一定值数字的正则排除
|
|
@@ -1315,10 +1337,12 @@ def findAttributeAfterEntity(PackDict,roleSet,PackageList,PackageSet,list_senten
|
|
phone_right_number = re.search("^\d+[-—-―]?\d+",phone_right)
|
|
phone_right_number = re.search("^\d+[-—-―]?\d+",phone_right)
|
|
if phone_left_number:
|
|
if phone_left_number:
|
|
if len(phone_left_number.group())<7:
|
|
if len(phone_left_number.group())<7:
|
|
|
|
+ error_numStr_index.append(numStr_index)
|
|
last_phone_mask = False
|
|
last_phone_mask = False
|
|
continue
|
|
continue
|
|
if phone_right_number:
|
|
if phone_right_number:
|
|
if len(phone_right_number.group())<7:
|
|
if len(phone_right_number.group())<7:
|
|
|
|
+ error_numStr_index.append(numStr_index)
|
|
last_phone_mask = False
|
|
last_phone_mask = False
|
|
continue
|
|
continue
|
|
# if:上一个phone实体不符合条件
|
|
# if:上一个phone实体不符合条件
|
|
@@ -1326,8 +1350,21 @@ def findAttributeAfterEntity(PackDict,roleSet,PackageList,PackageSet,list_senten
|
|
item_start = item[1]
|
|
item_start = item[1]
|
|
last_item_end = res_set[item_idx-1][2]
|
|
last_item_end = res_set[item_idx-1][2]
|
|
if item_start - last_item_end<=1 or re.search("^[\da-zA-Z\-—-―、]+$",sentence_text[last_item_end:item_start]):
|
|
if item_start - last_item_end<=1 or re.search("^[\da-zA-Z\-—-―、]+$",sentence_text[last_item_end:item_start]):
|
|
|
|
+ error_numStr_index.append(numStr_index)
|
|
last_phone_mask = False
|
|
last_phone_mask = False
|
|
continue
|
|
continue
|
|
|
|
+ sentence_phone_list.append(item)
|
|
|
|
+ last_phone_mask = True
|
|
|
|
+ if error_numStr_index:
|
|
|
|
+ drop_list = []
|
|
|
|
+ for item in sentence_phone_list:
|
|
|
|
+ for err_index in error_numStr_index:
|
|
|
|
+ if (item[1]>=err_index[0] and item[1]<=err_index[1]) or (item[2]>=err_index[0] and item[2]<=err_index[1]) or (item[1]<=err_index[0] and item[2]>=err_index[1]):
|
|
|
|
+ drop_list.append(item)
|
|
|
|
+ break
|
|
|
|
+ for _drop_item in drop_list:
|
|
|
|
+ sentence_phone_list.remove(_drop_item)
|
|
|
|
+ for item in sentence_phone_list:
|
|
for j in range(len(list_tokenbegin)):
|
|
for j in range(len(list_tokenbegin)):
|
|
if list_tokenbegin[j] == item[1]:
|
|
if list_tokenbegin[j] == item[1]:
|
|
begin_index = j
|
|
begin_index = j
|
|
@@ -1342,8 +1379,7 @@ def findAttributeAfterEntity(PackDict,roleSet,PackageList,PackageSet,list_senten
|
|
_entity = Entity(_sentence.doc_id, None, item[0], "phone", _sentence.sentence_index, begin_index, end_index, item[1],
|
|
_entity = Entity(_sentence.doc_id, None, item[0], "phone", _sentence.sentence_index, begin_index, end_index, item[1],
|
|
item[2],in_attachment=in_attachment)
|
|
item[2],in_attachment=in_attachment)
|
|
phone_entitys.append(_entity)
|
|
phone_entitys.append(_entity)
|
|
- last_phone_mask = True
|
|
|
|
-
|
|
|
|
|
|
+ print('phone_set:',set([ent.entity_text for ent in phone_entitys]))
|
|
def is_company(entity,text):
|
|
def is_company(entity,text):
|
|
# 判断"公司"实体是否为地址地点
|
|
# 判断"公司"实体是否为地址地点
|
|
if entity.label!=5 and entity.values[entity.label]>0.5:
|
|
if entity.label!=5 and entity.values[entity.label]>0.5:
|