|
@@ -1078,9 +1078,11 @@ def segment(soup,final=True):
|
|
|
text = re.sub("(?<=[\u4e00-\u9fa5]),|,(?=[\u4e00-\u9fa5])",",",text)
|
|
|
#替换为中文分号
|
|
|
text = re.sub("(?<=[\u4e00-\u9fa5]);|;(?=[\u4e00-\u9fa5])",";",text)
|
|
|
+ # 感叹号替换为中文句号
|
|
|
+ text = re.sub("(?<=[\u4e00-\u9fa5])[!!]|[!!](?=[\u4e00-\u9fa5])","。",text)
|
|
|
#替换"?"为 " " ,update:2021/7/20
|
|
|
text = re.sub("?"," ",text)
|
|
|
-
|
|
|
+
|
|
|
|
|
|
#替换"""为"“",否则导入deepdive出错
|
|
|
text = text.replace('"',"“").replace("\r","").replace("\n",",")
|
|
@@ -1925,10 +1927,11 @@ def get_preprocessed_entitys(list_sentences,useselffool=True,cost_time=dict()):
|
|
|
|
|
|
# "联系人"正则补充提取 2021/11/15 新增
|
|
|
list_person_text = [entity.entity_text for entity in list_sentence_entitys if entity.entity_type=='person']
|
|
|
- error_text = ['传真','网址','电子邮','联系','联系电','联系地','采购代','邮政编','邮政','电话','手机','手机号','联系人','地址','地点','邮箱','邮编','联系方','招标','招标人','代理','代理人','采购','附件']
|
|
|
+ error_text = ['交易','机构','教育','项目','公司','中标','开标','截标','监督','政府','国家','中国','技术','投标','传真','网址','电子邮',
|
|
|
+ '联系','联系电','联系地','采购代','邮政编','邮政','电话','手机','手机号','联系人','地址','地点','邮箱','邮编','联系方','招标','招标人','代理','代理人','采购','附件']
|
|
|
list_person_text = set(list_person_text + error_text)
|
|
|
- re_person = re.compile("联系人[::]([\u4e00-\u9fa5]{2,3})(?=联系)|"
|
|
|
- "联系人[::]([\u4e00-\u9fa5]工)|"
|
|
|
+ re_person = re.compile("联系人[::]([\u4e00-\u9fa5]工)|"
|
|
|
+ "联系人[::]([\u4e00-\u9fa5]{2,3})(?=联系)|"
|
|
|
"联系人[::]([\u4e00-\u9fa5]{2,3})")
|
|
|
list_person = []
|
|
|
for match_result in re_person.finditer(sentence_text):
|
|
@@ -1937,6 +1940,9 @@ def get_preprocessed_entitys(list_sentences,useselffool=True,cost_time=dict()):
|
|
|
wordOffset_begin = match_result.start() + 4
|
|
|
wordOffset_end = match_result.end()
|
|
|
# print(text[wordOffset_begin:wordOffset_end])
|
|
|
+ # 排除一些不为人名的实体
|
|
|
+ if re.search("^[\u4e00-\u9fa5]{7,}([,。]|$)",sentence_text[wordOffset_begin:wordOffset_begin+20]):
|
|
|
+ continue
|
|
|
if entity_text not in list_person_text and entity_text[:2] not in list_person_text:
|
|
|
_person = dict()
|
|
|
_person['body'] = entity_text
|