|
@@ -2114,7 +2114,7 @@ def get_preprocessed_article(articles,cost_time = dict(),useselffool=True):
|
|
|
doc_id = article[0]
|
|
|
sourceContent = article[1]
|
|
|
sourceContent = re.sub("<html>|</html>|<body>|</body>","",sourceContent)
|
|
|
-
|
|
|
+ sourceContent = re.sub("##attachment##","",sourceContent)
|
|
|
sourceContent = sourceContent.replace('<br/>', '<br>')
|
|
|
sourceContent = re.sub("<br>(\s{0,}<br>)+","<br>",sourceContent)
|
|
|
# for br_match in re.findall("[^>]+?<br>",sourceContent):
|
|
@@ -2493,16 +2493,7 @@ def get_preprocessed_entitys(list_sentences,useselffool=True,cost_time=dict()):
|
|
|
continue
|
|
|
elif entity_type=="person" and len(entity_text)>10 and len(re.findall("[\u4e00-\u9fa5]",entity_text))<len(entity_text)/2:
|
|
|
continue
|
|
|
- # 组织机构实体名称补充
|
|
|
- if entity_type in ["org", "company"]:
|
|
|
- fix_name = re.search("(有限)([责贵]?任?)(公?司?)",entity_text)
|
|
|
- if fix_name:
|
|
|
- if len(fix_name.group(2))>0:
|
|
|
- entity_text = entity_text.replace(fix_name.group(), "有限责任公司")
|
|
|
- elif len(fix_name.group(3))>0:
|
|
|
- entity_text = entity_text.replace(fix_name.group(), "有限公司")
|
|
|
- elif re.search("有限$", entity_text):
|
|
|
- entity_text = re.sub("有限$","有限公司",entity_text)
|
|
|
+
|
|
|
|
|
|
for j in range(len(list_tokenbegin)):
|
|
|
if list_tokenbegin[j]==begin_index_temp:
|
|
@@ -2521,6 +2512,16 @@ def get_preprocessed_entitys(list_sentences,useselffool=True,cost_time=dict()):
|
|
|
#去掉标点符号
|
|
|
entity_text = re.sub("[,,。:!&@$\*]","",entity_text)
|
|
|
entity_text = entity_text.replace("(","(").replace(")",")") if isinstance(entity_text,str) else entity_text
|
|
|
+ # 组织机构实体名称补充
|
|
|
+ if entity_type in ["org", "company"]:
|
|
|
+ fix_name = re.search("(有限)([责贵]?任?)(公?司?)",entity_text)
|
|
|
+ if fix_name:
|
|
|
+ if len(fix_name.group(2))>0:
|
|
|
+ entity_text = entity_text.replace(fix_name.group(), "有限责任公司")
|
|
|
+ elif len(fix_name.group(3))>0:
|
|
|
+ entity_text = entity_text.replace(fix_name.group(), "有限公司")
|
|
|
+ elif re.search("有限$", entity_text):
|
|
|
+ entity_text = re.sub("有限$","有限公司",entity_text)
|
|
|
list_sentence_entitys.append(Entity(doc_id,entity_id,entity_text,entity_type,sentence_index,begin_index,end_index,ner_entity[0],ner_entity[1],in_attachment=in_attachment))
|
|
|
# 标记文章末尾的"发布人”、“发布时间”实体
|
|
|
if sentence_index==len(list_sentence)-1:
|