Browse Source

预处理错误修复

znj 2 years ago
parent
commit
c34a5aa603
1 changed files with 12 additions and 11 deletions
  1. 12 11
      BiddingKG/dl/interface/Preprocessing.py

+ 12 - 11
BiddingKG/dl/interface/Preprocessing.py

@@ -2114,7 +2114,7 @@ def get_preprocessed_article(articles,cost_time = dict(),useselffool=True):
         doc_id = article[0]
         sourceContent = article[1]
         sourceContent = re.sub("<html>|</html>|<body>|</body>","",sourceContent)
-
+        sourceContent = re.sub("##attachment##","",sourceContent)
         sourceContent = sourceContent.replace('<br/>', '<br>')
         sourceContent = re.sub("<br>(\s{0,}<br>)+","<br>",sourceContent)
         # for br_match in re.findall("[^>]+?<br>",sourceContent):
@@ -2493,16 +2493,7 @@ def get_preprocessed_entitys(list_sentences,useselffool=True,cost_time=dict()):
                     continue
                 elif entity_type=="person" and len(entity_text)>10 and len(re.findall("[\u4e00-\u9fa5]",entity_text))<len(entity_text)/2:
                     continue
-                # 组织机构实体名称补充
-                if entity_type in ["org", "company"]:
-                    fix_name = re.search("(有限)([责贵]?任?)(公?司?)",entity_text)
-                    if fix_name:
-                        if len(fix_name.group(2))>0:
-                            entity_text = entity_text.replace(fix_name.group(), "有限责任公司")
-                        elif len(fix_name.group(3))>0:
-                            entity_text = entity_text.replace(fix_name.group(), "有限公司")
-                        elif re.search("有限$", entity_text):
-                            entity_text = re.sub("有限$","有限公司",entity_text)
+
 
                 for j in range(len(list_tokenbegin)):
                     if list_tokenbegin[j]==begin_index_temp:
@@ -2521,6 +2512,16 @@ def get_preprocessed_entitys(list_sentences,useselffool=True,cost_time=dict()):
                 #去掉标点符号
                 entity_text = re.sub("[,,。:!&@$\*]","",entity_text)
                 entity_text = entity_text.replace("(","(").replace(")",")") if isinstance(entity_text,str) else entity_text
+                # 组织机构实体名称补充
+                if entity_type in ["org", "company"]:
+                    fix_name = re.search("(有限)([责贵]?任?)(公?司?)",entity_text)
+                    if fix_name:
+                        if len(fix_name.group(2))>0:
+                            entity_text = entity_text.replace(fix_name.group(), "有限责任公司")
+                        elif len(fix_name.group(3))>0:
+                            entity_text = entity_text.replace(fix_name.group(), "有限公司")
+                        elif re.search("有限$", entity_text):
+                            entity_text = re.sub("有限$","有限公司",entity_text)
                 list_sentence_entitys.append(Entity(doc_id,entity_id,entity_text,entity_type,sentence_index,begin_index,end_index,ner_entity[0],ner_entity[1],in_attachment=in_attachment))
             # 标记文章末尾的"发布人”、“发布时间”实体
             if sentence_index==len(list_sentence)-1: