2 years ago · c34a5aa603
--- a/BiddingKG/dl/interface/Preprocessing.py
+++ b/BiddingKG/dl/interface/Preprocessing.py
@@ -2114,7 +2114,7 @@ def get_preprocessed_article(articles,cost_time = dict(),useselffool=True):
 
				         doc_id = article[0]
			
 
				         sourceContent = article[1]
			
 
				         sourceContent = re.sub("<html>|</html>|<body>|</body>","",sourceContent)
			
 
				-
			
 
				+        sourceContent = re.sub("##attachment##","",sourceContent)
			
 
				         sourceContent = sourceContent.replace('<br/>', '<br>')
			
 
				         sourceContent = re.sub("<br>(\s{0,}<br>)+","<br>",sourceContent)
			
 
				         # for br_match in re.findall("[^>]+?<br>",sourceContent):
			
@@ -2493,16 +2493,7 @@ def get_preprocessed_entitys(list_sentences,useselffool=True,cost_time=dict()):
 
				                     continue
			
 
				                 elif entity_type=="person" and len(entity_text)>10 and len(re.findall("[\u4e00-\u9fa5]",entity_text))<len(entity_text)/2:
			
 
				                     continue
			
 
				-                # 组织机构实体名称补充
			
 
				-                if entity_type in ["org", "company"]:
			
 
				-                    fix_name = re.search("(有限)([责贵]?任?)(公?司?)",entity_text)
			
 
				-                    if fix_name:
			
 
				-                        if len(fix_name.group(2))>0:
			
 
				-                            entity_text = entity_text.replace(fix_name.group(), "有限责任公司")
			
 
				-                        elif len(fix_name.group(3))>0:
			
 
				-                            entity_text = entity_text.replace(fix_name.group(), "有限公司")
			
 
				-                        elif re.search("有限$", entity_text):
			
 
				-                            entity_text = re.sub("有限$","有限公司",entity_text)
			
 
				+
			
 
				 
			
 
				                 for j in range(len(list_tokenbegin)):
			
 
				                     if list_tokenbegin[j]==begin_index_temp:
			
@@ -2521,6 +2512,16 @@ def get_preprocessed_entitys(list_sentences,useselffool=True,cost_time=dict()):
 
				                 #去掉标点符号
			
 
				                 entity_text = re.sub("[,，。：!&@$\*]","",entity_text)
			
 
				                 entity_text = entity_text.replace("(","（").replace(")","）") if isinstance(entity_text,str) else entity_text
			
 
				+                # 组织机构实体名称补充
			
 
				+                if entity_type in ["org", "company"]:
			
 
				+                    fix_name = re.search("(有限)([责贵]?任?)(公?司?)",entity_text)
			
 
				+                    if fix_name:
			
 
				+                        if len(fix_name.group(2))>0:
			
 
				+                            entity_text = entity_text.replace(fix_name.group(), "有限责任公司")
			
 
				+                        elif len(fix_name.group(3))>0:
			
 
				+                            entity_text = entity_text.replace(fix_name.group(), "有限公司")
			
 
				+                        elif re.search("有限$", entity_text):
			
 
				+                            entity_text = re.sub("有限$","有限公司",entity_text)
			
 
				                 list_sentence_entitys.append(Entity(doc_id,entity_id,entity_text,entity_type,sentence_index,begin_index,end_index,ner_entity[0],ner_entity[1],in_attachment=in_attachment))
			
 
				             # 标记文章末尾的"发布人”、“发布时间”实体
			
 
				             if sentence_index==len(list_sentence)-1: