2 years ago · 4d95eed5c3
--- a/BiddingKG/dl/interface/Preprocessing.py
+++ b/BiddingKG/dl/interface/Preprocessing.py
@@ -1899,7 +1899,8 @@ def attachment_filelink(soup):
 
				                     break
			
 
				             if "filelink" in child.attrs and child['filelink'] in attachment_dict:
			
 
				                 if re.search(attachment_type,str(child.string).strip()) or \
			
 
				-                        ('original' in child.attrs and re.search(attachment_type,str(child['original']).strip())):
			
 
				+                        ('original' in child.attrs and re.search(attachment_type,str(child['original']).strip())) or \
			
 
				+                        ('href' in child.attrs and re.search(attachment_type,str(child['href']).strip())):
			
 
				                     # 附件插入正文标识
			
 
				                     child.insert_before("。##attachment_begin##")
			
 
				                     child.insert_after("。##attachment_end##")
			
@@ -1972,7 +1973,8 @@ def get_preprocessed_article(articles,cost_time = dict(),useselffool=True):
 
				             for _match in re.finditer("\d。\d{2}",attachment_text):
			
 
				                 _match_text = _match.group()
			
 
				                 attachment_text = attachment_text.replace(_match_text,_match_text.replace("。","."),1)
			
 
				-            for _match in re.finditer("(\d，\d{3})[,，.]",attachment_text):
			
 
				+            # for _match in re.finditer("(\d，\d{3})[,，.]",attachment_text):
			
 
				+            for _match in re.finditer("\d，(?=\d{3}[^\d])",attachment_text):
			
 
				                 _match_text = _match.group()
			
 
				                 attachment_text = attachment_text.replace(_match_text,_match_text.replace("，",","),1)
			
 
				             article_processed_list[1] = attachment_text