znj 2 years ago
parent
commit
4d95eed5c3
1 changed files with 4 additions and 2 deletions
  1. 4 2
      BiddingKG/dl/interface/Preprocessing.py

+ 4 - 2
BiddingKG/dl/interface/Preprocessing.py

@@ -1899,7 +1899,8 @@ def attachment_filelink(soup):
                     break
             if "filelink" in child.attrs and child['filelink'] in attachment_dict:
                 if re.search(attachment_type,str(child.string).strip()) or \
-                        ('original' in child.attrs and re.search(attachment_type,str(child['original']).strip())):
+                        ('original' in child.attrs and re.search(attachment_type,str(child['original']).strip())) or \
+                        ('href' in child.attrs and re.search(attachment_type,str(child['href']).strip())):
                     # 附件插入正文标识
                     child.insert_before("。##attachment_begin##")
                     child.insert_after("。##attachment_end##")
@@ -1972,7 +1973,8 @@ def get_preprocessed_article(articles,cost_time = dict(),useselffool=True):
             for _match in re.finditer("\d。\d{2}",attachment_text):
                 _match_text = _match.group()
                 attachment_text = attachment_text.replace(_match_text,_match_text.replace("。","."),1)
-            for _match in re.finditer("(\d,\d{3})[,,.]",attachment_text):
+            # for _match in re.finditer("(\d,\d{3})[,,.]",attachment_text):
+            for _match in re.finditer("\d,(?=\d{3}[^\d])",attachment_text):
                 _match_text = _match.group()
                 attachment_text = attachment_text.replace(_match_text,_match_text.replace(",",","),1)
             article_processed_list[1] = attachment_text