|
@@ -1899,7 +1899,8 @@ def attachment_filelink(soup):
|
|
|
break
|
|
|
if "filelink" in child.attrs and child['filelink'] in attachment_dict:
|
|
|
if re.search(attachment_type,str(child.string).strip()) or \
|
|
|
- ('original' in child.attrs and re.search(attachment_type,str(child['original']).strip())):
|
|
|
+ ('original' in child.attrs and re.search(attachment_type,str(child['original']).strip())) or \
|
|
|
+ ('href' in child.attrs and re.search(attachment_type,str(child['href']).strip())):
|
|
|
# 附件插入正文标识
|
|
|
child.insert_before("。##attachment_begin##")
|
|
|
child.insert_after("。##attachment_end##")
|
|
@@ -1972,7 +1973,8 @@ def get_preprocessed_article(articles,cost_time = dict(),useselffool=True):
|
|
|
for _match in re.finditer("\d。\d{2}",attachment_text):
|
|
|
_match_text = _match.group()
|
|
|
attachment_text = attachment_text.replace(_match_text,_match_text.replace("。","."),1)
|
|
|
- for _match in re.finditer("(\d,\d{3})[,,.]",attachment_text):
|
|
|
+ # for _match in re.finditer("(\d,\d{3})[,,.]",attachment_text):
|
|
|
+ for _match in re.finditer("\d,(?=\d{3}[^\d])",attachment_text):
|
|
|
_match_text = _match.group()
|
|
|
attachment_text = attachment_text.replace(_match_text,_match_text.replace(",",","),1)
|
|
|
article_processed_list[1] = attachment_text
|