|
@@ -1066,13 +1066,33 @@ def segment(soup,final=True):
|
|
text = re.sub("(?<=[\u4e00-\u9fa5]),|,(?=[\u4e00-\u9fa5])",",",text)
|
|
text = re.sub("(?<=[\u4e00-\u9fa5]),|,(?=[\u4e00-\u9fa5])",",",text)
|
|
#替换为中文分号
|
|
#替换为中文分号
|
|
text = re.sub("(?<=[\u4e00-\u9fa5]);|;(?=[\u4e00-\u9fa5])",";",text)
|
|
text = re.sub("(?<=[\u4e00-\u9fa5]);|;(?=[\u4e00-\u9fa5])",";",text)
|
|
- #替换"?"为 "" ,update:2021/7/20
|
|
|
|
- text = re.sub("?+","",text)
|
|
|
|
|
|
+ #替换"?"为 " " ,update:2021/7/20
|
|
|
|
+ text = re.sub("?+"," ",text)
|
|
|
|
|
|
|
|
|
|
#替换"""为"“",否则导入deepdive出错
|
|
#替换"""为"“",否则导入deepdive出错
|
|
text = text.replace('"',"“").replace("\r","").replace("\n",",")
|
|
text = text.replace('"',"“").replace("\r","").replace("\n",",")
|
|
- text = re.sub("\s{4,}",",",text)
|
|
|
|
|
|
+ # print('==1',text)
|
|
|
|
+ # text = re.sub("\s{4,}",",",text)
|
|
|
|
+ # 解决公告中的" "空格替换问题
|
|
|
|
+ if re.search("\s{4,}",text):
|
|
|
|
+ _text = ""
|
|
|
|
+ for _sent in re.split("。+",text):
|
|
|
|
+ for _sent2 in re.split(',+',_sent):
|
|
|
|
+ for _sent3 in re.split(":+",_sent2):
|
|
|
|
+ for _t in re.split("\s{4,}",_sent3):
|
|
|
|
+ if len(_t)<3:
|
|
|
|
+ _text += _t
|
|
|
|
+ else:
|
|
|
|
+ _text += ","+_t
|
|
|
|
+ _text += ":"
|
|
|
|
+ _text = _text[:-1]
|
|
|
|
+ _text += ","
|
|
|
|
+ _text = _text[:-1]
|
|
|
|
+ _text += "。"
|
|
|
|
+ _text = _text[:-1]
|
|
|
|
+ text = _text
|
|
|
|
+ # print('==2',text)
|
|
#替换标点
|
|
#替换标点
|
|
|
|
|
|
#替换连续的标点
|
|
#替换连续的标点
|
|
@@ -1451,7 +1471,17 @@ def get_preprocessed_article(articles,cost_time = dict(),useselffool=True):
|
|
key_preprocess = "tableToText"
|
|
key_preprocess = "tableToText"
|
|
start_time = time.time()
|
|
start_time = time.time()
|
|
article_processed = segment(tableToText(BeautifulSoup(sourceContent,"lxml")))
|
|
article_processed = segment(tableToText(BeautifulSoup(sourceContent,"lxml")))
|
|
-
|
|
|
|
|
|
+ # 修正被","逗号分隔的时间
|
|
|
|
+ repair_time = re.compile("20,?\d,?\d,?[-—-―/年],?[0-1]?\d,?[-—-―/月],?[0-3]?\d,?日?,?(?:上午|下午)?,?[0-2]?\d,?:,?[0-6]\d,?:,?[0-6]\d|"
|
|
|
|
+ "20,?\d,?\d,?[-—-―/年],?[0-1]?\d,?[-—-―/月],?[0-3]?\d,?日?,?(?:上午|下午)?,?[0-2]?\d,?[:时点],?[0-6]\d,?分?|"
|
|
|
|
+ "20,?\d,?\d,?[-—-―/年],?[0-1]?\d,?[-—-―/月],?[0-3]?\d,?日?,?(?:上午|下午)?,?[0-2]?\d,?[时点]|"
|
|
|
|
+ "20,?\d,?\d,?[-—-―/年],?[0-1]?\d,?[-—-―/月],?[0-3]?\d,?日?|"
|
|
|
|
+ "[0-2]?\d,?:,?[0-6]\d,?:,?[0-6]\d"
|
|
|
|
+ )
|
|
|
|
+ for _time in set(re.findall(repair_time,article_processed)):
|
|
|
|
+ if re.search(",",_time):
|
|
|
|
+ article_processed = article_processed.replace(_time,re.sub(",","",_time))
|
|
|
|
+ # print('re_rtime',re.findall(repair_time,article_processed))
|
|
# log(article_processed)
|
|
# log(article_processed)
|
|
|
|
|
|
if key_preprocess not in cost_time:
|
|
if key_preprocess not in cost_time:
|