|
@@ -1472,10 +1472,10 @@ def get_preprocessed_article(articles,cost_time = dict(),useselffool=True):
|
|
start_time = time.time()
|
|
start_time = time.time()
|
|
article_processed = segment(tableToText(BeautifulSoup(sourceContent,"lxml")))
|
|
article_processed = segment(tableToText(BeautifulSoup(sourceContent,"lxml")))
|
|
# 修正被","逗号分隔的时间
|
|
# 修正被","逗号分隔的时间
|
|
- repair_time = re.compile("[12]\d,?\d,?\d,?[-—-―/年],?[0-1]?\d,?[-—-―/月],?[0-3]?\d,?日?,?(?:上午|下午)?,?[0-2]?\d,?:,?[0-6]\d,?:,?[0-6]\d|"
|
|
|
|
- "[12]\d,?\d,?\d,?[-—-―/年],?[0-1]?\d,?[-—-―/月],?[0-3]?\d,?日?,?(?:上午|下午)?,?[0-2]?\d,?[:时点],?[0-6]\d,?分?|"
|
|
|
|
- "[12]\d,?\d,?\d,?[-—-―/年],?[0-1]?\d,?[-—-―/月],?[0-3]?\d,?日?,?(?:上午|下午)?,?[0-2]?\d,?[时点]|"
|
|
|
|
- "[12]\d,?\d,?\d,?[-—-―/年],?[0-1]?\d,?[-—-―/月],?[0-3]?\d,?日?|"
|
|
|
|
|
|
+ repair_time = re.compile("[12]\d,?\d,?\d,?[-—-―/年],?[0-1]?\d,?[-—-―/月],?[0-3]?\d,?[日号]?,?(?:上午|下午)?,?[0-2]?\d,?:,?[0-6]\d,?:,?[0-6]\d|"
|
|
|
|
+ "[12]\d,?\d,?\d,?[-—-―/年],?[0-1]?\d,?[-—-―/月],?[0-3]?\d,?[日号]?,?(?:上午|下午)?,?[0-2]?\d,?[:时点],?[0-6]\d分?|"
|
|
|
|
+ "[12]\d,?\d,?\d,?[-—-―/年],?[0-1]?\d,?[-—-―/月],?[0-3]?\d,?[日号]?,?(?:上午|下午)?,?[0-2]?\d,?[时点]|"
|
|
|
|
+ "[12]\d,?\d,?\d,?[-—-―/年],?[0-1]?\d,?[-—-―/月],?[0-3]?\d,?[日号]|"
|
|
"[0-2]?\d,?:,?[0-6]\d,?:,?[0-6]\d"
|
|
"[0-2]?\d,?:,?[0-6]\d,?:,?[0-6]\d"
|
|
)
|
|
)
|
|
for _time in set(re.findall(repair_time,article_processed)):
|
|
for _time in set(re.findall(repair_time,article_processed)):
|