|
@@ -419,7 +419,7 @@ def tableToText(soup):
|
|
else:
|
|
else:
|
|
inner_table[_h][_w][1] = 0
|
|
inner_table[_h][_w][1] = 0
|
|
_item = inner_table[_h][_w][0]
|
|
_item = inner_table[_h][_w][0]
|
|
- if re.search(pat_head,_item) is not None and len(item)<8:
|
|
|
|
|
|
+ if re.search(pat_head,_item) is not None and len(_item)<8:
|
|
inner_table[_h][_w][1] = 1
|
|
inner_table[_h][_w][1] = 1
|
|
|
|
|
|
# print("=====")
|
|
# print("=====")
|
|
@@ -430,7 +430,7 @@ def tableToText(soup):
|
|
width = len(inner_table[0])
|
|
width = len(inner_table[0])
|
|
for i in range(height):
|
|
for i in range(height):
|
|
for j in range(width):
|
|
for j in range(width):
|
|
- if re.search("[::]$", inner_table[i][j][0]):
|
|
|
|
|
|
+ if re.search("[::]$", inner_table[i][j][0]) and len(inner_table[i][j][0])<8:
|
|
inner_table[i][j][1] = 1
|
|
inner_table[i][j][1] = 1
|
|
|
|
|
|
repairTable(inner_table)
|
|
repairTable(inner_table)
|
|
@@ -1472,15 +1472,24 @@ def get_preprocessed_article(articles,cost_time = dict(),useselffool=True):
|
|
start_time = time.time()
|
|
start_time = time.time()
|
|
article_processed = segment(tableToText(BeautifulSoup(sourceContent,"lxml")))
|
|
article_processed = segment(tableToText(BeautifulSoup(sourceContent,"lxml")))
|
|
# 修正被","逗号分隔的时间
|
|
# 修正被","逗号分隔的时间
|
|
- repair_time = re.compile("20,?\d,?\d,?[-—-―/年],?[0-1]?\d,?[-—-―/月],?[0-3]?\d,?日?,?(?:上午|下午)?,?[0-2]?\d,?:,?[0-6]\d,?:,?[0-6]\d|"
|
|
|
|
- "20,?\d,?\d,?[-—-―/年],?[0-1]?\d,?[-—-―/月],?[0-3]?\d,?日?,?(?:上午|下午)?,?[0-2]?\d,?[:时点],?[0-6]\d,?分?|"
|
|
|
|
- "20,?\d,?\d,?[-—-―/年],?[0-1]?\d,?[-—-―/月],?[0-3]?\d,?日?,?(?:上午|下午)?,?[0-2]?\d,?[时点]|"
|
|
|
|
- "20,?\d,?\d,?[-—-―/年],?[0-1]?\d,?[-—-―/月],?[0-3]?\d,?日?|"
|
|
|
|
|
|
+ repair_time = re.compile("[12]\d,?\d,?\d,?[-—-―/年],?[0-1]?\d,?[-—-―/月],?[0-3]?\d,?日?,?(?:上午|下午)?,?[0-2]?\d,?:,?[0-6]\d,?:,?[0-6]\d|"
|
|
|
|
+ "[12]\d,?\d,?\d,?[-—-―/年],?[0-1]?\d,?[-—-―/月],?[0-3]?\d,?日?,?(?:上午|下午)?,?[0-2]?\d,?[:时点],?[0-6]\d,?分?|"
|
|
|
|
+ "[12]\d,?\d,?\d,?[-—-―/年],?[0-1]?\d,?[-—-―/月],?[0-3]?\d,?日?,?(?:上午|下午)?,?[0-2]?\d,?[时点]|"
|
|
|
|
+ "[12]\d,?\d,?\d,?[-—-―/年],?[0-1]?\d,?[-—-―/月],?[0-3]?\d,?日?|"
|
|
"[0-2]?\d,?:,?[0-6]\d,?:,?[0-6]\d"
|
|
"[0-2]?\d,?:,?[0-6]\d,?:,?[0-6]\d"
|
|
)
|
|
)
|
|
for _time in set(re.findall(repair_time,article_processed)):
|
|
for _time in set(re.findall(repair_time,article_processed)):
|
|
if re.search(",",_time):
|
|
if re.search(",",_time):
|
|
- article_processed = article_processed.replace(_time,re.sub(",","",_time))
|
|
|
|
|
|
+ _time2 = re.sub(",", "", _time)
|
|
|
|
+ item = re.search("[12]\d{3}[-—-―/][0-1]?\d[-—-―/][0-3]\d(?=\d)", _time2)
|
|
|
|
+ if item:
|
|
|
|
+ _time2 = _time2.replace(item.group(),item.group() + " ")
|
|
|
|
+ article_processed = article_processed.replace(_time, _time2)
|
|
|
|
+ else:
|
|
|
|
+ item = re.search("[12]\d{3}[-—-―/][0-1]?\d[-—-―/][0-3]\d(?=\d)", _time)
|
|
|
|
+ if item:
|
|
|
|
+ _time2 = _time.replace(item.group(),item.group() + " ")
|
|
|
|
+ article_processed = article_processed.replace(_time, _time2)
|
|
# print('re_rtime',re.findall(repair_time,article_processed))
|
|
# print('re_rtime',re.findall(repair_time,article_processed))
|
|
# log(article_processed)
|
|
# log(article_processed)
|
|
|
|
|