|
@@ -1048,8 +1048,12 @@ def tableToText(soup):
|
|
|
_td_len_list.append(len_td)
|
|
|
if _td_len_list:
|
|
|
if len(list(set(_td_len_list))) >= 8 or max(_td_len_list) > 100:
|
|
|
+ string_list = [re.sub("\s+","",i)for i in tbody.strings if i and i!='\n']
|
|
|
+ tbody.string = ",".join(string_list)
|
|
|
+ table_max_len = 30000
|
|
|
+ tbody.string = tbody.string[:table_max_len]
|
|
|
+ tbody.name = "turntable"
|
|
|
return None
|
|
|
-
|
|
|
# fixSpan(tbody)
|
|
|
# inner_table = getTable(tbody)
|
|
|
# inner_table = fixTable(inner_table)
|
|
@@ -1059,7 +1063,8 @@ def tableToText(soup):
|
|
|
inner_table = fixTable(inner_table)
|
|
|
|
|
|
if inner_table == []:
|
|
|
- tbody.string = segment(tbody,final=False)
|
|
|
+ string_list = [re.sub("\s+", "", i) for i in tbody.strings if i and i != '\n']
|
|
|
+ tbody.string = ",".join(string_list)
|
|
|
table_max_len = 30000
|
|
|
tbody.string = tbody.string[:table_max_len]
|
|
|
# log('异常表格直接取全文')
|
|
@@ -1119,7 +1124,7 @@ def tableToText(soup):
|
|
|
tag.extract()
|
|
|
for ul in soup.find_all('ul'): #例子 156439663 多个不同channel 类别的标题
|
|
|
if ul.find_all('li') == ul.findChildren(recursive=False) and len(set(re.findall(
|
|
|
- '招标公告|中标结果公示|中标候选人公示|招标答疑|开标评标|合同履?约?公示|开标评标|资格评审',
|
|
|
+ '招标公告|中标结果公示|中标候选人公示|招标答疑|开标评标|合同履?约?公示|资格评审',
|
|
|
ul.get_text(), re.S)))>3:
|
|
|
ul.extract()
|
|
|
|
|
@@ -1307,7 +1312,6 @@ def segment(soup,final=True):
|
|
|
commaList = ["div","br","td","p","li"]
|
|
|
#commaList = []
|
|
|
spaceList = ["span"]
|
|
|
-
|
|
|
tbodies = soup.find_all('tbody')
|
|
|
if len(tbodies) == 0:
|
|
|
tbodies = soup.find_all('table')
|