|
@@ -8,7 +8,7 @@ import time
|
|
import codecs
|
|
import codecs
|
|
|
|
|
|
from BiddingKG.dl.ratio.re_ratio import extract_ratio
|
|
from BiddingKG.dl.ratio.re_ratio import extract_ratio
|
|
-# from BiddingKG.dl.table_head.predict import predict
|
|
|
|
|
|
+from BiddingKG.dl.table_head.predict import predict
|
|
|
|
|
|
sys.setrecursionlimit(1000000)
|
|
sys.setrecursionlimit(1000000)
|
|
sys.path.append(os.path.abspath("../.."))
|
|
sys.path.append(os.path.abspath("../.."))
|
|
@@ -117,7 +117,9 @@ def tableToText(soup):
|
|
if len(tds)==0:
|
|
if len(tds)==0:
|
|
tr_line.append([re.sub('\xa0','',segment(tr,final=False)),0]) # 2021/12/21 修复部分表格没有td 造成数据丢失
|
|
tr_line.append([re.sub('\xa0','',segment(tr,final=False)),0]) # 2021/12/21 修复部分表格没有td 造成数据丢失
|
|
for td in tds:
|
|
for td in tds:
|
|
|
|
+ # print("td", td)
|
|
tr_line.append([re.sub('\xa0','',segment(td,final=False)),0])
|
|
tr_line.append([re.sub('\xa0','',segment(td,final=False)),0])
|
|
|
|
+ # print("segment td", segment(td,final=False))
|
|
#tr_line.append([td.get_text(),0])
|
|
#tr_line.append([td.get_text(),0])
|
|
inner_table.append(tr_line)
|
|
inner_table.append(tr_line)
|
|
return inner_table
|
|
return inner_table
|
|
@@ -422,7 +424,11 @@ def tableToText(soup):
|
|
def set_head_model(inner_table):
|
|
def set_head_model(inner_table):
|
|
for i in range(len(inner_table)):
|
|
for i in range(len(inner_table)):
|
|
for j in range(len(inner_table[i])):
|
|
for j in range(len(inner_table[i])):
|
|
- inner_table[i][j] = inner_table[i][j][0]
|
|
|
|
|
|
+ # 删掉单格前后符号,以免影响表头预测
|
|
|
|
+ col = inner_table[i][j][0]
|
|
|
|
+ col = re.sub("^[^\u4e00-\u9fa5a-zA-Z0-9]+", "", col)
|
|
|
|
+ col = re.sub("[^\u4e00-\u9fa5a-zA-Z0-9]+$", "", col)
|
|
|
|
+ inner_table[i][j] = col
|
|
|
|
|
|
# 模型预测表头
|
|
# 模型预测表头
|
|
predict_list = predict(inner_table)
|
|
predict_list = predict(inner_table)
|
|
@@ -990,7 +996,7 @@ def tableToText(soup):
|
|
inner_table[h][w][0] = ""
|
|
inner_table[h][w][0] = ""
|
|
|
|
|
|
def trunTable(tbody,in_attachment):
|
|
def trunTable(tbody,in_attachment):
|
|
- # print(tbody.find('tbody'))
|
|
|
|
|
|
+ # print("tbody", tbody.find('tbody'))
|
|
# 附件中的表格,排除异常错乱的表格
|
|
# 附件中的表格,排除异常错乱的表格
|
|
if in_attachment:
|
|
if in_attachment:
|
|
if tbody.name=='table':
|
|
if tbody.name=='table':
|
|
@@ -1011,10 +1017,10 @@ def tableToText(soup):
|
|
if len(inner_table)>0 and len(inner_table[0])>0:
|
|
if len(inner_table)>0 and len(inner_table[0])>0:
|
|
#inner_table,head_list = setHead_withRule(inner_table,pat_head,pat_value,3)
|
|
#inner_table,head_list = setHead_withRule(inner_table,pat_head,pat_value,3)
|
|
#inner_table,head_list = setHead_inline(inner_table)
|
|
#inner_table,head_list = setHead_inline(inner_table)
|
|
- inner_table, head_list = setHead_initem(inner_table,pat_head)
|
|
|
|
- # inner_table, head_list = set_head_model(inner_table)
|
|
|
|
|
|
+ # inner_table, head_list = setHead_initem(inner_table,pat_head)
|
|
|
|
+ inner_table, head_list = set_head_model(inner_table)
|
|
# inner_table,head_list = setHead_incontext(inner_table,pat_head)
|
|
# inner_table,head_list = setHead_incontext(inner_table,pat_head)
|
|
- # print(inner_table)
|
|
|
|
|
|
+ # print("table_head", inner_table)
|
|
# for begin in range(len(head_list[:-1])):
|
|
# for begin in range(len(head_list[:-1])):
|
|
# for item in inner_table[head_list[begin]:head_list[begin+1]]:
|
|
# for item in inner_table[head_list[begin]:head_list[begin+1]]:
|
|
# print(item)
|
|
# print(item)
|