|
@@ -384,34 +384,38 @@ def tableToText(soup):
|
|
set_item = set()
|
|
set_item = set()
|
|
height = len(inner_table)
|
|
height = len(inner_table)
|
|
width = len(inner_table[0])
|
|
width = len(inner_table[0])
|
|
|
|
+ empty_set = set()
|
|
for i in range(height):
|
|
for i in range(height):
|
|
for j in range(width):
|
|
for j in range(width):
|
|
item = inner_table[i][j][0]
|
|
item = inner_table[i][j][0]
|
|
- set_item.add(item)
|
|
|
|
|
|
+ if item.strip()=="":
|
|
|
|
+ empty_set.add(item)
|
|
|
|
+ else:
|
|
|
|
+ set_item.add(item)
|
|
list_item = list(set_item)
|
|
list_item = list(set_item)
|
|
- x = []
|
|
|
|
- for item in list_item:
|
|
|
|
- x.append(getPredictor("form").encode(item))
|
|
|
|
- predict_y = getPredictor("form").predict(np.array(x),type="item")
|
|
|
|
- _dict = dict()
|
|
|
|
-
|
|
|
|
- for item,values in zip(list_item,list(predict_y)):
|
|
|
|
- _dict[item] = values[1]
|
|
|
|
- # print("##",item,values)
|
|
|
|
- #print(_dict)
|
|
|
|
- for i in range(height):
|
|
|
|
- for j in range(width):
|
|
|
|
- item = inner_table[i][j][0]
|
|
|
|
- inner_table[i][j][1] = 1 if _dict[item]>prob_min else (1 if re.search(pat_head,item) is not None and len(item)<8 else 0)
|
|
|
|
|
|
+ if list_item:
|
|
|
|
+ x = []
|
|
|
|
+ for item in list_item:
|
|
|
|
+ x.append(getPredictor("form").encode(item))
|
|
|
|
+ predict_y = getPredictor("form").predict(np.array(x),type="item")
|
|
|
|
+ _dict = dict()
|
|
|
|
+
|
|
|
|
+ for item,values in zip(list_item,list(predict_y)):
|
|
|
|
+ _dict[item] = values[1]
|
|
|
|
+ # print("##",item,values)
|
|
|
|
+ #print(_dict)
|
|
|
|
+ for i in range(height):
|
|
|
|
+ for j in range(width):
|
|
|
|
+ item = inner_table[i][j][0]
|
|
|
|
+ if item not in empty_set:
|
|
|
|
+ inner_table[i][j][1] = 1 if _dict[item]>prob_min else (1 if re.search(pat_head,item) is not None and len(item)<8 else 0)
|
|
|
|
|
|
# print("=====")
|
|
# print("=====")
|
|
# for item in inner_table:
|
|
# for item in inner_table:
|
|
# print(item)
|
|
# print(item)
|
|
# print("======")
|
|
# print("======")
|
|
-
|
|
|
|
repairTable(inner_table)
|
|
repairTable(inner_table)
|
|
head_list = sliceTable(inner_table)
|
|
head_list = sliceTable(inner_table)
|
|
-
|
|
|
|
|
|
|
|
return inner_table,head_list
|
|
return inner_table,head_list
|
|
|
|
|
|
@@ -985,15 +989,28 @@ def tableToText(soup):
|
|
if inner_table[h][w][0]==fix_value:
|
|
if inner_table[h][w][0]==fix_value:
|
|
inner_table[h][w][0] = ""
|
|
inner_table[h][w][0] = ""
|
|
|
|
|
|
- def trunTable(tbody):
|
|
|
|
|
|
+ def trunTable(tbody,in_attachment):
|
|
|
|
+ # print(tbody.find('tbody'))
|
|
|
|
+ # 附件中的表格,排除异常错乱的表格
|
|
|
|
+ if in_attachment:
|
|
|
|
+ if tbody.name=='table':
|
|
|
|
+ _tbody = tbody.find('tbody')
|
|
|
|
+ else:
|
|
|
|
+ _tbody = tbody
|
|
|
|
+ _td_len_list = []
|
|
|
|
+ for _tr in _tbody.find_all(recursive=False):
|
|
|
|
+ len_td = len(_tr.find_all(recursive=False))
|
|
|
|
+ _td_len_list.append(len_td)
|
|
|
|
+ if len(list(set(_td_len_list)))>8:
|
|
|
|
+ return None
|
|
fixSpan(tbody)
|
|
fixSpan(tbody)
|
|
inner_table = getTable(tbody)
|
|
inner_table = getTable(tbody)
|
|
inner_table = fixTable(inner_table)
|
|
inner_table = fixTable(inner_table)
|
|
if len(inner_table)>0 and len(inner_table[0])>0:
|
|
if len(inner_table)>0 and len(inner_table[0])>0:
|
|
#inner_table,head_list = setHead_withRule(inner_table,pat_head,pat_value,3)
|
|
#inner_table,head_list = setHead_withRule(inner_table,pat_head,pat_value,3)
|
|
#inner_table,head_list = setHead_inline(inner_table)
|
|
#inner_table,head_list = setHead_inline(inner_table)
|
|
- # inner_table, head_list = setHead_initem(inner_table,pat_head)
|
|
|
|
- inner_table, head_list = set_head_model(inner_table)
|
|
|
|
|
|
+ inner_table, head_list = setHead_initem(inner_table,pat_head)
|
|
|
|
+ # inner_table, head_list = set_head_model(inner_table)
|
|
# inner_table,head_list = setHead_incontext(inner_table,pat_head)
|
|
# inner_table,head_list = setHead_incontext(inner_table,pat_head)
|
|
# print(inner_table)
|
|
# print(inner_table)
|
|
# for begin in range(len(head_list[:-1])):
|
|
# for begin in range(len(head_list[:-1])):
|
|
@@ -1033,20 +1050,36 @@ def tableToText(soup):
|
|
ul.get_text(), re.S)))>3:
|
|
ul.get_text(), re.S)))>3:
|
|
ul.extract()
|
|
ul.extract()
|
|
|
|
|
|
- tbodies = soup.find_all('table')
|
|
|
|
|
|
+ # tbodies = soup.find_all('table')
|
|
# 遍历表格中的每个tbody
|
|
# 遍历表格中的每个tbody
|
|
|
|
+ tbodies = []
|
|
|
|
+ in_attachment = False
|
|
|
|
+ for _part in soup.find_all():
|
|
|
|
+ if _part.name=='table':
|
|
|
|
+ tbodies.append((_part,in_attachment))
|
|
|
|
+ elif _part.name=='div':
|
|
|
|
+ if 'class' in _part.attrs and "richTextFetch" in _part['class']:
|
|
|
|
+ in_attachment = True
|
|
#逆序处理嵌套表格
|
|
#逆序处理嵌套表格
|
|
for tbody_index in range(1,len(tbodies)+1):
|
|
for tbody_index in range(1,len(tbodies)+1):
|
|
- tbody = tbodies[len(tbodies)-tbody_index]
|
|
|
|
- inner_table = trunTable(tbody)
|
|
|
|
|
|
+ tbody,_in_attachment = tbodies[len(tbodies)-tbody_index]
|
|
|
|
+ inner_table = trunTable(tbody,_in_attachment)
|
|
list_innerTable.append(inner_table)
|
|
list_innerTable.append(inner_table)
|
|
|
|
|
|
- tbodies = soup.find_all('tbody')
|
|
|
|
|
|
+ # tbodies = soup.find_all('tbody')
|
|
# 遍历表格中的每个tbody
|
|
# 遍历表格中的每个tbody
|
|
|
|
+ tbodies = []
|
|
|
|
+ in_attachment = False
|
|
|
|
+ for _part in soup.find_all():
|
|
|
|
+ if _part.name == 'tbody':
|
|
|
|
+ tbodies.append((_part, in_attachment))
|
|
|
|
+ elif _part.name == 'div':
|
|
|
|
+ if 'class' in _part.attrs and "richTextFetch" in _part['class']:
|
|
|
|
+ in_attachment = True
|
|
#逆序处理嵌套表格
|
|
#逆序处理嵌套表格
|
|
for tbody_index in range(1,len(tbodies)+1):
|
|
for tbody_index in range(1,len(tbodies)+1):
|
|
- tbody = tbodies[len(tbodies)-tbody_index]
|
|
|
|
- inner_table = trunTable(tbody)
|
|
|
|
|
|
+ tbody,_in_attachment = tbodies[len(tbodies)-tbody_index]
|
|
|
|
+ inner_table = trunTable(tbody,_in_attachment)
|
|
list_innerTable.append(inner_table)
|
|
list_innerTable.append(inner_table)
|
|
|
|
|
|
return soup
|
|
return soup
|
|
@@ -1785,15 +1818,20 @@ def article_limit(soup,limit_words=30000):
|
|
while n_soup:
|
|
while n_soup:
|
|
text_count, gap, n_soup = soup_limit(n_soup, text_count, max_count=limit_words, max_gap=500)
|
|
text_count, gap, n_soup = soup_limit(n_soup, text_count, max_count=limit_words, max_gap=500)
|
|
if len(_text_split[1])>limit_words:
|
|
if len(_text_split[1])>limit_words:
|
|
- attachment_text_nums = 0
|
|
|
|
- attachment_skip = False
|
|
|
|
- for part in attachment_part.find_all(recursive=False):
|
|
|
|
- if not attachment_skip:
|
|
|
|
- attachment_text_nums += len(re.sub(sub_space, "", part.get_text()))
|
|
|
|
- if attachment_text_nums>=limit_words:
|
|
|
|
- attachment_skip = True
|
|
|
|
- else:
|
|
|
|
- part.decompose()
|
|
|
|
|
|
+ # attachment_html纯文本,无子结构
|
|
|
|
+ if len(attachment_part.find_all(recursive=False))==0:
|
|
|
|
+ attachment_part.string = str(attachment_part.get_text())[:limit_words]
|
|
|
|
+ else:
|
|
|
|
+ attachment_text_nums = 0
|
|
|
|
+ attachment_skip = False
|
|
|
|
+ for part in attachment_part.find_all(recursive=False):
|
|
|
|
+ if not attachment_skip:
|
|
|
|
+ attachment_text_nums += len(re.sub(sub_space, "", part.get_text()))
|
|
|
|
+ if attachment_text_nums>=limit_words:
|
|
|
|
+ part.string = str(part.get_text())[:attachment_text_nums-limit_words]
|
|
|
|
+ attachment_skip = True
|
|
|
|
+ else:
|
|
|
|
+ part.decompose()
|
|
|
|
|
|
return soup
|
|
return soup
|
|
|
|
|
|
@@ -1843,8 +1881,8 @@ def get_preprocessed_article(articles,cost_time = dict(),useselffool=True):
|
|
# 正文和附件内容限制字数30000
|
|
# 正文和附件内容限制字数30000
|
|
article_processed = article_limit(article_processed,limit_words=30000)
|
|
article_processed = article_limit(article_processed,limit_words=30000)
|
|
article_processed = get_preprocessed_outline(article_processed)
|
|
article_processed = get_preprocessed_outline(article_processed)
|
|
|
|
+ # print('article_processed')
|
|
article_processed = tableToText(article_processed)
|
|
article_processed = tableToText(article_processed)
|
|
- # print(article_processed)
|
|
|
|
article_processed = segment(article_processed)
|
|
article_processed = segment(article_processed)
|
|
article_processed = article_processed.replace('.','.') # 2021/12/01 修正OCR识别PDF小数点错误问题
|
|
article_processed = article_processed.replace('.','.') # 2021/12/01 修正OCR识别PDF小数点错误问题
|
|
article_processed = article_processed.replace('报价限价', '招标限价') #2021/12/17 由于报价限价预测为中投标金额所以修改
|
|
article_processed = article_processed.replace('报价限价', '招标限价') #2021/12/17 由于报价限价预测为中投标金额所以修改
|