|
@@ -8,7 +8,7 @@ import time
|
|
import codecs
|
|
import codecs
|
|
|
|
|
|
from BiddingKG.dl.ratio.re_ratio import extract_ratio
|
|
from BiddingKG.dl.ratio.re_ratio import extract_ratio
|
|
-# from BiddingKG.dl.table_head.predict import predict
|
|
|
|
|
|
+from BiddingKG.dl.table_head.predict import predict
|
|
|
|
|
|
sys.setrecursionlimit(1000000)
|
|
sys.setrecursionlimit(1000000)
|
|
sys.path.append(os.path.abspath("../.."))
|
|
sys.path.append(os.path.abspath("../.."))
|
|
@@ -422,7 +422,11 @@ def tableToText(soup):
|
|
def set_head_model(inner_table):
|
|
def set_head_model(inner_table):
|
|
for i in range(len(inner_table)):
|
|
for i in range(len(inner_table)):
|
|
for j in range(len(inner_table[i])):
|
|
for j in range(len(inner_table[i])):
|
|
- inner_table[i][j] = inner_table[i][j][0]
|
|
|
|
|
|
+ # 删掉单格前后符号,以免影响表头预测
|
|
|
|
+ col = inner_table[i][j][0]
|
|
|
|
+ col = re.sub("^[^\u4e00-\u9fa5a-zA-Z0-9]+", "", col)
|
|
|
|
+ col = re.sub("[^\u4e00-\u9fa5a-zA-Z0-9]+$", "", col)
|
|
|
|
+ inner_table[i][j] = col
|
|
|
|
|
|
# 模型预测表头
|
|
# 模型预测表头
|
|
predict_list = predict(inner_table)
|
|
predict_list = predict(inner_table)
|
|
@@ -1012,9 +1016,9 @@ def tableToText(soup):
|
|
#inner_table,head_list = setHead_withRule(inner_table,pat_head,pat_value,3)
|
|
#inner_table,head_list = setHead_withRule(inner_table,pat_head,pat_value,3)
|
|
#inner_table,head_list = setHead_inline(inner_table)
|
|
#inner_table,head_list = setHead_inline(inner_table)
|
|
# inner_table, head_list = setHead_initem(inner_table,pat_head)
|
|
# inner_table, head_list = setHead_initem(inner_table,pat_head)
|
|
- # inner_table, head_list = set_head_model(inner_table)
|
|
|
|
- inner_table,head_list = setHead_incontext(inner_table,pat_head) # 发现setHead_initem挺多明显的表头识别不到,换回此方法
|
|
|
|
- # print(inner_table)
|
|
|
|
|
|
+ inner_table, head_list = set_head_model(inner_table)
|
|
|
|
+ # inner_table,head_list = setHead_incontext(inner_table,pat_head)
|
|
|
|
+ # print("table_head", inner_table)
|
|
# for begin in range(len(head_list[:-1])):
|
|
# for begin in range(len(head_list[:-1])):
|
|
# for item in inner_table[head_list[begin]:head_list[begin+1]]:
|
|
# for item in inner_table[head_list[begin]:head_list[begin+1]]:
|
|
# print(item)
|
|
# print(item)
|
|
@@ -1029,6 +1033,8 @@ def tableToText(soup):
|
|
|
|
|
|
|
|
|
|
tbody.string = getTableText(inner_table,head_list)
|
|
tbody.string = getTableText(inner_table,head_list)
|
|
|
|
+ table_max_len = 30000
|
|
|
|
+ tbody.string = tbody.string[:table_max_len]
|
|
#print(tbody.string)
|
|
#print(tbody.string)
|
|
tbody.name = "turntable"
|
|
tbody.name = "turntable"
|
|
return inner_table
|
|
return inner_table
|
|
@@ -1347,6 +1353,11 @@ def segment(soup,final=True):
|
|
text = _text
|
|
text = _text
|
|
# 附件标识前修改为句号,避免正文和附件内容混合在一起
|
|
# 附件标识前修改为句号,避免正文和附件内容混合在一起
|
|
text = re.sub("[^。](?=##attachment##)","。",text)
|
|
text = re.sub("[^。](?=##attachment##)","。",text)
|
|
|
|
+ text = re.sub("[^。](?=##attachment_begin##)","。",text)
|
|
|
|
+ text = re.sub("[^。](?=##attachment_end##)","。",text)
|
|
|
|
+ text = re.sub("##attachment_begin##。","##attachment_begin##",text)
|
|
|
|
+ text = re.sub("##attachment_end##。","##attachment_end##",text)
|
|
|
|
+
|
|
return text
|
|
return text
|
|
|
|
|
|
'''
|
|
'''
|
|
@@ -1862,9 +1873,43 @@ def article_limit(soup,limit_words=30000):
|
|
attachment_skip = True
|
|
attachment_skip = True
|
|
else:
|
|
else:
|
|
part.decompose()
|
|
part.decompose()
|
|
-
|
|
|
|
return soup
|
|
return soup
|
|
|
|
|
|
|
|
+def attachment_filelink(soup):
|
|
|
|
+ have_attachment = False
|
|
|
|
+ attachment_part = None
|
|
|
|
+ for child in soup.find_all(recursive=True):
|
|
|
|
+ if child.name == 'div' and 'class' in child.attrs:
|
|
|
|
+ if "richTextFetch" in child['class']:
|
|
|
|
+ attachment_part = child
|
|
|
|
+ have_attachment = True
|
|
|
|
+ break
|
|
|
|
+ if not have_attachment:
|
|
|
|
+ return soup
|
|
|
|
+ else:
|
|
|
|
+ # 附件类型:图片、表格
|
|
|
|
+ attachment_type = re.compile("\.(?:png|jpg|jpeg|tif|bmp|xlsx|xls)$")
|
|
|
|
+ attachment_dict = dict()
|
|
|
|
+ for _attachment in attachment_part.find_all(recursive=False):
|
|
|
|
+ if _attachment.name == 'div' and 'filemd5' in _attachment.attrs:
|
|
|
|
+ # print('filemd5',_attachment['filemd5'])
|
|
|
|
+ attachment_dict[_attachment['filemd5']] = _attachment
|
|
|
|
+ # print(attachment_dict)
|
|
|
|
+ for child in soup.find_all(recursive=True):
|
|
|
|
+ if child.name == 'div' and 'class' in child.attrs:
|
|
|
|
+ if "richTextFetch" in child['class']:
|
|
|
|
+ break
|
|
|
|
+ if "filelink" in child.attrs and child['filelink'] in attachment_dict:
|
|
|
|
+ if re.search(attachment_type,str(child.string).strip()) or \
|
|
|
|
+ ('original' in child.attrs and re.search(attachment_type,str(child['original']).strip())):
|
|
|
|
+ # 附件插入正文标识
|
|
|
|
+ child.insert_before("。##attachment_begin##")
|
|
|
|
+ child.insert_after("。##attachment_end##")
|
|
|
|
+ child.replace_with(attachment_dict[child['filelink']])
|
|
|
|
+
|
|
|
|
+ # print('格式化输出',soup.prettify())
|
|
|
|
+ return soup
|
|
|
|
+
|
|
def get_preprocessed_article(articles,cost_time = dict(),useselffool=True):
|
|
def get_preprocessed_article(articles,cost_time = dict(),useselffool=True):
|
|
'''
|
|
'''
|
|
:param articles: 待处理的article source html
|
|
:param articles: 待处理的article source html
|
|
@@ -1909,7 +1954,10 @@ def get_preprocessed_article(articles,cost_time = dict(),useselffool=True):
|
|
_soup.wrap(article_processed.new_tag("span"))
|
|
_soup.wrap(article_processed.new_tag("span"))
|
|
# print(article_processed)
|
|
# print(article_processed)
|
|
# 正文和附件内容限制字数30000
|
|
# 正文和附件内容限制字数30000
|
|
- article_processed = article_limit(article_processed,limit_words=30000)
|
|
|
|
|
|
+ article_processed = article_limit(article_processed, limit_words=30000)
|
|
|
|
+ # 把每个附件识别对应的html放回原来出现的位置
|
|
|
|
+ article_processed = attachment_filelink(article_processed)
|
|
|
|
+
|
|
article_processed = get_preprocessed_outline(article_processed)
|
|
article_processed = get_preprocessed_outline(article_processed)
|
|
# print('article_processed')
|
|
# print('article_processed')
|
|
article_processed = tableToText(article_processed)
|
|
article_processed = tableToText(article_processed)
|
|
@@ -1919,6 +1967,7 @@ def get_preprocessed_article(articles,cost_time = dict(),useselffool=True):
|
|
article_processed = article_processed.replace('.','.') # 2021/12/01 修正OCR识别PDF小数点错误问题
|
|
article_processed = article_processed.replace('.','.') # 2021/12/01 修正OCR识别PDF小数点错误问题
|
|
article_processed = article_processed.replace('报价限价', '招标限价') #2021/12/17 由于报价限价预测为中投标金额所以修改
|
|
article_processed = article_processed.replace('报价限价', '招标限价') #2021/12/17 由于报价限价预测为中投标金额所以修改
|
|
article_processed = article_processed.replace('成交工程价款', '成交工程价') # 2021/12/21 修正为中标价
|
|
article_processed = article_processed.replace('成交工程价款', '成交工程价') # 2021/12/21 修正为中标价
|
|
|
|
+ article_processed = re.sub('任务(?=编号[::])', '项目',article_processed) # 2022/08/10 修正为项目编号
|
|
article_processed = article_processed.replace('招标(建设)单位', '招标单位') #2022/8/10 修正预测不到表达
|
|
article_processed = article_processed.replace('招标(建设)单位', '招标单位') #2022/8/10 修正预测不到表达
|
|
article_processed = re.sub('(招标|采购)人(概况|信息)[,。]', '采购人信息:', article_processed) # 2022/8/10统一表达
|
|
article_processed = re.sub('(招标|采购)人(概况|信息)[,。]', '采购人信息:', article_processed) # 2022/8/10统一表达
|
|
# 修复OCR金额中“,”、“。”识别错误
|
|
# 修复OCR金额中“,”、“。”识别错误
|
|
@@ -2096,16 +2145,23 @@ def get_preprocessed_sentences(list_articles,useselffool=True,cost_time=dict()):
|
|
#限流执行
|
|
#限流执行
|
|
key_nerToken = "nerToken"
|
|
key_nerToken = "nerToken"
|
|
start_time = time.time()
|
|
start_time = time.time()
|
|
- tokens_all = getTokens(sentences,useselffool=useselffool)
|
|
|
|
|
|
+ # tokens_all = getTokens(sentences,useselffool=useselffool)
|
|
|
|
+ tokens_all = getTokens([re.sub("##attachment_begin##|##attachment_end##","",_sen) for _sen in sentences],useselffool=useselffool)
|
|
if key_nerToken not in cost_time:
|
|
if key_nerToken not in cost_time:
|
|
cost_time[key_nerToken] = 0
|
|
cost_time[key_nerToken] = 0
|
|
cost_time[key_nerToken] += round(time.time()-start_time,2)
|
|
cost_time[key_nerToken] += round(time.time()-start_time,2)
|
|
|
|
|
|
in_attachment = False
|
|
in_attachment = False
|
|
for sentence_index in range(len(sentences)):
|
|
for sentence_index in range(len(sentences)):
|
|
- if sentence_index == attachment_begin_index:
|
|
|
|
- in_attachment = True
|
|
|
|
sentence_text = sentences[sentence_index]
|
|
sentence_text = sentences[sentence_index]
|
|
|
|
+ if re.search("##attachment_begin##",sentence_text):
|
|
|
|
+ in_attachment = True
|
|
|
|
+ sentence_text = re.sub("##attachment_begin##","",sentence_text)
|
|
|
|
+ elif re.search("##attachment_end##",sentence_text):
|
|
|
|
+ in_attachment = False
|
|
|
|
+ sentence_text = re.sub("##attachment_end##", "", sentence_text)
|
|
|
|
+ if sentence_index >= attachment_begin_index and attachment_begin_index!=-1:
|
|
|
|
+ in_attachment = True
|
|
tokens = tokens_all[sentence_index]
|
|
tokens = tokens_all[sentence_index]
|
|
|
|
|
|
#pos_tag = pos_all[sentence_index]
|
|
#pos_tag = pos_all[sentence_index]
|