|
@@ -257,6 +257,28 @@ def repair_entity(prem,district_dict,list_articles):
|
|
elif re.search("族$",city):
|
|
elif re.search("族$",city):
|
|
role['role_text'] = city + role_text
|
|
role['role_text'] = city + role_text
|
|
|
|
|
|
|
|
+def fix_table_structure_preserve_order(html):
|
|
|
|
+ """
|
|
|
|
+ 修复table结构中tr与tbody平级的问题
|
|
|
|
+ 保持原有行顺序不变
|
|
|
|
+ """
|
|
|
|
+ soup = BeautifulSoup(html, 'html.parser')
|
|
|
|
+
|
|
|
|
+ for table in soup.find_all('table'):
|
|
|
|
+ if table.find_all('tr', recursive=False) != []:
|
|
|
|
+ # 获取table下所有直接子节点
|
|
|
|
+ children = list(table.children)
|
|
|
|
+ tbody_new = soup.new_tag('tbody')
|
|
|
|
+ table.append(tbody_new)
|
|
|
|
+ for child in children:
|
|
|
|
+ if child.name:
|
|
|
|
+ if child.name == 'tbody':
|
|
|
|
+ for tag in list(child.children):
|
|
|
|
+ tbody_new.append(tag.extract())
|
|
|
|
+ child.extract()
|
|
|
|
+ else:
|
|
|
|
+ tbody_new.append(child.extract())
|
|
|
|
+ return str(soup)
|
|
|
|
|
|
def predict(doc_id,text,title="",page_time="",web_source_no='',web_source_name="",original_docchannel='',page_attachments='[]',**kwargs):
|
|
def predict(doc_id,text,title="",page_time="",web_source_no='',web_source_name="",original_docchannel='',page_attachments='[]',**kwargs):
|
|
cost_time = dict()
|
|
cost_time = dict()
|
|
@@ -269,6 +291,7 @@ def predict(doc_id,text,title="",page_time="",web_source_no='',web_source_name="
|
|
log("start process doc %s"%(str(doc_id)))
|
|
log("start process doc %s"%(str(doc_id)))
|
|
# 字符编码标准化
|
|
# 字符编码标准化
|
|
text = str_normalize(text)
|
|
text = str_normalize(text)
|
|
|
|
+ text = fix_table_structure_preserve_order(text) # 20250331 修复表格tr tbody平级问题
|
|
list_articles,list_sentences,list_entitys,list_outlines,_cost_time = Preprocessing.get_preprocessed([[doc_id,text,"","",title,page_time, web_source_no]],useselffool=True)
|
|
list_articles,list_sentences,list_entitys,list_outlines,_cost_time = Preprocessing.get_preprocessed([[doc_id,text,"","",title,page_time, web_source_no]],useselffool=True)
|
|
log("get preprocessed done of doc_id%s"%(doc_id))
|
|
log("get preprocessed done of doc_id%s"%(doc_id))
|
|
cost_time["preprocess"] = round(time.time()-start_time,2)
|
|
cost_time["preprocess"] = round(time.time()-start_time,2)
|
|
@@ -489,7 +512,7 @@ def predict(doc_id,text,title="",page_time="",web_source_no='',web_source_name="
|
|
|
|
|
|
# data_res = Preprocessing.union_result(Preprocessing.union_result(codeName, prem),list_punish_dic)[0]
|
|
# data_res = Preprocessing.union_result(Preprocessing.union_result(codeName, prem),list_punish_dic)[0]
|
|
# data_res = Preprocessing.union_result(Preprocessing.union_result(Preprocessing.union_result(codeName, prem),list_punish_dic), list_channel_dic)[0]
|
|
# data_res = Preprocessing.union_result(Preprocessing.union_result(Preprocessing.union_result(codeName, prem),list_punish_dic), list_channel_dic)[0]
|
|
- version_date = {'version_date': '2025-03-27'}
|
|
|
|
|
|
+ version_date = {'version_date': '2025-03-31'}
|
|
data_res = dict(codeName[0], **prem[0], **channel_dic, **product_attrs[0], **product_attrs[1], **payment_way_dic, **fail_reason, **industry, **district, **candidate_dic, **version_date, **all_moneys, **pb_json)
|
|
data_res = dict(codeName[0], **prem[0], **channel_dic, **product_attrs[0], **product_attrs[1], **payment_way_dic, **fail_reason, **industry, **district, **candidate_dic, **version_date, **all_moneys, **pb_json)
|
|
|
|
|
|
if original_docchannel == 302:
|
|
if original_docchannel == 302:
|