|
@@ -1,3 +1,4 @@
|
|
|
+# coding:utf-8
|
|
|
import copy
|
|
|
import os
|
|
|
import re
|
|
@@ -879,8 +880,16 @@ def extract_several_money(list_sentence, html='', is_obj=True, show=0):
|
|
|
|
|
|
money_list = []
|
|
|
all_before_sentence = ''
|
|
|
- soup = BeautifulSoup(html, 'lxml')
|
|
|
- tables_and_divs = soup.find_all(['table', 'div'])
|
|
|
+ # if 'pcontent' in html:
|
|
|
+ # print('pcontent in html')
|
|
|
+ # soup = BeautifulSoup(html, 'lxml').find("div",id="pcontent")
|
|
|
+ # else:
|
|
|
+ # soup = BeautifulSoup(html, 'lxml')
|
|
|
+ # soup_text = soup.get_text()
|
|
|
+ # tables_and_divs = soup.find_all(['table', 'div'])
|
|
|
+ tables_and_divs = get_table_and_div(html)
|
|
|
+ # for t in tables_and_divs:
|
|
|
+ # print('tttt', t)
|
|
|
for i, sentence in enumerate(list_sentence):
|
|
|
if show and i % 100 == 0:
|
|
|
print('extract_several_money Loop', i, len(list_sentence), time.time() - start_time1)
|
|
@@ -913,6 +922,9 @@ def extract_several_money(list_sentence, html='', is_obj=True, show=0):
|
|
|
_list, _ = get_several_money(text, 0, False, tables_and_divs=tables_and_divs)
|
|
|
if show:
|
|
|
print('extract_several_money time0.2', time.time() - start_time2)
|
|
|
+ print('get_several_money text', text)
|
|
|
+ # print('tables_and_divs', tables_and_divs)
|
|
|
+ print('get_several_money', _list)
|
|
|
start_time2 = time.time()
|
|
|
# logging.info('get_several_money _list ' + str(_list))
|
|
|
|
|
@@ -1306,11 +1318,13 @@ def judge_yeji(end_index, content, judge_len=300, tables_and_divs=None, entity=N
|
|
|
# _m = re.search(re.escape(_entity_text), str(_table))
|
|
|
_m = re.search(re.escape(_entity_text), _table_text)
|
|
|
if not _m:
|
|
|
- _entity_text = _entity_text[1:]
|
|
|
_m = re.search(re.escape(_entity_text[1:]), _table_text)
|
|
|
+ if _m:
|
|
|
+ _entity_text = _entity_text[1:]
|
|
|
if not _m:
|
|
|
- _entity_text = _entity_text[:-1]
|
|
|
_m = re.search(re.escape(_entity_text[:-1]), _table_text)
|
|
|
+ if _m:
|
|
|
+ _entity_text = _entity_text[:-1]
|
|
|
# print('_entity_text', _entity_text)
|
|
|
if _m:
|
|
|
rows = _table.find_all('tr')
|
|
@@ -1331,7 +1345,7 @@ def judge_yeji(end_index, content, judge_len=300, tables_and_divs=None, entity=N
|
|
|
# 第一列是否含业绩
|
|
|
# if re.search(reg_yeji, str(cells[0])):
|
|
|
if re.search(reg_yeji, str(cells[0].get_text())):
|
|
|
- # logging.info('is_yeji_table 1')
|
|
|
+ # print('is_yeji_table 1')
|
|
|
is_yeji = 1
|
|
|
# 前面几行是否有占多列的有业绩
|
|
|
else:
|
|
@@ -1342,7 +1356,7 @@ def judge_yeji(end_index, content, judge_len=300, tables_and_divs=None, entity=N
|
|
|
# print(re.search('业绩', str(row2)), str(row2))
|
|
|
if len(row2.find_all('td')) <= max_col_span / 2 and re.search(reg_yeji,
|
|
|
str(row2.get_text())):
|
|
|
- # logging.info('is_yeji_table 2')
|
|
|
+ # print('is_yeji_table 2')
|
|
|
is_yeji = 1
|
|
|
|
|
|
break
|
|
@@ -1350,7 +1364,13 @@ def judge_yeji(end_index, content, judge_len=300, tables_and_divs=None, entity=N
|
|
|
# 前面都找不到,那么找表格上方的两行
|
|
|
div_list = [str(x.get_text()) for x in _tables_and_divs[max(0, index3 - 2):index3]]
|
|
|
if not is_yeji and re.search(reg_yeji, ' '.join(div_list)):
|
|
|
- # logging.info('is_yeji_table 3')
|
|
|
+ # print('is_yeji_table 3')
|
|
|
+ # # print('div_list', div_list)
|
|
|
+ # print('_tables_and_divs[index3]', _tables_and_divs[index3])
|
|
|
+ # print('_tables_and_divs[max(0, index3 - 2):index3]', _tables_and_divs[max(0, index3 - 2)])
|
|
|
+ # print('_tables_and_divs[max(0, index3 - 2):index3]', _tables_and_divs[index3-1])
|
|
|
+ # print('_table_text,', _table_text)
|
|
|
+ # print('_entity_text', _entity_text)
|
|
|
is_yeji = 1
|
|
|
break
|
|
|
if show:
|
|
@@ -1530,14 +1550,17 @@ def get_several_money(sentence_text, found_yeji, in_attachment=False,
|
|
|
break
|
|
|
yeji_table_flag = 0
|
|
|
for k, v in _match.groupdict().items():
|
|
|
+ # print('kv k:', k, 'v:', v)
|
|
|
if v != "" and v is not None:
|
|
|
if k == 'text_key_word':
|
|
|
notSure = True
|
|
|
if k.split("_")[0] == "money":
|
|
|
entity_text = v
|
|
|
# if is_yeji_table(tables_and_divs, entity_text):
|
|
|
+ # print('judge_yeji entity_text', entity_text)
|
|
|
if judge_yeji(len(sentence_text), sentence_text, 300, tables_and_divs, entity_text):
|
|
|
yeji_table_flag = 1
|
|
|
+ # print('yeji_table_flag', yeji_table_flag)
|
|
|
break
|
|
|
# print(_match.group(k), 'entity_text: ', sentence_text[_match.start(k): _match.end(k)])
|
|
|
if entity_text.endswith(',00'): # 金额逗号后面不可能为两个0结尾,应该小数点识别错,直接去掉
|
|
@@ -1998,3 +2021,85 @@ class get_service_end:
|
|
|
# return ''
|
|
|
except Exception as e:
|
|
|
return '0'
|
|
|
+
|
|
|
+
|
|
|
+def get_table_and_div(html_content):
|
|
|
+ try:
|
|
|
+ if 'pcontent' in html_content:
|
|
|
+ # print('pcontent in html')
|
|
|
+ soup = BeautifulSoup(html_content, 'lxml').find("div",id="pcontent")
|
|
|
+ else:
|
|
|
+ soup = BeautifulSoup(html_content, 'lxml')
|
|
|
+
|
|
|
+ # 用于存储提取的元素
|
|
|
+ extracted_elements = []
|
|
|
+
|
|
|
+ # 提取所有独立的 <table>,并记录它们的位置
|
|
|
+ tables = soup.find_all('table')
|
|
|
+ # 去掉table中的div
|
|
|
+ tables_to_replace = []
|
|
|
+ for table in tables:
|
|
|
+ table_no_div = re.sub('<div[^>]*|</div>', '', str(table))
|
|
|
+ table_no_div = BeautifulSoup(table_no_div, 'lxml')
|
|
|
+ tables_to_replace.append([table, table_no_div])
|
|
|
+ for original_table, cleaned_table in tables_to_replace:
|
|
|
+ # cleaned_table_soup = BeautifulSoup(cleaned_table_content, 'lxml')
|
|
|
+ original_table.replace_with(cleaned_table)
|
|
|
+
|
|
|
+ tables = soup.find_all('table')
|
|
|
+ for table in tables:
|
|
|
+ #
|
|
|
+ # table_no_div = re.sub('<div\b[^>]|</div>', '', str(table))
|
|
|
+ # table_no_div = BeautifulSoup(table_no_div, 'lxml')
|
|
|
+
|
|
|
+ # 检查 <table> 是否嵌套在 <div> 中
|
|
|
+ parent_div = table.find_parent('div')
|
|
|
+ if parent_div:
|
|
|
+ # 如果嵌套在 <div> 中,标记该 <div> 为已处理
|
|
|
+ # print('table parent is div')
|
|
|
+ # 如果嵌套在 <div> 中,标记所有父级 <div> 为已处理
|
|
|
+ current_div = parent_div
|
|
|
+ while current_div:
|
|
|
+ current_div['data-processed'] = 'true'
|
|
|
+ current_div = current_div.find_parent('div')
|
|
|
+ # else:
|
|
|
+ # 如果 <table> 是独立的,提取其内容
|
|
|
+ # print('table parent is not div')
|
|
|
+ extracted_elements.append(table)
|
|
|
+
|
|
|
+ # 提取所有符合条件的 <div>
|
|
|
+ divs = soup.find_all('div')
|
|
|
+ for div in divs:
|
|
|
+ # 跳过包含 <table> 的 <div>(已处理过)
|
|
|
+ if div.find('table') is None and 'data-processed' not in div.attrs:
|
|
|
+ extracted_elements.append(div)
|
|
|
+
|
|
|
+ # 按顺序提取内容
|
|
|
+ table_div_list = []
|
|
|
+ for element in extracted_elements:
|
|
|
+ if element.name == 'table':
|
|
|
+ table_div_list.append(str(element))
|
|
|
+ # print("Table content:")
|
|
|
+ # 提取表格内容时忽略嵌套的 <div>
|
|
|
+ # for row in element.find_all('tr'):
|
|
|
+ # for cell in row.find_all('td'):
|
|
|
+ # # 移除嵌套的 <div> 内容
|
|
|
+ # for div in cell.find_all('div'):
|
|
|
+ # div.decompose()
|
|
|
+ # print(cell.get_text(strip=True))
|
|
|
+ elif element.name == 'div':
|
|
|
+ table_div_list.append(str(element))
|
|
|
+ # print("Div content:")
|
|
|
+ # print(element.get_text(strip=True))
|
|
|
+ return table_div_list
|
|
|
+ except:
|
|
|
+ traceback.print_exc()
|
|
|
+ return []
|
|
|
+
|
|
|
+
|
|
|
+if __name__ == '__main__':
|
|
|
+ rr = get_several_money(
|
|
|
+ '2.3项目估算总投资:449.3万元。',
|
|
|
+ 0
|
|
|
+ )
|
|
|
+ print(rr)
|