il y a 1 semaine · 2805f0eb29
--- a/BiddingKG/dl/proposed_building/pb_extract.py
+++ b/BiddingKG/dl/proposed_building/pb_extract.py
@@ -1,3 +1,4 @@
 
				+# coding:utf-8
			
 
				 import copy
			
 
				 import os
			
 
				 import re
			
@@ -879,8 +880,16 @@ def extract_several_money(list_sentence, html='', is_obj=True, show=0):
 
				 
			
 
				     money_list = []
			
 
				     all_before_sentence = ''
			
 
				-    soup = BeautifulSoup(html, 'lxml')
			
 
				-    tables_and_divs = soup.find_all(['table', 'div'])
			
 
				+    # if 'pcontent' in html:
			
 
				+    #     print('pcontent in html')
			
 
				+    #     soup = BeautifulSoup(html, 'lxml').find("div",id="pcontent")
			
 
				+    # else:
			
 
				+    #     soup = BeautifulSoup(html, 'lxml')
			
 
				+    # soup_text = soup.get_text()
			
 
				+    # tables_and_divs = soup.find_all(['table', 'div'])
			
 
				+    tables_and_divs = get_table_and_div(html)
			
 
				+    # for t in tables_and_divs:
			
 
				+    #     print('tttt', t)
			
 
				     for i, sentence in enumerate(list_sentence):
			
 
				         if show and i % 100 == 0:
			
 
				             print('extract_several_money Loop', i, len(list_sentence), time.time() - start_time1)
			
@@ -913,6 +922,9 @@ def extract_several_money(list_sentence, html='', is_obj=True, show=0):
 
				         _list, _ = get_several_money(text, 0, False, tables_and_divs=tables_and_divs)
			
 
				         if show:
			
 
				             print('extract_several_money time0.2', time.time() - start_time2)
			
 
				+            print('get_several_money text', text)
			
 
				+            # print('tables_and_divs', tables_and_divs)
			
 
				+            print('get_several_money', _list)
			
 
				             start_time2 = time.time()
			
 
				         # logging.info('get_several_money _list ' + str(_list))
			
 
				 
			
@@ -1306,11 +1318,13 @@ def judge_yeji(end_index, content, judge_len=300, tables_and_divs=None, entity=N
 
				             # _m = re.search(re.escape(_entity_text), str(_table))
			
 
				             _m = re.search(re.escape(_entity_text), _table_text)
			
 
				             if not _m:
			
 
				-                _entity_text = _entity_text[1:]
			
 
				                 _m = re.search(re.escape(_entity_text[1:]), _table_text)
			
 
				+                if _m:
			
 
				+                    _entity_text = _entity_text[1:]
			
 
				             if not _m:
			
 
				-                _entity_text = _entity_text[:-1]
			
 
				                 _m = re.search(re.escape(_entity_text[:-1]), _table_text)
			
 
				+                if _m:
			
 
				+                    _entity_text = _entity_text[:-1]
			
 
				             # print('_entity_text', _entity_text)
			
 
				             if _m:
			
 
				                 rows = _table.find_all('tr')
			
@@ -1331,7 +1345,7 @@ def judge_yeji(end_index, content, judge_len=300, tables_and_divs=None, entity=N
 
				                         # 第一列是否含业绩
			
 
				                         # if re.search(reg_yeji, str(cells[0])):
			
 
				                         if re.search(reg_yeji, str(cells[0].get_text())):
			
 
				-                            # logging.info('is_yeji_table 1')
			
 
				+                            # print('is_yeji_table 1')
			
 
				                             is_yeji = 1
			
 
				                         # 前面几行是否有占多列的有业绩
			
 
				                         else:
			
@@ -1342,7 +1356,7 @@ def judge_yeji(end_index, content, judge_len=300, tables_and_divs=None, entity=N
 
				                                     #     print(re.search('业绩', str(row2)), str(row2))
			
 
				                                     if len(row2.find_all('td')) <= max_col_span / 2 and re.search(reg_yeji,
			
 
				                                                                                                   str(row2.get_text())):
			
 
				-                                        # logging.info('is_yeji_table 2')
			
 
				+                                        # print('is_yeji_table 2')
			
 
				                                         is_yeji = 1
			
 
				 
			
 
				                         break
			
@@ -1350,7 +1364,13 @@ def judge_yeji(end_index, content, judge_len=300, tables_and_divs=None, entity=N
 
				                 # 前面都找不到，那么找表格上方的两行
			
 
				                 div_list = [str(x.get_text()) for x in _tables_and_divs[max(0, index3 - 2):index3]]
			
 
				                 if not is_yeji and re.search(reg_yeji, ' '.join(div_list)):
			
 
				-                    # logging.info('is_yeji_table 3')
			
 
				+                    # print('is_yeji_table 3')
			
 
				+                    # # print('div_list', div_list)
			
 
				+                    # print('_tables_and_divs[index3]', _tables_and_divs[index3])
			
 
				+                    # print('_tables_and_divs[max(0, index3 - 2):index3]', _tables_and_divs[max(0, index3 - 2)])
			
 
				+                    # print('_tables_and_divs[max(0, index3 - 2):index3]', _tables_and_divs[index3-1])
			
 
				+                    # print('_table_text,', _table_text)
			
 
				+                    # print('_entity_text', _entity_text)
			
 
				                     is_yeji = 1
			
 
				                 break
			
 
				         if show:
			
@@ -1530,14 +1550,17 @@ def get_several_money(sentence_text, found_yeji, in_attachment=False,
 
				                 break
			
 
				             yeji_table_flag = 0
			
 
				             for k, v in _match.groupdict().items():
			
 
				+                # print('kv k:', k, 'v:', v)
			
 
				                 if v != "" and v is not None:
			
 
				                     if k == 'text_key_word':
			
 
				                         notSure = True
			
 
				                     if k.split("_")[0] == "money":
			
 
				                         entity_text = v
			
 
				                         # if is_yeji_table(tables_and_divs, entity_text):
			
 
				+                        # print('judge_yeji entity_text', entity_text)
			
 
				                         if judge_yeji(len(sentence_text), sentence_text, 300, tables_and_divs, entity_text):
			
 
				                             yeji_table_flag = 1
			
 
				+                            # print('yeji_table_flag', yeji_table_flag)
			
 
				                             break
			
 
				                         # print(_match.group(k), 'entity_text: ', sentence_text[_match.start(k): _match.end(k)])
			
 
				                         if entity_text.endswith(',00'):  # 金额逗号后面不可能为两个0结尾，应该小数点识别错，直接去掉
			
@@ -1998,3 +2021,85 @@ class get_service_end:
 
				             #     return ''
			
 
				         except Exception as e:
			
 
				             return '0'
			
 
				+
			
 
				+
			
 
				+def get_table_and_div(html_content):
			
 
				+    try:
			
 
				+        if 'pcontent' in html_content:
			
 
				+            # print('pcontent in html')
			
 
				+            soup = BeautifulSoup(html_content, 'lxml').find("div",id="pcontent")
			
 
				+        else:
			
 
				+            soup = BeautifulSoup(html_content, 'lxml')
			
 
				+
			
 
				+        # 用于存储提取的元素
			
 
				+        extracted_elements = []
			
 
				+
			
 
				+        # 提取所有独立的 <table>，并记录它们的位置
			
 
				+        tables = soup.find_all('table')
			
 
				+        # 去掉table中的div
			
 
				+        tables_to_replace = []
			
 
				+        for table in tables:
			
 
				+            table_no_div = re.sub('<div[^>]*|</div>', '', str(table))
			
 
				+            table_no_div = BeautifulSoup(table_no_div, 'lxml')
			
 
				+            tables_to_replace.append([table, table_no_div])
			
 
				+        for original_table, cleaned_table in tables_to_replace:
			
 
				+            # cleaned_table_soup = BeautifulSoup(cleaned_table_content, 'lxml')
			
 
				+            original_table.replace_with(cleaned_table)
			
 
				+
			
 
				+        tables = soup.find_all('table')
			
 
				+        for table in tables:
			
 
				+            #
			
 
				+            # table_no_div = re.sub('<div\b[^>]|</div>', '', str(table))
			
 
				+            # table_no_div = BeautifulSoup(table_no_div, 'lxml')
			
 
				+
			
 
				+            # 检查 <table> 是否嵌套在 <div> 中
			
 
				+            parent_div = table.find_parent('div')
			
 
				+            if parent_div:
			
 
				+                # 如果嵌套在 <div> 中，标记该 <div> 为已处理
			
 
				+                # print('table parent is div')
			
 
				+                # 如果嵌套在 <div> 中，标记所有父级 <div> 为已处理
			
 
				+                current_div = parent_div
			
 
				+                while current_div:
			
 
				+                    current_div['data-processed'] = 'true'
			
 
				+                    current_div = current_div.find_parent('div')
			
 
				+            # else:
			
 
				+                # 如果 <table> 是独立的，提取其内容
			
 
				+                # print('table parent is not div')
			
 
				+            extracted_elements.append(table)
			
 
				+
			
 
				+        # 提取所有符合条件的 <div>
			
 
				+        divs = soup.find_all('div')
			
 
				+        for div in divs:
			
 
				+            # 跳过包含 <table> 的 <div>（已处理过）
			
 
				+            if div.find('table') is None and 'data-processed' not in div.attrs:
			
 
				+                extracted_elements.append(div)
			
 
				+
			
 
				+        # 按顺序提取内容
			
 
				+        table_div_list = []
			
 
				+        for element in extracted_elements:
			
 
				+            if element.name == 'table':
			
 
				+                table_div_list.append(str(element))
			
 
				+                # print("Table content:")
			
 
				+                # 提取表格内容时忽略嵌套的 <div>
			
 
				+                # for row in element.find_all('tr'):
			
 
				+                #     for cell in row.find_all('td'):
			
 
				+                #         # 移除嵌套的 <div> 内容
			
 
				+                #         for div in cell.find_all('div'):
			
 
				+                #             div.decompose()
			
 
				+                #         print(cell.get_text(strip=True))
			
 
				+            elif element.name == 'div':
			
 
				+                table_div_list.append(str(element))
			
 
				+                # print("Div content:")
			
 
				+                # print(element.get_text(strip=True))
			
 
				+        return table_div_list
			
 
				+    except:
			
 
				+        traceback.print_exc()
			
 
				+        return []
			
 
				+
			
 
				+
			
 
				+if __name__ == '__main__':
			
 
				+    rr = get_several_money(
			
 
				+        '2.3项目估算总投资：449.3万元。',
			
 
				+        0
			
 
				+    )
			
 
				+    print(rr)
			
--- a/BiddingKG/dl/time/re_servicetime.py
+++ b/BiddingKG/dl/time/re_servicetime.py
@@ -422,7 +422,7 @@ def calculateLen(ss, i):
 
				 def extract_servicetime(text):
			
 
				     list_servicetime = []
			
 
				     word_list, text_index_list, prob = re_service_time(text)
			
 
				-    # print(word_list, text_index_list)
			
 
				+    # print('extract_servicetime', word_list, text_index_list)
			
 
				     for i in range(len(text_index_list)):
			
 
				         d = {"body": word_list[i], "begin_index": text_index_list[i][0], "end_index": text_index_list[i][1], "prob": prob}
			
 
				         if len(word_list[i]) <= 35:
			
@@ -454,6 +454,9 @@ def test_from_str():
 
				     # s = '第十四条，服务时间：2023.12-2024.12服务时间'
			
 
				 #     s = '''，莆田市财政局走廊及卫生间吊顶改造工程中标结果公告，莆田市财政局走廊及卫生间吊顶改造工程，工程预算价236878元，发包价194240元，招标编号为：宏福莆招字【2020】H001号，该项目招标方式为：邀请招标。2020年04月07日开标，2020年04月07日评标完成，中标主要结果公示如下：中标人名称，福建省东海伟业建设有限公司，中标价:194240元，评标办法，随机抽取法，资格评审结果，注册建造师：合格：余爱华(注册编号：闽235141578763)，履约保证金(元)：合格：合同金额的10%，施工工期：14日历天，工程质量，备注，被确定为废标、无效标的投标人及原因：合格：无废标，资格审查小组：合格：王宗仙、林慧灵、谢淑青，根据评标结果确定福建省东海伟业建设有限公司为中标人，现在莆田市财政局网上(http://czj.putian.gov.cn/)公示。中标公示期自2020年04月08日至2020年04月10日。投标人对中标结果有异议或认为评标活动存在违法违规行为，可在公示期内向相关主管部门投诉，招标单位：招标代理机构：莆田市财政局，福建省宏福工程管理有限公司，联系电话：0594-2694413，联系电话：15160467775，2020年04月08日，2020年04月08日，
			
 
				 # '''
			
 
				+    s = """
			
 
				+履约期限：2025年03月12日至2028年03月19日
			
 
				+    """
			
 
				     print(extract_servicetime(s))
			
 
				     # print(re.findall('(\d{2,4}[-.年/]|\d{1,2}[-.月/]|\d{1,2}[日号]?)+[-～~起至到—]+\d{2,4}[-.年/]', s))