Selaa lähdekoodia

Merge remote-tracking branch 'origin/master'

lsm 1 viikko sitten
vanhempi
commit
2805f0eb29
2 muutettua tiedostoa jossa 116 lisäystä ja 8 poistoa
  1. 112 7
      BiddingKG/dl/proposed_building/pb_extract.py
  2. 4 1
      BiddingKG/dl/time/re_servicetime.py

+ 112 - 7
BiddingKG/dl/proposed_building/pb_extract.py

@@ -1,3 +1,4 @@
+# coding:utf-8
 import copy
 import os
 import re
@@ -879,8 +880,16 @@ def extract_several_money(list_sentence, html='', is_obj=True, show=0):
 
     money_list = []
     all_before_sentence = ''
-    soup = BeautifulSoup(html, 'lxml')
-    tables_and_divs = soup.find_all(['table', 'div'])
+    # if 'pcontent' in html:
+    #     print('pcontent in html')
+    #     soup = BeautifulSoup(html, 'lxml').find("div",id="pcontent")
+    # else:
+    #     soup = BeautifulSoup(html, 'lxml')
+    # soup_text = soup.get_text()
+    # tables_and_divs = soup.find_all(['table', 'div'])
+    tables_and_divs = get_table_and_div(html)
+    # for t in tables_and_divs:
+    #     print('tttt', t)
     for i, sentence in enumerate(list_sentence):
         if show and i % 100 == 0:
             print('extract_several_money Loop', i, len(list_sentence), time.time() - start_time1)
@@ -913,6 +922,9 @@ def extract_several_money(list_sentence, html='', is_obj=True, show=0):
         _list, _ = get_several_money(text, 0, False, tables_and_divs=tables_and_divs)
         if show:
             print('extract_several_money time0.2', time.time() - start_time2)
+            print('get_several_money text', text)
+            # print('tables_and_divs', tables_and_divs)
+            print('get_several_money', _list)
             start_time2 = time.time()
         # logging.info('get_several_money _list ' + str(_list))
 
@@ -1306,11 +1318,13 @@ def judge_yeji(end_index, content, judge_len=300, tables_and_divs=None, entity=N
             # _m = re.search(re.escape(_entity_text), str(_table))
             _m = re.search(re.escape(_entity_text), _table_text)
             if not _m:
-                _entity_text = _entity_text[1:]
                 _m = re.search(re.escape(_entity_text[1:]), _table_text)
+                if _m:
+                    _entity_text = _entity_text[1:]
             if not _m:
-                _entity_text = _entity_text[:-1]
                 _m = re.search(re.escape(_entity_text[:-1]), _table_text)
+                if _m:
+                    _entity_text = _entity_text[:-1]
             # print('_entity_text', _entity_text)
             if _m:
                 rows = _table.find_all('tr')
@@ -1331,7 +1345,7 @@ def judge_yeji(end_index, content, judge_len=300, tables_and_divs=None, entity=N
                         # 第一列是否含业绩
                         # if re.search(reg_yeji, str(cells[0])):
                         if re.search(reg_yeji, str(cells[0].get_text())):
-                            # logging.info('is_yeji_table 1')
+                            # print('is_yeji_table 1')
                             is_yeji = 1
                         # 前面几行是否有占多列的有业绩
                         else:
@@ -1342,7 +1356,7 @@ def judge_yeji(end_index, content, judge_len=300, tables_and_divs=None, entity=N
                                     #     print(re.search('业绩', str(row2)), str(row2))
                                     if len(row2.find_all('td')) <= max_col_span / 2 and re.search(reg_yeji,
                                                                                                   str(row2.get_text())):
-                                        # logging.info('is_yeji_table 2')
+                                        # print('is_yeji_table 2')
                                         is_yeji = 1
 
                         break
@@ -1350,7 +1364,13 @@ def judge_yeji(end_index, content, judge_len=300, tables_and_divs=None, entity=N
                 # 前面都找不到,那么找表格上方的两行
                 div_list = [str(x.get_text()) for x in _tables_and_divs[max(0, index3 - 2):index3]]
                 if not is_yeji and re.search(reg_yeji, ' '.join(div_list)):
-                    # logging.info('is_yeji_table 3')
+                    # print('is_yeji_table 3')
+                    # # print('div_list', div_list)
+                    # print('_tables_and_divs[index3]', _tables_and_divs[index3])
+                    # print('_tables_and_divs[max(0, index3 - 2):index3]', _tables_and_divs[max(0, index3 - 2)])
+                    # print('_tables_and_divs[max(0, index3 - 2):index3]', _tables_and_divs[index3-1])
+                    # print('_table_text,', _table_text)
+                    # print('_entity_text', _entity_text)
                     is_yeji = 1
                 break
         if show:
@@ -1530,14 +1550,17 @@ def get_several_money(sentence_text, found_yeji, in_attachment=False,
                 break
             yeji_table_flag = 0
             for k, v in _match.groupdict().items():
+                # print('kv k:', k, 'v:', v)
                 if v != "" and v is not None:
                     if k == 'text_key_word':
                         notSure = True
                     if k.split("_")[0] == "money":
                         entity_text = v
                         # if is_yeji_table(tables_and_divs, entity_text):
+                        # print('judge_yeji entity_text', entity_text)
                         if judge_yeji(len(sentence_text), sentence_text, 300, tables_and_divs, entity_text):
                             yeji_table_flag = 1
+                            # print('yeji_table_flag', yeji_table_flag)
                             break
                         # print(_match.group(k), 'entity_text: ', sentence_text[_match.start(k): _match.end(k)])
                         if entity_text.endswith(',00'):  # 金额逗号后面不可能为两个0结尾,应该小数点识别错,直接去掉
@@ -1998,3 +2021,85 @@ class get_service_end:
             #     return ''
         except Exception as e:
             return '0'
+
+
+def get_table_and_div(html_content):
+    try:
+        if 'pcontent' in html_content:
+            # print('pcontent in html')
+            soup = BeautifulSoup(html_content, 'lxml').find("div",id="pcontent")
+        else:
+            soup = BeautifulSoup(html_content, 'lxml')
+
+        # 用于存储提取的元素
+        extracted_elements = []
+
+        # 提取所有独立的 <table>,并记录它们的位置
+        tables = soup.find_all('table')
+        # 去掉table中的div
+        tables_to_replace = []
+        for table in tables:
+            table_no_div = re.sub('<div[^>]*|</div>', '', str(table))
+            table_no_div = BeautifulSoup(table_no_div, 'lxml')
+            tables_to_replace.append([table, table_no_div])
+        for original_table, cleaned_table in tables_to_replace:
+            # cleaned_table_soup = BeautifulSoup(cleaned_table_content, 'lxml')
+            original_table.replace_with(cleaned_table)
+
+        tables = soup.find_all('table')
+        for table in tables:
+            #
+            # table_no_div = re.sub('<div\b[^>]|</div>', '', str(table))
+            # table_no_div = BeautifulSoup(table_no_div, 'lxml')
+
+            # 检查 <table> 是否嵌套在 <div> 中
+            parent_div = table.find_parent('div')
+            if parent_div:
+                # 如果嵌套在 <div> 中,标记该 <div> 为已处理
+                # print('table parent is div')
+                # 如果嵌套在 <div> 中,标记所有父级 <div> 为已处理
+                current_div = parent_div
+                while current_div:
+                    current_div['data-processed'] = 'true'
+                    current_div = current_div.find_parent('div')
+            # else:
+                # 如果 <table> 是独立的,提取其内容
+                # print('table parent is not div')
+            extracted_elements.append(table)
+
+        # 提取所有符合条件的 <div>
+        divs = soup.find_all('div')
+        for div in divs:
+            # 跳过包含 <table> 的 <div>(已处理过)
+            if div.find('table') is None and 'data-processed' not in div.attrs:
+                extracted_elements.append(div)
+
+        # 按顺序提取内容
+        table_div_list = []
+        for element in extracted_elements:
+            if element.name == 'table':
+                table_div_list.append(str(element))
+                # print("Table content:")
+                # 提取表格内容时忽略嵌套的 <div>
+                # for row in element.find_all('tr'):
+                #     for cell in row.find_all('td'):
+                #         # 移除嵌套的 <div> 内容
+                #         for div in cell.find_all('div'):
+                #             div.decompose()
+                #         print(cell.get_text(strip=True))
+            elif element.name == 'div':
+                table_div_list.append(str(element))
+                # print("Div content:")
+                # print(element.get_text(strip=True))
+        return table_div_list
+    except:
+        traceback.print_exc()
+        return []
+
+
+if __name__ == '__main__':
+    rr = get_several_money(
+        '2.3项目估算总投资:449.3万元。',
+        0
+    )
+    print(rr)

+ 4 - 1
BiddingKG/dl/time/re_servicetime.py

@@ -422,7 +422,7 @@ def calculateLen(ss, i):
 def extract_servicetime(text):
     list_servicetime = []
     word_list, text_index_list, prob = re_service_time(text)
-    # print(word_list, text_index_list)
+    # print('extract_servicetime', word_list, text_index_list)
     for i in range(len(text_index_list)):
         d = {"body": word_list[i], "begin_index": text_index_list[i][0], "end_index": text_index_list[i][1], "prob": prob}
         if len(word_list[i]) <= 35:
@@ -454,6 +454,9 @@ def test_from_str():
     # s = '第十四条,服务时间:2023.12-2024.12服务时间'
 #     s = ''',莆田市财政局走廊及卫生间吊顶改造工程中标结果公告,莆田市财政局走廊及卫生间吊顶改造工程,工程预算价236878元,发包价194240元,招标编号为:宏福莆招字【2020】H001号,该项目招标方式为:邀请招标。2020年04月07日开标,2020年04月07日评标完成,中标主要结果公示如下:中标人名称,福建省东海伟业建设有限公司,中标价:194240元,评标办法,随机抽取法,资格评审结果,注册建造师:合格:余爱华(注册编号:闽235141578763),履约保证金(元):合格:合同金额的10%,施工工期:14日历天,工程质量,备注,被确定为废标、无效标的投标人及原因:合格:无废标,资格审查小组:合格:王宗仙、林慧灵、谢淑青,根据评标结果确定福建省东海伟业建设有限公司为中标人,现在莆田市财政局网上(http://czj.putian.gov.cn/)公示。中标公示期自2020年04月08日至2020年04月10日。投标人对中标结果有异议或认为评标活动存在违法违规行为,可在公示期内向相关主管部门投诉,招标单位:招标代理机构:莆田市财政局,福建省宏福工程管理有限公司,联系电话:0594-2694413,联系电话:15160467775,2020年04月08日,2020年04月08日,
 # '''
+    s = """
+履约期限:2025年03月12日至2028年03月19日
+    """
     print(extract_servicetime(s))
     # print(re.findall('(\d{2,4}[-.年/]|\d{1,2}[-.月/]|\d{1,2}[日号]?)+[-~~起至到—]+\d{2,4}[-.年/]', s))