Forráskód Böngészése

优化拟在建字段提取

fangjiasheng 1 éve
szülő
commit
bd9e682295

+ 4 - 4
BiddingKG/dl/interface/extract.py

@@ -353,14 +353,14 @@ def predict(doc_id,text,title="",page_time="",web_source_no='',web_source_name="
     getAttributes.confirm_prem(prem[0]['prem'], channel_dic)
 
     # 提取拟在建所需字段
-    # start_time = time.time()
-    # pb_json = predictor.getPredictor('pb_extract').predict(prem, list_articles, list_sentences, list_entitys, title, codeName[0], text)
-    # cost_time["pb_extract"] = round(time.time() - start_time, 2)
+    start_time = time.time()
+    pb_json = predictor.getPredictor('pb_extract').predict(prem, list_articles, list_sentences, list_entitys, title, codeName[0], text)
+    cost_time["pb_extract"] = round(time.time() - start_time, 2)
 
     # data_res = Preprocessing.union_result(Preprocessing.union_result(codeName, prem),list_punish_dic)[0]
     # data_res = Preprocessing.union_result(Preprocessing.union_result(Preprocessing.union_result(codeName, prem),list_punish_dic), list_channel_dic)[0]
     version_date = {'version_date': '2024-04-30'}
-    data_res = dict(codeName[0], **prem[0], **channel_dic, **product_attrs[0], **product_attrs[1], **payment_way_dic, **fail_reason, **industry, **district, **candidate_dic, **version_date, **all_moneys)
+    data_res = dict(codeName[0], **prem[0], **channel_dic, **product_attrs[0], **product_attrs[1], **payment_way_dic, **fail_reason, **industry, **district, **candidate_dic, **version_date, **all_moneys, **pb_json)
 
     '''最终检查修正招标、中标金额'''
     getAttributes.limit_maximum_amount(data_res, list_entitys[0])

+ 12 - 61
BiddingKG/dl/proposed_building/pb_extract.py

@@ -876,8 +876,6 @@ def extract_several_money(list_sentence, html='', is_obj=True):
 
 
 def extract_max_floor(content, html=None):
-    from bs4 import BeautifulSoup
-
     def match_floor(_reg, _content, _reg2=None, _tables_and_divs=None):
         _match = re.finditer(_reg, _content)
         _floor_list = []
@@ -977,8 +975,6 @@ def extract_max_floor(content, html=None):
 
 
 def extract_structure(content, html=None, structure_keyword_list=None):
-    from bs4 import BeautifulSoup
-
     # reg = '框架结构|钢框架结构|混凝土框架结构|剪力墙结构|框架-剪力墙结构|框架+剪力墙结构|框架和剪力墙结构|框架及剪力墙结构|混凝土剪力墙结构|筒体结构|桅式结构|墙板结构|膜结构|悬索结构|板柱结构|充气结构|网架结构|壳体结构|拱形结构|穹顶结构|混凝土结构|钢筋混凝土框架结构|钢筋混凝土筒仓结构|钢结构|砌体结构|木结构|砖混结构|排架结构|束筒结构|薄壳结构|钢混结构|砖木结构|砌体结构|钢砼结构|框剪结构|钢筋混凝土框架结构|筒中筒结构|框筒结构|桁架结构|拱券结构|钢筋混凝土结构|框架核心筒结构|门式钢架结构|门钢结构|轻钢结构|钢-混凝土框架结构|木框架结构|空间网格结构|框架筒体结构|砖拱结构|钢筋砼结构|核心筒结构|框架-核心筒结构'
     reg1 = '(结构(楼层|)(形式|类型|类别|体系|结构)[为是::])([^结]{2,8}结构)'
 
@@ -1187,17 +1183,20 @@ def judge_yeji(end_index, content, judge_len=300, tables_and_divs=None, entity=N
         is_yeji = 0
         reg_yeji = '业绩|选取原因|奖项|获奖|供应商信息|近年完成|中标单位信息|评标情况|类似项目|资质|候选人情况'
         for index3, obj in enumerate(_tables_and_divs):
-            if '<tr' not in str(obj):
+            # if '<tr' not in str(obj):
+            if obj.find('tr'):
                 continue
 
             _table = obj
-            _m = re.search(re.escape(_entity_text), str(_table))
+            _table_text = str(_table.get_text())
+            # _m = re.search(re.escape(_entity_text), str(_table))
+            _m = re.search(re.escape(_entity_text), _table_text)
             if not _m:
                 _entity_text = _entity_text[1:]
-                _m = re.search(re.escape(_entity_text[1:]), str(_table))
+                _m = re.search(re.escape(_entity_text[1:]), _table_text)
             if not _m:
                 _entity_text = _entity_text[:-1]
-                _m = re.search(re.escape(_entity_text[:-1]), str(_table))
+                _m = re.search(re.escape(_entity_text[:-1]), _table_text)
             # print('_entity_text', _entity_text)
             if _m:
                 rows = _table.find_all('tr')
@@ -1209,14 +1208,15 @@ def judge_yeji(end_index, content, judge_len=300, tables_and_divs=None, entity=N
                 # print('max_col_span', max_col_span)
 
                 for index, row in enumerate(rows):
-                    if re.search(re.escape(_entity_text), str(row)):
+                    if re.search(re.escape(_entity_text), str(row.get_text())):
                         cells = row.find_all('td')
                         if len(cells) == 0:
                             continue
 
                         # print('cells', cells[0])
                         # 第一列是否含业绩
-                        if re.search(reg_yeji, str(cells[0])):
+                        # if re.search(reg_yeji, str(cells[0])):
+                        if re.search(reg_yeji, str(cells[0].get_text())):
                             # logging.info('is_yeji_table 1')
                             is_yeji = 1
                         # 前面几行是否有占多列的有业绩
@@ -1226,14 +1226,14 @@ def judge_yeji(end_index, content, judge_len=300, tables_and_divs=None, entity=N
                                     # print('len(rows[index2])', len(row2.find_all('td')))
                                     # if len(row2.find_all('td')) <= max_col_span / 2:
                                     #     print(re.search('业绩', str(row2)), str(row2))
-                                    if len(row2.find_all('td')) <= max_col_span / 2 and re.search(reg_yeji, str(row2)):
+                                    if len(row2.find_all('td')) <= max_col_span / 2 and re.search(reg_yeji, str(row2.get_text())):
                                         # logging.info('is_yeji_table 2')
                                         is_yeji = 1
 
                         break
 
                 # 前面都找不到,那么找表格上方的两行
-                div_list = [str(x) for x in _tables_and_divs[max(0, index3-2):index3]]
+                div_list = [str(x.get_text()) for x in _tables_and_divs[max(0, index3-2):index3]]
                 if not is_yeji and re.search(reg_yeji, ' '.join(div_list)):
                     # logging.info('is_yeji_table 3')
                     is_yeji = 1
@@ -1261,8 +1261,6 @@ def judge_yeji(end_index, content, judge_len=300, tables_and_divs=None, entity=N
 
 
 def get_several_money(sentence_text, found_yeji, in_attachment=False, html=''):
-    from bs4 import BeautifulSoup
-
     def getDigitsDic(_unit):
         '''
         @summary:拿到中文对应的数字
@@ -1336,53 +1334,6 @@ def get_several_money(sentence_text, found_yeji, in_attachment=False, html=''):
             return Decimal(0)
         return result
 
-    def is_yeji_table(_tables_and_divs, _entity_text):
-        if not _tables_and_divs:
-            return 0
-
-        is_yeji = 0
-        reg_yeji = '业绩|选取原因|奖项|获奖'
-        for index3, obj in enumerate(_tables_and_divs):
-            if '<tr' not in str(obj):
-                continue
-
-            _table = obj
-            _m = re.search(re.escape(_entity_text), str(_table))
-            if _m:
-                rows = _table.find_all('tr')
-
-                max_col_span = 0
-                for row in rows:
-                    if len(row.find_all('td')) > max_col_span:
-                        max_col_span = len(row.find_all('td'))
-                # print('max_col_span', max_col_span)
-
-                for index, row in enumerate(rows):
-                    if re.search(re.escape(_entity_text), str(row)):
-                        cells = row.find_all('td')
-                        # 第一列是否含业绩
-                        if re.search(re.escape(_entity_text), str(cells[0])):
-                            is_yeji = 1
-                        # 前面几行是否有占多列的有业绩
-                        else:
-                            if index > 0:
-                                for row2 in rows[:index][::-1]:
-                                    # print('len(rows[index2])', len(row2.find_all('td')))
-                                    # if len(row2.find_all('td')) <= max_col_span / 2:
-                                    #     print(re.search('业绩', str(row2)), str(row2))
-                                    if len(row2.find_all('td')) <= max_col_span / 2 and re.search(reg_yeji, str(row2)):
-                                        is_yeji = 1
-
-                        break
-
-                # 前面都找不到,那么找表格上方的两行
-                if not is_yeji and re.search(reg_yeji, ' '.join(_tables_and_divs[index3-2:index3])):
-                    is_yeji = 1
-                break
-        # print('is_yeji', is_yeji)
-        return is_yeji
-
-
     # 提取表格用于判断业绩
     if html:
         soup = BeautifulSoup(html, 'lxml')