소스 검색

优化拟在建字段提取慢问题

fangjiasheng 1 년 전
부모
커밋
b71a2dfa3b
2개의 변경된 파일111개의 추가작업 그리고 11개의 파일을 삭제
  1. 1 0
      BiddingKG/dl/interface/extract.py
  2. 110 11
      BiddingKG/dl/proposed_building/pb_extract.py

+ 1 - 0
BiddingKG/dl/interface/extract.py

@@ -363,6 +363,7 @@ def predict(doc_id,text,title="",page_time="",web_source_no='',web_source_name="
     # 提取拟在建所需字段
     start_time = time.time()
     pb_json = predictor.getPredictor('pb_extract').predict(prem, list_articles, list_sentences, list_entitys, title, codeName[0], text)
+    log("pb_extract done of doc_id%s"%(doc_id))
     cost_time["pb_extract"] = round(time.time() - start_time, 2)
 
     # data_res = Preprocessing.union_result(Preprocessing.union_result(codeName, prem),list_punish_dic)[0]

+ 110 - 11
BiddingKG/dl/proposed_building/pb_extract.py

@@ -2,6 +2,7 @@ import copy
 import os
 import re
 import sys
+import time
 import traceback
 from decimal import Decimal
 import pandas as pd
@@ -37,7 +38,8 @@ class PBPredictor:
             end_time = item.get('time_completion')
         return tenderee, agency, product, begin_time, end_time
 
-    def predict(self, prem, list_articles, list_sentences, list_entitys, doctitle, code_name_dict, dochtmlcon):
+    def predict(self, prem, list_articles, list_sentences, list_entitys, doctitle,
+                code_name_dict, dochtmlcon, show=0):
         try:
             for list_article, list_sentence, list_entity in zip(list_articles, list_sentences, list_entitys):
                 list_sentence.sort(key=lambda x: x.sentence_index)
@@ -60,24 +62,70 @@ class PBPredictor:
                 else:
                     project_code = None
 
+                start_time = time.time()
                 stage = extract_legal_stage(project_name+doctitle, self.stage_pattern, self.stage_priority_dict, product, tenderee=tenderee, agency=agency)
+                if show:
+                    print('extract_legal_stage time', time.time()-start_time)
+                    start_time = time.time()
                 industry1 = extract_industry(doctitle+content, self.industry_pattern)
+                if show:
+                    print('extract_industry time', time.time()-start_time)
+                    start_time = time.time()
                 industry = extract_industry(doctitle+content_no_att, self.industry_pattern)
+                if show:
+                    print('extract_industry time', time.time()-start_time)
+                    start_time = time.time()
                 # print('industry', industry, industry1)
                 if not industry and industry1:
                     industry = industry1
                 proportion1, proportion = extract_proportion(content)
+                if show:
+                    print('extract_proportion time', time.time()-start_time)
+                    start_time = time.time()
                 project_digest = extract_project_digest(content)
+                if show:
+                    print('extract_project_digest time', time.time()-start_time)
+                    start_time = time.time()
                 project_address = extract_project_address(list_sentence, list_entity)
+                if show:
+                    print('extract_project_address time', time.time()-start_time)
+                    start_time = time.time()
                 location = get_bid_location(doctitle+"\t"+project_name)
+                if show:
+                    print('get_bid_location time', time.time()-start_time)
+                    start_time = time.time()
                 project_name_refind, show_name_refind = get_project_name_refind(project_name, doctitle, tenderee, agency)
+                if show:
+                    print('get_project_name_refind time', time.time()-start_time)
+                    start_time = time.time()
                 has_elevator = extract_has_elevator(content)
+                if show:
+                    print('extract_has_elevator time', time.time()-start_time)
+                    start_time = time.time()
                 project_property = extract_project_property(doctitle+"\t"+project_name, self.property_pattern, self.property_priority_dict)
+                if show:
+                    print('extract_project_property time', time.time()-start_time)
+                    start_time = time.time()
                 total_invest, construct_install_fee, engineer_cost = extract_several_money(list_sentence, dochtmlcon)
+                if show:
+                    print('extract_several_money time', time.time()-start_time)
+                    start_time = time.time()
                 max_floor = extract_max_floor(content, dochtmlcon)
+                if show:
+                    print('extract_max_floor time', time.time()-start_time)
+                    start_time = time.time()
                 structure = extract_structure(content, dochtmlcon, self.structure_keyword_list)
+                if show:
+                    print('extract_structure time', time.time()-start_time)
+                    start_time = time.time()
                 has_steel = extract_has_steel_structure(structure)
+                if show:
+                    print('extract_has_steel_structure time', time.time()-start_time)
+                    start_time = time.time()
                 wall_type, wall_type2 = extract_wall_type(doctitle+"\t"+project_name, content)
+                if show:
+                    print('extract_wall_type time', time.time()-start_time)
+                    start_time = time.time()
 
                 if stage is not None:
                     has_stage = 1
@@ -787,12 +835,19 @@ def extract_project_property(content, property_pattern, property_priority_dict):
     return _property
 
 
-def extract_several_money(list_sentence, html='', is_obj=True):
+def extract_several_money(list_sentence, html='', is_obj=True, show=0):
+    start_time = time.time()
+    start_time1 = time.time()
     money_type_list = ['总投资', '建安费', '工程造价']
 
     money_list = []
     all_before_sentence = ''
+    soup = BeautifulSoup(html, 'lxml')
+    tables_and_divs = soup.find_all(['table', 'div'])
     for i, sentence in enumerate(list_sentence):
+        if show and i % 100 == 0:
+            print('extract_several_money Loop', i, len(list_sentence), time.time()-start_time1)
+            start_time1 = time.time()
         last_text = ''
         next_text = ''
         if is_obj:
@@ -810,11 +865,18 @@ def extract_several_money(list_sentence, html='', is_obj=True):
             if i < len(list_sentence) - 1:
                 next_text = list_sentence[i+1][:30]
 
+        start_time2 = time.time()
         if judge_yeji(len(all_before_sentence), all_before_sentence, 300+len(text)):
             # print('sentence yeji before ' + text)
             continue
+        if show:
+            print('extract_several_money time0.1', time.time()-start_time2)
+            start_time2 = time.time()
         # if '项目概算总投资为' in text:
-        _list, _ = get_several_money(text, 0, False, html=html)
+        _list, _ = get_several_money(text, 0, False, tables_and_divs=tables_and_divs)
+        if show:
+            print('extract_several_money time0.2', time.time()-start_time2)
+            start_time2 = time.time()
         # logging.info('get_several_money _list ' + str(_list))
 
         temp_list = []
@@ -824,11 +886,18 @@ def extract_several_money(list_sentence, html='', is_obj=True):
                     continue
             temp_list.append(l)
         _list = temp_list
+        if show:
+            print('extract_several_money time0.3', time.time()-start_time2)
+            start_time2 = time.time()
 
         money_list += _list
         # if money_list:
         #     break
 
+    if show:
+        print('extract_several_money time1', time.time()-start_time)
+        start_time = time.time()
+
     money_type_dict = {}
     for money, _, _, _, money_type in money_list:
         for _type in money_type_list:
@@ -851,6 +920,9 @@ def extract_several_money(list_sentence, html='', is_obj=True):
                 money_type_dict[_type] = [_money]
 
     # logging.info('money_type_dict ' + str(money_type_dict))
+    if show:
+        print('extract_several_money time2', time.time()-start_time)
+        start_time = time.time()
 
     result_list = []
     for _type in money_type_list:
@@ -867,6 +939,10 @@ def extract_several_money(list_sentence, html='', is_obj=True):
         else:
             result_list.append(None)
 
+    if show:
+        print('extract_several_money time3', time.time()-start_time)
+        start_time = time.time()
+
     for i in range(len(result_list)):
         if result_list[i] is None:
             result_list[i] = 0
@@ -1176,7 +1252,8 @@ def cut_win_bid_part(_str):
 
 
 def judge_yeji(end_index, content, judge_len=300, tables_and_divs=None, entity=None):
-    def is_yeji_table(_tables_and_divs, _entity_text):
+    def is_yeji_table(_tables_and_divs, _entity_text, show=0):
+        start_time = time.time()
         if not _tables_and_divs:
             return 0
 
@@ -1238,6 +1315,8 @@ def judge_yeji(end_index, content, judge_len=300, tables_and_divs=None, entity=N
                     # logging.info('is_yeji_table 3')
                     is_yeji = 1
                 break
+        if show:
+            print('is_yeji_table time', time.time()-start_time)
         return is_yeji
 
     # 先判断表格业绩
@@ -1260,7 +1339,8 @@ def judge_yeji(end_index, content, judge_len=300, tables_and_divs=None, entity=N
         return 0
 
 
-def get_several_money(sentence_text, found_yeji, in_attachment=False, html=''):
+def get_several_money(sentence_text, found_yeji, in_attachment=False,
+                      tables_and_divs=[], show=0):
     def getDigitsDic(_unit):
         '''
         @summary:拿到中文对应的数字
@@ -1334,13 +1414,19 @@ def get_several_money(sentence_text, found_yeji, in_attachment=False, html=''):
             return Decimal(0)
         return result
 
+    start_time = time.time()
+
     # 提取表格用于判断业绩
-    if html:
-        soup = BeautifulSoup(html, 'lxml')
-        tables = soup.find_all('table')
-        tables_and_divs = soup.find_all(['table', 'div'])
-    else:
-        tables_and_divs = []
+    # if tables_and_divs:
+    #     soup = BeautifulSoup(html, 'lxml')
+    #     # tables = soup.find_all('table')
+    #     tables_and_divs = soup.find_all(['table', 'div'])
+    # else:
+    #     tables_and_divs = []
+
+    # if show:
+    #     print('get_several_money time1', time.time()-start_time)
+    #     start_time = time.time()
 
     money_list = []
     # 使用正则识别金额
@@ -1365,6 +1451,10 @@ def get_several_money(sentence_text, found_yeji, in_attachment=False, html=''):
     if match:
         sentence_text = re.sub(re.escape(match.group()), match.group()[0] + match.group()[2:], sentence_text)
 
+    if show:
+        print('get_several_money time2', time.time()-start_time)
+        start_time = time.time()
+
     if re.search('业绩(公示|汇总|及|报告|\w{,2}(内容|情况|信息)|[^\w])', sentence_text):
         found_yeji += 1
     if found_yeji >= 2:  # 过滤掉业绩后面的所有金额
@@ -1375,6 +1465,11 @@ def get_several_money(sentence_text, found_yeji, in_attachment=False, html=''):
             all_match = re.finditer(pattern_money, sentence_text.replace(ser.group(0), ' ' * len(ser.group(0))))
         else:
             all_match = re.finditer(pattern_money, sentence_text)
+
+    if show:
+        print('get_several_money time3', time.time()-start_time)
+        start_time = time.time()
+
     for _match in all_match:
         if len(_match.group()) > 0:
             notes = ''  # 2021/7/20 新增备注金额大写或金额单位 if 金额大写 notes=大写 elif 单位 notes=单位
@@ -1531,6 +1626,10 @@ def get_several_money(sentence_text, found_yeji, in_attachment=False, html=''):
                 continue
             money_list.append((entity_text, start_index, end_index, unit, notes))
 
+    if show:
+        print('get_several_money time4', time.time()-start_time)
+        start_time = time.time()
+
     # 排除过小的金额
     temp_list = []
     for money in money_list: