před 1 rokem · b71a2dfa3b
--- a/BiddingKG/dl/interface/extract.py
+++ b/BiddingKG/dl/interface/extract.py
@@ -363,6 +363,7 @@ def predict(doc_id,text,title="",page_time="",web_source_no='',web_source_name="
 
				     # 提取拟在建所需字段
			
 
				     start_time = time.time()
			
 
				     pb_json = predictor.getPredictor('pb_extract').predict(prem, list_articles, list_sentences, list_entitys, title, codeName[0], text)
			
 
				+    log("pb_extract done of doc_id%s"%(doc_id))
			
 
				     cost_time["pb_extract"] = round(time.time() - start_time, 2)
			
 
				 
			
 
				     # data_res = Preprocessing.union_result(Preprocessing.union_result(codeName, prem),list_punish_dic)[0]
			
--- a/BiddingKG/dl/proposed_building/pb_extract.py
+++ b/BiddingKG/dl/proposed_building/pb_extract.py
@@ -2,6 +2,7 @@ import copy
 
				 import os
			
 
				 import re
			
 
				 import sys
			
 
				+import time
			
 
				 import traceback
			
 
				 from decimal import Decimal
			
 
				 import pandas as pd
			
@@ -37,7 +38,8 @@ class PBPredictor:
 
				             end_time = item.get('time_completion')
			
 
				         return tenderee, agency, product, begin_time, end_time
			
 
				 
			
 
				-    def predict(self, prem, list_articles, list_sentences, list_entitys, doctitle, code_name_dict, dochtmlcon):
			
 
				+    def predict(self, prem, list_articles, list_sentences, list_entitys, doctitle,
			
 
				+                code_name_dict, dochtmlcon, show=0):
			
 
				         try:
			
 
				             for list_article, list_sentence, list_entity in zip(list_articles, list_sentences, list_entitys):
			
 
				                 list_sentence.sort(key=lambda x: x.sentence_index)
			
@@ -60,24 +62,70 @@ class PBPredictor:
 
				                 else:
			
 
				                     project_code = None
			
 
				 
			
 
				+                start_time = time.time()
			
 
				                 stage = extract_legal_stage(project_name+doctitle, self.stage_pattern, self.stage_priority_dict, product, tenderee=tenderee, agency=agency)
			
 
				+                if show:
			
 
				+                    print('extract_legal_stage time', time.time()-start_time)
			
 
				+                    start_time = time.time()
			
 
				                 industry1 = extract_industry(doctitle+content, self.industry_pattern)
			
 
				+                if show:
			
 
				+                    print('extract_industry time', time.time()-start_time)
			
 
				+                    start_time = time.time()
			
 
				                 industry = extract_industry(doctitle+content_no_att, self.industry_pattern)
			
 
				+                if show:
			
 
				+                    print('extract_industry time', time.time()-start_time)
			
 
				+                    start_time = time.time()
			
 
				                 # print('industry', industry, industry1)
			
 
				                 if not industry and industry1:
			
 
				                     industry = industry1
			
 
				                 proportion1, proportion = extract_proportion(content)
			
 
				+                if show:
			
 
				+                    print('extract_proportion time', time.time()-start_time)
			
 
				+                    start_time = time.time()
			
 
				                 project_digest = extract_project_digest(content)
			
 
				+                if show:
			
 
				+                    print('extract_project_digest time', time.time()-start_time)
			
 
				+                    start_time = time.time()
			
 
				                 project_address = extract_project_address(list_sentence, list_entity)
			
 
				+                if show:
			
 
				+                    print('extract_project_address time', time.time()-start_time)
			
 
				+                    start_time = time.time()
			
 
				                 location = get_bid_location(doctitle+"\t"+project_name)
			
 
				+                if show:
			
 
				+                    print('get_bid_location time', time.time()-start_time)
			
 
				+                    start_time = time.time()
			
 
				                 project_name_refind, show_name_refind = get_project_name_refind(project_name, doctitle, tenderee, agency)
			
 
				+                if show:
			
 
				+                    print('get_project_name_refind time', time.time()-start_time)
			
 
				+                    start_time = time.time()
			
 
				                 has_elevator = extract_has_elevator(content)
			
 
				+                if show:
			
 
				+                    print('extract_has_elevator time', time.time()-start_time)
			
 
				+                    start_time = time.time()
			
 
				                 project_property = extract_project_property(doctitle+"\t"+project_name, self.property_pattern, self.property_priority_dict)
			
 
				+                if show:
			
 
				+                    print('extract_project_property time', time.time()-start_time)
			
 
				+                    start_time = time.time()
			
 
				                 total_invest, construct_install_fee, engineer_cost = extract_several_money(list_sentence, dochtmlcon)
			
 
				+                if show:
			
 
				+                    print('extract_several_money time', time.time()-start_time)
			
 
				+                    start_time = time.time()
			
 
				                 max_floor = extract_max_floor(content, dochtmlcon)
			
 
				+                if show:
			
 
				+                    print('extract_max_floor time', time.time()-start_time)
			
 
				+                    start_time = time.time()
			
 
				                 structure = extract_structure(content, dochtmlcon, self.structure_keyword_list)
			
 
				+                if show:
			
 
				+                    print('extract_structure time', time.time()-start_time)
			
 
				+                    start_time = time.time()
			
 
				                 has_steel = extract_has_steel_structure(structure)
			
 
				+                if show:
			
 
				+                    print('extract_has_steel_structure time', time.time()-start_time)
			
 
				+                    start_time = time.time()
			
 
				                 wall_type, wall_type2 = extract_wall_type(doctitle+"\t"+project_name, content)
			
 
				+                if show:
			
 
				+                    print('extract_wall_type time', time.time()-start_time)
			
 
				+                    start_time = time.time()
			
 
				 
			
 
				                 if stage is not None:
			
 
				                     has_stage = 1
			
@@ -787,12 +835,19 @@ def extract_project_property(content, property_pattern, property_priority_dict):
 
				     return _property
			
 
				 
			
 
				 
			
 
				-def extract_several_money(list_sentence, html='', is_obj=True):
			
 
				+def extract_several_money(list_sentence, html='', is_obj=True, show=0):
			
 
				+    start_time = time.time()
			
 
				+    start_time1 = time.time()
			
 
				     money_type_list = ['总投资', '建安费', '工程造价']
			
 
				 
			
 
				     money_list = []
			
 
				     all_before_sentence = ''
			
 
				+    soup = BeautifulSoup(html, 'lxml')
			
 
				+    tables_and_divs = soup.find_all(['table', 'div'])
			
 
				     for i, sentence in enumerate(list_sentence):
			
 
				+        if show and i % 100 == 0:
			
 
				+            print('extract_several_money Loop', i, len(list_sentence), time.time()-start_time1)
			
 
				+            start_time1 = time.time()
			
 
				         last_text = ''
			
 
				         next_text = ''
			
 
				         if is_obj:
			
@@ -810,11 +865,18 @@ def extract_several_money(list_sentence, html='', is_obj=True):
 
				             if i < len(list_sentence) - 1:
			
 
				                 next_text = list_sentence[i+1][:30]
			
 
				 
			
 
				+        start_time2 = time.time()
			
 
				         if judge_yeji(len(all_before_sentence), all_before_sentence, 300+len(text)):
			
 
				             # print('sentence yeji before ' + text)
			
 
				             continue
			
 
				+        if show:
			
 
				+            print('extract_several_money time0.1', time.time()-start_time2)
			
 
				+            start_time2 = time.time()
			
 
				         # if '项目概算总投资为' in text:
			
 
				-        _list, _ = get_several_money(text, 0, False, html=html)
			
 
				+        _list, _ = get_several_money(text, 0, False, tables_and_divs=tables_and_divs)
			
 
				+        if show:
			
 
				+            print('extract_several_money time0.2', time.time()-start_time2)
			
 
				+            start_time2 = time.time()
			
 
				         # logging.info('get_several_money _list ' + str(_list))
			
 
				 
			
 
				         temp_list = []
			
@@ -824,11 +886,18 @@ def extract_several_money(list_sentence, html='', is_obj=True):
 
				                     continue
			
 
				             temp_list.append(l)
			
 
				         _list = temp_list
			
 
				+        if show:
			
 
				+            print('extract_several_money time0.3', time.time()-start_time2)
			
 
				+            start_time2 = time.time()
			
 
				 
			
 
				         money_list += _list
			
 
				         # if money_list:
			
 
				         #     break
			
 
				 
			
 
				+    if show:
			
 
				+        print('extract_several_money time1', time.time()-start_time)
			
 
				+        start_time = time.time()
			
 
				+
			
 
				     money_type_dict = {}
			
 
				     for money, _, _, _, money_type in money_list:
			
 
				         for _type in money_type_list:
			
@@ -851,6 +920,9 @@ def extract_several_money(list_sentence, html='', is_obj=True):
 
				                 money_type_dict[_type] = [_money]
			
 
				 
			
 
				     # logging.info('money_type_dict ' + str(money_type_dict))
			
 
				+    if show:
			
 
				+        print('extract_several_money time2', time.time()-start_time)
			
 
				+        start_time = time.time()
			
 
				 
			
 
				     result_list = []
			
 
				     for _type in money_type_list:
			
@@ -867,6 +939,10 @@ def extract_several_money(list_sentence, html='', is_obj=True):
 
				         else:
			
 
				             result_list.append(None)
			
 
				 
			
 
				+    if show:
			
 
				+        print('extract_several_money time3', time.time()-start_time)
			
 
				+        start_time = time.time()
			
 
				+
			
 
				     for i in range(len(result_list)):
			
 
				         if result_list[i] is None:
			
 
				             result_list[i] = 0
			
@@ -1176,7 +1252,8 @@ def cut_win_bid_part(_str):
 
				 
			
 
				 
			
 
				 def judge_yeji(end_index, content, judge_len=300, tables_and_divs=None, entity=None):
			
 
				-    def is_yeji_table(_tables_and_divs, _entity_text):
			
 
				+    def is_yeji_table(_tables_and_divs, _entity_text, show=0):
			
 
				+        start_time = time.time()
			
 
				         if not _tables_and_divs:
			
 
				             return 0
			
 
				 
			
@@ -1238,6 +1315,8 @@ def judge_yeji(end_index, content, judge_len=300, tables_and_divs=None, entity=N
 
				                     # logging.info('is_yeji_table 3')
			
 
				                     is_yeji = 1
			
 
				                 break
			
 
				+        if show:
			
 
				+            print('is_yeji_table time', time.time()-start_time)
			
 
				         return is_yeji
			
 
				 
			
 
				     # 先判断表格业绩
			
@@ -1260,7 +1339,8 @@ def judge_yeji(end_index, content, judge_len=300, tables_and_divs=None, entity=N
 
				         return 0
			
 
				 
			
 
				 
			
 
				-def get_several_money(sentence_text, found_yeji, in_attachment=False, html=''):
			
 
				+def get_several_money(sentence_text, found_yeji, in_attachment=False,
			
 
				+                      tables_and_divs=[], show=0):
			
 
				     def getDigitsDic(_unit):
			
 
				         '''
			
 
				         @summary:拿到中文对应的数字
			
@@ -1334,13 +1414,19 @@ def get_several_money(sentence_text, found_yeji, in_attachment=False, html=''):
 
				             return Decimal(0)
			
 
				         return result
			
 
				 
			
 
				+    start_time = time.time()
			
 
				+
			
 
				     # 提取表格用于判断业绩
			
 
				-    if html:
			
 
				-        soup = BeautifulSoup(html, 'lxml')
			
 
				-        tables = soup.find_all('table')
			
 
				-        tables_and_divs = soup.find_all(['table', 'div'])
			
 
				-    else:
			
 
				-        tables_and_divs = []
			
 
				+    # if tables_and_divs:
			
 
				+    #     soup = BeautifulSoup(html, 'lxml')
			
 
				+    #     # tables = soup.find_all('table')
			
 
				+    #     tables_and_divs = soup.find_all(['table', 'div'])
			
 
				+    # else:
			
 
				+    #     tables_and_divs = []
			
 
				+
			
 
				+    # if show:
			
 
				+    #     print('get_several_money time1', time.time()-start_time)
			
 
				+    #     start_time = time.time()
			
 
				 
			
 
				     money_list = []
			
 
				     # 使用正则识别金额
			
@@ -1365,6 +1451,10 @@ def get_several_money(sentence_text, found_yeji, in_attachment=False, html=''):
 
				     if match:
			
 
				         sentence_text = re.sub(re.escape(match.group()), match.group()[0] + match.group()[2:], sentence_text)
			
 
				 
			
 
				+    if show:
			
 
				+        print('get_several_money time2', time.time()-start_time)
			
 
				+        start_time = time.time()
			
 
				+
			
 
				     if re.search('业绩(公示|汇总|及|报告|\w{,2}(内容|情况|信息)|[^\w])', sentence_text):
			
 
				         found_yeji += 1
			
 
				     if found_yeji >= 2:  # 过滤掉业绩后面的所有金额
			
@@ -1375,6 +1465,11 @@ def get_several_money(sentence_text, found_yeji, in_attachment=False, html=''):
 
				             all_match = re.finditer(pattern_money, sentence_text.replace(ser.group(0), ' ' * len(ser.group(0))))
			
 
				         else:
			
 
				             all_match = re.finditer(pattern_money, sentence_text)
			
 
				+
			
 
				+    if show:
			
 
				+        print('get_several_money time3', time.time()-start_time)
			
 
				+        start_time = time.time()
			
 
				+
			
 
				     for _match in all_match:
			
 
				         if len(_match.group()) > 0:
			
 
				             notes = ''  # 2021/7/20 新增备注金额大写或金额单位 if 金额大写 notes=大写 elif 单位 notes=单位
			
@@ -1531,6 +1626,10 @@ def get_several_money(sentence_text, found_yeji, in_attachment=False, html=''):
 
				                 continue
			
 
				             money_list.append((entity_text, start_index, end_index, unit, notes))
			
 
				 
			
 
				+    if show:
			
 
				+        print('get_several_money time4', time.time()-start_time)
			
 
				+        start_time = time.time()
			
 
				+
			
 
				     # 排除过小的金额
			
 
				     temp_list = []
			
 
				     for money in money_list: