|
@@ -2,6 +2,7 @@ import copy
|
|
|
import os
|
|
|
import re
|
|
|
import sys
|
|
|
+import time
|
|
|
import traceback
|
|
|
from decimal import Decimal
|
|
|
import pandas as pd
|
|
@@ -37,7 +38,8 @@ class PBPredictor:
|
|
|
end_time = item.get('time_completion')
|
|
|
return tenderee, agency, product, begin_time, end_time
|
|
|
|
|
|
- def predict(self, prem, list_articles, list_sentences, list_entitys, doctitle, code_name_dict, dochtmlcon):
|
|
|
+ def predict(self, prem, list_articles, list_sentences, list_entitys, doctitle,
|
|
|
+ code_name_dict, dochtmlcon, show=0):
|
|
|
try:
|
|
|
for list_article, list_sentence, list_entity in zip(list_articles, list_sentences, list_entitys):
|
|
|
list_sentence.sort(key=lambda x: x.sentence_index)
|
|
@@ -60,24 +62,70 @@ class PBPredictor:
|
|
|
else:
|
|
|
project_code = None
|
|
|
|
|
|
+ start_time = time.time()
|
|
|
stage = extract_legal_stage(project_name+doctitle, self.stage_pattern, self.stage_priority_dict, product, tenderee=tenderee, agency=agency)
|
|
|
+ if show:
|
|
|
+ print('extract_legal_stage time', time.time()-start_time)
|
|
|
+ start_time = time.time()
|
|
|
industry1 = extract_industry(doctitle+content, self.industry_pattern)
|
|
|
+ if show:
|
|
|
+ print('extract_industry time', time.time()-start_time)
|
|
|
+ start_time = time.time()
|
|
|
industry = extract_industry(doctitle+content_no_att, self.industry_pattern)
|
|
|
+ if show:
|
|
|
+ print('extract_industry time', time.time()-start_time)
|
|
|
+ start_time = time.time()
|
|
|
# print('industry', industry, industry1)
|
|
|
if not industry and industry1:
|
|
|
industry = industry1
|
|
|
proportion1, proportion = extract_proportion(content)
|
|
|
+ if show:
|
|
|
+ print('extract_proportion time', time.time()-start_time)
|
|
|
+ start_time = time.time()
|
|
|
project_digest = extract_project_digest(content)
|
|
|
+ if show:
|
|
|
+ print('extract_project_digest time', time.time()-start_time)
|
|
|
+ start_time = time.time()
|
|
|
project_address = extract_project_address(list_sentence, list_entity)
|
|
|
+ if show:
|
|
|
+ print('extract_project_address time', time.time()-start_time)
|
|
|
+ start_time = time.time()
|
|
|
location = get_bid_location(doctitle+"\t"+project_name)
|
|
|
+ if show:
|
|
|
+ print('get_bid_location time', time.time()-start_time)
|
|
|
+ start_time = time.time()
|
|
|
project_name_refind, show_name_refind = get_project_name_refind(project_name, doctitle, tenderee, agency)
|
|
|
+ if show:
|
|
|
+ print('get_project_name_refind time', time.time()-start_time)
|
|
|
+ start_time = time.time()
|
|
|
has_elevator = extract_has_elevator(content)
|
|
|
+ if show:
|
|
|
+ print('extract_has_elevator time', time.time()-start_time)
|
|
|
+ start_time = time.time()
|
|
|
project_property = extract_project_property(doctitle+"\t"+project_name, self.property_pattern, self.property_priority_dict)
|
|
|
+ if show:
|
|
|
+ print('extract_project_property time', time.time()-start_time)
|
|
|
+ start_time = time.time()
|
|
|
total_invest, construct_install_fee, engineer_cost = extract_several_money(list_sentence, dochtmlcon)
|
|
|
+ if show:
|
|
|
+ print('extract_several_money time', time.time()-start_time)
|
|
|
+ start_time = time.time()
|
|
|
max_floor = extract_max_floor(content, dochtmlcon)
|
|
|
+ if show:
|
|
|
+ print('extract_max_floor time', time.time()-start_time)
|
|
|
+ start_time = time.time()
|
|
|
structure = extract_structure(content, dochtmlcon, self.structure_keyword_list)
|
|
|
+ if show:
|
|
|
+ print('extract_structure time', time.time()-start_time)
|
|
|
+ start_time = time.time()
|
|
|
has_steel = extract_has_steel_structure(structure)
|
|
|
+ if show:
|
|
|
+ print('extract_has_steel_structure time', time.time()-start_time)
|
|
|
+ start_time = time.time()
|
|
|
wall_type, wall_type2 = extract_wall_type(doctitle+"\t"+project_name, content)
|
|
|
+ if show:
|
|
|
+ print('extract_wall_type time', time.time()-start_time)
|
|
|
+ start_time = time.time()
|
|
|
|
|
|
if stage is not None:
|
|
|
has_stage = 1
|
|
@@ -787,12 +835,19 @@ def extract_project_property(content, property_pattern, property_priority_dict):
|
|
|
return _property
|
|
|
|
|
|
|
|
|
-def extract_several_money(list_sentence, html='', is_obj=True):
|
|
|
+def extract_several_money(list_sentence, html='', is_obj=True, show=0):
|
|
|
+ start_time = time.time()
|
|
|
+ start_time1 = time.time()
|
|
|
money_type_list = ['总投资', '建安费', '工程造价']
|
|
|
|
|
|
money_list = []
|
|
|
all_before_sentence = ''
|
|
|
+ soup = BeautifulSoup(html, 'lxml')
|
|
|
+ tables_and_divs = soup.find_all(['table', 'div'])
|
|
|
for i, sentence in enumerate(list_sentence):
|
|
|
+ if show and i % 100 == 0:
|
|
|
+ print('extract_several_money Loop', i, len(list_sentence), time.time()-start_time1)
|
|
|
+ start_time1 = time.time()
|
|
|
last_text = ''
|
|
|
next_text = ''
|
|
|
if is_obj:
|
|
@@ -810,11 +865,18 @@ def extract_several_money(list_sentence, html='', is_obj=True):
|
|
|
if i < len(list_sentence) - 1:
|
|
|
next_text = list_sentence[i+1][:30]
|
|
|
|
|
|
+ start_time2 = time.time()
|
|
|
if judge_yeji(len(all_before_sentence), all_before_sentence, 300+len(text)):
|
|
|
# print('sentence yeji before ' + text)
|
|
|
continue
|
|
|
+ if show:
|
|
|
+ print('extract_several_money time0.1', time.time()-start_time2)
|
|
|
+ start_time2 = time.time()
|
|
|
# if '项目概算总投资为' in text:
|
|
|
- _list, _ = get_several_money(text, 0, False, html=html)
|
|
|
+ _list, _ = get_several_money(text, 0, False, tables_and_divs=tables_and_divs)
|
|
|
+ if show:
|
|
|
+ print('extract_several_money time0.2', time.time()-start_time2)
|
|
|
+ start_time2 = time.time()
|
|
|
# logging.info('get_several_money _list ' + str(_list))
|
|
|
|
|
|
temp_list = []
|
|
@@ -824,11 +886,18 @@ def extract_several_money(list_sentence, html='', is_obj=True):
|
|
|
continue
|
|
|
temp_list.append(l)
|
|
|
_list = temp_list
|
|
|
+ if show:
|
|
|
+ print('extract_several_money time0.3', time.time()-start_time2)
|
|
|
+ start_time2 = time.time()
|
|
|
|
|
|
money_list += _list
|
|
|
# if money_list:
|
|
|
# break
|
|
|
|
|
|
+ if show:
|
|
|
+ print('extract_several_money time1', time.time()-start_time)
|
|
|
+ start_time = time.time()
|
|
|
+
|
|
|
money_type_dict = {}
|
|
|
for money, _, _, _, money_type in money_list:
|
|
|
for _type in money_type_list:
|
|
@@ -851,6 +920,9 @@ def extract_several_money(list_sentence, html='', is_obj=True):
|
|
|
money_type_dict[_type] = [_money]
|
|
|
|
|
|
# logging.info('money_type_dict ' + str(money_type_dict))
|
|
|
+ if show:
|
|
|
+ print('extract_several_money time2', time.time()-start_time)
|
|
|
+ start_time = time.time()
|
|
|
|
|
|
result_list = []
|
|
|
for _type in money_type_list:
|
|
@@ -867,6 +939,10 @@ def extract_several_money(list_sentence, html='', is_obj=True):
|
|
|
else:
|
|
|
result_list.append(None)
|
|
|
|
|
|
+ if show:
|
|
|
+ print('extract_several_money time3', time.time()-start_time)
|
|
|
+ start_time = time.time()
|
|
|
+
|
|
|
for i in range(len(result_list)):
|
|
|
if result_list[i] is None:
|
|
|
result_list[i] = 0
|
|
@@ -1176,7 +1252,8 @@ def cut_win_bid_part(_str):
|
|
|
|
|
|
|
|
|
def judge_yeji(end_index, content, judge_len=300, tables_and_divs=None, entity=None):
|
|
|
- def is_yeji_table(_tables_and_divs, _entity_text):
|
|
|
+ def is_yeji_table(_tables_and_divs, _entity_text, show=0):
|
|
|
+ start_time = time.time()
|
|
|
if not _tables_and_divs:
|
|
|
return 0
|
|
|
|
|
@@ -1238,6 +1315,8 @@ def judge_yeji(end_index, content, judge_len=300, tables_and_divs=None, entity=N
|
|
|
# logging.info('is_yeji_table 3')
|
|
|
is_yeji = 1
|
|
|
break
|
|
|
+ if show:
|
|
|
+ print('is_yeji_table time', time.time()-start_time)
|
|
|
return is_yeji
|
|
|
|
|
|
# 先判断表格业绩
|
|
@@ -1260,7 +1339,8 @@ def judge_yeji(end_index, content, judge_len=300, tables_and_divs=None, entity=N
|
|
|
return 0
|
|
|
|
|
|
|
|
|
-def get_several_money(sentence_text, found_yeji, in_attachment=False, html=''):
|
|
|
+def get_several_money(sentence_text, found_yeji, in_attachment=False,
|
|
|
+ tables_and_divs=[], show=0):
|
|
|
def getDigitsDic(_unit):
|
|
|
'''
|
|
|
@summary:拿到中文对应的数字
|
|
@@ -1334,13 +1414,19 @@ def get_several_money(sentence_text, found_yeji, in_attachment=False, html=''):
|
|
|
return Decimal(0)
|
|
|
return result
|
|
|
|
|
|
+ start_time = time.time()
|
|
|
+
|
|
|
# 提取表格用于判断业绩
|
|
|
- if html:
|
|
|
- soup = BeautifulSoup(html, 'lxml')
|
|
|
- tables = soup.find_all('table')
|
|
|
- tables_and_divs = soup.find_all(['table', 'div'])
|
|
|
- else:
|
|
|
- tables_and_divs = []
|
|
|
+ # if tables_and_divs:
|
|
|
+ # soup = BeautifulSoup(html, 'lxml')
|
|
|
+ # # tables = soup.find_all('table')
|
|
|
+ # tables_and_divs = soup.find_all(['table', 'div'])
|
|
|
+ # else:
|
|
|
+ # tables_and_divs = []
|
|
|
+
|
|
|
+ # if show:
|
|
|
+ # print('get_several_money time1', time.time()-start_time)
|
|
|
+ # start_time = time.time()
|
|
|
|
|
|
money_list = []
|
|
|
# 使用正则识别金额
|
|
@@ -1365,6 +1451,10 @@ def get_several_money(sentence_text, found_yeji, in_attachment=False, html=''):
|
|
|
if match:
|
|
|
sentence_text = re.sub(re.escape(match.group()), match.group()[0] + match.group()[2:], sentence_text)
|
|
|
|
|
|
+ if show:
|
|
|
+ print('get_several_money time2', time.time()-start_time)
|
|
|
+ start_time = time.time()
|
|
|
+
|
|
|
if re.search('业绩(公示|汇总|及|报告|\w{,2}(内容|情况|信息)|[^\w])', sentence_text):
|
|
|
found_yeji += 1
|
|
|
if found_yeji >= 2: # 过滤掉业绩后面的所有金额
|
|
@@ -1375,6 +1465,11 @@ def get_several_money(sentence_text, found_yeji, in_attachment=False, html=''):
|
|
|
all_match = re.finditer(pattern_money, sentence_text.replace(ser.group(0), ' ' * len(ser.group(0))))
|
|
|
else:
|
|
|
all_match = re.finditer(pattern_money, sentence_text)
|
|
|
+
|
|
|
+ if show:
|
|
|
+ print('get_several_money time3', time.time()-start_time)
|
|
|
+ start_time = time.time()
|
|
|
+
|
|
|
for _match in all_match:
|
|
|
if len(_match.group()) > 0:
|
|
|
notes = '' # 2021/7/20 新增备注金额大写或金额单位 if 金额大写 notes=大写 elif 单位 notes=单位
|
|
@@ -1531,6 +1626,10 @@ def get_several_money(sentence_text, found_yeji, in_attachment=False, html=''):
|
|
|
continue
|
|
|
money_list.append((entity_text, start_index, end_index, unit, notes))
|
|
|
|
|
|
+ if show:
|
|
|
+ print('get_several_money time4', time.time()-start_time)
|
|
|
+ start_time = time.time()
|
|
|
+
|
|
|
# 排除过小的金额
|
|
|
temp_list = []
|
|
|
for money in money_list:
|