|
@@ -23,6 +23,7 @@ class PBPredictor:
|
|
|
|
|
|
def get_col_from_prem(self, prem):
|
|
|
tenderee, agency, product = None, None, None
|
|
|
+ begin_time, end_time = None, None
|
|
|
for item in prem:
|
|
|
prem = item.get('prem')
|
|
|
for key in prem.keys():
|
|
@@ -63,68 +64,71 @@ class PBPredictor:
|
|
|
project_code = None
|
|
|
|
|
|
start_time = time.time()
|
|
|
- stage = extract_legal_stage(project_name+doctitle, self.stage_pattern, self.stage_priority_dict, product, tenderee=tenderee, agency=agency)
|
|
|
+ stage = extract_legal_stage(project_name + doctitle, self.stage_pattern, self.stage_priority_dict,
|
|
|
+ product, tenderee=tenderee, agency=agency)
|
|
|
if show:
|
|
|
- print('extract_legal_stage time', time.time()-start_time)
|
|
|
+ print('extract_legal_stage time', time.time() - start_time)
|
|
|
start_time = time.time()
|
|
|
- industry1 = extract_industry(doctitle+content, self.industry_pattern)
|
|
|
+ industry1 = extract_industry(doctitle + content, self.industry_pattern)
|
|
|
if show:
|
|
|
- print('extract_industry time', time.time()-start_time)
|
|
|
+ print('extract_industry time', time.time() - start_time)
|
|
|
start_time = time.time()
|
|
|
- industry = extract_industry(doctitle+content_no_att, self.industry_pattern)
|
|
|
+ industry = extract_industry(doctitle + content_no_att, self.industry_pattern)
|
|
|
if show:
|
|
|
- print('extract_industry time', time.time()-start_time)
|
|
|
+ print('extract_industry time', time.time() - start_time)
|
|
|
start_time = time.time()
|
|
|
# print('industry', industry, industry1)
|
|
|
if not industry and industry1:
|
|
|
industry = industry1
|
|
|
proportion1, proportion = extract_proportion(content)
|
|
|
if show:
|
|
|
- print('extract_proportion time', time.time()-start_time)
|
|
|
+ print('extract_proportion time', time.time() - start_time)
|
|
|
start_time = time.time()
|
|
|
project_digest = extract_project_digest(content)
|
|
|
if show:
|
|
|
- print('extract_project_digest time', time.time()-start_time)
|
|
|
+ print('extract_project_digest time', time.time() - start_time)
|
|
|
start_time = time.time()
|
|
|
project_address = extract_project_address(list_sentence, list_entity)
|
|
|
if show:
|
|
|
- print('extract_project_address time', time.time()-start_time)
|
|
|
+ print('extract_project_address time', time.time() - start_time)
|
|
|
start_time = time.time()
|
|
|
- location = get_bid_location(doctitle+"\t"+project_name)
|
|
|
+ location = get_bid_location(doctitle + "\t" + project_name)
|
|
|
if show:
|
|
|
- print('get_bid_location time', time.time()-start_time)
|
|
|
+ print('get_bid_location time', time.time() - start_time)
|
|
|
start_time = time.time()
|
|
|
- project_name_refind, show_name_refind = get_project_name_refind(project_name, doctitle, tenderee, agency)
|
|
|
+ project_name_refind, show_name_refind = get_project_name_refind(project_name, doctitle, tenderee,
|
|
|
+ agency)
|
|
|
if show:
|
|
|
- print('get_project_name_refind time', time.time()-start_time)
|
|
|
+ print('get_project_name_refind time', time.time() - start_time)
|
|
|
start_time = time.time()
|
|
|
has_elevator = extract_has_elevator(content)
|
|
|
if show:
|
|
|
- print('extract_has_elevator time', time.time()-start_time)
|
|
|
+ print('extract_has_elevator time', time.time() - start_time)
|
|
|
start_time = time.time()
|
|
|
- project_property = extract_project_property(doctitle+"\t"+project_name, self.property_pattern, self.property_priority_dict)
|
|
|
+ project_property = extract_project_property(doctitle + "\t" + project_name, self.property_pattern,
|
|
|
+ self.property_priority_dict)
|
|
|
if show:
|
|
|
- print('extract_project_property time', time.time()-start_time)
|
|
|
+ print('extract_project_property time', time.time() - start_time)
|
|
|
start_time = time.time()
|
|
|
total_invest, construct_install_fee, engineer_cost = extract_several_money(list_sentence, dochtmlcon)
|
|
|
if show:
|
|
|
- print('extract_several_money time', time.time()-start_time)
|
|
|
+ print('extract_several_money time', time.time() - start_time)
|
|
|
start_time = time.time()
|
|
|
max_floor = extract_max_floor(content, dochtmlcon)
|
|
|
if show:
|
|
|
- print('extract_max_floor time', time.time()-start_time)
|
|
|
+ print('extract_max_floor time', time.time() - start_time)
|
|
|
start_time = time.time()
|
|
|
structure = extract_structure(content, dochtmlcon, self.structure_keyword_list)
|
|
|
if show:
|
|
|
- print('extract_structure time', time.time()-start_time)
|
|
|
+ print('extract_structure time', time.time() - start_time)
|
|
|
start_time = time.time()
|
|
|
has_steel = extract_has_steel_structure(structure)
|
|
|
if show:
|
|
|
- print('extract_has_steel_structure time', time.time()-start_time)
|
|
|
+ print('extract_has_steel_structure time', time.time() - start_time)
|
|
|
start_time = time.time()
|
|
|
- wall_type, wall_type2 = extract_wall_type(doctitle+"\t"+project_name, content)
|
|
|
+ wall_type, wall_type2 = extract_wall_type(doctitle + "\t" + project_name, content)
|
|
|
if show:
|
|
|
- print('extract_wall_type time', time.time()-start_time)
|
|
|
+ print('extract_wall_type time', time.time() - start_time)
|
|
|
start_time = time.time()
|
|
|
|
|
|
if stage is not None:
|
|
@@ -191,7 +195,7 @@ def extract_legal_stage(content, _pattern, priority_dict, product='', tenderee='
|
|
|
|
|
|
list_stage = []
|
|
|
for stage_search in re.finditer(_pattern, _content):
|
|
|
- for k,v in stage_search.groupdict().items():
|
|
|
+ for k, v in stage_search.groupdict().items():
|
|
|
if v is not None:
|
|
|
list_stage.append([k, priority_dict.get(k)])
|
|
|
if len(list_stage) > 0:
|
|
@@ -211,10 +215,10 @@ def extract_legal_stage(content, _pattern, priority_dict, product='', tenderee='
|
|
|
if stage == '立项阶段':
|
|
|
sub_content = re.sub('立项目', '', _content)
|
|
|
for stage_search in re.finditer(_pattern, sub_content):
|
|
|
- for k,v in stage_search.groupdict().items():
|
|
|
+ for k, v in stage_search.groupdict().items():
|
|
|
if v is not None:
|
|
|
list_stage.append([k, priority_dict.get(k)])
|
|
|
- if len(list_stage)>0:
|
|
|
+ if len(list_stage) > 0:
|
|
|
list_stage.sort(key=lambda x: x[1])
|
|
|
stage = list_stage[0][0]
|
|
|
|
|
@@ -222,7 +226,7 @@ def extract_legal_stage(content, _pattern, priority_dict, product='', tenderee='
|
|
|
return None
|
|
|
|
|
|
|
|
|
-def get_project_name_refind(project_name, doctitle, tenderee='', agency= '', min_len=3):
|
|
|
+def get_project_name_refind(project_name, doctitle, tenderee='', agency='', min_len=3):
|
|
|
# 跳过部分
|
|
|
re_str11 = '网上超市|服务市场采购|印刷服务|复印纸|车辆维修和保养|商品房预售|办公家具定点|直接订购|定点议价' \
|
|
|
'|政府采购意向|信息技术服务定点议价|信息技术服务定点采购|法人章刻制中介机构|专用设备|办公设备采购' \
|
|
@@ -352,18 +356,21 @@ def get_project_name_refind(project_name, doctitle, tenderee='', agency= '', min
|
|
|
project_word_in_org = []
|
|
|
for m in match:
|
|
|
# 混淆词,设施工程中的施工
|
|
|
- if m.span()[0] > 0 and name_refind[m.span()[0]-1] in ['设']:
|
|
|
+ if m.span()[0] > 0 and name_refind[m.span()[0] - 1] in ['设']:
|
|
|
continue
|
|
|
|
|
|
# 判断是不是公司名里的工程
|
|
|
if re.search(re_str26, name_refind[m.span()[1]:]):
|
|
|
- project_word_in_org.append(name_refind[max(0, m.span()[0]-1):min(m.span()[1]+1, len(name_refind))])
|
|
|
+ project_word_in_org.append(
|
|
|
+ name_refind[max(0, m.span()[0] - 1):min(m.span()[1] + 1, len(name_refind))])
|
|
|
continue
|
|
|
- if re.search(re_str17, name_refind[m.span()[1]:m.span()[1]+3]):
|
|
|
- project_word_in_org.append(name_refind[max(0, m.span()[0]-1):min(m.span()[1]+1, len(name_refind))])
|
|
|
+ if re.search(re_str17, name_refind[m.span()[1]:m.span()[1] + 3]):
|
|
|
+ project_word_in_org.append(
|
|
|
+ name_refind[max(0, m.span()[0] - 1):min(m.span()[1] + 1, len(name_refind))])
|
|
|
continue
|
|
|
if re.search(re_str18, name_refind[m.span()[1]:]):
|
|
|
- project_word_in_org.append(name_refind[max(0, m.span()[0]-1):min(m.span()[1]+1, len(name_refind))])
|
|
|
+ project_word_in_org.append(
|
|
|
+ name_refind[max(0, m.span()[0] - 1):min(m.span()[1] + 1, len(name_refind))])
|
|
|
continue
|
|
|
|
|
|
match_flag = True
|
|
@@ -377,18 +384,21 @@ def get_project_name_refind(project_name, doctitle, tenderee='', agency= '', min
|
|
|
last_index = 0
|
|
|
for m in match:
|
|
|
# 混淆词,设施工程中的施工
|
|
|
- if m.span()[0] > 0 and name_refind[m.span()[0]-1] in ['设']:
|
|
|
+ if m.span()[0] > 0 and name_refind[m.span()[0] - 1] in ['设']:
|
|
|
continue
|
|
|
|
|
|
# 判断是不是公司名里的工程
|
|
|
if re.search(re_str26, name_refind[m.span()[1]:]):
|
|
|
- project_word_in_org.append(name_refind[max(0, m.span()[0]-1):min(m.span()[1]+1, len(name_refind))])
|
|
|
+ project_word_in_org.append(
|
|
|
+ name_refind[max(0, m.span()[0] - 1):min(m.span()[1] + 1, len(name_refind))])
|
|
|
continue
|
|
|
- if re.search(re_str17, name_refind[m.span()[1]:m.span()[1]+3]):
|
|
|
- project_word_in_org.append(name_refind[max(0, m.span()[0]-1):min(m.span()[1]+1, len(name_refind))])
|
|
|
+ if re.search(re_str17, name_refind[m.span()[1]:m.span()[1] + 3]):
|
|
|
+ project_word_in_org.append(
|
|
|
+ name_refind[max(0, m.span()[0] - 1):min(m.span()[1] + 1, len(name_refind))])
|
|
|
continue
|
|
|
if re.search(re_str18, name_refind[m.span()[1]:]):
|
|
|
- project_word_in_org.append(name_refind[max(0, m.span()[0]-1):min(m.span()[1]+1, len(name_refind))])
|
|
|
+ project_word_in_org.append(
|
|
|
+ name_refind[max(0, m.span()[0] - 1):min(m.span()[1] + 1, len(name_refind))])
|
|
|
continue
|
|
|
match_flag = True
|
|
|
prob_name_list.append(name_refind[last_index:m.span()[1]])
|
|
@@ -429,7 +439,7 @@ def get_project_name_refind(project_name, doctitle, tenderee='', agency= '', min
|
|
|
match1 = re.finditer(re_str6, name)
|
|
|
for m1 in match1:
|
|
|
# 混淆词,设施工程中的施工
|
|
|
- if m1.span()[0] > 0 and name[m1.span()[0]-1] in ['设']:
|
|
|
+ if m1.span()[0] > 0 and name[m1.span()[0] - 1] in ['设']:
|
|
|
continue
|
|
|
s_index, e_index = m1.span()
|
|
|
word = name[s_index:e_index]
|
|
@@ -473,12 +483,12 @@ def get_project_name_refind(project_name, doctitle, tenderee='', agency= '', min
|
|
|
for name_refind in name_refind_candidate_list:
|
|
|
# 直接判断删除数字
|
|
|
match = re.match(re_str16, name_refind)
|
|
|
- if match and not re.match('[0-9]', name_refind[match.span()[1]:match.span()[1]+1]):
|
|
|
+ if match and not re.match('[0-9]', name_refind[match.span()[1]:match.span()[1] + 1]):
|
|
|
name_refind = name_refind[match.span()[1]:]
|
|
|
|
|
|
# 删除开头奇怪数字
|
|
|
match = re.match(re_str15, name_refind)
|
|
|
- if match and not re.match('[a-zA-Z地块号]', name_refind[match.span()[1]:match.span()[1]+1]):
|
|
|
+ if match and not re.match('[a-zA-Z地块号]', name_refind[match.span()[1]:match.span()[1] + 1]):
|
|
|
name_refind = name_refind[match.span()[1]:]
|
|
|
|
|
|
# 删除期数
|
|
@@ -525,7 +535,7 @@ def get_project_name_refind(project_name, doctitle, tenderee='', agency= '', min
|
|
|
# 删除区
|
|
|
match2 = re.match(re_str22, name_refind)
|
|
|
if match2:
|
|
|
- name_refind = name_refind[match2.span()[1]-1:]
|
|
|
+ name_refind = name_refind[match2.span()[1] - 1:]
|
|
|
|
|
|
# 删除'小区表达'
|
|
|
if len(name_refind) >= min_len + 2:
|
|
@@ -537,7 +547,8 @@ def get_project_name_refind(project_name, doctitle, tenderee='', agency= '', min
|
|
|
if agency in [None, 'None', '-', '']:
|
|
|
agency = ''
|
|
|
try:
|
|
|
- if len(name_refind) >= 4 and (re.search(re.escape(name_refind[-4:]), tenderee) or re.search(re.escape(name_refind[-4:]), agency)):
|
|
|
+ if len(name_refind) >= 4 and (
|
|
|
+ re.search(re.escape(name_refind[-4:]), tenderee) or re.search(re.escape(name_refind[-4:]), agency)):
|
|
|
name_refind = ''
|
|
|
show_name_refind = ''
|
|
|
except:
|
|
@@ -558,14 +569,14 @@ def extract_industry(content, _pattern):
|
|
|
list_stage = []
|
|
|
stage_dict = {}
|
|
|
for stage_search in re.finditer(_pattern, content):
|
|
|
- for k,v in stage_search.groupdict().items():
|
|
|
+ for k, v in stage_search.groupdict().items():
|
|
|
if v is not None:
|
|
|
list_stage.append(k)
|
|
|
if k in stage_dict.keys():
|
|
|
stage_dict[k] += 1
|
|
|
else:
|
|
|
stage_dict[k] = 1
|
|
|
- if len(list_stage)>0:
|
|
|
+ if len(list_stage) > 0:
|
|
|
stage_cnt_list = [[x, stage_dict.get(x)] for x in stage_dict.keys()]
|
|
|
stage_cnt_list.sort(key=lambda x: x[1], reverse=True)
|
|
|
# print('extract_industry ' + str(stage_cnt_list))
|
|
@@ -598,12 +609,12 @@ def extract_tenderee(list_entity):
|
|
|
|
|
|
def extract_project_digest(content):
|
|
|
_pattern = "(?P<projectDigest>(项目|工程|标的|需求|建设|招标|采购|内容)(概况|规模|简介|信息|范围|内容|说明|摘要).{10,300})"
|
|
|
- _pattern_search = re.search(_pattern,content)
|
|
|
+ _pattern_search = re.search(_pattern, content)
|
|
|
_projectDigest = ""
|
|
|
_find = ""
|
|
|
if _pattern_search is not None:
|
|
|
- _find = _pattern_search.groupdict().get("projectDigest","")
|
|
|
- if len(_find)>0:
|
|
|
+ _find = _pattern_search.groupdict().get("projectDigest", "")
|
|
|
+ if len(_find) > 0:
|
|
|
_projectDigest = "。".join(_find.split("。")[0:3])
|
|
|
|
|
|
# 截掉中标信息
|
|
@@ -620,7 +631,6 @@ def extract_project_address(list_sentence, list_entity):
|
|
|
reg3 = "(项目|建设|工程)(地址|地点)[::]?(位于|起于)"
|
|
|
reg4 = "(项目|建设|工程)(地址|地点)[为::]+"
|
|
|
|
|
|
-
|
|
|
address_list = []
|
|
|
candidate_list = []
|
|
|
for sentence in list_sentence:
|
|
@@ -637,10 +647,12 @@ def extract_project_address(list_sentence, list_entity):
|
|
|
continue
|
|
|
|
|
|
text = p_entity.entity_text
|
|
|
- if text == content[end_index:end_index+len(text)] or text in content[end_index:end_index+len(text)+10]:
|
|
|
+ if text == content[end_index:end_index + len(text)] or text in content[end_index:end_index + len(
|
|
|
+ text) + 10]:
|
|
|
address_list.append(text)
|
|
|
else:
|
|
|
- candidate_list.append(content[max(0, end_index-10):end_index] + '@@@' + content[end_index:end_index+20] + '@@@' + text)
|
|
|
+ candidate_list.append(content[max(0, end_index - 10):end_index] + '@@@' + content[
|
|
|
+ end_index:end_index + 20] + '@@@' + text)
|
|
|
|
|
|
if address_list:
|
|
|
break
|
|
@@ -665,17 +677,19 @@ def extract_begin_end_time(list_sentence, list_entity):
|
|
|
if p_entity.entity_type == "time":
|
|
|
for _sentence in list_sentence:
|
|
|
if _sentence.sentence_index == p_entity.sentence_index:
|
|
|
- _span = spanWindow(tokens=_sentence.tokens,begin_index=p_entity.begin_index,end_index=p_entity.end_index,size=20,center_include=True,word_flag=True,text=p_entity.entity_text)
|
|
|
- if re.search("开工(时间|日期)",_span[0]) is not None:
|
|
|
+ _span = spanWindow(tokens=_sentence.tokens, begin_index=p_entity.begin_index,
|
|
|
+ end_index=p_entity.end_index, size=20, center_include=True, word_flag=True,
|
|
|
+ text=p_entity.entity_text)
|
|
|
+ if re.search("开工(时间|日期)", _span[0]) is not None:
|
|
|
_time_temp = timeFormat(p_entity.entity_text)
|
|
|
- if len(_time_temp)>0:
|
|
|
+ if len(_time_temp) > 0:
|
|
|
_begin_time = _time_temp
|
|
|
- if re.search("(竣工|完工)(时间|日期)",_span[0]) is not None:
|
|
|
+ if re.search("(竣工|完工)(时间|日期)", _span[0]) is not None:
|
|
|
_time_temp = timeFormat(p_entity.entity_text)
|
|
|
- if len(_time_temp)>0:
|
|
|
+ if len(_time_temp) > 0:
|
|
|
_end_time = _time_temp
|
|
|
|
|
|
- return _begin_time,_end_time
|
|
|
+ return _begin_time, _end_time
|
|
|
|
|
|
|
|
|
def get_bid_location(content):
|
|
@@ -709,7 +723,8 @@ def get_bid_location(content):
|
|
|
find_flag = False
|
|
|
for entitys in list_entitys:
|
|
|
for entity in entitys:
|
|
|
- if entity.entity_type in ["tenderee", 'agency', 'win_tenderer', 'second_tenderer', 'third_tenderer', 'company', 'org']:
|
|
|
+ if entity.entity_type in ["tenderee", 'agency', 'win_tenderer', 'second_tenderer', 'third_tenderer',
|
|
|
+ 'company', 'org']:
|
|
|
if location in entity.entity_text:
|
|
|
find_flag = True
|
|
|
break
|
|
@@ -727,8 +742,8 @@ def extract_proportion(content, has_preffix=True):
|
|
|
# log(content)
|
|
|
suffix = "[大概约为是::【\[\s]*[\d,]+(\.\d+)?[十百千万亿]*([\]】平方kK千万公㎡mM米里顷亩]+2?))"
|
|
|
reg_dict = {
|
|
|
- 0: "(?P<proportion>(总((建筑|建设)面积|长|长度))" + suffix,
|
|
|
- 1: "(?P<proportion>((建筑|建设)面积|全长)" + suffix,
|
|
|
+ 0: "(?P<proportion>(总((建筑|建设)(面积|规模)|长|长度))" + suffix,
|
|
|
+ 1: "(?P<proportion>((建筑|建设)(面积|规模)|全长)" + suffix,
|
|
|
2: "(?P<proportion>((建筑|建设|区域)?面积|全长|项目规模)" + suffix
|
|
|
}
|
|
|
|
|
@@ -743,7 +758,7 @@ def extract_proportion(content, has_preffix=True):
|
|
|
# logging.info('content ' + str(content))
|
|
|
match = re.search(_pattern, str(content))
|
|
|
if match:
|
|
|
- _proportion = match.groupdict().get("proportion","")
|
|
|
+ _proportion = match.groupdict().get("proportion", "")
|
|
|
|
|
|
if not _proportion:
|
|
|
return "", ""
|
|
@@ -820,7 +835,7 @@ def extract_has_elevator(content):
|
|
|
has_flag = 1
|
|
|
if judge_yeji(match.span()[0], content):
|
|
|
has_flag = 0
|
|
|
- elif re.search('公司', content[end_index:end_index+8]):
|
|
|
+ elif re.search('公司', content[end_index:end_index + 8]):
|
|
|
has_flag = 0
|
|
|
return has_flag
|
|
|
|
|
@@ -828,12 +843,12 @@ def extract_has_elevator(content):
|
|
|
def extract_project_property(content, property_pattern, property_priority_dict):
|
|
|
property_list = []
|
|
|
for m in re.finditer(property_pattern, content):
|
|
|
- for k,v in m.groupdict().items():
|
|
|
+ for k, v in m.groupdict().items():
|
|
|
if v is not None:
|
|
|
property_list.append([k, property_priority_dict.get(k)])
|
|
|
|
|
|
_property = '新建'
|
|
|
- if len(property_list)>0:
|
|
|
+ if len(property_list) > 0:
|
|
|
property_list.sort(key=lambda x: x[1])
|
|
|
_property = property_list[0][0]
|
|
|
return _property
|
|
@@ -850,7 +865,7 @@ def extract_several_money(list_sentence, html='', is_obj=True, show=0):
|
|
|
tables_and_divs = soup.find_all(['table', 'div'])
|
|
|
for i, sentence in enumerate(list_sentence):
|
|
|
if show and i % 100 == 0:
|
|
|
- print('extract_several_money Loop', i, len(list_sentence), time.time()-start_time1)
|
|
|
+ print('extract_several_money Loop', i, len(list_sentence), time.time() - start_time1)
|
|
|
start_time1 = time.time()
|
|
|
last_text = ''
|
|
|
next_text = ''
|
|
@@ -858,40 +873,40 @@ def extract_several_money(list_sentence, html='', is_obj=True, show=0):
|
|
|
text = sentence.sentence_text
|
|
|
all_before_sentence += text
|
|
|
if i > 0:
|
|
|
- last_text = list_sentence[i-1].sentence_text[-30:]
|
|
|
+ last_text = list_sentence[i - 1].sentence_text[-30:]
|
|
|
if i < len(list_sentence) - 1:
|
|
|
- next_text = list_sentence[i+1].sentence_text[:30]
|
|
|
+ next_text = list_sentence[i + 1].sentence_text[:30]
|
|
|
else:
|
|
|
text = sentence
|
|
|
all_before_sentence += text
|
|
|
if i > 0:
|
|
|
- last_text = list_sentence[i-1][-30:]
|
|
|
+ last_text = list_sentence[i - 1][-30:]
|
|
|
if i < len(list_sentence) - 1:
|
|
|
- next_text = list_sentence[i+1][:30]
|
|
|
+ next_text = list_sentence[i + 1][:30]
|
|
|
|
|
|
start_time2 = time.time()
|
|
|
- if judge_yeji(len(all_before_sentence), all_before_sentence, 300+len(text)):
|
|
|
+ if judge_yeji(len(all_before_sentence), all_before_sentence, 300 + len(text)):
|
|
|
# print('sentence yeji before ' + text)
|
|
|
continue
|
|
|
if show:
|
|
|
- print('extract_several_money time0.1', time.time()-start_time2)
|
|
|
+ print('extract_several_money time0.1', time.time() - start_time2)
|
|
|
start_time2 = time.time()
|
|
|
# if '项目概算总投资为' in text:
|
|
|
_list, _ = get_several_money(text, 0, False, tables_and_divs=tables_and_divs)
|
|
|
if show:
|
|
|
- print('extract_several_money time0.2', time.time()-start_time2)
|
|
|
+ print('extract_several_money time0.2', time.time() - start_time2)
|
|
|
start_time2 = time.time()
|
|
|
# logging.info('get_several_money _list ' + str(_list))
|
|
|
|
|
|
temp_list = []
|
|
|
for l in _list:
|
|
|
if l[-1] == '总投资':
|
|
|
- if re.search('业绩', last_text+text+next_text):
|
|
|
+ if re.search('业绩', last_text + text + next_text):
|
|
|
continue
|
|
|
temp_list.append(l)
|
|
|
_list = temp_list
|
|
|
if show:
|
|
|
- print('extract_several_money time0.3', time.time()-start_time2)
|
|
|
+ print('extract_several_money time0.3', time.time() - start_time2)
|
|
|
start_time2 = time.time()
|
|
|
|
|
|
money_list += _list
|
|
@@ -899,7 +914,7 @@ def extract_several_money(list_sentence, html='', is_obj=True, show=0):
|
|
|
# break
|
|
|
|
|
|
if show:
|
|
|
- print('extract_several_money time1', time.time()-start_time)
|
|
|
+ print('extract_several_money time1', time.time() - start_time)
|
|
|
start_time = time.time()
|
|
|
|
|
|
money_type_dict = {}
|
|
@@ -925,7 +940,7 @@ def extract_several_money(list_sentence, html='', is_obj=True, show=0):
|
|
|
|
|
|
# logging.info('money_type_dict ' + str(money_type_dict))
|
|
|
if show:
|
|
|
- print('extract_several_money time2', time.time()-start_time)
|
|
|
+ print('extract_several_money time2', time.time() - start_time)
|
|
|
start_time = time.time()
|
|
|
|
|
|
result_list = []
|
|
@@ -944,7 +959,7 @@ def extract_several_money(list_sentence, html='', is_obj=True, show=0):
|
|
|
result_list.append(None)
|
|
|
|
|
|
if show:
|
|
|
- print('extract_several_money time3', time.time()-start_time)
|
|
|
+ print('extract_several_money time3', time.time() - start_time)
|
|
|
start_time = time.time()
|
|
|
|
|
|
for i in range(len(result_list)):
|
|
@@ -961,9 +976,9 @@ def extract_max_floor(content, html=None):
|
|
|
_floor_list = []
|
|
|
for m in _match:
|
|
|
if 'reg6' in _reg:
|
|
|
- _floor1 = content[max(0, m.span('reg6')[0]-1):m.span('reg6')[1]+1]
|
|
|
+ _floor1 = content[max(0, m.span('reg6')[0] - 1):m.span('reg6')[1] + 1]
|
|
|
elif 'reg4' in _reg:
|
|
|
- _floor1 = content[max(0, m.span('reg4')[0]-1):m.span('reg4')[1]+1]
|
|
|
+ _floor1 = content[max(0, m.span('reg4')[0] - 1):m.span('reg4')[1] + 1]
|
|
|
else:
|
|
|
_floor1 = content[m.span()[0]:m.span()[1]]
|
|
|
if judge_yeji(m.span()[0], _content, 300, _tables_and_divs, _floor1):
|
|
@@ -1003,7 +1018,7 @@ def extract_max_floor(content, html=None):
|
|
|
_floor = chinese_to_arabic(_floor)
|
|
|
_floor = int(_floor)
|
|
|
if _reg2:
|
|
|
- _floor_list2 = match_floor(_reg2, _content[m.span()[1]:m.span()[1]+35])
|
|
|
+ _floor_list2 = match_floor(_reg2, _content[m.span()[1]:m.span()[1] + 35])
|
|
|
# print('@2', _floor_list2)
|
|
|
if _floor_list2:
|
|
|
_floor2 = int(_floor_list2[0])
|
|
@@ -1087,11 +1102,11 @@ def extract_structure(content, html=None, structure_keyword_list=None):
|
|
|
match = re.finditer(reg, content)
|
|
|
for m in match:
|
|
|
structure = m.group()
|
|
|
- structure1 = content[max(0, m.span()[0]-1):m.span()[1]+1]
|
|
|
+ structure1 = content[max(0, m.span()[0] - 1):m.span()[1] + 1]
|
|
|
if judge_yeji(m.span()[0], content, 300, tables_and_divs, structure1):
|
|
|
continue
|
|
|
if structure in ['钢结构']:
|
|
|
- if re.search('公司', content[m.span()[1]:m.span()[1]+8]):
|
|
|
+ if re.search('公司', content[m.span()[1]:m.span()[1] + 8]):
|
|
|
continue
|
|
|
structure_list.append(structure)
|
|
|
|
|
@@ -1234,7 +1249,7 @@ def cut_win_bid_part(_str):
|
|
|
index_start = m.span()[0]
|
|
|
cut_str = re.split("[,,。;;]", _str[index_start:])[0]
|
|
|
if len(cut_str) < 25:
|
|
|
- cut_str = _str[index_start:index_start+25]
|
|
|
+ cut_str = _str[index_start:index_start + 25]
|
|
|
# cut_str = _str[index_start:index_start+15]
|
|
|
# print("cut_str", cut_str)
|
|
|
|
|
@@ -1307,20 +1322,21 @@ def judge_yeji(end_index, content, judge_len=300, tables_and_divs=None, entity=N
|
|
|
# print('len(rows[index2])', len(row2.find_all('td')))
|
|
|
# if len(row2.find_all('td')) <= max_col_span / 2:
|
|
|
# print(re.search('业绩', str(row2)), str(row2))
|
|
|
- if len(row2.find_all('td')) <= max_col_span / 2 and re.search(reg_yeji, str(row2.get_text())):
|
|
|
+ if len(row2.find_all('td')) <= max_col_span / 2 and re.search(reg_yeji,
|
|
|
+ str(row2.get_text())):
|
|
|
# logging.info('is_yeji_table 2')
|
|
|
is_yeji = 1
|
|
|
|
|
|
break
|
|
|
|
|
|
# 前面都找不到,那么找表格上方的两行
|
|
|
- div_list = [str(x.get_text()) for x in _tables_and_divs[max(0, index3-2):index3]]
|
|
|
+ div_list = [str(x.get_text()) for x in _tables_and_divs[max(0, index3 - 2):index3]]
|
|
|
if not is_yeji and re.search(reg_yeji, ' '.join(div_list)):
|
|
|
# logging.info('is_yeji_table 3')
|
|
|
is_yeji = 1
|
|
|
break
|
|
|
if show:
|
|
|
- print('is_yeji_table time', time.time()-start_time)
|
|
|
+ print('is_yeji_table time', time.time() - start_time)
|
|
|
return is_yeji
|
|
|
|
|
|
# 先判断表格业绩
|
|
@@ -1349,15 +1365,17 @@ def get_several_money(sentence_text, found_yeji, in_attachment=False,
|
|
|
'''
|
|
|
@summary:拿到中文对应的数字
|
|
|
'''
|
|
|
- DigitsDic = {"零":0, "壹":1, "贰":2, "叁":3, "肆":4, "伍":5, "陆":6, "柒":7, "捌":8, "玖":9,
|
|
|
- "〇":0, "一":1, "二":2, "三":3, "四":4, "五":5, "六":6, "七":7, "八":8, "九":9}
|
|
|
+ DigitsDic = {"零": 0, "壹": 1, "贰": 2, "叁": 3, "肆": 4, "伍": 5, "陆": 6, "柒": 7, "捌": 8, "玖": 9,
|
|
|
+ "〇": 0, "一": 1, "二": 2, "三": 3, "四": 4, "五": 5, "六": 6, "七": 7, "八": 8, "九": 9}
|
|
|
return DigitsDic.get(_unit)
|
|
|
|
|
|
def getMultipleFactor(_unit):
|
|
|
'''
|
|
|
@summary:拿到单位对应的值
|
|
|
'''
|
|
|
- MultipleFactor = {"兆":Decimal(1000000000000),"亿":Decimal(100000000),"万":Decimal(10000),"仟":Decimal(1000),"千":Decimal(1000),"佰":Decimal(100),"百":Decimal(100),"拾":Decimal(10),"十":Decimal(10),"元":Decimal(1),"圆":Decimal(1),"角":round(Decimal(0.1),1),"分":round(Decimal(0.01),2)}
|
|
|
+ MultipleFactor = {"兆": Decimal(1000000000000), "亿": Decimal(100000000), "万": Decimal(10000), "仟": Decimal(1000),
|
|
|
+ "千": Decimal(1000), "佰": Decimal(100), "百": Decimal(100), "拾": Decimal(10), "十": Decimal(10),
|
|
|
+ "元": Decimal(1), "圆": Decimal(1), "角": round(Decimal(0.1), 1), "分": round(Decimal(0.01), 2)}
|
|
|
return MultipleFactor.get(_unit)
|
|
|
|
|
|
def getUnifyMoney(money):
|
|
@@ -1370,45 +1388,45 @@ def get_several_money(sentence_text, found_yeji, in_attachment=False,
|
|
|
|
|
|
MAX_MONEY = 1000000000000
|
|
|
MAX_NUM = 12
|
|
|
- #去掉逗号
|
|
|
- money = re.sub("[,,]","",money)
|
|
|
- money = re.sub("[^0-9.零壹贰叁肆伍陆柒捌玖拾佰仟萬億圆十百千万亿元角分]","",money)
|
|
|
+ # 去掉逗号
|
|
|
+ money = re.sub("[,,]", "", money)
|
|
|
+ money = re.sub("[^0-9.零壹贰叁肆伍陆柒捌玖拾佰仟萬億圆十百千万亿元角分]", "", money)
|
|
|
result = Decimal(0)
|
|
|
chnDigits = ["零", "壹", "贰", "叁", "肆", "伍", "陆", "柒", "捌", "玖"]
|
|
|
# chnFactorUnits = ["兆", "亿", "万", "仟", "佰", "拾","圆","元","角","分"]
|
|
|
- chnFactorUnits = ["圆", "元","兆", "亿", "万", "仟", "佰", "拾", "角", "分", '十', '百', '千']
|
|
|
+ chnFactorUnits = ["圆", "元", "兆", "亿", "万", "仟", "佰", "拾", "角", "分", '十', '百', '千']
|
|
|
|
|
|
LowMoneypattern = re.compile("^[\d,]+(\.\d+)?$")
|
|
|
- BigMoneypattern = re.compile("^零?(?P<BigMoney>[%s])$"%("".join(chnDigits)))
|
|
|
+ BigMoneypattern = re.compile("^零?(?P<BigMoney>[%s])$" % ("".join(chnDigits)))
|
|
|
try:
|
|
|
- if re.search(LowMoneypattern,money) is not None:
|
|
|
+ if re.search(LowMoneypattern, money) is not None:
|
|
|
return Decimal(money)
|
|
|
- elif re.search(BigMoneypattern,money) is not None:
|
|
|
- return getDigitsDic(re.search(BigMoneypattern,money).group("BigMoney"))
|
|
|
+ elif re.search(BigMoneypattern, money) is not None:
|
|
|
+ return getDigitsDic(re.search(BigMoneypattern, money).group("BigMoney"))
|
|
|
for factorUnit in chnFactorUnits:
|
|
|
- if re.search(re.compile(".*%s.*"%(factorUnit)),money) is not None:
|
|
|
- subMoneys = re.split(re.compile("%s(?!.*%s.*)"%(factorUnit,factorUnit)),money)
|
|
|
- if re.search(re.compile("^(\d+)(\.\d+)?$"),subMoneys[0]) is not None:
|
|
|
- if MAX_MONEY/getMultipleFactor(factorUnit)<Decimal(subMoneys[0]):
|
|
|
+ if re.search(re.compile(".*%s.*" % (factorUnit)), money) is not None:
|
|
|
+ subMoneys = re.split(re.compile("%s(?!.*%s.*)" % (factorUnit, factorUnit)), money)
|
|
|
+ if re.search(re.compile("^(\d+)(\.\d+)?$"), subMoneys[0]) is not None:
|
|
|
+ if MAX_MONEY / getMultipleFactor(factorUnit) < Decimal(subMoneys[0]):
|
|
|
return Decimal(0)
|
|
|
- result += Decimal(subMoneys[0])*(getMultipleFactor(factorUnit))
|
|
|
- elif len(subMoneys[0])==1:
|
|
|
- if re.search(re.compile("^[%s]$"%("".join(chnDigits))),subMoneys[0]) is not None:
|
|
|
- result += Decimal(getDigitsDic(subMoneys[0]))*(getMultipleFactor(factorUnit))
|
|
|
+ result += Decimal(subMoneys[0]) * (getMultipleFactor(factorUnit))
|
|
|
+ elif len(subMoneys[0]) == 1:
|
|
|
+ if re.search(re.compile("^[%s]$" % ("".join(chnDigits))), subMoneys[0]) is not None:
|
|
|
+ result += Decimal(getDigitsDic(subMoneys[0])) * (getMultipleFactor(factorUnit))
|
|
|
# subMoneys[0]中无金额单位,不可再拆分
|
|
|
- elif subMoneys[0]=="":
|
|
|
+ elif subMoneys[0] == "":
|
|
|
result += 0
|
|
|
- elif re.search(re.compile("[%s]"%("".join(chnFactorUnits))),subMoneys[0]) is None:
|
|
|
+ elif re.search(re.compile("[%s]" % ("".join(chnFactorUnits))), subMoneys[0]) is None:
|
|
|
# print(subMoneys)
|
|
|
# subMoneys[0] = subMoneys[0][0]
|
|
|
result += Decimal(getUnifyMoney(subMoneys[0])) * (getMultipleFactor(factorUnit))
|
|
|
else:
|
|
|
- result += Decimal(getUnifyMoney(subMoneys[0]))*(getMultipleFactor(factorUnit))
|
|
|
- if len(subMoneys)>1:
|
|
|
- if re.search(re.compile("^(\d+(,)?)+(\.\d+)?[百千万亿]?\s?(元)?$"),subMoneys[1]) is not None:
|
|
|
+ result += Decimal(getUnifyMoney(subMoneys[0])) * (getMultipleFactor(factorUnit))
|
|
|
+ if len(subMoneys) > 1:
|
|
|
+ if re.search(re.compile("^(\d+(,)?)+(\.\d+)?[百千万亿]?\s?(元)?$"), subMoneys[1]) is not None:
|
|
|
result += Decimal(subMoneys[1])
|
|
|
- elif len(subMoneys[1])==1:
|
|
|
- if re.search(re.compile("^[%s]$"%("".join(chnDigits))),subMoneys[1]) is not None:
|
|
|
+ elif len(subMoneys[1]) == 1:
|
|
|
+ if re.search(re.compile("^[%s]$" % ("".join(chnDigits))), subMoneys[1]) is not None:
|
|
|
result += Decimal(getDigitsDic(subMoneys[1]))
|
|
|
else:
|
|
|
result += Decimal(getUnifyMoney(subMoneys[1]))
|
|
@@ -1456,7 +1474,7 @@ def get_several_money(sentence_text, found_yeji, in_attachment=False,
|
|
|
sentence_text = re.sub(re.escape(match.group()), match.group()[0] + match.group()[2:], sentence_text)
|
|
|
|
|
|
if show:
|
|
|
- print('get_several_money time2', time.time()-start_time)
|
|
|
+ print('get_several_money time2', time.time() - start_time)
|
|
|
start_time = time.time()
|
|
|
|
|
|
if re.search('业绩(公示|汇总|及|报告|\w{,2}(内容|情况|信息)|[^\w])', sentence_text):
|
|
@@ -1464,14 +1482,15 @@ def get_several_money(sentence_text, found_yeji, in_attachment=False,
|
|
|
if found_yeji >= 2: # 过滤掉业绩后面的所有金额
|
|
|
all_match = []
|
|
|
else:
|
|
|
- ser = re.search('((收费标准|计算[方公]?式):|\w{3,5}\s*=)+\s*[中标投标成交金额招标人预算价格万元\s()()\[\]【】\d\.%%‰\+\-*×/]{20,}[,。]?', sentence_text) # 过滤掉收费标准里面的金额
|
|
|
+ ser = re.search('((收费标准|计算[方公]?式):|\w{3,5}\s*=)+\s*[中标投标成交金额招标人预算价格万元\s()()\[\]【】\d\.%%‰\+\-*×/]{20,}[,。]?',
|
|
|
+ sentence_text) # 过滤掉收费标准里面的金额
|
|
|
if ser:
|
|
|
all_match = re.finditer(pattern_money, sentence_text.replace(ser.group(0), ' ' * len(ser.group(0))))
|
|
|
else:
|
|
|
all_match = re.finditer(pattern_money, sentence_text)
|
|
|
|
|
|
if show:
|
|
|
- print('get_several_money time3', time.time()-start_time)
|
|
|
+ print('get_several_money time3', time.time() - start_time)
|
|
|
start_time = time.time()
|
|
|
|
|
|
for _match in all_match:
|
|
@@ -1486,7 +1505,8 @@ def get_several_money(sentence_text, found_yeji, in_attachment=False,
|
|
|
filter_unit = False
|
|
|
notSure = False
|
|
|
science = ""
|
|
|
- if re.search('业绩(公示|汇总|及|报告|\w{,2}(内容|情况|信息)|[^\w])', sentence_text[:_match.span()[0]]): # 2021/7/21过滤掉业绩后面金额
|
|
|
+ if re.search('业绩(公示|汇总|及|报告|\w{,2}(内容|情况|信息)|[^\w])',
|
|
|
+ sentence_text[:_match.span()[0]]): # 2021/7/21过滤掉业绩后面金额
|
|
|
# print('金额在业绩后面: ', _match.group(0))
|
|
|
found_yeji += 1
|
|
|
break
|
|
@@ -1529,12 +1549,14 @@ def get_several_money(sentence_text, found_yeji, in_attachment=False,
|
|
|
if re.search('电话|手机|联系|方式|编号|编码|日期|数字|时间', text_beforeMoney):
|
|
|
# print('过滤掉手机号码作为金额')
|
|
|
continue
|
|
|
- elif re.search('^1[3-9]\d{9}$', entity_text) and re.search(':\w{1,3}$', text_beforeMoney): # 过滤掉类似 '13863441880', '金额(万元):季勇13863441880'
|
|
|
+ elif re.search('^1[3-9]\d{9}$', entity_text) and re.search(':\w{1,3}$',
|
|
|
+ text_beforeMoney): # 过滤掉类似 '13863441880', '金额(万元):季勇13863441880'
|
|
|
# print('过滤掉手机号码作为金额')
|
|
|
continue
|
|
|
|
|
|
if unit == "": # 2021/7/21 有明显金额特征的补充单位,避免被过滤
|
|
|
- if (re.search('(¥|¥|RMB|CNY)[::]?$', text_beforeMoney) or re.search('[零壹贰叁肆伍陆柒捌玖拾佰仟萬億圆十百千万亿元角分]{3,}', entity_text)):
|
|
|
+ if (re.search('(¥|¥|RMB|CNY)[::]?$', text_beforeMoney) or re.search('[零壹贰叁肆伍陆柒捌玖拾佰仟萬億圆十百千万亿元角分]{3,}',
|
|
|
+ entity_text)):
|
|
|
if entity_text.endswith('万元'):
|
|
|
unit = '万元'
|
|
|
entity_text = entity_text[:-2]
|
|
@@ -1550,8 +1572,10 @@ def get_several_money(sentence_text, found_yeji, in_attachment=False,
|
|
|
elif re.search('^[-—]+[\d,.]+万元', sentence_text[end_index:]):
|
|
|
# print('两个金额连接后面的有单位,用后面单位')
|
|
|
unit = '万元'
|
|
|
- elif re.search('([单报标限总造]价款?|金额|租金|(中标|成交|合同|承租|投资))?[价额]|价格|预算(金额)?|(监理|设计|勘察)(服务)?费)[::为]*-?$', text_beforeMoney.strip()) and re.search('^0|1[3|4|5|6|7|8|9]\d{9}', entity_text) == None:
|
|
|
- if re.search('^[\d,,.]+$', entity_text) and float(re.sub('[,,]', '', entity_text))<500 and re.search('万元', sentence_text):
|
|
|
+ elif re.search('([单报标限总造]价款?|金额|租金|(中标|成交|合同|承租|投资))?[价额]|价格|预算(金额)?|(监理|设计|勘察)(服务)?费)[::为]*-?$',
|
|
|
+ text_beforeMoney.strip()) and re.search('^0|1[3|4|5|6|7|8|9]\d{9}', entity_text) == None:
|
|
|
+ if re.search('^[\d,,.]+$', entity_text) and float(
|
|
|
+ re.sub('[,,]', '', entity_text)) < 500 and re.search('万元', sentence_text):
|
|
|
unit = '万元'
|
|
|
# print('金额较小且句子中有万元的,补充单位为万元')
|
|
|
elif re.search('^\d{1,3}\.\d{4,6}$', entity_text) and re.search('0000$', entity_text) == None:
|
|
@@ -1568,7 +1592,7 @@ def get_several_money(sentence_text, found_yeji, in_attachment=False,
|
|
|
elif unit == '万元':
|
|
|
if end_index < len(sentence_text) and sentence_text[end_index] == '元' and re.search('\d$', entity_text):
|
|
|
unit = '元'
|
|
|
- elif re.search('^[5-9]\d{6,}\.\d{2}$', entity_text): # 五百亿以上的万元改为元
|
|
|
+ elif re.search('^[5-9]\d{6,}\.\d{2}$', entity_text): # 五百亿以上的万元改为元
|
|
|
unit = '元'
|
|
|
|
|
|
if unit.find("万") >= 0 and entity_text.find("万") >= 0: # 2021/7/19修改为金额文本有万,不计算单位
|
|
@@ -1625,13 +1649,13 @@ def get_several_money(sentence_text, found_yeji, in_attachment=False,
|
|
|
continue
|
|
|
# print("金额:{0} ,单位:{1}, 前文:{2}, filter: {3}, filter_unit: {4}".format(entity_text, unit, text_beforeMoney,
|
|
|
# filter, filter_unit))
|
|
|
- if re.search('[%%‰折]|费率|下浮率', text_beforeMoney) and float(entity_text)<1000: # 过滤掉可能是费率的金额
|
|
|
+ if re.search('[%%‰折]|费率|下浮率', text_beforeMoney) and float(entity_text) < 1000: # 过滤掉可能是费率的金额
|
|
|
# print('过滤掉可能是费率的金额')
|
|
|
continue
|
|
|
money_list.append((entity_text, start_index, end_index, unit, notes))
|
|
|
|
|
|
if show:
|
|
|
- print('get_several_money time4', time.time()-start_time)
|
|
|
+ print('get_several_money time4', time.time() - start_time)
|
|
|
start_time = time.time()
|
|
|
|
|
|
# 排除过小的金额
|
|
@@ -1727,8 +1751,8 @@ def get_stage_pattern():
|
|
|
}
|
|
|
|
|
|
list_stage_v = []
|
|
|
- for k,v in stage_dict.items():
|
|
|
- list_stage_v.append("(?P<%s>%s)"%(k,v))
|
|
|
+ for k, v in stage_dict.items():
|
|
|
+ list_stage_v.append("(?P<%s>%s)" % (k, v))
|
|
|
stage_pattern = "|".join(list_stage_v)
|
|
|
return stage_pattern, stage_priority_dict
|
|
|
|
|
@@ -1777,7 +1801,182 @@ def get_property_pattern():
|
|
|
}
|
|
|
|
|
|
list_property_v = []
|
|
|
- for k,v in property_dict.items():
|
|
|
- list_property_v.append("(?P<%s>%s)"%(k,v))
|
|
|
+ for k, v in property_dict.items():
|
|
|
+ list_property_v.append("(?P<%s>%s)" % (k, v))
|
|
|
property_pattern = "|".join(list_property_v)
|
|
|
- return property_pattern, property_priority_dict
|
|
|
+ return property_pattern, property_priority_dict
|
|
|
+
|
|
|
+
|
|
|
+class get_service_end:
|
|
|
+ def __init__(self):
|
|
|
+ self.pattern1 = re.compile("\d{4}[年\-\./]\d{1,2}[月\-\./]\d{1,2}日?")
|
|
|
+ self.pattern2 = re.compile("\d+(?:\.\d+)?[\((]?个?[^\d]?[^\d]?(?:日|天|周年|整年|学?年|月|周|日历[天日]|工作[天日])")
|
|
|
+ self.pattern3 = re.compile("\d{4}[年\-\./]\d{1,2}月?")
|
|
|
+ self.pattern4 = re.compile("(?:日|天|周年|年|月|周|日历[天日]|工作[天日]|星期)[^\d]{1,3}\d+(?:\.\d+)?")
|
|
|
+ self.DigitsDic = {"零": 0, "壹": 1, "贰": 2, "叁": 3, "肆": 4, "伍": 5, "陆": 6, "柒": 7, "捌": 8, "玖": 9,
|
|
|
+ "〇": 0, "一": 1, "二": 2, "三": 3, "四": 4, "五": 5, "六": 6, "七": 7, "八": 8, "九": 9,
|
|
|
+ "两": 2, '貮': 2}
|
|
|
+
|
|
|
+ def get_num(self, text):
|
|
|
+ CN_UNIT = {
|
|
|
+ '十': 10,
|
|
|
+ '拾': 10,
|
|
|
+ '百': 100,
|
|
|
+ '佰': 100,
|
|
|
+ '千': 1000,
|
|
|
+ '仟': 1000}
|
|
|
+
|
|
|
+ regex = re.compile(r'[〇一二三四五六七八九零壹贰叁肆伍陆柒捌玖貮两十拾百佰千仟]+')
|
|
|
+ text = regex.search(text)
|
|
|
+ if text:
|
|
|
+ text = text.group()
|
|
|
+ else:
|
|
|
+ return ""
|
|
|
+ result = 0
|
|
|
+ result_list = []
|
|
|
+ unit = 0
|
|
|
+ control = 0
|
|
|
+ for i, d in enumerate(text):
|
|
|
+ if d in '零百佰千仟' and i == 0:
|
|
|
+ return ""
|
|
|
+ break
|
|
|
+ if d in self.DigitsDic:
|
|
|
+ result += self.DigitsDic[d]
|
|
|
+ elif d in CN_UNIT:
|
|
|
+ if unit == 0:
|
|
|
+ unit_1 = CN_UNIT[d]
|
|
|
+ # 这里的处理主要是考虑到类似于二十三亿五千万这种数
|
|
|
+ if result == 0:
|
|
|
+ result = CN_UNIT[d]
|
|
|
+ else:
|
|
|
+ result *= CN_UNIT[d]
|
|
|
+ unit = CN_UNIT[d]
|
|
|
+ result_1 = result
|
|
|
+ elif unit > CN_UNIT[d]:
|
|
|
+ result -= self.DigitsDic[text[i - 1]]
|
|
|
+ result += self.DigitsDic[text[i - 1]] * CN_UNIT[d]
|
|
|
+ unit = CN_UNIT[d]
|
|
|
+ elif unit <= CN_UNIT[d]:
|
|
|
+ if (CN_UNIT[d] < unit_1) and (len(result_list) == control):
|
|
|
+ result_list.append(result_1)
|
|
|
+ result = (result - result_1) * CN_UNIT[d]
|
|
|
+ control += 1
|
|
|
+ else:
|
|
|
+ result *= CN_UNIT[d]
|
|
|
+ unit = CN_UNIT[d]
|
|
|
+ if len(result_list) == control:
|
|
|
+ unit_1 = unit
|
|
|
+ result_1 = result
|
|
|
+ else:
|
|
|
+ return ""
|
|
|
+ break
|
|
|
+ return sum(result_list) + result
|
|
|
+
|
|
|
+ def process(self, page_time, service_time):
|
|
|
+ try:
|
|
|
+ page_time = re.search("\d{4}\-\d{1,2}\-\d{1,2}", page_time).group()
|
|
|
+
|
|
|
+ re_num = re.findall(r'[〇一二三四五六七八九零壹贰叁肆伍陆柒捌玖貮两十拾百佰千仟]+', service_time)
|
|
|
+ for _num in re_num:
|
|
|
+ if not re.search("[十拾百佰千仟]", _num):
|
|
|
+ num = ""
|
|
|
+ for word in _num:
|
|
|
+ num += str(self.DigitsDic.get(word, word))
|
|
|
+ service_time = service_time.replace(_num, num, 1)
|
|
|
+ else:
|
|
|
+ num = str(self.get_num(_num))
|
|
|
+ service_time = service_time.replace(_num, num, 1)
|
|
|
+
|
|
|
+ end_time = ""
|
|
|
+ service_days = 0
|
|
|
+ page_timestamp = time.mktime(time.strptime(page_time, "%Y-%m-%d"))
|
|
|
+ if re.search(self.pattern1, service_time):
|
|
|
+ end_time = re.findall(self.pattern1, service_time)[-1]
|
|
|
+ end_time = re.sub("日", "", end_time)
|
|
|
+ end_time = re.sub("[年月\./]", "-", end_time)
|
|
|
+
|
|
|
+ _year, _month, _day = end_time.split("-")
|
|
|
+ _month = int(_month)
|
|
|
+ _day = int(_day)
|
|
|
+ _year = int(_year)
|
|
|
+ if _year > 2050 or _year <= 2000 or _month > 12 or _month <= 0 or _day <= 0 or _day > 31:
|
|
|
+ service_days = 0
|
|
|
+ else:
|
|
|
+ if _month == 2:
|
|
|
+ _day = min(28, _day)
|
|
|
+ else:
|
|
|
+ _day = min(30, _day)
|
|
|
+ end_time = str(_year) + '-' + str(_month) + '-' + str(_day)
|
|
|
+ service_days = (time.mktime(time.strptime(end_time, "%Y-%m-%d")) - page_timestamp) / (24 * 60 * 60)
|
|
|
+ elif re.search(self.pattern3, service_time):
|
|
|
+ # logging.info('2')
|
|
|
+ end_time = re.findall(self.pattern3, service_time)[-1]
|
|
|
+ end_time = re.sub("月", "", end_time)
|
|
|
+ end_time = re.sub("[年\./]", "-", end_time)
|
|
|
+
|
|
|
+ _year, _month = end_time.split("-")
|
|
|
+ _day = 0
|
|
|
+ _month = int(_month)
|
|
|
+ _year = int(_year)
|
|
|
+ if _year > 2050 or _year <= 2000 or _month > 12 or _month <= 0:
|
|
|
+ service_days = 0
|
|
|
+ else:
|
|
|
+ if _month == 2:
|
|
|
+ _day = 28
|
|
|
+ else:
|
|
|
+ _day = 30
|
|
|
+ end_time = str(_year) + '-' + str(_month) + '-' + str(_day)
|
|
|
+ service_days = (time.mktime(time.strptime(end_time, "%Y-%m-%d")) - page_timestamp) / (24 * 60 * 60)
|
|
|
+ elif re.search(self.pattern2, service_time) or re.search(self.pattern4, service_time):
|
|
|
+ for pattern in [self.pattern2, self.pattern4]:
|
|
|
+ unit = 1
|
|
|
+ match = re.findall(pattern, service_time)
|
|
|
+ if len(set(match)) == 1:
|
|
|
+ match_text = match[0]
|
|
|
+ # turn_service_time = match_text
|
|
|
+ if "月" in match_text:
|
|
|
+ unit = 30
|
|
|
+ elif "年" in match_text:
|
|
|
+ unit = 365
|
|
|
+ elif "周" in match_text or "星期" in match_text:
|
|
|
+ unit = 7
|
|
|
+ match_num = float(re.search("\d+", match_text).group())
|
|
|
+ # 数字能被365整除,单位更正为天
|
|
|
+ if int(match_num) % 365 == 0:
|
|
|
+ unit = 1
|
|
|
+ # turn_service_time = str(match_num)+"天"
|
|
|
+ if unit == 365:
|
|
|
+ if match_num > 10: # 单位为'年'时,排除数字过大的
|
|
|
+ match_num = 0
|
|
|
+ elif unit == 30:
|
|
|
+ if match_num > 60: # 单位为'月'时,排除数字过大的
|
|
|
+ match_num = 0
|
|
|
+ elif unit == 1:
|
|
|
+ if match_num > 4000: # 单位为'日'时,排除数字过大的
|
|
|
+ match_num = 0
|
|
|
+ service_days = match_num * unit
|
|
|
+ service_days = int(service_days)
|
|
|
+ if service_days > 0:
|
|
|
+ break
|
|
|
+ elif "半年" in service_time:
|
|
|
+ # turn_service_time = "半年"
|
|
|
+ service_days = 180
|
|
|
+
|
|
|
+
|
|
|
+ if service_days > 4000 or service_days < 0:
|
|
|
+ service_days = 0
|
|
|
+ return str(service_days)
|
|
|
+
|
|
|
+ # # 服务天数小于90不预测
|
|
|
+ # if service_days<90 or service_days>4000:
|
|
|
+ # end_time = ""
|
|
|
+ # elif not end_time and service_days!=0:
|
|
|
+ # end_time = time.strftime("%Y-%m-%d",time.localtime(page_timestamp + service_days*24*60*60))
|
|
|
+ # may_begin = ""
|
|
|
+ # may_end = ""
|
|
|
+ # if end_time:
|
|
|
+ # return end_time
|
|
|
+ # else:
|
|
|
+ # return ''
|
|
|
+ except Exception as e:
|
|
|
+ return '0'
|