10 ماه پیش · c2963160be
--- a/BiddingKG/dl/interface/Preprocessing.py
+++ b/BiddingKG/dl/interface/Preprocessing.py
@@ -710,6 +710,9 @@ def tableToText(soup, docid=None):
 
				                 inner_table[i][j] = [origin_inner_table[i][j][0], int(predict_list[i][j])]
			
 
				 
			
 
				         if show:
			
 
				+            print(inner_table)
			
 
				+            print("="*80)
			
 
				+
			
 
				             print("table_head before repair")
			
 
				             for r in inner_table:
			
 
				                 print('row', r)
			
@@ -725,10 +728,10 @@ def tableToText(soup, docid=None):
 
				                 inner_table[i][j] = [origin_inner_table[i][j][0], int(inner_table[i][j][1])]
			
 
				 
			
 
				         if show:
			
 
				-            print("="*80)
			
 
				             print("table_head after repair")
			
 
				             for r in inner_table:
			
 
				                 print('row', r)
			
 
				+            print("="*80)
			
 
				 
			
 
				         # 按表头分割表格
			
 
				         head_list = sliceTable(inner_table)
			
--- a/BiddingKG/dl/interface/extract.py
+++ b/BiddingKG/dl/interface/extract.py
@@ -388,7 +388,7 @@ def predict(doc_id,text,title="",page_time="",web_source_no='',web_source_name="
 
				     # predictor.getPredictor("product").predict(list_sentences, list_entitys)
			
 
				     log("get product done of doc_id%s"%(doc_id))
			
 
				     cost_time["product"] = round(time.time()-start_time,2)
			
 
				-    prem[0].update(getAttributes.getOtherAttributes(list_entitys[0],page_time,prem))
			
 
				+    prem[0].update(getAttributes.getOtherAttributes(list_entitys[0],page_time,prem,channel_dic))
			
 
				 
			
 
				     '''更新单一来源招标公告中标角色为预中标'''
			
 
				     getAttributes.fix_single_source(prem[0], channel_dic, original_docchannel)
			
--- a/BiddingKG/dl/interface/getAttributes.py
+++ b/BiddingKG/dl/interface/getAttributes.py
@@ -3939,7 +3939,7 @@ def extract_serviceTime(service_time,page_time):
 
				 
			
 
				     return serviceTime_dict
			
 
				 
			
 
				-def getOtherAttributes(list_entity,page_time,prem):
			
 
				+def getOtherAttributes(list_entity,page_time,prem,channel_dic):
			
 
				     dict_other = {"moneysource":"",
			
 
				                   "person_review":[],
			
 
				                   "serviceTime":"",
			
@@ -4042,7 +4042,7 @@ def getOtherAttributes(list_entity,page_time,prem):
 
				         service_days = get_days_between(serviceTime_dict['service_start'],serviceTime_dict['service_end'])
			
 
				         serviceTime_dict['service_days'] = str(service_days) + "天"
			
 
				     dict_other["serviceTime"] = serviceTime_dict
			
 
				-    if not time_contractEnd and prem[0]['docchannel']['docchannel']=='合同公告': # 用serviceTime补充合同开始结束时间,公告类型为合同公告
			
 
				+    if not time_contractEnd and channel_dic['docchannel']['docchannel']=='合同公告': # 用serviceTime补充合同开始结束时间,公告类型为合同公告
			
 
				         if serviceTime_dict['service_start'] and serviceTime_dict['service_end']:
			
 
				             prem[0]["time_contractStart"] = serviceTime_dict['service_start']
			
 
				             prem[0]["time_contractEnd"] = serviceTime_dict['service_end']
			
--- a/BiddingKG/dl/proposed_building/pb_extract.py
+++ b/BiddingKG/dl/proposed_building/pb_extract.py
@@ -23,6 +23,7 @@ class PBPredictor:
 
				 
			
 
				     def get_col_from_prem(self, prem):
			
 
				         tenderee, agency, product = None, None, None
			
 
				+        begin_time, end_time = None, None
			
 
				         for item in prem:
			
 
				             prem = item.get('prem')
			
 
				             for key in prem.keys():
			
@@ -63,68 +64,71 @@ class PBPredictor:
 
				                     project_code = None
			
 
				 
			
 
				                 start_time = time.time()
			
 
				-                stage = extract_legal_stage(project_name+doctitle, self.stage_pattern, self.stage_priority_dict, product, tenderee=tenderee, agency=agency)
			
 
				+                stage = extract_legal_stage(project_name + doctitle, self.stage_pattern, self.stage_priority_dict,
			
 
				+                                            product, tenderee=tenderee, agency=agency)
			
 
				                 if show:
			
 
				-                    print('extract_legal_stage time', time.time()-start_time)
			
 
				+                    print('extract_legal_stage time', time.time() - start_time)
			
 
				                     start_time = time.time()
			
 
				-                industry1 = extract_industry(doctitle+content, self.industry_pattern)
			
 
				+                industry1 = extract_industry(doctitle + content, self.industry_pattern)
			
 
				                 if show:
			
 
				-                    print('extract_industry time', time.time()-start_time)
			
 
				+                    print('extract_industry time', time.time() - start_time)
			
 
				                     start_time = time.time()
			
 
				-                industry = extract_industry(doctitle+content_no_att, self.industry_pattern)
			
 
				+                industry = extract_industry(doctitle + content_no_att, self.industry_pattern)
			
 
				                 if show:
			
 
				-                    print('extract_industry time', time.time()-start_time)
			
 
				+                    print('extract_industry time', time.time() - start_time)
			
 
				                     start_time = time.time()
			
 
				                 # print('industry', industry, industry1)
			
 
				                 if not industry and industry1:
			
 
				                     industry = industry1
			
 
				                 proportion1, proportion = extract_proportion(content)
			
 
				                 if show:
			
 
				-                    print('extract_proportion time', time.time()-start_time)
			
 
				+                    print('extract_proportion time', time.time() - start_time)
			
 
				                     start_time = time.time()
			
 
				                 project_digest = extract_project_digest(content)
			
 
				                 if show:
			
 
				-                    print('extract_project_digest time', time.time()-start_time)
			
 
				+                    print('extract_project_digest time', time.time() - start_time)
			
 
				                     start_time = time.time()
			
 
				                 project_address = extract_project_address(list_sentence, list_entity)
			
 
				                 if show:
			
 
				-                    print('extract_project_address time', time.time()-start_time)
			
 
				+                    print('extract_project_address time', time.time() - start_time)
			
 
				                     start_time = time.time()
			
 
				-                location = get_bid_location(doctitle+"\t"+project_name)
			
 
				+                location = get_bid_location(doctitle + "\t" + project_name)
			
 
				                 if show:
			
 
				-                    print('get_bid_location time', time.time()-start_time)
			
 
				+                    print('get_bid_location time', time.time() - start_time)
			
 
				                     start_time = time.time()
			
 
				-                project_name_refind, show_name_refind = get_project_name_refind(project_name, doctitle, tenderee, agency)
			
 
				+                project_name_refind, show_name_refind = get_project_name_refind(project_name, doctitle, tenderee,
			
 
				+                                                                                agency)
			
 
				                 if show:
			
 
				-                    print('get_project_name_refind time', time.time()-start_time)
			
 
				+                    print('get_project_name_refind time', time.time() - start_time)
			
 
				                     start_time = time.time()
			
 
				                 has_elevator = extract_has_elevator(content)
			
 
				                 if show:
			
 
				-                    print('extract_has_elevator time', time.time()-start_time)
			
 
				+                    print('extract_has_elevator time', time.time() - start_time)
			
 
				                     start_time = time.time()
			
 
				-                project_property = extract_project_property(doctitle+"\t"+project_name, self.property_pattern, self.property_priority_dict)
			
 
				+                project_property = extract_project_property(doctitle + "\t" + project_name, self.property_pattern,
			
 
				+                                                            self.property_priority_dict)
			
 
				                 if show:
			
 
				-                    print('extract_project_property time', time.time()-start_time)
			
 
				+                    print('extract_project_property time', time.time() - start_time)
			
 
				                     start_time = time.time()
			
 
				                 total_invest, construct_install_fee, engineer_cost = extract_several_money(list_sentence, dochtmlcon)
			
 
				                 if show:
			
 
				-                    print('extract_several_money time', time.time()-start_time)
			
 
				+                    print('extract_several_money time', time.time() - start_time)
			
 
				                     start_time = time.time()
			
 
				                 max_floor = extract_max_floor(content, dochtmlcon)
			
 
				                 if show:
			
 
				-                    print('extract_max_floor time', time.time()-start_time)
			
 
				+                    print('extract_max_floor time', time.time() - start_time)
			
 
				                     start_time = time.time()
			
 
				                 structure = extract_structure(content, dochtmlcon, self.structure_keyword_list)
			
 
				                 if show:
			
 
				-                    print('extract_structure time', time.time()-start_time)
			
 
				+                    print('extract_structure time', time.time() - start_time)
			
 
				                     start_time = time.time()
			
 
				                 has_steel = extract_has_steel_structure(structure)
			
 
				                 if show:
			
 
				-                    print('extract_has_steel_structure time', time.time()-start_time)
			
 
				+                    print('extract_has_steel_structure time', time.time() - start_time)
			
 
				                     start_time = time.time()
			
 
				-                wall_type, wall_type2 = extract_wall_type(doctitle+"\t"+project_name, content)
			
 
				+                wall_type, wall_type2 = extract_wall_type(doctitle + "\t" + project_name, content)
			
 
				                 if show:
			
 
				-                    print('extract_wall_type time', time.time()-start_time)
			
 
				+                    print('extract_wall_type time', time.time() - start_time)
			
 
				                     start_time = time.time()
			
 
				 
			
 
				                 if stage is not None:
			
@@ -191,7 +195,7 @@ def extract_legal_stage(content, _pattern, priority_dict, product='', tenderee='
 
				 
			
 
				     list_stage = []
			
 
				     for stage_search in re.finditer(_pattern, _content):
			
 
				-        for k,v in stage_search.groupdict().items():
			
 
				+        for k, v in stage_search.groupdict().items():
			
 
				             if v is not None:
			
 
				                 list_stage.append([k, priority_dict.get(k)])
			
 
				     if len(list_stage) > 0:
			
@@ -211,10 +215,10 @@ def extract_legal_stage(content, _pattern, priority_dict, product='', tenderee='
 
				         if stage == '立项阶段':
			
 
				             sub_content = re.sub('立项目', '', _content)
			
 
				             for stage_search in re.finditer(_pattern, sub_content):
			
 
				-                for k,v in stage_search.groupdict().items():
			
 
				+                for k, v in stage_search.groupdict().items():
			
 
				                     if v is not None:
			
 
				                         list_stage.append([k, priority_dict.get(k)])
			
 
				-            if len(list_stage)>0:
			
 
				+            if len(list_stage) > 0:
			
 
				                 list_stage.sort(key=lambda x: x[1])
			
 
				                 stage = list_stage[0][0]
			
 
				 
			
@@ -222,7 +226,7 @@ def extract_legal_stage(content, _pattern, priority_dict, product='', tenderee='
 
				     return None
			
 
				 
			
 
				 
			
 
				-def get_project_name_refind(project_name, doctitle, tenderee='', agency= '', min_len=3):
			
 
				+def get_project_name_refind(project_name, doctitle, tenderee='', agency='', min_len=3):
			
 
				     # 跳过部分
			
 
				     re_str11 = '网上超市|服务市场采购|印刷服务|复印纸|车辆维修和保养|商品房预售|办公家具定点|直接订购|定点议价' \
			
 
				                '|政府采购意向|信息技术服务定点议价|信息技术服务定点采购|法人章刻制中介机构|专用设备|办公设备采购' \
			
@@ -352,18 +356,21 @@ def get_project_name_refind(project_name, doctitle, tenderee='', agency= '', min
 
				             project_word_in_org = []
			
 
				             for m in match:
			
 
				                 # 混淆词，设施工程中的施工
			
 
				-                if m.span()[0] > 0 and name_refind[m.span()[0]-1] in ['设']:
			
 
				+                if m.span()[0] > 0 and name_refind[m.span()[0] - 1] in ['设']:
			
 
				                     continue
			
 
				 
			
 
				                 # 判断是不是公司名里的工程
			
 
				                 if re.search(re_str26, name_refind[m.span()[1]:]):
			
 
				-                    project_word_in_org.append(name_refind[max(0, m.span()[0]-1):min(m.span()[1]+1, len(name_refind))])
			
 
				+                    project_word_in_org.append(
			
 
				+                        name_refind[max(0, m.span()[0] - 1):min(m.span()[1] + 1, len(name_refind))])
			
 
				                     continue
			
 
				-                if re.search(re_str17, name_refind[m.span()[1]:m.span()[1]+3]):
			
 
				-                    project_word_in_org.append(name_refind[max(0, m.span()[0]-1):min(m.span()[1]+1, len(name_refind))])
			
 
				+                if re.search(re_str17, name_refind[m.span()[1]:m.span()[1] + 3]):
			
 
				+                    project_word_in_org.append(
			
 
				+                        name_refind[max(0, m.span()[0] - 1):min(m.span()[1] + 1, len(name_refind))])
			
 
				                     continue
			
 
				                 if re.search(re_str18, name_refind[m.span()[1]:]):
			
 
				-                    project_word_in_org.append(name_refind[max(0, m.span()[0]-1):min(m.span()[1]+1, len(name_refind))])
			
 
				+                    project_word_in_org.append(
			
 
				+                        name_refind[max(0, m.span()[0] - 1):min(m.span()[1] + 1, len(name_refind))])
			
 
				                     continue
			
 
				 
			
 
				                 match_flag = True
			
@@ -377,18 +384,21 @@ def get_project_name_refind(project_name, doctitle, tenderee='', agency= '', min
 
				                 last_index = 0
			
 
				                 for m in match:
			
 
				                     # 混淆词，设施工程中的施工
			
 
				-                    if m.span()[0] > 0 and name_refind[m.span()[0]-1] in ['设']:
			
 
				+                    if m.span()[0] > 0 and name_refind[m.span()[0] - 1] in ['设']:
			
 
				                         continue
			
 
				 
			
 
				                     # 判断是不是公司名里的工程
			
 
				                     if re.search(re_str26, name_refind[m.span()[1]:]):
			
 
				-                        project_word_in_org.append(name_refind[max(0, m.span()[0]-1):min(m.span()[1]+1, len(name_refind))])
			
 
				+                        project_word_in_org.append(
			
 
				+                            name_refind[max(0, m.span()[0] - 1):min(m.span()[1] + 1, len(name_refind))])
			
 
				                         continue
			
 
				-                    if re.search(re_str17, name_refind[m.span()[1]:m.span()[1]+3]):
			
 
				-                        project_word_in_org.append(name_refind[max(0, m.span()[0]-1):min(m.span()[1]+1, len(name_refind))])
			
 
				+                    if re.search(re_str17, name_refind[m.span()[1]:m.span()[1] + 3]):
			
 
				+                        project_word_in_org.append(
			
 
				+                            name_refind[max(0, m.span()[0] - 1):min(m.span()[1] + 1, len(name_refind))])
			
 
				                         continue
			
 
				                     if re.search(re_str18, name_refind[m.span()[1]:]):
			
 
				-                        project_word_in_org.append(name_refind[max(0, m.span()[0]-1):min(m.span()[1]+1, len(name_refind))])
			
 
				+                        project_word_in_org.append(
			
 
				+                            name_refind[max(0, m.span()[0] - 1):min(m.span()[1] + 1, len(name_refind))])
			
 
				                         continue
			
 
				                     match_flag = True
			
 
				                     prob_name_list.append(name_refind[last_index:m.span()[1]])
			
@@ -429,7 +439,7 @@ def get_project_name_refind(project_name, doctitle, tenderee='', agency= '', min
 
				                 match1 = re.finditer(re_str6, name)
			
 
				                 for m1 in match1:
			
 
				                     # 混淆词，设施工程中的施工
			
 
				-                    if m1.span()[0] > 0 and name[m1.span()[0]-1] in ['设']:
			
 
				+                    if m1.span()[0] > 0 and name[m1.span()[0] - 1] in ['设']:
			
 
				                         continue
			
 
				                     s_index, e_index = m1.span()
			
 
				                     word = name[s_index:e_index]
			
@@ -473,12 +483,12 @@ def get_project_name_refind(project_name, doctitle, tenderee='', agency= '', min
 
				     for name_refind in name_refind_candidate_list:
			
 
				         # 直接判断删除数字
			
 
				         match = re.match(re_str16, name_refind)
			
 
				-        if match and not re.match('[0-9]', name_refind[match.span()[1]:match.span()[1]+1]):
			
 
				+        if match and not re.match('[0-9]', name_refind[match.span()[1]:match.span()[1] + 1]):
			
 
				             name_refind = name_refind[match.span()[1]:]
			
 
				 
			
 
				         # 删除开头奇怪数字
			
 
				         match = re.match(re_str15, name_refind)
			
 
				-        if match and not re.match('[a-zA-Z地块号]', name_refind[match.span()[1]:match.span()[1]+1]):
			
 
				+        if match and not re.match('[a-zA-Z地块号]', name_refind[match.span()[1]:match.span()[1] + 1]):
			
 
				             name_refind = name_refind[match.span()[1]:]
			
 
				 
			
 
				         # 删除期数
			
@@ -525,7 +535,7 @@ def get_project_name_refind(project_name, doctitle, tenderee='', agency= '', min
 
				         # 删除区
			
 
				         match2 = re.match(re_str22, name_refind)
			
 
				         if match2:
			
 
				-            name_refind = name_refind[match2.span()[1]-1:]
			
 
				+            name_refind = name_refind[match2.span()[1] - 1:]
			
 
				 
			
 
				         # 删除'小区表达'
			
 
				         if len(name_refind) >= min_len + 2:
			
@@ -537,7 +547,8 @@ def get_project_name_refind(project_name, doctitle, tenderee='', agency= '', min
 
				         if agency in [None, 'None', '-', '']:
			
 
				             agency = ''
			
 
				         try:
			
 
				-            if len(name_refind) >= 4 and (re.search(re.escape(name_refind[-4:]), tenderee) or re.search(re.escape(name_refind[-4:]), agency)):
			
 
				+            if len(name_refind) >= 4 and (
			
 
				+                    re.search(re.escape(name_refind[-4:]), tenderee) or re.search(re.escape(name_refind[-4:]), agency)):
			
 
				                 name_refind = ''
			
 
				                 show_name_refind = ''
			
 
				         except:
			
@@ -558,14 +569,14 @@ def extract_industry(content, _pattern):
 
				     list_stage = []
			
 
				     stage_dict = {}
			
 
				     for stage_search in re.finditer(_pattern, content):
			
 
				-        for k,v in stage_search.groupdict().items():
			
 
				+        for k, v in stage_search.groupdict().items():
			
 
				             if v is not None:
			
 
				                 list_stage.append(k)
			
 
				                 if k in stage_dict.keys():
			
 
				                     stage_dict[k] += 1
			
 
				                 else:
			
 
				                     stage_dict[k] = 1
			
 
				-    if len(list_stage)>0:
			
 
				+    if len(list_stage) > 0:
			
 
				         stage_cnt_list = [[x, stage_dict.get(x)] for x in stage_dict.keys()]
			
 
				         stage_cnt_list.sort(key=lambda x: x[1], reverse=True)
			
 
				         # print('extract_industry ' + str(stage_cnt_list))
			
@@ -598,12 +609,12 @@ def extract_tenderee(list_entity):
 
				 
			
 
				 def extract_project_digest(content):
			
 
				     _pattern = "(?P<projectDigest>(项目|工程|标的|需求|建设|招标|采购|内容)(概况|规模|简介|信息|范围|内容|说明|摘要).{10,300})"
			
 
				-    _pattern_search = re.search(_pattern,content)
			
 
				+    _pattern_search = re.search(_pattern, content)
			
 
				     _projectDigest = ""
			
 
				     _find = ""
			
 
				     if _pattern_search is not None:
			
 
				-        _find = _pattern_search.groupdict().get("projectDigest","")
			
 
				-    if len(_find)>0:
			
 
				+        _find = _pattern_search.groupdict().get("projectDigest", "")
			
 
				+    if len(_find) > 0:
			
 
				         _projectDigest = "。".join(_find.split("。")[0:3])
			
 
				 
			
 
				     # 截掉中标信息
			
@@ -620,7 +631,6 @@ def extract_project_address(list_sentence, list_entity):
 
				     reg3 = "(项目|建设|工程)(地址|地点)[：:]?(位于|起于)"
			
 
				     reg4 = "(项目|建设|工程)(地址|地点)[为：:]+"
			
 
				 
			
 
				-
			
 
				     address_list = []
			
 
				     candidate_list = []
			
 
				     for sentence in list_sentence:
			
@@ -637,10 +647,12 @@ def extract_project_address(list_sentence, list_entity):
 
				                         continue
			
 
				 
			
 
				                     text = p_entity.entity_text
			
 
				-                    if text == content[end_index:end_index+len(text)] or text in content[end_index:end_index+len(text)+10]:
			
 
				+                    if text == content[end_index:end_index + len(text)] or text in content[end_index:end_index + len(
			
 
				+                            text) + 10]:
			
 
				                         address_list.append(text)
			
 
				                     else:
			
 
				-                        candidate_list.append(content[max(0, end_index-10):end_index] + '@@@' + content[end_index:end_index+20] + '@@@' + text)
			
 
				+                        candidate_list.append(content[max(0, end_index - 10):end_index] + '@@@' + content[
			
 
				+                                                                                                  end_index:end_index + 20] + '@@@' + text)
			
 
				 
			
 
				             if address_list:
			
 
				                 break
			
@@ -665,17 +677,19 @@ def extract_begin_end_time(list_sentence, list_entity):
 
				         if p_entity.entity_type == "time":
			
 
				             for _sentence in list_sentence:
			
 
				                 if _sentence.sentence_index == p_entity.sentence_index:
			
 
				-                    _span = spanWindow(tokens=_sentence.tokens,begin_index=p_entity.begin_index,end_index=p_entity.end_index,size=20,center_include=True,word_flag=True,text=p_entity.entity_text)
			
 
				-                    if re.search("开工(时间|日期)",_span[0]) is not None:
			
 
				+                    _span = spanWindow(tokens=_sentence.tokens, begin_index=p_entity.begin_index,
			
 
				+                                       end_index=p_entity.end_index, size=20, center_include=True, word_flag=True,
			
 
				+                                       text=p_entity.entity_text)
			
 
				+                    if re.search("开工(时间|日期)", _span[0]) is not None:
			
 
				                         _time_temp = timeFormat(p_entity.entity_text)
			
 
				-                        if len(_time_temp)>0:
			
 
				+                        if len(_time_temp) > 0:
			
 
				                             _begin_time = _time_temp
			
 
				-                    if re.search("(竣工|完工)(时间|日期)",_span[0]) is not None:
			
 
				+                    if re.search("(竣工|完工)(时间|日期)", _span[0]) is not None:
			
 
				                         _time_temp = timeFormat(p_entity.entity_text)
			
 
				-                        if len(_time_temp)>0:
			
 
				+                        if len(_time_temp) > 0:
			
 
				                             _end_time = _time_temp
			
 
				 
			
 
				-    return _begin_time,_end_time
			
 
				+    return _begin_time, _end_time
			
 
				 
			
 
				 
			
 
				 def get_bid_location(content):
			
@@ -709,7 +723,8 @@ def get_bid_location(content):
 
				             find_flag = False
			
 
				             for entitys in list_entitys:
			
 
				                 for entity in entitys:
			
 
				-                    if entity.entity_type in ["tenderee", 'agency', 'win_tenderer', 'second_tenderer', 'third_tenderer', 'company', 'org']:
			
 
				+                    if entity.entity_type in ["tenderee", 'agency', 'win_tenderer', 'second_tenderer', 'third_tenderer',
			
 
				+                                              'company', 'org']:
			
 
				                         if location in entity.entity_text:
			
 
				                             find_flag = True
			
 
				                             break
			
@@ -727,8 +742,8 @@ def extract_proportion(content, has_preffix=True):
 
				     # log(content)
			
 
				     suffix = "[大概约为是:：【\[\s]*[\d,]+(\.\d+)?[十百千万亿]*([\]】平方kK千万公㎡mM米里顷亩]+2?))"
			
 
				     reg_dict = {
			
 
				-        0: "(?P<proportion>(总((建筑|建设)面积|长|长度))" + suffix,
			
 
				-        1: "(?P<proportion>((建筑|建设)面积|全长)" + suffix,
			
 
				+        0: "(?P<proportion>(总((建筑|建设)(面积|规模)|长|长度))" + suffix,
			
 
				+        1: "(?P<proportion>((建筑|建设)(面积|规模)|全长)" + suffix,
			
 
				         2: "(?P<proportion>((建筑|建设|区域)?面积|全长|项目规模)" + suffix
			
 
				     }
			
 
				 
			
@@ -743,7 +758,7 @@ def extract_proportion(content, has_preffix=True):
 
				         # logging.info('content ' + str(content))
			
 
				         match = re.search(_pattern, str(content))
			
 
				         if match:
			
 
				-            _proportion = match.groupdict().get("proportion","")
			
 
				+            _proportion = match.groupdict().get("proportion", "")
			
 
				 
			
 
				     if not _proportion:
			
 
				         return "", ""
			
@@ -820,7 +835,7 @@ def extract_has_elevator(content):
 
				         has_flag = 1
			
 
				         if judge_yeji(match.span()[0], content):
			
 
				             has_flag = 0
			
 
				-        elif re.search('公司', content[end_index:end_index+8]):
			
 
				+        elif re.search('公司', content[end_index:end_index + 8]):
			
 
				             has_flag = 0
			
 
				     return has_flag
			
 
				 
			
@@ -828,12 +843,12 @@ def extract_has_elevator(content):
 
				 def extract_project_property(content, property_pattern, property_priority_dict):
			
 
				     property_list = []
			
 
				     for m in re.finditer(property_pattern, content):
			
 
				-        for k,v in m.groupdict().items():
			
 
				+        for k, v in m.groupdict().items():
			
 
				             if v is not None:
			
 
				                 property_list.append([k, property_priority_dict.get(k)])
			
 
				 
			
 
				     _property = '新建'
			
 
				-    if len(property_list)>0:
			
 
				+    if len(property_list) > 0:
			
 
				         property_list.sort(key=lambda x: x[1])
			
 
				         _property = property_list[0][0]
			
 
				     return _property
			
@@ -850,7 +865,7 @@ def extract_several_money(list_sentence, html='', is_obj=True, show=0):
 
				     tables_and_divs = soup.find_all(['table', 'div'])
			
 
				     for i, sentence in enumerate(list_sentence):
			
 
				         if show and i % 100 == 0:
			
 
				-            print('extract_several_money Loop', i, len(list_sentence), time.time()-start_time1)
			
 
				+            print('extract_several_money Loop', i, len(list_sentence), time.time() - start_time1)
			
 
				             start_time1 = time.time()
			
 
				         last_text = ''
			
 
				         next_text = ''
			
@@ -858,40 +873,40 @@ def extract_several_money(list_sentence, html='', is_obj=True, show=0):
 
				             text = sentence.sentence_text
			
 
				             all_before_sentence += text
			
 
				             if i > 0:
			
 
				-                last_text = list_sentence[i-1].sentence_text[-30:]
			
 
				+                last_text = list_sentence[i - 1].sentence_text[-30:]
			
 
				             if i < len(list_sentence) - 1:
			
 
				-                next_text = list_sentence[i+1].sentence_text[:30]
			
 
				+                next_text = list_sentence[i + 1].sentence_text[:30]
			
 
				         else:
			
 
				             text = sentence
			
 
				             all_before_sentence += text
			
 
				             if i > 0:
			
 
				-                last_text = list_sentence[i-1][-30:]
			
 
				+                last_text = list_sentence[i - 1][-30:]
			
 
				             if i < len(list_sentence) - 1:
			
 
				-                next_text = list_sentence[i+1][:30]
			
 
				+                next_text = list_sentence[i + 1][:30]
			
 
				 
			
 
				         start_time2 = time.time()
			
 
				-        if judge_yeji(len(all_before_sentence), all_before_sentence, 300+len(text)):
			
 
				+        if judge_yeji(len(all_before_sentence), all_before_sentence, 300 + len(text)):
			
 
				             # print('sentence yeji before ' + text)
			
 
				             continue
			
 
				         if show:
			
 
				-            print('extract_several_money time0.1', time.time()-start_time2)
			
 
				+            print('extract_several_money time0.1', time.time() - start_time2)
			
 
				             start_time2 = time.time()
			
 
				         # if '项目概算总投资为' in text:
			
 
				         _list, _ = get_several_money(text, 0, False, tables_and_divs=tables_and_divs)
			
 
				         if show:
			
 
				-            print('extract_several_money time0.2', time.time()-start_time2)
			
 
				+            print('extract_several_money time0.2', time.time() - start_time2)
			
 
				             start_time2 = time.time()
			
 
				         # logging.info('get_several_money _list ' + str(_list))
			
 
				 
			
 
				         temp_list = []
			
 
				         for l in _list:
			
 
				             if l[-1] == '总投资':
			
 
				-                if re.search('业绩', last_text+text+next_text):
			
 
				+                if re.search('业绩', last_text + text + next_text):
			
 
				                     continue
			
 
				             temp_list.append(l)
			
 
				         _list = temp_list
			
 
				         if show:
			
 
				-            print('extract_several_money time0.3', time.time()-start_time2)
			
 
				+            print('extract_several_money time0.3', time.time() - start_time2)
			
 
				             start_time2 = time.time()
			
 
				 
			
 
				         money_list += _list
			
@@ -899,7 +914,7 @@ def extract_several_money(list_sentence, html='', is_obj=True, show=0):
 
				         #     break
			
 
				 
			
 
				     if show:
			
 
				-        print('extract_several_money time1', time.time()-start_time)
			
 
				+        print('extract_several_money time1', time.time() - start_time)
			
 
				         start_time = time.time()
			
 
				 
			
 
				     money_type_dict = {}
			
@@ -925,7 +940,7 @@ def extract_several_money(list_sentence, html='', is_obj=True, show=0):
 
				 
			
 
				     # logging.info('money_type_dict ' + str(money_type_dict))
			
 
				     if show:
			
 
				-        print('extract_several_money time2', time.time()-start_time)
			
 
				+        print('extract_several_money time2', time.time() - start_time)
			
 
				         start_time = time.time()
			
 
				 
			
 
				     result_list = []
			
@@ -944,7 +959,7 @@ def extract_several_money(list_sentence, html='', is_obj=True, show=0):
 
				             result_list.append(None)
			
 
				 
			
 
				     if show:
			
 
				-        print('extract_several_money time3', time.time()-start_time)
			
 
				+        print('extract_several_money time3', time.time() - start_time)
			
 
				         start_time = time.time()
			
 
				 
			
 
				     for i in range(len(result_list)):
			
@@ -961,9 +976,9 @@ def extract_max_floor(content, html=None):
 
				         _floor_list = []
			
 
				         for m in _match:
			
 
				             if 'reg6' in _reg:
			
 
				-                _floor1 = content[max(0, m.span('reg6')[0]-1):m.span('reg6')[1]+1]
			
 
				+                _floor1 = content[max(0, m.span('reg6')[0] - 1):m.span('reg6')[1] + 1]
			
 
				             elif 'reg4' in _reg:
			
 
				-                _floor1 = content[max(0, m.span('reg4')[0]-1):m.span('reg4')[1]+1]
			
 
				+                _floor1 = content[max(0, m.span('reg4')[0] - 1):m.span('reg4')[1] + 1]
			
 
				             else:
			
 
				                 _floor1 = content[m.span()[0]:m.span()[1]]
			
 
				             if judge_yeji(m.span()[0], _content, 300, _tables_and_divs, _floor1):
			
@@ -1003,7 +1018,7 @@ def extract_max_floor(content, html=None):
 
				                     _floor = chinese_to_arabic(_floor)
			
 
				                 _floor = int(_floor)
			
 
				             if _reg2:
			
 
				-                _floor_list2 = match_floor(_reg2, _content[m.span()[1]:m.span()[1]+35])
			
 
				+                _floor_list2 = match_floor(_reg2, _content[m.span()[1]:m.span()[1] + 35])
			
 
				                 # print('@2', _floor_list2)
			
 
				                 if _floor_list2:
			
 
				                     _floor2 = int(_floor_list2[0])
			
@@ -1087,11 +1102,11 @@ def extract_structure(content, html=None, structure_keyword_list=None):
 
				     match = re.finditer(reg, content)
			
 
				     for m in match:
			
 
				         structure = m.group()
			
 
				-        structure1 = content[max(0, m.span()[0]-1):m.span()[1]+1]
			
 
				+        structure1 = content[max(0, m.span()[0] - 1):m.span()[1] + 1]
			
 
				         if judge_yeji(m.span()[0], content, 300, tables_and_divs, structure1):
			
 
				             continue
			
 
				         if structure in ['钢结构']:
			
 
				-            if re.search('公司', content[m.span()[1]:m.span()[1]+8]):
			
 
				+            if re.search('公司', content[m.span()[1]:m.span()[1] + 8]):
			
 
				                 continue
			
 
				         structure_list.append(structure)
			
 
				 
			
@@ -1234,7 +1249,7 @@ def cut_win_bid_part(_str):
 
				         index_start = m.span()[0]
			
 
				         cut_str = re.split("[,，。；;]", _str[index_start:])[0]
			
 
				         if len(cut_str) < 25:
			
 
				-            cut_str = _str[index_start:index_start+25]
			
 
				+            cut_str = _str[index_start:index_start + 25]
			
 
				         # cut_str = _str[index_start:index_start+15]
			
 
				         # print("cut_str", cut_str)
			
 
				 
			
@@ -1307,20 +1322,21 @@ def judge_yeji(end_index, content, judge_len=300, tables_and_divs=None, entity=N
 
				                                     # print('len(rows[index2])', len(row2.find_all('td')))
			
 
				                                     # if len(row2.find_all('td')) <= max_col_span / 2:
			
 
				                                     #     print(re.search('业绩', str(row2)), str(row2))
			
 
				-                                    if len(row2.find_all('td')) <= max_col_span / 2 and re.search(reg_yeji, str(row2.get_text())):
			
 
				+                                    if len(row2.find_all('td')) <= max_col_span / 2 and re.search(reg_yeji,
			
 
				+                                                                                                  str(row2.get_text())):
			
 
				                                         # logging.info('is_yeji_table 2')
			
 
				                                         is_yeji = 1
			
 
				 
			
 
				                         break
			
 
				 
			
 
				                 # 前面都找不到，那么找表格上方的两行
			
 
				-                div_list = [str(x.get_text()) for x in _tables_and_divs[max(0, index3-2):index3]]
			
 
				+                div_list = [str(x.get_text()) for x in _tables_and_divs[max(0, index3 - 2):index3]]
			
 
				                 if not is_yeji and re.search(reg_yeji, ' '.join(div_list)):
			
 
				                     # logging.info('is_yeji_table 3')
			
 
				                     is_yeji = 1
			
 
				                 break
			
 
				         if show:
			
 
				-            print('is_yeji_table time', time.time()-start_time)
			
 
				+            print('is_yeji_table time', time.time() - start_time)
			
 
				         return is_yeji
			
 
				 
			
 
				     # 先判断表格业绩
			
@@ -1349,15 +1365,17 @@ def get_several_money(sentence_text, found_yeji, in_attachment=False,
 
				         '''
			
 
				         @summary:拿到中文对应的数字
			
 
				         '''
			
 
				-        DigitsDic = {"零":0, "壹":1, "贰":2, "叁":3, "肆":4, "伍":5, "陆":6, "柒":7, "捌":8, "玖":9,
			
 
				-                     "〇":0, "一":1, "二":2, "三":3, "四":4, "五":5, "六":6, "七":7, "八":8, "九":9}
			
 
				+        DigitsDic = {"零": 0, "壹": 1, "贰": 2, "叁": 3, "肆": 4, "伍": 5, "陆": 6, "柒": 7, "捌": 8, "玖": 9,
			
 
				+                     "〇": 0, "一": 1, "二": 2, "三": 3, "四": 4, "五": 5, "六": 6, "七": 7, "八": 8, "九": 9}
			
 
				         return DigitsDic.get(_unit)
			
 
				 
			
 
				     def getMultipleFactor(_unit):
			
 
				         '''
			
 
				         @summary:拿到单位对应的值
			
 
				         '''
			
 
				-        MultipleFactor = {"兆":Decimal(1000000000000),"亿":Decimal(100000000),"万":Decimal(10000),"仟":Decimal(1000),"千":Decimal(1000),"佰":Decimal(100),"百":Decimal(100),"拾":Decimal(10),"十":Decimal(10),"元":Decimal(1),"圆":Decimal(1),"角":round(Decimal(0.1),1),"分":round(Decimal(0.01),2)}
			
 
				+        MultipleFactor = {"兆": Decimal(1000000000000), "亿": Decimal(100000000), "万": Decimal(10000), "仟": Decimal(1000),
			
 
				+                          "千": Decimal(1000), "佰": Decimal(100), "百": Decimal(100), "拾": Decimal(10), "十": Decimal(10),
			
 
				+                          "元": Decimal(1), "圆": Decimal(1), "角": round(Decimal(0.1), 1), "分": round(Decimal(0.01), 2)}
			
 
				         return MultipleFactor.get(_unit)
			
 
				 
			
 
				     def getUnifyMoney(money):
			
@@ -1370,45 +1388,45 @@ def get_several_money(sentence_text, found_yeji, in_attachment=False,
 
				 
			
 
				         MAX_MONEY = 1000000000000
			
 
				         MAX_NUM = 12
			
 
				-        #去掉逗号
			
 
				-        money = re.sub("[，,]","",money)
			
 
				-        money = re.sub("[^0-9.零壹贰叁肆伍陆柒捌玖拾佰仟萬億圆十百千万亿元角分]","",money)
			
 
				+        # 去掉逗号
			
 
				+        money = re.sub("[，,]", "", money)
			
 
				+        money = re.sub("[^0-9.零壹贰叁肆伍陆柒捌玖拾佰仟萬億圆十百千万亿元角分]", "", money)
			
 
				         result = Decimal(0)
			
 
				         chnDigits = ["零", "壹", "贰", "叁", "肆", "伍", "陆", "柒", "捌", "玖"]
			
 
				         # chnFactorUnits = ["兆", "亿", "万", "仟", "佰", "拾","圆","元","角","分"]
			
 
				-        chnFactorUnits = ["圆", "元","兆", "亿", "万", "仟", "佰", "拾", "角", "分", '十', '百', '千']
			
 
				+        chnFactorUnits = ["圆", "元", "兆", "亿", "万", "仟", "佰", "拾", "角", "分", '十', '百', '千']
			
 
				 
			
 
				         LowMoneypattern = re.compile("^[\d,]+(\.\d+)?$")
			
 
				-        BigMoneypattern = re.compile("^零?(?P<BigMoney>[%s])$"%("".join(chnDigits)))
			
 
				+        BigMoneypattern = re.compile("^零?(?P<BigMoney>[%s])$" % ("".join(chnDigits)))
			
 
				         try:
			
 
				-            if re.search(LowMoneypattern,money) is not None:
			
 
				+            if re.search(LowMoneypattern, money) is not None:
			
 
				                 return Decimal(money)
			
 
				-            elif re.search(BigMoneypattern,money) is not None:
			
 
				-                return getDigitsDic(re.search(BigMoneypattern,money).group("BigMoney"))
			
 
				+            elif re.search(BigMoneypattern, money) is not None:
			
 
				+                return getDigitsDic(re.search(BigMoneypattern, money).group("BigMoney"))
			
 
				             for factorUnit in chnFactorUnits:
			
 
				-                if re.search(re.compile(".*%s.*"%(factorUnit)),money) is not None:
			
 
				-                    subMoneys = re.split(re.compile("%s(?!.*%s.*)"%(factorUnit,factorUnit)),money)
			
 
				-                    if re.search(re.compile("^(\d+)(\.\d+)?$"),subMoneys[0]) is not None:
			
 
				-                        if MAX_MONEY/getMultipleFactor(factorUnit)<Decimal(subMoneys[0]):
			
 
				+                if re.search(re.compile(".*%s.*" % (factorUnit)), money) is not None:
			
 
				+                    subMoneys = re.split(re.compile("%s(?!.*%s.*)" % (factorUnit, factorUnit)), money)
			
 
				+                    if re.search(re.compile("^(\d+)(\.\d+)?$"), subMoneys[0]) is not None:
			
 
				+                        if MAX_MONEY / getMultipleFactor(factorUnit) < Decimal(subMoneys[0]):
			
 
				                             return Decimal(0)
			
 
				-                        result += Decimal(subMoneys[0])*(getMultipleFactor(factorUnit))
			
 
				-                    elif len(subMoneys[0])==1:
			
 
				-                        if re.search(re.compile("^[%s]$"%("".join(chnDigits))),subMoneys[0]) is not None:
			
 
				-                            result += Decimal(getDigitsDic(subMoneys[0]))*(getMultipleFactor(factorUnit))
			
 
				+                        result += Decimal(subMoneys[0]) * (getMultipleFactor(factorUnit))
			
 
				+                    elif len(subMoneys[0]) == 1:
			
 
				+                        if re.search(re.compile("^[%s]$" % ("".join(chnDigits))), subMoneys[0]) is not None:
			
 
				+                            result += Decimal(getDigitsDic(subMoneys[0])) * (getMultipleFactor(factorUnit))
			
 
				                     # subMoneys[0]中无金额单位，不可再拆分
			
 
				-                    elif subMoneys[0]=="":
			
 
				+                    elif subMoneys[0] == "":
			
 
				                         result += 0
			
 
				-                    elif re.search(re.compile("[%s]"%("".join(chnFactorUnits))),subMoneys[0]) is None:
			
 
				+                    elif re.search(re.compile("[%s]" % ("".join(chnFactorUnits))), subMoneys[0]) is None:
			
 
				                         # print(subMoneys)
			
 
				                         # subMoneys[0] = subMoneys[0][0]
			
 
				                         result += Decimal(getUnifyMoney(subMoneys[0])) * (getMultipleFactor(factorUnit))
			
 
				                     else:
			
 
				-                        result += Decimal(getUnifyMoney(subMoneys[0]))*(getMultipleFactor(factorUnit))
			
 
				-                    if len(subMoneys)>1:
			
 
				-                        if re.search(re.compile("^(\d+(,)?)+(\.\d+)?[百千万亿]?\s?(元)?$"),subMoneys[1]) is not None:
			
 
				+                        result += Decimal(getUnifyMoney(subMoneys[0])) * (getMultipleFactor(factorUnit))
			
 
				+                    if len(subMoneys) > 1:
			
 
				+                        if re.search(re.compile("^(\d+(,)?)+(\.\d+)?[百千万亿]?\s?(元)?$"), subMoneys[1]) is not None:
			
 
				                             result += Decimal(subMoneys[1])
			
 
				-                        elif len(subMoneys[1])==1:
			
 
				-                            if re.search(re.compile("^[%s]$"%("".join(chnDigits))),subMoneys[1]) is not None:
			
 
				+                        elif len(subMoneys[1]) == 1:
			
 
				+                            if re.search(re.compile("^[%s]$" % ("".join(chnDigits))), subMoneys[1]) is not None:
			
 
				                                 result += Decimal(getDigitsDic(subMoneys[1]))
			
 
				                         else:
			
 
				                             result += Decimal(getUnifyMoney(subMoneys[1]))
			
@@ -1456,7 +1474,7 @@ def get_several_money(sentence_text, found_yeji, in_attachment=False,
 
				         sentence_text = re.sub(re.escape(match.group()), match.group()[0] + match.group()[2:], sentence_text)
			
 
				 
			
 
				     if show:
			
 
				-        print('get_several_money time2', time.time()-start_time)
			
 
				+        print('get_several_money time2', time.time() - start_time)
			
 
				         start_time = time.time()
			
 
				 
			
 
				     if re.search('业绩(公示|汇总|及|报告|\w{,2}(内容|情况|信息)|[^\w])', sentence_text):
			
@@ -1464,14 +1482,15 @@ def get_several_money(sentence_text, found_yeji, in_attachment=False,
 
				     if found_yeji >= 2:  # 过滤掉业绩后面的所有金额
			
 
				         all_match = []
			
 
				     else:
			
 
				-        ser = re.search('((收费标准|计算[方公]?式)：|\w{3,5}\s*=)+\s*[中标投标成交金额招标人预算价格万元\s（）()\[\]【】\d\.%％‰\+\-*×/]{20,}[，。]?', sentence_text)  # 过滤掉收费标准里面的金额
			
 
				+        ser = re.search('((收费标准|计算[方公]?式)：|\w{3,5}\s*=)+\s*[中标投标成交金额招标人预算价格万元\s（）()\[\]【】\d\.%％‰\+\-*×/]{20,}[，。]?',
			
 
				+                        sentence_text)  # 过滤掉收费标准里面的金额
			
 
				         if ser:
			
 
				             all_match = re.finditer(pattern_money, sentence_text.replace(ser.group(0), ' ' * len(ser.group(0))))
			
 
				         else:
			
 
				             all_match = re.finditer(pattern_money, sentence_text)
			
 
				 
			
 
				     if show:
			
 
				-        print('get_several_money time3', time.time()-start_time)
			
 
				+        print('get_several_money time3', time.time() - start_time)
			
 
				         start_time = time.time()
			
 
				 
			
 
				     for _match in all_match:
			
@@ -1486,7 +1505,8 @@ def get_several_money(sentence_text, found_yeji, in_attachment=False,
 
				             filter_unit = False
			
 
				             notSure = False
			
 
				             science = ""
			
 
				-            if re.search('业绩(公示|汇总|及|报告|\w{,2}(内容|情况|信息)|[^\w])', sentence_text[:_match.span()[0]]):  # 2021/7/21过滤掉业绩后面金额
			
 
				+            if re.search('业绩(公示|汇总|及|报告|\w{,2}(内容|情况|信息)|[^\w])',
			
 
				+                         sentence_text[:_match.span()[0]]):  # 2021/7/21过滤掉业绩后面金额
			
 
				                 # print('金额在业绩后面: ', _match.group(0))
			
 
				                 found_yeji += 1
			
 
				                 break
			
@@ -1529,12 +1549,14 @@ def get_several_money(sentence_text, found_yeji, in_attachment=False,
 
				             if re.search('电话|手机|联系|方式|编号|编码|日期|数字|时间', text_beforeMoney):
			
 
				                 # print('过滤掉手机号码作为金额')
			
 
				                 continue
			
 
				-            elif re.search('^1[3-9]\d{9}$', entity_text) and re.search('：\w{1,3}$', text_beforeMoney): # 过滤掉类似 '13863441880', '金额（万元）：季勇13863441880'
			
 
				+            elif re.search('^1[3-9]\d{9}$', entity_text) and re.search('：\w{1,3}$',
			
 
				+                                                                       text_beforeMoney):  # 过滤掉类似 '13863441880', '金额（万元）：季勇13863441880'
			
 
				                 # print('过滤掉手机号码作为金额')
			
 
				                 continue
			
 
				 
			
 
				             if unit == "":  # 2021/7/21 有明显金额特征的补充单位，避免被过滤
			
 
				-                if (re.search('(￥|¥|RMB|CNY)[:：]?$', text_beforeMoney) or re.search('[零壹贰叁肆伍陆柒捌玖拾佰仟萬億圆十百千万亿元角分]{3,}', entity_text)):
			
 
				+                if (re.search('(￥|¥|RMB|CNY)[:：]?$', text_beforeMoney) or re.search('[零壹贰叁肆伍陆柒捌玖拾佰仟萬億圆十百千万亿元角分]{3,}',
			
 
				+                                                                                    entity_text)):
			
 
				                     if entity_text.endswith('万元'):
			
 
				                         unit = '万元'
			
 
				                         entity_text = entity_text[:-2]
			
@@ -1550,8 +1572,10 @@ def get_several_money(sentence_text, found_yeji, in_attachment=False,
 
				                 elif re.search('^[-—]+[\d,.]+万元', sentence_text[end_index:]):
			
 
				                     # print('两个金额连接后面的有单位，用后面单位')
			
 
				                     unit = '万元'
			
 
				-                elif re.search('([单报标限总造]价款?|金额|租金|(中标|成交|合同|承租|投资)）?[价额]|价格|预算(金额)?|(监理|设计|勘察)(服务)?费)[:：为]*-?$', text_beforeMoney.strip()) and re.search('^0|1[3|4|5|6|7|8|9]\d{9}', entity_text) == None:
			
 
				-                    if re.search('^[\d，,.]+$', entity_text) and float(re.sub('[,，]', '', entity_text))<500 and re.search('万元', sentence_text):
			
 
				+                elif re.search('([单报标限总造]价款?|金额|租金|(中标|成交|合同|承租|投资)）?[价额]|价格|预算(金额)?|(监理|设计|勘察)(服务)?费)[:：为]*-?$',
			
 
				+                               text_beforeMoney.strip()) and re.search('^0|1[3|4|5|6|7|8|9]\d{9}', entity_text) == None:
			
 
				+                    if re.search('^[\d，,.]+$', entity_text) and float(
			
 
				+                            re.sub('[,，]', '', entity_text)) < 500 and re.search('万元', sentence_text):
			
 
				                         unit = '万元'
			
 
				                         # print('金额较小且句子中有万元的，补充单位为万元')
			
 
				                     elif re.search('^\d{1,3}\.\d{4,6}$', entity_text) and re.search('0000$', entity_text) == None:
			
@@ -1568,7 +1592,7 @@ def get_several_money(sentence_text, found_yeji, in_attachment=False,
 
				             elif unit == '万元':
			
 
				                 if end_index < len(sentence_text) and sentence_text[end_index] == '元' and re.search('\d$', entity_text):
			
 
				                     unit = '元'
			
 
				-                elif re.search('^[5-9]\d{6,}\.\d{2}$', entity_text): # 五百亿以上的万元改为元
			
 
				+                elif re.search('^[5-9]\d{6,}\.\d{2}$', entity_text):  # 五百亿以上的万元改为元
			
 
				                     unit = '元'
			
 
				 
			
 
				             if unit.find("万") >= 0 and entity_text.find("万") >= 0:  # 2021/7/19修改为金额文本有万，不计算单位
			
@@ -1625,13 +1649,13 @@ def get_several_money(sentence_text, found_yeji, in_attachment=False,
 
				                 continue
			
 
				             # print("金额：{0} ,单位：{1}, 前文：{2}, filter: {3}, filter_unit: {4}".format(entity_text, unit, text_beforeMoney,
			
 
				             #                                                                      filter, filter_unit))
			
 
				-            if re.search('[%％‰折]|费率|下浮率', text_beforeMoney) and float(entity_text)<1000: # 过滤掉可能是费率的金额
			
 
				+            if re.search('[%％‰折]|费率|下浮率', text_beforeMoney) and float(entity_text) < 1000:  # 过滤掉可能是费率的金额
			
 
				                 # print('过滤掉可能是费率的金额')
			
 
				                 continue
			
 
				             money_list.append((entity_text, start_index, end_index, unit, notes))
			
 
				 
			
 
				     if show:
			
 
				-        print('get_several_money time4', time.time()-start_time)
			
 
				+        print('get_several_money time4', time.time() - start_time)
			
 
				         start_time = time.time()
			
 
				 
			
 
				     # 排除过小的金额
			
@@ -1727,8 +1751,8 @@ def get_stage_pattern():
 
				     }
			
 
				 
			
 
				     list_stage_v = []
			
 
				-    for k,v in stage_dict.items():
			
 
				-        list_stage_v.append("(?P<%s>%s)"%(k,v))
			
 
				+    for k, v in stage_dict.items():
			
 
				+        list_stage_v.append("(?P<%s>%s)" % (k, v))
			
 
				     stage_pattern = "|".join(list_stage_v)
			
 
				     return stage_pattern, stage_priority_dict
			
 
				 
			
@@ -1777,7 +1801,182 @@ def get_property_pattern():
 
				     }
			
 
				 
			
 
				     list_property_v = []
			
 
				-    for k,v in property_dict.items():
			
 
				-        list_property_v.append("(?P<%s>%s)"%(k,v))
			
 
				+    for k, v in property_dict.items():
			
 
				+        list_property_v.append("(?P<%s>%s)" % (k, v))
			
 
				     property_pattern = "|".join(list_property_v)
			
 
				-    return property_pattern, property_priority_dict
			
 
				+    return property_pattern, property_priority_dict
			
 
				+
			
 
				+
			
 
				+class get_service_end:
			
 
				+    def __init__(self):
			
 
				+        self.pattern1 = re.compile("\d{4}[年\-\./]\d{1,2}[月\-\./]\d{1,2}日?")
			
 
				+        self.pattern2 = re.compile("\d+(?:\.\d+)?[\(（]?个?[^\d]?[^\d]?(?:日|天|周年|整年|学?年|月|周|日历[天日]|工作[天日])")
			
 
				+        self.pattern3 = re.compile("\d{4}[年\-\./]\d{1,2}月?")
			
 
				+        self.pattern4 = re.compile("(?:日|天|周年|年|月|周|日历[天日]|工作[天日]|星期)[^\d]{1,3}\d+(?:\.\d+)?")
			
 
				+        self.DigitsDic = {"零": 0, "壹": 1, "贰": 2, "叁": 3, "肆": 4, "伍": 5, "陆": 6, "柒": 7, "捌": 8, "玖": 9,
			
 
				+                          "〇": 0, "一": 1, "二": 2, "三": 3, "四": 4, "五": 5, "六": 6, "七": 7, "八": 8, "九": 9,
			
 
				+                          "两": 2, '貮': 2}
			
 
				+
			
 
				+    def get_num(self, text):
			
 
				+        CN_UNIT = {
			
 
				+            '十': 10,
			
 
				+            '拾': 10,
			
 
				+            '百': 100,
			
 
				+            '佰': 100,
			
 
				+            '千': 1000,
			
 
				+            '仟': 1000}
			
 
				+
			
 
				+        regex = re.compile(r'[〇一二三四五六七八九零壹贰叁肆伍陆柒捌玖貮两十拾百佰千仟]+')
			
 
				+        text = regex.search(text)
			
 
				+        if text:
			
 
				+            text = text.group()
			
 
				+        else:
			
 
				+            return ""
			
 
				+        result = 0
			
 
				+        result_list = []
			
 
				+        unit = 0
			
 
				+        control = 0
			
 
				+        for i, d in enumerate(text):
			
 
				+            if d in '零百佰千仟' and i == 0:
			
 
				+                return ""
			
 
				+                break
			
 
				+            if d in self.DigitsDic:
			
 
				+                result += self.DigitsDic[d]
			
 
				+            elif d in CN_UNIT:
			
 
				+                if unit == 0:
			
 
				+                    unit_1 = CN_UNIT[d]
			
 
				+                    # 这里的处理主要是考虑到类似于二十三亿五千万这种数
			
 
				+                    if result == 0:
			
 
				+                        result = CN_UNIT[d]
			
 
				+                    else:
			
 
				+                        result *= CN_UNIT[d]
			
 
				+                    unit = CN_UNIT[d]
			
 
				+                    result_1 = result
			
 
				+                elif unit > CN_UNIT[d]:
			
 
				+                    result -= self.DigitsDic[text[i - 1]]
			
 
				+                    result += self.DigitsDic[text[i - 1]] * CN_UNIT[d]
			
 
				+                    unit = CN_UNIT[d]
			
 
				+                elif unit <= CN_UNIT[d]:
			
 
				+                    if (CN_UNIT[d] < unit_1) and (len(result_list) == control):
			
 
				+                        result_list.append(result_1)
			
 
				+                        result = (result - result_1) * CN_UNIT[d]
			
 
				+                        control += 1
			
 
				+                    else:
			
 
				+                        result *= CN_UNIT[d]
			
 
				+                    unit = CN_UNIT[d]
			
 
				+                    if len(result_list) == control:
			
 
				+                        unit_1 = unit
			
 
				+                        result_1 = result
			
 
				+            else:
			
 
				+                return ""
			
 
				+                break
			
 
				+        return sum(result_list) + result
			
 
				+
			
 
				+    def process(self, page_time, service_time):
			
 
				+        try:
			
 
				+            page_time = re.search("\d{4}\-\d{1,2}\-\d{1,2}", page_time).group()
			
 
				+
			
 
				+            re_num = re.findall(r'[〇一二三四五六七八九零壹贰叁肆伍陆柒捌玖貮两十拾百佰千仟]+', service_time)
			
 
				+            for _num in re_num:
			
 
				+                if not re.search("[十拾百佰千仟]", _num):
			
 
				+                    num = ""
			
 
				+                    for word in _num:
			
 
				+                        num += str(self.DigitsDic.get(word, word))
			
 
				+                    service_time = service_time.replace(_num, num, 1)
			
 
				+                else:
			
 
				+                    num = str(self.get_num(_num))
			
 
				+                    service_time = service_time.replace(_num, num, 1)
			
 
				+
			
 
				+            end_time = ""
			
 
				+            service_days = 0
			
 
				+            page_timestamp = time.mktime(time.strptime(page_time, "%Y-%m-%d"))
			
 
				+            if re.search(self.pattern1, service_time):
			
 
				+                end_time = re.findall(self.pattern1, service_time)[-1]
			
 
				+                end_time = re.sub("日", "", end_time)
			
 
				+                end_time = re.sub("[年月\./]", "-", end_time)
			
 
				+
			
 
				+                _year, _month, _day = end_time.split("-")
			
 
				+                _month = int(_month)
			
 
				+                _day = int(_day)
			
 
				+                _year = int(_year)
			
 
				+                if _year > 2050 or _year <= 2000 or _month > 12 or _month <= 0 or _day <= 0 or _day > 31:
			
 
				+                    service_days = 0
			
 
				+                else:
			
 
				+                    if _month == 2:
			
 
				+                        _day = min(28, _day)
			
 
				+                    else:
			
 
				+                        _day = min(30, _day)
			
 
				+                    end_time = str(_year) + '-' + str(_month) + '-' + str(_day)
			
 
				+                    service_days = (time.mktime(time.strptime(end_time, "%Y-%m-%d")) - page_timestamp) / (24 * 60 * 60)
			
 
				+            elif re.search(self.pattern3, service_time):
			
 
				+                # logging.info('2')
			
 
				+                end_time = re.findall(self.pattern3, service_time)[-1]
			
 
				+                end_time = re.sub("月", "", end_time)
			
 
				+                end_time = re.sub("[年\./]", "-", end_time)
			
 
				+
			
 
				+                _year, _month = end_time.split("-")
			
 
				+                _day = 0
			
 
				+                _month = int(_month)
			
 
				+                _year = int(_year)
			
 
				+                if _year > 2050 or _year <= 2000 or _month > 12 or _month <= 0:
			
 
				+                    service_days = 0
			
 
				+                else:
			
 
				+                    if _month == 2:
			
 
				+                        _day = 28
			
 
				+                    else:
			
 
				+                        _day = 30
			
 
				+                    end_time = str(_year) + '-' + str(_month) + '-' + str(_day)
			
 
				+                    service_days = (time.mktime(time.strptime(end_time, "%Y-%m-%d")) - page_timestamp) / (24 * 60 * 60)
			
 
				+            elif re.search(self.pattern2, service_time) or re.search(self.pattern4, service_time):
			
 
				+                for pattern in [self.pattern2, self.pattern4]:
			
 
				+                    unit = 1
			
 
				+                    match = re.findall(pattern, service_time)
			
 
				+                    if len(set(match)) == 1:
			
 
				+                        match_text = match[0]
			
 
				+                        # turn_service_time = match_text
			
 
				+                        if "月" in match_text:
			
 
				+                            unit = 30
			
 
				+                        elif "年" in match_text:
			
 
				+                            unit = 365
			
 
				+                        elif "周" in match_text or "星期" in match_text:
			
 
				+                            unit = 7
			
 
				+                        match_num = float(re.search("\d+", match_text).group())
			
 
				+                        # 数字能被365整除，单位更正为天
			
 
				+                        if int(match_num) % 365 == 0:
			
 
				+                            unit = 1
			
 
				+                            # turn_service_time = str(match_num)+"天"
			
 
				+                        if unit == 365:
			
 
				+                            if match_num > 10:  # 单位为'年'时，排除数字过大的
			
 
				+                                match_num = 0
			
 
				+                        elif unit == 30:
			
 
				+                            if match_num > 60:  # 单位为'月'时，排除数字过大的
			
 
				+                                match_num = 0
			
 
				+                        elif unit == 1:
			
 
				+                            if match_num > 4000:  # 单位为'日'时，排除数字过大的
			
 
				+                                match_num = 0
			
 
				+                        service_days = match_num * unit
			
 
				+                        service_days = int(service_days)
			
 
				+                    if service_days > 0:
			
 
				+                        break
			
 
				+            elif "半年" in service_time:
			
 
				+                # turn_service_time = "半年"
			
 
				+                service_days = 180
			
 
				+
			
 
				+
			
 
				+            if service_days > 4000 or service_days < 0:
			
 
				+                service_days = 0
			
 
				+            return str(service_days)
			
 
				+
			
 
				+            # # 服务天数小于90不预测
			
 
				+            # if service_days<90 or service_days>4000:
			
 
				+            #     end_time = ""
			
 
				+            # elif not end_time and service_days!=0:
			
 
				+            #     end_time = time.strftime("%Y-%m-%d",time.localtime(page_timestamp + service_days*24*60*60))
			
 
				+            # may_begin = ""
			
 
				+            # may_end = ""
			
 
				+            # if end_time:
			
 
				+            #     return end_time
			
 
				+            # else:
			
 
				+            #     return ''
			
 
				+        except Exception as e:
			
 
				+            return '0'