瀏覽代碼

serviceTime提取更新

znj 10 月之前
父節點
當前提交
8622057e26
共有 3 個文件被更改,包括 457 次插入28 次删除
  1. 1 0
      BiddingKG/dl/interface/Preprocessing.py
  2. 1 1
      BiddingKG/dl/interface/extract.py
  3. 455 27
      BiddingKG/dl/interface/getAttributes.py

+ 1 - 0
BiddingKG/dl/interface/Preprocessing.py

@@ -2998,6 +2998,7 @@ def get_preprocessed_article(articles,cost_time = dict(),useselffool=True):
         article_processed = re.sub(',最高有效报价:', ',投标报价:', article_processed)  # 224678159 # 2023/7/4 四川站源特殊中标修改
         article_processed = re.sub('备选中标人', '第二候选人', article_processed)  # 341344142 # 2023/7/17 特殊表达修改
         article_processed = re.sub('例:建设银行(甲方全称)', ' ', article_processed)  # 2024/06/12 特殊表达修改 修改 481513912 金采网 附件模板导致错误提取招标人
+        article_processed = re.sub('^[,,.。;;、]+', '', article_processed)
         if web_source_no.startswith('DX002756-'):
             article_processed = re.sub('状态:(进行中|已结束)单位', ',项目单位', article_processed)  # 376225646
         if web_source_no.startswith('DX006116-') and re.search('结果公告如下:.{5,50},单位名称:', article_processed):  # 2023/11/20 特殊处理 381591924 381592533 这种提取不到情况

+ 1 - 1
BiddingKG/dl/interface/extract.py

@@ -322,7 +322,7 @@ def predict(doc_id,text,title="",page_time="",web_source_no='',web_source_name="
     # predictor.getPredictor("product").predict(list_sentences, list_entitys)
     log("get product done of doc_id%s"%(doc_id))
     cost_time["product"] = round(time.time()-start_time,2)
-    prem[0].update(getAttributes.getOtherAttributes(list_entitys[0],page_time))
+    prem[0].update(getAttributes.getOtherAttributes(list_entitys[0],page_time,prem))
 
     '''更新单一来源招标公告中标角色为预中标'''
     getAttributes.fix_single_source(prem[0], channel_dic, original_docchannel)

+ 455 - 27
BiddingKG/dl/interface/getAttributes.py

@@ -12,6 +12,8 @@ import os
 from scipy.optimize import linear_sum_assignment
 from BiddingKG.dl.interface.Entitys import Match
 import numpy as np
+import time,calendar
+from datetime import datetime
 
 def getTheRole(entity,role_list):
     '''
@@ -3011,10 +3013,9 @@ def turnMoneySource(moneysource):
 
 my_time_format_pattern = re.compile("((?P<year>20\d{2}|\d{2}|二[零〇0][零〇一二三四五六七八九0]{2})\s*[-/年.]\s*(?P<month>\d{1,2}|[一二三四五六七八九十]{1,3})\s*[-/月.]\s*(?P<day>\d{1,2}|[一二三四五六七八九十]{1,3}))")
 from BiddingKG.dl.ratio.re_ratio import getUnifyNum
-import time,datetime
 def my_timeFormat(_time,page_time):
     if page_time:
-        current_year = time.strftime("%Y",time.localtime(int(datetime.datetime.strptime(page_time, '%Y-%m-%d').timestamp())))
+        current_year = time.strftime("%Y",time.localtime(int(datetime.strptime(page_time, '%Y-%m-%d').timestamp())))
     else:
         current_year = time.strftime("%Y",time.localtime())
     all_match = re.finditer(my_time_format_pattern,_time)
@@ -3086,6 +3087,20 @@ def my_timeFormat(_time,page_time):
     return time_list
 
 def getTimeAttributes(list_entity,list_sentence,page_time):
+    # from BiddingKG.dl.interface.htmlparser import get_childs
+    # document_tree = parse_document.tree
+    # new_document_tree = []
+    # _data_i = -1
+    # while _data_i < len(document_tree) - 1:
+    #     _data_i += 1
+    #     _data = document_tree[_data_i]
+    #     _type = _data["type"]
+    #     if _type == "sentence":
+    #         if _data["sentence_title"] is not None:
+    #             new_document_tree.append(_data)
+    # document_tree = new_document_tree
+
+
     time_entitys = [i for i in list_entity if i.entity_type=='time']
     time_entitys = sorted(time_entitys,key=lambda x:(x.sentence_index, x.begin_index))
     list_sentence = sorted(list_sentence,key=lambda x:x.sentence_index)
@@ -3147,19 +3162,34 @@ def getTimeAttributes(list_entity,list_sentence,page_time):
         'time_listingStart':"time_listingEnd",
         'time_contractStart':"time_contractEnd"
     }
-    for entity in time_entitys:
+    time_entitys = [[_entity,my_timeFormat(_entity.entity_text,page_time)] for _entity in time_entitys]
+    time_entitys = [item for item in time_entitys if item[1]]
+    for entity_idx in range(len(time_entitys)):
+        entity = time_entitys[entity_idx][0]
+        extract_time = time_entitys[entity_idx][1]
         sentence_text = list_sentence[entity.sentence_index].sentence_text
+        previous_entity = time_entitys[entity_idx-1][0] if entity_idx!=0 else None
+        previous_extract_time = time_entitys[entity_idx-1][1] if entity_idx!=0 else None
+        next_entity = time_entitys[entity_idx+1][0] if entity_idx!=len(time_entitys)-1 else None
+        next_extract_time = time_entitys[entity_idx+1][1] if entity_idx!=len(time_entitys)-1 else None
+        # 实体有效上下文
+        entity_context_begin = previous_entity.wordOffset_end if previous_entity and previous_entity.sentence_index==entity.sentence_index else 0
+        entity_context_end = next_entity.wordOffset_begin if next_entity and next_entity.sentence_index==entity.sentence_index else len(sentence_text)
+
         if entity.sentence_index!=last_sentence_index:
             # sentence_index 不同句子重置last_time_type
             last_time_type = ""
-        entity_left = sentence_text[max(0, entity.wordOffset_begin - 2):entity.wordOffset_begin]
-        entity_left2 = sentence_text[max(0, entity.wordOffset_begin - 10):entity.wordOffset_begin]
-        entity_left3 = sentence_text[max(0, entity.wordOffset_begin - 25):entity.wordOffset_begin]
-        entity_right = sentence_text[entity.wordOffset_end:entity.wordOffset_end + 3]
+        entity_left = sentence_text[max(entity_context_begin, entity.wordOffset_begin - 2):entity.wordOffset_begin]
+        entity_left2 = sentence_text[max(entity_context_begin, entity.wordOffset_begin - 10):entity.wordOffset_begin]
+        entity_left3 = sentence_text[max(entity_context_begin, entity.wordOffset_begin - 30):entity.wordOffset_begin]
+        entity_right = sentence_text[entity.wordOffset_end:min(entity.wordOffset_end + 3,entity_context_end)]
+        entity_right2 = sentence_text[entity.wordOffset_end:entity_context_end]
+        entity_right2 = re.sub(r"http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\\(\\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+",'',entity_right2)[:60] # 去除网址
+        # print(entity.entity_text,entity_right2)
         label_prob = entity.values[entity.label]
         entity_text = entity.entity_text
         in_attachment = entity.in_attachment
-        extract_time = my_timeFormat(entity_text,page_time)
+        # extract_time = my_timeFormat(entity_text,page_time)
         # print(entity_text,entity_left2)
         if extract_time:
             definite_time_list = []
@@ -3208,6 +3238,8 @@ def getTimeAttributes(list_entity,list_sentence,page_time):
 
             min_len = min(len(extract_time),len(definite_time_list))
             for i in range(min_len):
+                if definite_time_list[i] == "24:00:00": # 修正不规范时间表述
+                    definite_time_list[i] = "23:59:59"
                 if definite_time_list[i] != "00:00:00":
                     extract_time[i] = extract_time[i] + " " + definite_time_list[i]
 
@@ -3228,13 +3260,13 @@ def getTimeAttributes(list_entity,list_sentence,page_time):
             # 优化多个并列的时间,如:开标时间和截标时间,截标时间和报名结束时间
             if entity.label in [2,3,9]:
                 if entity.label==2 and re.search("截标|投标.{,2}截止|([递提]交|接收)(?:文件)?.{,2}截止|报价.{,2}截止|响应.{,2}截止|文件.{,2}([递提]交|接收)",entity_left3):
-                    dict_time['time_bidclose'].append((extract_time[0], label_prob, in_attachment))
+                    dict_time['time_bidclose'].append((extract_time[0], label_prob-0.1, in_attachment))
                 if entity.label==3 and re.search("开标|(评审|比选).{,2}(?:开始)?(时间|日期)|选取.{,2}(时间|日期)",entity_left3):
-                    dict_time['time_bidopen'].append((extract_time[0], label_prob, in_attachment))
+                    dict_time['time_bidopen'].append((extract_time[0], label_prob-0.1, in_attachment))
                 if entity.label==3 and re.search("报名",entity_left3):
                     dict_time['time_registrationEnd'].append((extract_time[0], 0.5, in_attachment))
                 if entity.label==9 and re.search("截标|投标.{,2}截止|([递提]交|接收)(?:文件)?.{,2}截止|报价.{,2}截止|响应.{,2}截止|文件.{,2}([递提]交|接收)",entity_left3):
-                    dict_time['time_bidclose'].append((extract_time[0], label_prob, in_attachment))
+                    dict_time['time_bidclose'].append((extract_time[0], label_prob-0.1, in_attachment))
             if entity.label in [11, 3]:
                 if entity.label==11 and re.search("文件.{,2}([递提]交|接收)|截标|投标.{,2}截止|([递提]交|接收)(?:文件)?.{,2}截止|报价.{,2}截止|响应.{,2}截止",entity_left3):
                     dict_time['time_bidclose'].append((extract_time[0], 0.5, in_attachment))
@@ -3254,6 +3286,73 @@ def getTimeAttributes(list_entity,list_sentence,page_time):
                     else:
                         dict_time['time_registrationStart'].append((extract_time[0], 0.51, in_attachment))
                         dict_time['time_registrationEnd'].append((extract_time[1], 0.51, in_attachment))
+            # 获取文件/报名/报价 时间补充(上下文表达过长无法通过模型识别)
+            # if entity.label == 0:
+            #     if re.search("(获取|领取|售卖|出售|购买|下载).{,4}(招标|投标|采购)?(文件|标书)|(文件|标书).{,4}(获取|售卖|出售|发售|购买)", entity_left3):
+            #         if len(extract_time)==2:
+            #             dict_time['time_getFileStart'].append((extract_time[0], 0.51, in_attachment))
+            #             dict_time['time_getFileEnd'].append((extract_time[1], 0.51, in_attachment))
+            #         else:
+            #             if next_entity and next_entity.sentence_index==entity.sentence_index:
+            #                 mid_text = sentence_text[entity.wordOffset_end:next_entity.wordOffset_begin]
+            #                 if len(mid_text)<=10 and re.search("至|到|[-—]|[~~]",mid_text) and len(next_extract_time)==1:
+            #                     dict_time['time_getFileStart'].append((extract_time[0], 0.51, in_attachment))
+            #                     dict_time['time_getFileEnd'].append((next_extract_time[0], 0.51, in_attachment))
+            #             if not dict_time['time_getFileEnd']:
+            #                 if re.search("前|止|截止", entity_right) or re.search("前",entity_text[-2:]):
+            #                     dict_time['time_getFileEnd'].append((extract_time[0], 0.51, in_attachment))
+            #                 elif re.search("起|开?始", entity_right) or re.search("起",entity_text[-2:]):
+            #                     dict_time['time_getFileStart'].append((extract_time[0], 0.51, in_attachment))
+            #     if re.search("(进行|在线|线下|线上|网上).{,2}报名|报名.{,2}(开始)?(时间|日期)", entity_left3):
+            #         if len(extract_time)==2:
+            #             dict_time['time_registrationStart'].append((extract_time[0], 0.51, in_attachment))
+            #             dict_time['time_registrationEnd'].append((extract_time[1], 0.51, in_attachment))
+            #         else:
+            #             if next_entity and next_entity.sentence_index==entity.sentence_index:
+            #                 mid_text = sentence_text[entity.wordOffset_end:next_entity.wordOffset_begin]
+            #                 if len(mid_text)<=10 and re.search("至|到|[-—]|[~~]",mid_text) and len(next_extract_time)==1:
+            #                     dict_time['time_registrationStart'].append((extract_time[0], 0.51, in_attachment))
+            #                     dict_time['time_registrationEnd'].append((next_extract_time[0], 0.51, in_attachment))
+            #             if not dict_time['time_registrationEnd']:
+            #                 if re.search("前|止|截止", entity_right) or re.search("前",entity_text[-2:]):
+            #                     dict_time['time_registrationEnd'].append((extract_time[0], 0.51, in_attachment))
+            #                 elif re.search("起|开?始", entity_right) or re.search("起",entity_text[-2:]):
+            #                     dict_time['time_registrationStart'].append((extract_time[0], 0.51, in_attachment))
+            #
+            #     if re.search("(获取|售卖|出售|购买).{,4}(招标|投标|采购)?(文件|标书)|(文件|标书).{,4}(获取|售卖|出售|发售|购买)", entity_right2):
+            #         if len(extract_time)==2:
+            #             dict_time['time_getFileStart'].append((extract_time[0], 0.51, in_attachment))
+            #             dict_time['time_getFileEnd'].append((extract_time[1], 0.51, in_attachment))
+            #         else:
+            #             if previous_entity and previous_entity.sentence_index==entity.sentence_index:
+            #                 mid_text = sentence_text[previous_entity.wordOffset_end:entity.wordOffset_begin]
+            #                 if len(mid_text)<=10 and re.search("至|到|[-—]|[~~]",mid_text) and len(previous_extract_time)==1:
+            #                     dict_time['time_getFileStart'].append((previous_extract_time[0], 0.51, in_attachment))
+            #                     dict_time['time_getFileEnd'].append((extract_time[0], 0.51, in_attachment))
+            #             if not dict_time['time_getFileEnd']:
+            #                 if re.search("前|止|截止", entity_right) or re.search("前",entity_text[-2:]):
+            #                     dict_time['time_getFileEnd'].append((extract_time[0], 0.51, in_attachment))
+            #                 elif re.search("起|开?始", entity_right) or re.search("起",entity_text[-2:]):
+            #                     dict_time['time_getFileStart'].append((extract_time[0], 0.51, in_attachment))
+            #     if re.search("(进行|在线|线下).{,2}报名", entity_right2):
+            #         if len(extract_time) == 2:
+            #             dict_time['time_registrationStart'].append((extract_time[0], 0.51, in_attachment))
+            #             dict_time['time_registrationEnd'].append((extract_time[1], 0.51, in_attachment))
+            #         else:
+            #             if previous_entity and previous_entity.sentence_index==entity.sentence_index:
+            #                 mid_text = sentence_text[previous_entity.wordOffset_end:entity.wordOffset_begin]
+            #                 if len(mid_text)<=10 and re.search("至|到|[-—]|[~~]",mid_text) and len(previous_extract_time)==1:
+            #                     dict_time['time_registrationStart'].append((previous_extract_time[0], 0.51, in_attachment))
+            #                     dict_time['time_registrationEnd'].append((extract_time[0], 0.51, in_attachment))
+            #             if not dict_time['time_registrationEnd']:
+            #                 if re.search("前|止|截止", entity_right) or re.search("前",entity_text[-2:]):
+            #                     dict_time['time_registrationEnd'].append((extract_time[0], 0.51, in_attachment))
+            #                 elif re.search("起|开?始", entity_right) or re.search("起",entity_text[-2:]):
+            #                     dict_time['time_registrationStart'].append((extract_time[0], 0.51, in_attachment))
+            #     if re.search("(进行|开始).{,4}(报价|投标|竞价)", entity_right2):
+            #         if len(extract_time) == 2:
+            #             dict_time['time_bidstart'].append((extract_time[0], 0.51, in_attachment))
+            #             # dict_time['time_bidclose'].append((extract_time[1], 0.51, in_attachment))
 
             # 补充公告末尾处的发布时间
             if entity.label==0:
@@ -3304,6 +3403,8 @@ def getTimeAttributes(list_entity,list_sentence,page_time):
                         else:
                             dict_time['time_contractStart'].append((extract_time[0], 0.5, in_attachment))
                             last_time_type = 'time_contractStart'
+                    last_sentence_index = entity.sentence_index
+                    continue
                 else:
                     if re.search("(?:合同|服务|履约|(合同|服务)履行)(?:期限?|有效期)|(?:服务|履约|(合同|服务)履行)(?:时间|日期|周期)|服务[时年]限|合同周期", entity_left2):
                         # 排除开始和借宿时间一样的错误模板,例:“履约期限:2023年02月15日至2023年02月15日”
@@ -3311,6 +3412,8 @@ def getTimeAttributes(list_entity,list_sentence,page_time):
                             dict_time['time_contractStart'].append((extract_time[0], 0.6, in_attachment))
                             dict_time['time_contractEnd'].append((extract_time[1], 0.6, in_attachment))
                             last_time_type = ''
+                        last_sentence_index = entity.sentence_index
+                        continue
             # 服务期限表达补充
             if entity.label==0:
                 re_service = '合同期限|工期/交货期/服务期|工期\(交货期\)|合格工期|服务期限|工期' \
@@ -3328,7 +3431,7 @@ def getTimeAttributes(list_entity,list_sentence,page_time):
                         dict_time['time_contractStart'].append((extract_time[0], 0.5, in_attachment))
                         dict_time['time_contractEnd'].append((extract_time[1], 0.5, in_attachment))
                         last_time_type = ''
-            # 报价/投标时间补充
+            # 报价/投标时间补充(规则补充)
             if entity.label == 0:
                 if re.search("[报竞]价.{,2}(开始|起始).{,2}(时间|日期)",entity_left2):
                     entity.label = 12
@@ -3351,6 +3454,26 @@ def getTimeAttributes(list_entity,list_sentence,page_time):
                 elif re.search("(竞价|报价).?(时间|日期)",entity_left3) and re.search("参与|报价|有意",entity_left2):
                     entity.label = 12
                     label_prob = 0.501
+            # 文档结构补充
+            # if entity.label == 0:
+            #     re_registration = re.compile("报名|(文件|标书)[\u4e00-\u9fa5、]{,4}(获取|出售|售卖|购买|下载)|"
+            #                                  "(获取|出售|售卖|购买|下载)[\u4e00-\u9fa5、]{,4}(文件|标书)")
+            #     _data_i = -1
+            #     while _data_i < len(document_tree) - 1:
+            #         _data_i += 1
+            #         _data = document_tree[_data_i]
+            #         _type = _data["type"]
+            #         _text = _data["text"].strip()
+            #         childs = get_childs([_data])
+            #         last_child = childs[-1]
+            #         if entity.sentence_index>=_data.sentence_index and entity.wordOffset_begin>=_data.wordOffset_begin and
+            #             ():
+            #             if re.search(re_registration, re.split("[::;;,]", _text)[0][:20]) is not None:
+            #
+            #                 content_text = ""
+            #                 for c in childs:
+            #                     content_text += c["text"] + ""
+            #                 print('concat_text', content_text)
 
 
             if re.search("至|到|[日\d][-—]$|[~~]", entity_left):
@@ -3496,6 +3619,88 @@ def getTimeAttributes(list_entity,list_sentence,page_time):
             last_time_type = ""
         last_sentence_index = entity.sentence_index
 
+    # 通过文档分析树形结构补充部分时间实体
+    def add_time_by_parseDocument(dict_time,parse_document):
+        from BiddingKG.dl.interface.htmlparser import get_childs
+        document_tree = parse_document.tree
+        # if not dict_time['time_getFileStart'] or not dict_time['time_getFileEnd']:
+        # time_pattern = re.compile("")
+
+        concat_text_list = []
+        if not dict_time['time_registrationStart'] or not dict_time['time_registrationEnd']:
+            re_registration = re.compile("报名|(文件|标书)[\u4e00-\u9fa5、]{,4}(获取|出售|售卖|购买|下载)|"
+                                         "(获取|出售|售卖|购买|下载)[\u4e00-\u9fa5、]{,4}(文件|标书)")
+            _data_i = -1
+            while _data_i < len(document_tree) - 1:
+                _data_i += 1
+                _data = document_tree[_data_i]
+                _type = _data["type"]
+                _text = _data["text"].strip()
+                # print(_data.keys())
+                if _type == "sentence":
+                    print('_text:',_text,_data["sentence_title"])
+                    if _data["sentence_title"] is not None:
+                        print("aptitude_pattern", _text)
+                        print(_data['sentence_index'],_data['wordOffset_begin'],_data['wordOffset_end'])
+                        if re.search(re_registration, re.split("[::;;。]",_text)[0][:15]) is not None:
+                            childs = get_childs([_data])
+                            concat_text = ""
+                            for c in childs:
+                                concat_text += c["text"] + ""
+                            print('concat_text',concat_text)
+                            concat_text_list.append(concat_text)
+                            _data_i += len(childs)-1
+                # if _type == "table":
+                #     list_table = _data["list_table"]
+                #     parent_title = _data["parent_title"]
+                #     if list_table is not None:
+                #         for line in list_table[:2]:
+                #             for cell_i in range(len(line)):
+                #                 cell = line[cell_i]
+                #                 cell_text = cell[0]
+                #                 if len(cell_text) > 120 and re.search(re_registration, cell_text) is not None:
+                #                     concat_text += cell_text + "\n"
+        print('_text',concat_text_list)
+        for text in concat_text_list:
+            time_list = re.finditer(my_time_format_pattern,text)
+            time_list = [(i,my_timeFormat(i.group(),page_time)) for i in time_list]
+            for time_idx in range(len(time_list)):
+                _time = time_list[time_idx][0]
+                extract_time = time_list[time_idx][1]
+                entity_left = text[:_time.start()]
+                entity_left = re.split("[。;;!??]",entity_left)[-1]
+                # entity_left2 = sentence_text[
+                #                max(entity_context_begin, entity.wordOffset_begin - 10):entity.wordOffset_begin]
+                # entity_left3 = sentence_text[
+                #                max(entity_context_begin, entity.wordOffset_begin - 30):entity.wordOffset_begin]
+                entity_right = text[_time.end():]
+                entity_right = re.split("[。;;!??]",entity_right)[0]
+                # entity_right2 = sentence_text[entity.wordOffset_end:entity_context_end]
+                entity_right2 = re.sub(r"(http[s]?://)?(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\\(\\),]|(?:%[0-9a-fA-F][0-9a-fA-F])){6,}",
+                                       '', entity_right)[:60]  # 去除网址
+                print('entity_right2',entity_right2)
+                if re.search("(进行|在线|线下).{,2}报名", entity_right2):
+                    print('报名text',entity_right2)
+                    if len(extract_time) == 2:
+                        dict_time['time_registrationStart'].append((extract_time[0], 0.51, in_attachment))
+                        dict_time['time_registrationEnd'].append((extract_time[1], 0.51, in_attachment))
+                    else:
+                        if previous_entity and previous_entity.sentence_index==entity.sentence_index:
+                            mid_text = sentence_text[previous_entity.wordOffset_end:entity.wordOffset_begin]
+                            if len(mid_text)<=10 and re.search("至|到|[-—]|[~~]",mid_text) and len(previous_extract_time)==1:
+                                dict_time['time_registrationStart'].append((previous_extract_time[0], 0.51, in_attachment))
+                                dict_time['time_registrationEnd'].append((extract_time[0], 0.51, in_attachment))
+                        if not dict_time['time_registrationEnd']:
+                            if re.search("前|止|截止", entity_right) or re.search("前",entity_text[-2:]):
+                                dict_time['time_registrationEnd'].append((extract_time[0], 0.51, in_attachment))
+                            elif re.search("起|开?始", entity_right) or re.search("起",entity_text[-2:]):
+                                dict_time['time_registrationStart'].append((extract_time[0], 0.51, in_attachment))
+
+
+        return dict_time
+
+    # dict_time = add_time_by_parseDocument(dict_time,parse_document)
+
     # print(dict_time)
     result_dict = dict((key,"") for key in dict_time.keys())
     for time_type,value in dict_time.items():
@@ -3504,7 +3709,7 @@ def getTimeAttributes(list_entity,list_sentence,page_time):
             for in_attachment in [False,True]:
                 _list_time  = [_time for _time in list_time if _time[2]==in_attachment]
                 if _list_time:
-                    _list_time.sort(key=lambda x:x[1],reverse=True)
+                    _list_time.sort(key=lambda x:(x[1],len(x[0])),reverse=True) # sort_key: label_prob,时间文本长度(优先有具体时分秒的)
                     if in_attachment==True and len(result_dict[time_type])>0:
                         break
                     result_dict[time_type] = _list_time[0][0]
@@ -3527,8 +3732,199 @@ def getTimeAttributes(list_entity,list_sentence,page_time):
 
     return result_dict
 
+def get_days_between(day1,day2,get_abs=0):
+    '''
+    :param day1: 较小日期
+    :param day2: 较大日期
+    :param get_abs: 是否取绝对值
+    :return: 天数差
+    '''
+    # 将日期字符串转换为datetime对象
+    date1 = datetime.strptime(day1, '%Y-%m-%d')
+    date2 = datetime.strptime(day2, '%Y-%m-%d')
+    # 计算日期差
+    delta = date2 - date1
+    # 获取天数差
+    days_difference = delta.days
+    if get_abs:
+        return abs(days_difference)
+    else:
+        return days_difference
+
+def extract_serviceTime(service_time,page_time):
+    pattern1 = re.compile("\d{4}[年\-\./]\d{1,2}[月\-\./]\d{1,2}日?")
+    pattern2 = re.compile("\d+(?:\.\d+)?[\((]?个?[^\d]?[^\d]?(?:日|天|周年|整年|学?年|月|周|日历[天日]|工作[天日])")
+    pattern3 = re.compile("\d{4}[年\-\./]\d{1,2}月?")
+    pattern4 = re.compile("(?:日|天|周年|年|月|周|日历[天日]|工作[天日]|星期)[^\d]{1,3}\d+(?:\.\d+)?")
+    DigitsDic = {"零":0, "壹":1, "贰":2, "叁":3, "肆":4, "伍":5, "陆":6, "柒":7, "捌":8, "玖":9,
+                 "〇":0, "一":1, "二":2, "三":3, "四":4, "五":5, "六":6, "七":7, "八":8, "九":9,
+                 "两":2, '貮': 2}
+
+    def get_month_days(year, month):
+        # calendar.monthrange(year, month)返回一个元组,其中第一个元素是月份的第一天是星期几(0-6为星期一到星期日),
+        # 第二个元素是该月的天数。
+        _, last_day = calendar.monthrange(year, month)
+        return last_day
+    def get_num(text):
+        CN_UNIT = {'十': 10,'拾': 10,'百': 100,
+            '佰': 100,'千': 1000,'仟': 1000}
+
+        regex = re.compile(r'[〇一二三四五六七八九零壹贰叁肆伍陆柒捌玖貮两十拾百佰千仟]+')
+        text = regex.search(text)
+        if text:
+            text = text.group()
+        else:
+            return ""
+        result = 0
+        result_list = []
+        unit = 0
+        control = 0
+        for i, d in enumerate(text):
+            if d in '零百佰千仟' and i == 0:
+                return ""
+            if d in DigitsDic:
+                result += DigitsDic[d]
+            elif d in CN_UNIT:
+                if unit == 0:
+                    unit_1 = CN_UNIT[d]
+                    # 这里的处理主要是考虑到类似于二十三亿五千万这种数
+                    if result == 0:
+                        result = CN_UNIT[d]
+                    else:
+                        result *= CN_UNIT[d]
+                    unit = CN_UNIT[d]
+                    result_1 = result
+                elif unit > CN_UNIT[d]:
+                    result -= DigitsDic[text[i - 1]]
+                    result += DigitsDic[text[i - 1]] * CN_UNIT[d]
+                    unit = CN_UNIT[d]
+                elif unit <= CN_UNIT[d]:
+                    if (CN_UNIT[d] < unit_1) and (len(result_list) == control):
+                        result_list.append(result_1)
+                        result = (result - result_1) * CN_UNIT[d]
+                        control += 1
+                    else:
+                        result *= CN_UNIT[d]
+                    unit = CN_UNIT[d]
+                    if len(result_list) == control:
+                        unit_1 = unit
+                        result_1 = result
+            else:
+                return ""
+
+        return sum(result_list) + result
+
+    serviceTime_dict = {"service_start": "", "service_end": "", "service_days": ""}
+    re_num = re.findall(r'[〇一二三四五六七八九零壹贰叁肆伍陆柒捌玖貮两十拾百佰千仟]+',service_time)
+    for _num in re_num:
+        if not re.search("[十拾百佰千仟]",_num):
+            num = ""
+            for word in _num:
+                num += str(DigitsDic.get(word,word))
+            service_time = service_time.replace(_num,num,1)
+        else:
+            num = str(get_num(_num))
+            service_time = service_time.replace(_num,num,1)
+
+    end_time = ""
+    service_days = 0
+    re_page_time = re.search("20\d{2}-\d{2}-\d{2}", page_time)
+    page_time = re_page_time.group() if re_page_time else "2000-01-01" # page_time为空时默认值为2000-01-01
+    if re.search(pattern1,service_time):
+        # end_time = re.findall(pattern1,service_time)[-1]
+        time_list = []
+        for _time in re.findall(pattern1,service_time):
+            _time = re.sub("日","",_time)
+            _time = re.sub("[年月\./]","-",_time)
+            _year,_month,_day = _time.split("-")
+            _month = int(_month)
+            _day = int(_day)
+            _year = int(_year)
+            if _year>2050 or _year<=2000 or _month>12 or _month<=0 or _day<=0 or _day>31:
+                service_days = 0
+            else:
+                if isValidDate(_year,_month,_day):
+                    _time = str(_year)+'-'+str(_month)+'-'+str(_day)
+                    _time = _time.split("-")[0] + '-' + _time.split("-")[1].rjust(2,"0") + '-' + _time.split("-")[2].rjust(2,"0")
+                    time_list.append(_time)
+        if len(time_list)>=2:
+            if get_days_between(page_time,time_list[1])>1 and get_days_between(time_list[0],time_list[1])>0:
+                serviceTime_dict['service_end'] = time_list[1]
+                serviceTime_dict['service_start'] = time_list[0]
+        else:
+            if get_days_between(page_time, time_list[0]) > 1:
+                serviceTime_dict['service_end'] = time_list[0]
+            # service_days = (time.mktime(time.strptime(end_time,"%Y-%m-%d"))-page_timestamp)/(24*60*60)
+    elif re.search(pattern3,service_time):
+        time_list = []
+        # end_time = re.findall(pattern3,service_time)[-1]
+        for _time in re.findall(pattern3,service_time):
+            _time = re.sub("月","",_time)
+            _time = re.sub("[年\./]","-",_time)
+            _year,_month = _time.split("-")
+            _day = 0
+            _month = int(_month)
+            _year = int(_year)
+            if _year>2050 or _year<=2000 or _month>12 or _month<=0:
+                service_days = 0
+            else:
+                _day = get_month_days(_year,_month)
+                if isValidDate(_year, _month, _day):
+                    _time = str(_year)+'-'+str(_month)+'-'+str(_day)
+                    _time = _time.split("-")[0] + '-' + _time.split("-")[1].rjust(2,"0") + '-' + _time.split("-")[2].rjust(2,"0")
+                    time_list.append(_time)
+        if len(time_list) >= 2:
+            if get_days_between(page_time, time_list[1]) > 1 and get_days_between(time_list[0], time_list[1]) > 0:
+                serviceTime_dict['service_end'] = time_list[1]
+                serviceTime_dict['service_start'] = time_list[0]
+        else:
+            if get_days_between(page_time, time_list[0]) > 1:
+                serviceTime_dict['service_end'] = time_list[0]
+                # service_days = (time.mktime(time.strptime(end_time,"%Y-%m-%d"))-page_timestamp)/(24*60*60)
+    elif re.search(pattern2,service_time) or re.search(pattern4,service_time):
+        for pattern in [pattern2,pattern4]:
+            unit = 1
+            match = re.findall(pattern,service_time)
+            if len(set(match))==1:
+                match_text = match[0]
+                if "月" in match_text:
+                    unit = 30
+                elif "年" in match_text:
+                    unit = 365
+                elif "周" in match_text or "星期" in match_text:
+                    unit = 7
+                match_num = float(re.search("\d+",match_text).group())
+                # 数字能被365整除,单位更正为天
+                if int(match_num)%365==0:
+                    unit = 1
+                if unit==365:
+                    if match_num>10:#单位为'年'时,排除数字过大的
+                        match_num = 0
+                elif unit==30:
+                    if match_num>60:#单位为'月'时,排除数字过大的
+                        match_num = 0
+                elif unit==1:
+                    if match_num>4000:#单位为'日'时,排除数字过大的
+                        match_num = 0
+                service_days = match_num * unit
+                if int(service_days) % 360==0:
+                    service_days = service_days / 360 * 365
+                service_days = int(service_days)
+                if service_days <= 1 and service_days > 4000:
+                    service_days = 0
+
+                if service_days>0:
+                    service_days = str(service_days) + "天"
+                    serviceTime_dict['service_days'] = service_days
+                    break
+    elif "半年" in service_time:
+        service_days = 180
+        service_days = str(service_days) + "天"
+        serviceTime_dict['service_days'] = service_days
+
+    return serviceTime_dict
 
-def getOtherAttributes(list_entity,page_time):
+def getOtherAttributes(list_entity,page_time,prem):
     dict_other = {"moneysource":"",
                   "person_review":[],
                   "serviceTime":"",
@@ -3553,7 +3949,7 @@ def getOtherAttributes(list_entity,page_time):
             # print(entity.entity_text)
             # if list_serviceTime and entity.in_attachment:
             #     continue
-            if re.search("[^之]日|天|年|月|周|星期", entity.entity_text) or re.search("\d{4}[\-\./]\d{1,2}", entity.entity_text):
+            if re.search("[^之]日|天|年|月|周|星期", entity.entity_text) or re.search("\d{4}[-./]\d{1,2}", entity.entity_text):
                 list_serviceTime.append(entity)
         elif entity.entity_type=="person" and entity.label ==4:
             dict_other["person_review"].append(entity.entity_text)
@@ -3562,14 +3958,25 @@ def getOtherAttributes(list_entity,page_time):
         elif entity.entity_type=='money' and entity.notes=='总投资' and float(dict_other["total_tendereeMoney"])<float(entity.entity_text):
             dict_other["total_tendereeMoney"] = str(Decimal(entity.entity_text))
             dict_other["total_tendereeMoneyUnit"] = entity.money_unit
-    if list_serviceTime:
+
+    time_contractEnd = prem[0].get("time_contractEnd","")[:10]
+    time_contractStart = prem[0].get("time_contractStart","")[:10]
+    serviceTime_dict = {"service_start":"", "service_end":"", "service_days": ""}
+    if time_contractEnd:
+        serviceTime_dict['service_end'] = time_contractEnd
+        if time_contractStart:
+            if get_days_between(time_contractStart,time_contractEnd)>0:
+                serviceTime_dict['service_start'] = time_contractStart
+    # print([i.entity_text for i in list_serviceTime])
+    if list_serviceTime and not serviceTime_dict['service_end']:
         list_serviceTime_inAtt = [serviceTime for serviceTime in list_serviceTime if serviceTime.in_attachment==1]
         list_serviceTime = [serviceTime for serviceTime in list_serviceTime if serviceTime.in_attachment==0]
         # if not list_serviceTime:
         #     list_serviceTime = list_serviceTime_inAtt
         error_serviceTime = []
         for list_time in [list_serviceTime,list_serviceTime_inAtt]:
-            if not dict_other["serviceTime"]:
+            # if not dict_other["serviceTime"]:
+            if not serviceTime_dict['service_end']:
                 list_time.sort(key=lambda x: (x.prob,-x.sentence_index,-x.begin_index), reverse=True)
                 for _serviceTime in list_time:
                     # 优先取具体时间(20XX年x月x日-20XX年x月x日)
@@ -3578,28 +3985,49 @@ def getOtherAttributes(list_entity,page_time):
                         if _extract_time and len(_extract_time)==2:
                             # 排除开始和结束时间一样的错误模板,例:“履约期限:2023年02月15日至2023年02月15日”
                             if _extract_time[0]!=_extract_time[1]:
-                                dict_other["serviceTime"] = _serviceTime.entity_text
+                                # dict_other["serviceTime"] = _serviceTime.entity_text
+                                # extract_time = extract_serviceTime(_serviceTime.entity_text)
+                                # if extract_time['service_end']:
+                                serviceTime_dict['service_start'] = _extract_time[0]
+                                serviceTime_dict['service_end'] = _extract_time[1]
                                 break
                             else:
                                 error_serviceTime.append(_serviceTime.entity_text)
-                if not dict_other["serviceTime"]:
+                # if not dict_other["serviceTime"]:
+                if not serviceTime_dict['service_end']:
                     for _serviceTime in list_time:
                         # 优先取具体时间(20XX年x月-20XX年x月)
                         if re.search("20\d{2}[年/.\-]\d{1,2}月?[^。\d半一二三四五六七八九十壹两叁贰肆伍陆柒捌玖拾;;]{,3}20\d{2}[年/.\-]\d{1,2}月?", _serviceTime.entity_text):
-                            dict_other["serviceTime"] = _serviceTime.entity_text
-                            break
-                if not dict_other["serviceTime"]:
+                            # dict_other["serviceTime"] = _serviceTime.entity_text
+                            extract_time = extract_serviceTime(_serviceTime.entity_text,page_time)
+                            if extract_time['service_end']:
+                                serviceTime_dict = extract_time
+                                break
+                # if not dict_other["serviceTime"]:
+                if not serviceTime_dict['service_end']:
                     for _serviceTime in list_time:
                         # 优先取具体时间(20XX年x月x日)
                         if re.search("20\d{2}[年/.\-]\d{1,2}[月/.\-]\d{1,2}日?",_serviceTime.entity_text):
                             if _serviceTime.entity_text not in error_serviceTime:
-                                dict_other["serviceTime"] = _serviceTime.entity_text
-                                break
-                if not dict_other["serviceTime"]:
+                                # dict_other["serviceTime"] = _serviceTime.entity_text
+                                extract_time = extract_serviceTime(_serviceTime.entity_text,page_time)
+                                if extract_time['service_end']:
+                                    serviceTime_dict = extract_time
+                                    break
+                # if not dict_other["serviceTime"]:
+                if not serviceTime_dict['service_end'] and not serviceTime_dict['service_days']:
                     for _serviceTime in list_time:
                         if _serviceTime.entity_text not in error_serviceTime:
-                            dict_other["serviceTime"] = _serviceTime.entity_text
-                            break
+                            # dict_other["serviceTime"] = _serviceTime.entity_text
+                            extract_time = extract_serviceTime(_serviceTime.entity_text,page_time)
+                            if extract_time['service_end'] or extract_time['service_days']:
+                                serviceTime_dict = extract_time
+                                break
+    if serviceTime_dict['service_start'] and serviceTime_dict['service_end']:
+        service_days = get_days_between(serviceTime_dict['service_start'],serviceTime_dict['service_end'])
+        serviceTime_dict['service_days'] = str(service_days) + "天"
+    dict_other["serviceTime"] = serviceTime_dict
+
 
     if dict_other['moneysource']:
         dict_other['moneysource'] = turnMoneySource(dict_other['moneysource'])