Pārlūkot izejas kodu

Merge remote-tracking branch 'origin/master'

luojiehua 10 mēneši atpakaļ
vecāks
revīzija
75e8a0aff8

+ 3 - 3
BiddingKG/dl/interface/Preprocessing.py

@@ -952,7 +952,7 @@ def tableToText(soup, docid=None):
             count_flag = True
             for width_index in range(width):
                 if inner_table[height][width_index][1]==0:
-                    if re.search(company_pattern,inner_table[height][width_index][0])  is not None:
+                    if re.search(company_pattern,inner_table[height][width_index][0]) is not None:
                         count_set.add(inner_table[height][width_index][0])
                     else:
                         count_flag = False
@@ -1082,7 +1082,7 @@ def tableToText(soup, docid=None):
 
                                 cell = table_occurence[i][j]
                                 head = (cell["top_head"]+":") if len(cell["top_head"])>0 else ""
-                                if re.search("单报标限总]价|金额|成交报?价|报价|供应商|候选人|中标人", head):
+                                if re.search("[单报标限总]价|金额|成交报?价|报价|供应商|候选人|中标人|[利费]率|负责人|工期|服务(期限?|年限|时间|日期|周期)|(履约|履行)期限|合同(期限?|(完成|截止)(日期|时间))", head):
                                     head = cell["left_head"] + head
                                 else:
                                     head += cell["left_head"]
@@ -1127,7 +1127,7 @@ def tableToText(soup, docid=None):
 
                                 cell = table_occurence[i][j]
                                 head = (cell["left_head"]+"") if len(cell["left_head"])>0 else ""
-                                if re.search("单报标限总]价|金额|成交报?价|报价", head):
+                                if re.search("[单报标限总]价|金额|成交报?价|报价|供应商|候选人|中标人|[利费]率|负责人|工期|服务(期限?|年限|时间|日期|周期)|(履约|履行)期限|合同(期限?|(完成|截止)(日期|时间))", head):
                                     head = cell["top_head"] + head
                                 else:
                                     head += cell["top_head"]

+ 17 - 13
BiddingKG/dl/interface/getAttributes.py

@@ -3768,9 +3768,9 @@ def get_days_between(day1,day2,get_abs=0):
         return days_difference
 
 def extract_serviceTime(service_time,page_time):
-    pattern1 = re.compile("\d{4}[年\-\./]\d{1,2}[月\-\./]\d{1,2}日?")
-    pattern2 = re.compile("\d+(?:\.\d+)?[\((]?个?[^\d]?[^\d]?(?:日|天|周年|整年|学?年|月|周|日历[天日]|工作[天日])")
-    pattern3 = re.compile("\d{4}[年\-\./]\d{1,2}月?")
+    pattern1 = re.compile("\d{4}[年\-./]\d{1,2}[月\-./]\d{1,2}日?")
+    pattern2 = re.compile("\d+(?:\.\d+)?[((]?个?[^\d]?[^\d]?(?:日|天|周年|整年|学?年|月|周|日历[天日]|工作[天日])")
+    pattern3 = re.compile("\d{4}[年\-./]\d{1,2}月?")
     pattern4 = re.compile("(?:日|天|周年|年|月|周|日历[天日]|工作[天日]|星期)[^\d]{1,3}\d+(?:\.\d+)?")
     DigitsDic = {"零":0, "壹":1, "贰":2, "叁":3, "肆":4, "伍":5, "陆":6, "柒":7, "捌":8, "玖":9,
                  "〇":0, "一":1, "二":2, "三":3, "四":4, "五":5, "六":6, "七":7, "八":8, "九":9,
@@ -3851,7 +3851,7 @@ def extract_serviceTime(service_time,page_time):
         time_list = []
         for _time in re.findall(pattern1,service_time):
             _time = re.sub("日","",_time)
-            _time = re.sub("[年月\./]","-",_time)
+            _time = re.sub("[年月./]","-",_time)
             _year,_month,_day = _time.split("-")
             _month = int(_month)
             _day = int(_day)
@@ -3867,7 +3867,7 @@ def extract_serviceTime(service_time,page_time):
             if get_days_between(page_time,time_list[1])>1 and get_days_between(time_list[0],time_list[1])>0:
                 serviceTime_dict['service_end'] = time_list[1]
                 serviceTime_dict['service_start'] = time_list[0]
-        else:
+        elif len(time_list)==1:
             if get_days_between(page_time, time_list[0]) > 1:
                 serviceTime_dict['service_end'] = time_list[0]
             # service_days = (time.mktime(time.strptime(end_time,"%Y-%m-%d"))-page_timestamp)/(24*60*60)
@@ -3876,7 +3876,7 @@ def extract_serviceTime(service_time,page_time):
         # end_time = re.findall(pattern3,service_time)[-1]
         for _time in re.findall(pattern3,service_time):
             _time = re.sub("月","",_time)
-            _time = re.sub("[年\./]","-",_time)
+            _time = re.sub("[年./]","-",_time)
             _year,_month = _time.split("-")
             _day = 0
             _month = int(_month)
@@ -3893,7 +3893,7 @@ def extract_serviceTime(service_time,page_time):
             if get_days_between(page_time, time_list[1]) > 1 and get_days_between(time_list[0], time_list[1]) > 0:
                 serviceTime_dict['service_end'] = time_list[1]
                 serviceTime_dict['service_start'] = time_list[0]
-        else:
+        elif len(time_list)==1:
             if get_days_between(page_time, time_list[0]) > 1:
                 serviceTime_dict['service_end'] = time_list[0]
                 # service_days = (time.mktime(time.strptime(end_time,"%Y-%m-%d"))-page_timestamp)/(24*60*60)
@@ -3922,14 +3922,16 @@ def extract_serviceTime(service_time,page_time):
                 elif unit==1:
                     if match_num>4000:#单位为'日'时,排除数字过大的
                         match_num = 0
-                service_days = match_num * unit
-                if int(service_days) % 360==0:
+                service_days = int(match_num * unit)
+                if service_days % 360==0:
                     service_days = service_days / 360 * 365
+                elif service_days % 180==0 and service_days % 360!=0:
+                    service_days = service_days // 360 * 365 + 180
                 service_days = int(service_days)
                 if service_days <= 1 and service_days > 4000:
                     service_days = 0
 
-                if service_days>0:
+                if service_days>3:
                     # service_days = str(service_days) + "天"
                     serviceTime_dict['service_days'] = service_days
                     break
@@ -3943,6 +3945,10 @@ def extract_serviceTime(service_time,page_time):
 
     return serviceTime_dict
 
+def getServiceTime():
+
+    pass
+
 def getOtherAttributes(list_entity,page_time,prem,channel_dic):
     dict_other = {"moneysource":"",
                   "person_review":[],
@@ -3990,12 +3996,10 @@ def getOtherAttributes(list_entity,page_time,prem,channel_dic):
     if list_serviceTime and not serviceTime_dict['service_end']:
         list_serviceTime_inAtt = [serviceTime for serviceTime in list_serviceTime if serviceTime.in_attachment==1]
         list_serviceTime = [serviceTime for serviceTime in list_serviceTime if serviceTime.in_attachment==0]
-        # if not list_serviceTime:
-        #     list_serviceTime = list_serviceTime_inAtt
         error_serviceTime = []
         for list_time in [list_serviceTime,list_serviceTime_inAtt]:
             # if not dict_other["serviceTime"]:
-            if not serviceTime_dict['service_end']:
+            if not serviceTime_dict['service_end'] and not serviceTime_dict['service_days']:
                 list_time.sort(key=lambda x: (x.prob,-x.sentence_index,-x.begin_index), reverse=True)
                 for _serviceTime in list_time:
                     # 优先取具体时间(20XX年x月x日-20XX年x月x日)

+ 54 - 15
BiddingKG/dl/time/re_servicetime.py

@@ -30,7 +30,8 @@ before = '(?P<before>' \
          '|履约期限|合同的?约定完成时限|合同的?完成日期|承诺完成日期' \
          '|合同起始日起|合同的?履约期|履约截止日期|承包期限|合同的?完成日期|特许经营期限' \
          '|服务期间|服务履行期|委托(管理)?期限|经营期限|数量' \
-         '|(工期|服务期限?|交货期限?|服务履行期|合同期限?|履[行约]期限?)说明|存款期限?|存款年限' \
+         '|(工期|服务期限?|交货期限?|服务履行期|合同期限?|履[行约]期限?)说明|存款期限?|(存款|存放|定存)(期|年)限|服务日期' \
+         '|服务(有效期|年限)|本?合同有效期|协议有效期|项目期限' \
          ')'
 
 
@@ -42,10 +43,10 @@ before_wuye = '(?P<before>' \
 # (履约期限、地点等简要信息.{0,25}(?= [\d半一二三四五六七八九十壹两叁贰肆伍陆柒捌玖拾]+([年月日]|个月)|20[21]))
 
 before2 = '(?P<before2>' \
-          '自合同签订之日起至|合同签订之日起|自合同签订之日起|签订合同后|系统开发' \
-          '|合同签订之日起至|自合同签订之日|合同签定后|自签订合同之日起|自合同签订起' \
-          '|[自从]?合同签[订定]生效之日起|自合同签订后不超过|合同签订日至' \
-          '|合同签订生效之日起' \
+          '自合同签订[次]日起至|合同签订[次]日起|自合同签订[次]日起|签订合同后|系统开发' \
+          '|合同签订[次]日起至|自合同签订[次]日|合同签定后|自签订合同[次]日起|自合同签订起' \
+          '|[自从]?合同签[订定]生效[次]日起|自合同签订后不超过|合同签订日至' \
+          '|合同签订生效[次]日起' \
           '|本项目招标有效期|招标有效期' \
           '|[自从于]?签[订定署字](合同|协议书|协议)并?(期|开始履行|生效|有效期|约定|验收合格|期限|开始服务){0,2}(之[日后]|日期?[后起]|后|起|算|为)+[^。\d半一二三四五六七八九十壹两叁贰肆伍陆柒捌玖拾]{0,4}' \
           '|[自从于]?(采购)?(合同|协议书|协议)(正式)?签[订定署字](完[成毕])?并?(期|开始履行|生效|验收合格|开始服务|期限|有效期|约定){0,2}(之[日后]|日期?[后起]|后|起|算|为)+[^。\d半一二三四五六七八九十壹两叁贰肆伍陆柒捌玖拾]{0,5}' \
@@ -57,7 +58,7 @@ before2 = '(?P<before2>' \
           '|[自从于]服务(合同|协议书|协议)生效(之[日后]|后|起)[^。\d半一二三四五六七八九十壹两叁贰肆伍陆柒捌玖拾]{1,4}' \
           '|(本次)?采购周期' \
           '|(项目招标)?履行期|[自从于]?(合同|协议书|协议)生效(之[日后]|后|起)[^。\d半一二三四五六七八九十壹两叁贰肆伍陆柒捌玖拾]{1,3}' \
-          '|服务(有效期|年限)|本?合同有效期|(正式)?入驻(之[日后]|后|起|算)+' \
+          '|服务(有效期|年限)|本?合同有效期|协议有效期|(正式)?入驻(之[日后]|后|起|算)+' \
           '|(合同|协议书|协议)生效(之[日后]|后|起|算)+' \
           '|自?(提供服务|采购人指定|合同约定)(之[日后]|后|起|算)+' \
           '|本?项目合同期(为|是)*' \
@@ -66,6 +67,29 @@ before2 = '(?P<before2>' \
         # '|[^。]{0,4}[自从于][^。;;,]{0,15}(之[日后]|后|起|算|为)+[^。\d半一二三四五六七八九十壹两叁贰肆伍陆柒捌玖拾]{0,5}?' \
     # '|[自从于].{2,15}之日[^。\d半一二三四五六七八九十壹两叁贰肆伍陆柒捌玖拾]{1,4}' \
 
+# before2 用于做开头的表达,需排除一些不合理的
+before2_first = '(?P<before2>' \
+          '自合同签订之日起至|合同签订之日起|自合同签订之日起|签订合同后' \
+          '|合同签订之日起至|自合同签订之日|合同签定后|自签订合同之日起|自合同签订起' \
+          '|[自从]?合同签[订定]生效之日起|自合同签订后不超过|合同签订日至' \
+          '|合同签订生效之日起' \
+          '|本项目招标有效期|招标有效期' \
+          '|[自从于]?签[订定署字](合同|协议书|协议)并?(期|开始履行|生效|有效期|约定|验收合格|期限|开始服务){0,2}(之[日后]|日期?[后起]|后|起|算|为)+[^。\d半一二三四五六七八九十壹两叁贰肆伍陆柒捌玖拾]{0,4}' \
+          '|[自从于]?(采购)?(合同|协议书|协议)(正式)?签[订定署字](完[成毕])?并?(期|开始履行|生效|验收合格|开始服务|期限|有效期|约定){0,2}(之[日后]|日期?[后起]|后|起|算|为)+[^。\d半一二三四五六七八九十壹两叁贰肆伍陆柒捌玖拾]{0,5}' \
+          '|服务要求' \
+          '|签订合同起' \
+          '|项目的有效期限为|项目服务为|签订合同期为' \
+          '|(合同|协议书)签[订定署字]生效(之[日后]|后|起)[^。\d半一二三四五六七八九十壹两叁贰肆伍陆柒捌玖拾]{1,4}' \
+          '|[自从于]服务(合同|协议书|协议)生效(之[日后]|后|起)[^。\d半一二三四五六七八九十壹两叁贰肆伍陆柒捌玖拾]{1,4}' \
+          '|(本次)?采购周期' \
+          '|(项目招标)?履行期|[自从于]?(合同|协议书|协议)生效(之[日后]|后|起)[^。\d半一二三四五六七八九十壹两叁贰肆伍陆柒捌玖拾]{1,3}' \
+          '|服务(有效期|年限)|本?合同有效期|协议有效期|(正式)?入驻(之[日后]|后|起|算)+' \
+          '|(合同|协议书|协议)生效(之[日后]|后|起|算)+' \
+          '|自?(提供服务|采购人指定|合同约定)(之[日后]|后|起|算)+' \
+          '|本?项目合同期(为|是)*' \
+          '|交付使用(之[日后]|后|起|算)+|' \
+          ')'
+
 before3 = '(?P<before3>' \
           ',?([\((](日历天|施工时间|单位)[\))]|[\((]天[\))]?|[\((]年[\))]?|[\((]月[\))]?)?' \
           ')'
@@ -136,7 +160,7 @@ reg2 = re.compile(before + before3 + before7 + charac + before5 + before2 + befo
 
 reg3 = re.compile(before + before3 + before7 + charac + before5 + before2 + after2)
 
-reg4 = re.compile(before2[:-2]+before2[-1:] + before5 + center + after)
+reg4 = re.compile(before2_first[:-2]+before2_first[-1:] + before5 + center + after)
 
 reg5 = re.compile(before + before3 + before7 + charac + before5 + before2 + before4 + before6 + center2 + after)
 
@@ -229,18 +253,18 @@ def re_service_time(text):
             prob = 0.8
 
         if len(output_list) == 0:
-            output_list, text_index_list = re_find_all_result(reg4, input_str)
+            output_list, text_index_list = re_find_all_result(reg5, input_str)
             if TEST_MODE:
-                print("output_str, text_index reg4", output_list, text_index_list)
+                print("output_str, text_index reg5", output_list, text_index_list)
             output_list, text_index_list = filter_service_time(output_list, text_index_list)
-            prob = 0.5
+            prob = 0.8
 
         if len(output_list) == 0:
-            output_list, text_index_list = re_find_all_result(reg5, input_str)
+            output_list, text_index_list = re_find_all_result(reg4, input_str)
             if TEST_MODE:
-                print("output_str, text_index reg5", output_list, text_index_list)
+                print("output_str, text_index reg4", output_list, text_index_list)
             output_list, text_index_list = filter_service_time(output_list, text_index_list)
-            prob = 0.8
+            prob = 0.5
 
         # 添加
         all_output_list += output_list
@@ -298,7 +322,7 @@ def filter_service_time(output_list, text_index_list):
         if not re.findall(reg_right_unit, output) and not re.match('^\d{1,3}$', output):
             delete_list.append([output, text_index_list[i]])
             continue
-        if not re.findall("[^之]日|天|年|月|周|星期", output) or re.search("\d{4}[\-\./]\d{1,2}", output):
+        if not (re.findall("[^之]日|天|年|月|周|星期", output) or re.search("\d{4}[\-\./]\d{1,2}", output)):
             delete_list.append([output, text_index_list[i]])
             continue
         # 包含不要的字
@@ -362,7 +386,22 @@ def re_find_all_result(reg, input, unit="", index=0):
         if re.search("数量",i.group()) and not re.search("[年月日天周]",input[i.start()+front_len: i.end()]):
             continue
         # 前述表达有排除词的跳过
-        if re.search("公告|发布",input[i.start():i.start()+front_len]):
+        if re.search("公告|发布|公示",input[i.start():i.start()+front_len]):
+            continue
+        # ‘服务日期’只保留x年的
+        if re.search("服务日期", input[i.start():i.start() + front_len]) \
+            and (re.search('[日月]',input[i.start()+front_len: i.end()]) or not re.search('年',input[i.start()+front_len: i.end()])):
+            continue
+        # 排除某些容易错误的表达
+        if re.search('^(自合同签订[之次]日起至|合同签订[之次]日起|自合同签订[之次]日起|签订合同后' \
+              '|合同签订[之次]日起至|自合同签订[之次]日|合同签定后|自签订合同[之次]日起|自合同签订起' \
+              '|[自从]?合同签[订定]生效[之次]日起|自合同签订后不超过|合同签订日至' \
+              '|合同签订生效[之次]日起|签订合同起' \
+              '|[自从于]?签[订定署字](合同|协议书|协议)并?(期|开始履行|生效|有效期|约定|验收合格|期限|开始服务){0,2}(之[日后]|日期?[后起]|后|起|算|为)+[^。\d半一二三四五六七八九十壹两叁贰肆伍陆柒捌玖拾]{0,4}' \
+              '|[自从于]?(采购)?(合同|协议书|协议)(正式)?签[订定署字](完[成毕])?并?(期|开始履行|生效|验收合格|开始服务|期限|有效期|约定){0,2}(之[日后]|日期?[后起]|后|起|算|为)+[^。\d半一二三四五六七八九十壹两叁贰肆伍陆柒捌玖拾]{0,5}' \
+              '|(合同|协议书)签[订定署字]生效(之[日后]|后|起)[^。\d半一二三四五六七八九十壹两叁贰肆伍陆柒捌玖拾]{1,4}' \
+              '|[自从于]服务(合同|协议书|协议)生效(之[日后]|后|起)[^。\d半一二三四五六七八九十壹两叁贰肆伍陆柒捌玖拾]{1,4}' \
+              ')',input[i.start():i.start() + front_len]):
             continue
 
         text_index.append([i.start()+front_len, i.end()])