Explorar o código

截标时间提取优化;时分秒提取优化

znj hai 1 ano
pai
achega
91e57ab38c
Modificáronse 2 ficheiros con 108 adicións e 35 borrados
  1. 1 1
      BiddingKG/dl/common/Utils.py
  2. 107 34
      BiddingKG/dl/interface/getAttributes.py

+ 1 - 1
BiddingKG/dl/common/Utils.py

@@ -658,7 +658,7 @@ def timeFormat(_time):
                 if re.search("^\d+$",year):
                     if len(year)==2:
                         year = "20"+year
-                    if int(year)>int(current_year):
+                    if int(year)-int(current_year)>10:
                         legal = False
                 else:
                     _year = ""

+ 107 - 34
BiddingKG/dl/interface/getAttributes.py

@@ -2978,7 +2978,7 @@ def my_timeFormat(_time):
                 if re.search("^\d+$", year):
                     if len(year) == 2:
                         year = "20" + year
-                        if int(year) > int(current_year):
+                        if int(year) - int(current_year) > 5:
                             legal = False
                     else:
                         if int(year) - int(current_year)>10:
@@ -3053,6 +3053,30 @@ def getTimeAttributes(list_entity,list_sentence):
         'time_contractStart': [],  # 18 合同开始时间
         'time_contractEnd': []  # 19 合同结束时间
     }
+
+    dict_time2label = {
+        "time_release": 1,  # 1 发布时间
+        "time_bidopen": 2,  # 2 开标时间
+        "time_bidclose": 3,  # 3 截标时间
+        'time_bidstart': 12,  # 12 投标(开始)时间、响应文件接收(开始)时间
+
+        'time_publicityStart': 4,  # 4 公示开始时间(公示时间、公示期)
+        'time_publicityEnd': 5,  # 5 公示截止时间
+        'time_getFileStart': 6,  # 6 文件获取开始时间(文件获取时间)
+        'time_getFileEnd': 7,  # 7 文件获取截止时间
+        'time_registrationStart': 8,  # 8 报名开始时间(报名时间)
+        'time_registrationEnd': 9,  # 9 报名截止时间
+        'time_earnestMoneyStart': 10,  # 10 保证金递交开始时间(保证金递交时间)
+        'time_earnestMoneyEnd': 11,  # 11 保证金递交截止时间
+        'time_commencement': 13,  # 13 开工日期
+        'time_completion': 14,  # 14 竣工日期
+        'time_listingStart': 15,  # 15 挂牌开始日期(挂牌时间)
+        'time_listingEnd': 16,  # 16 挂牌结束日期、挂牌截止日期
+        'time_signContract': 17,  # 17 合同签订时间
+        'time_contractStart': 18,  # 18 合同开始时间
+        'time_contractEnd': 19  # 19 合同结束时间
+    }
+
     last_sentence_index = 0
     last_time_type = ""
     last_time_index = {
@@ -3067,6 +3091,9 @@ def getTimeAttributes(list_entity,list_sentence):
     }
     for entity in time_entitys:
         sentence_text = list_sentence[entity.sentence_index].sentence_text
+        if entity.sentence_index!=last_sentence_index:
+            # sentence_index 不同句子重置last_time_type
+            last_time_type = ""
         entity_left = sentence_text[max(0, entity.wordOffset_begin - 2):entity.wordOffset_begin]
         entity_left2 = sentence_text[max(0, entity.wordOffset_begin - 10):entity.wordOffset_begin]
         entity_left3 = sentence_text[max(0, entity.wordOffset_begin - 20):entity.wordOffset_begin]
@@ -3080,36 +3107,41 @@ def getTimeAttributes(list_entity,list_sentence):
         if extract_time:
             definite_time_list = []
             t = re.compile("(?P<day>下午|上午|早上)?(?P<hour>\d{1,2})[::时点](?P<half_hour>半)?(?P<minute>\d{2})?[::分]?(?P<second>\d{2})?秒?")
-            t_in_word = re.search(t,entity_text.replace(" ",""))
-            t_out_of_word = re.search("^[^\d]{,2}"+t.pattern,sentence_text[entity.wordOffset_end:])
-            if t_in_word:
-                # print('t_in_word',entity_text,t_in_word.groupdict())
-                day = t_in_word.groupdict().get('day',"")
-                hour = t_in_word.groupdict().get('hour',"")
-                half_hour = t_in_word.groupdict().get('half_hour',"")
-                minute = t_in_word.groupdict().get('minute',"")
-                second = t_in_word.groupdict().get('second',"")
-                if hour:
-                    if day=='下午' and int(hour)<12:
-                        hour = str(int(hour)+12)
-                    if int(hour)>24:
-                        continue
-                else:
-                    hour = "00"
-                if not minute:
-                    if half_hour:
-                        minute = "30"
+            _entity_text = re.sub(" (?=[^\d])|(?<=[^\d]) ","",entity_text)
+            t_in_word_num = len(re.findall(t,_entity_text))
+            t_out_of_word = re.search("^[^\d]{,2}"+t.pattern,re.sub(" (?=[^\d])|(?<=[^\d]) ","",sentence_text[entity.wordOffset_end:]))
+            begin_index = 0
+            for _num in range(t_in_word_num):
+                t_in_word = re.search(t, _entity_text[begin_index:])
+                if t_in_word:
+                    begin_index = t_in_word.end()
+                    # print('t_in_word',entity_text,t_in_word.groupdict())
+                    day = t_in_word.groupdict().get('day',"")
+                    hour = t_in_word.groupdict().get('hour',"")
+                    half_hour = t_in_word.groupdict().get('half_hour',"")
+                    minute = t_in_word.groupdict().get('minute',"")
+                    second = t_in_word.groupdict().get('second',"")
+                    if hour:
+                        if day=='下午' and int(hour)<12:
+                            hour = str(int(hour)+12)
+                        if int(hour)>24:
+                            continue
                     else:
-                        minute = "00"
-                if int(minute)>60:
-                    continue
-                if not second:
-                    second = "00"
-                if int(second)>60:
-                    continue
-                definite_time = "%s:%s:%s"%(hour.rjust(2,"0"),minute.rjust(2,"0"),second.rjust(2,"0"))
-                # print(definite_time)
-                definite_time_list.append(definite_time)
+                        hour = "00"
+                    if not minute:
+                        if half_hour:
+                            minute = "30"
+                        else:
+                            minute = "00"
+                    if int(minute)>60:
+                        continue
+                    if not second:
+                        second = "00"
+                    if int(second)>60:
+                        continue
+                    definite_time = "%s:%s:%s"%(hour.rjust(2,"0"),minute.rjust(2,"0"),second.rjust(2,"0"))
+                    # print(definite_time)
+                    definite_time_list.append(definite_time)
 
             if t_out_of_word:
                 # print('t_out_of_word', entity_text+sentence_text[entity.wordOffset_end:], t_out_of_word.groupdict())
@@ -3154,16 +3186,28 @@ def getTimeAttributes(list_entity,list_sentence):
                     last_index = item.start() + 1
                 label_prob = label_prob - 0.2 * last_index / len(entity_left2)
                 # print('prob优化',label_prob,extract_time)
+            elif re.search("改正|更正|修正|更改|延期",entity_left2):
+                new_label = dict_time2label.get(last_time_type,None)
+                if new_label and entity.label==0:
+                    entity.label = new_label
+                    label_prob = 1
+
             # 优化多个并列的时间,如:开标时间和截标时间,截标时间和报名结束时间
             if entity.label in [2,3,9]:
-                if entity.label==2 and re.search("截标|投标.{,2}截止|递交(?:文件)?.{,2}截止|报价.{,2}截止|响应.{,2}截止",entity_left3):
+                if entity.label==2 and re.search("截标|投标.{,2}截止|([提]|接收)(?:文件)?.{,2}截止|报价.{,2}截止|响应.{,2}截止|文件.{,2}([递提]交|接收)",entity_left3):
                     dict_time['time_bidclose'].append((extract_time[0], 0.5, in_attachment))
                 if entity.label==3 and re.search("开标|评审.{,2}(?:开始)?时间|选取.{,2}时间",entity_left3):
                     dict_time['time_bidopen'].append((extract_time[0], 0.5, in_attachment))
                 if entity.label==3 and re.search("报名",entity_left3):
                     dict_time['time_registrationEnd'].append((extract_time[0], 0.5, in_attachment))
-                if entity.label==9 and re.search("截标|投标.{,2}截止|递交(?:文件)?.{,2}截止|报价.{,2}截止|响应.{,2}截止",entity_left3):
+                if entity.label==9 and re.search("截标|投标.{,2}截止|([提]|接收)(?:文件)?.{,2}截止|报价.{,2}截止|响应.{,2}截止|文件.{,2}([递提]交|接收)",entity_left3):
                     dict_time['time_bidclose'].append((extract_time[0], 0.5, in_attachment))
+            if entity.label in [11, 3]:
+                if entity.label==11 and re.search("文件.{,2}([递提]交|接收)|截标|投标.{,2}截止|([递提]交|接收)(?:文件)?.{,2}截止|报价.{,2}截止|响应.{,2}截止",entity_left3):
+                    dict_time['time_bidclose'].append((extract_time[0], 0.5, in_attachment))
+                if entity.label==3 and re.search("保证金.{,2}(接受|收取)|(接受|收取).{,2}保证金",entity_left3):
+                    dict_time['time_earnestMoneyEnd'].append((extract_time[0], 0.5, in_attachment))
+
             # 补充公告末尾处的发布时间
             if entity.label==0:
                 if entity.is_tail:
@@ -3237,6 +3281,24 @@ def getTimeAttributes(list_entity,list_sentence):
                         dict_time['time_contractStart'].append((extract_time[0], 0.5, in_attachment))
                         dict_time['time_contractEnd'].append((extract_time[1], 0.5, in_attachment))
                         last_time_type = ''
+            # 报价/投标时间补充
+            if entity.label == 0:
+                if re.search("[报竞]价.{,2}(开始|起始).{,2}时间",entity_left2):
+                    entity.label = 12
+                    label_prob = 0.8
+                elif re.search("[报竞]价.{,2}起止.{,2}时间",entity_left2):
+                    entity.label = 12
+                    label_prob = 0.6
+                elif re.search("响应.{,2}文件([递提]交|接收).{,2}时间[::]|([递提]交|接收).{,2}响应.{,2}文件.{,2}时间[::]",entity_left2):
+                    entity.label = 3
+                    label_prob = 0.501
+                elif re.search("响应.{,2}文件([递提]交|接收).{,2}时间|([递提]交|接收).{,2}响应.{,2}文件.{,2}时间",entity_left2) and not re.search("截[止至]",entity_left2):
+                    entity.label = 12
+                    label_prob = 0.51
+                elif re.search("[报竞]价.{,2}截[止至].{,2}时间",entity_left2):
+                    entity.label = 3
+                    label_prob = 0.8
+
 
             if re.search("至|到|[日\d][-—]$|[~~]", entity_left):
                 if entity.sentence_index == last_sentence_index:
@@ -3254,8 +3316,13 @@ def getTimeAttributes(list_entity,list_sentence):
                     dict_time['time_bidopen'].append((extract_time[0],label_prob,in_attachment))
                     last_time_type = 'time_bidopen'
                 elif entity.label==3 and label_prob>0.5:
-                    dict_time['time_bidclose'].append((extract_time[0],label_prob,in_attachment))
-                    last_time_type = 'time_bidclose'
+                    if len(extract_time)==1:
+                        dict_time['time_bidclose'].append((extract_time[0],label_prob,in_attachment))
+                        last_time_type = 'time_bidclose'
+                    elif len(extract_time)==2:
+                        dict_time['time_bidstart'].append((extract_time[0], 0.5, in_attachment))
+                        dict_time['time_bidclose'].append((extract_time[1], label_prob, in_attachment))
+                        last_time_type = 'time_bidclose'
                 elif entity.label==12 and label_prob>0.5:
                     if len(extract_time)==1:
                         if re.search("前|止|截止",entity_right) or re.search("至|止|到",entity_left) or re.search("前",entity_text[-2:]):
@@ -3388,6 +3455,12 @@ def getTimeAttributes(list_entity,list_sentence):
                     if in_attachment==True and len(result_dict[time_type])>0:
                         break
                     result_dict[time_type] = _list_time[0][0]
+    # result_dict 纠错
+    if result_dict['time_bidstart'] and not result_dict['time_bidclose']:
+        if result_dict['time_bidstart']==result_dict['time_bidopen']:
+            result_dict['time_bidstart'] = ""
+            result_dict['time_bidclose'] = result_dict['time_bidopen']
+
     return result_dict