Переглянути джерело

时间分类解析规则优化

znj 3 роки тому
батько
коміт
2070545bba

+ 2 - 2
BiddingKG/dl/interface/Preprocessing.py

@@ -1086,8 +1086,8 @@ def segment(soup,final=True):
     text = re.sub("(?<=[\u4e00-\u9fa5]);|;(?=[\u4e00-\u9fa5])",";",text)
     # 感叹号替换为中文句号
     text = re.sub("(?<=[\u4e00-\u9fa5])[!!]|[!!](?=[\u4e00-\u9fa5])","。",text)
-    #替换"?"为 " " ,update:2021/7/20
-    text = re.sub("?{1,}"," ",text)
+    #替换格式未识别的问号为" " ,update:2021/7/20
+    text = re.sub("[?\?]{2,}"," ",text)
 
 
     #替换"""为"“",否则导入deepdive出错

+ 72 - 16
BiddingKG/dl/interface/getAttributes.py

@@ -2237,7 +2237,8 @@ def my_timeFormat(_time):
     return time_list
 
 def getTimeAttributes(list_entity,list_sentence):
-    # list_entity = [i for i in list_entity if i.entity_type=='time']
+    time_entitys = [i for i in list_entity if i.entity_type=='time']
+    time_entitys = sorted(time_entitys,key=lambda x:(x.sentence_index, x.begin_index))
     list_sentence = sorted(list_sentence,key=lambda x:x.sentence_index)
     dict_time = {
         "time_release": [],
@@ -2256,105 +2257,160 @@ def getTimeAttributes(list_entity,list_sentence):
         'time_commencement':[] , #13 开工日期
         'time_completion': []  # 14 竣工日期
     }
-    for entity in list_entity:
-        if entity.label!=0:
-            entity_text = entity.entity_text
-            extract_time = my_timeFormat(entity_text)
-            if extract_time:
-                sentence_text = list_sentence[entity.sentence_index].sentence_text
-                entity_left = sentence_text[max(0,entity.wordOffset_begin-2):entity.wordOffset_begin]
-                entity_right = sentence_text[entity.wordOffset_end:entity.wordOffset_end+3]
-                label_prob = entity.values[entity.label]
+    last_sentence_index = 0
+    last_time_type = ""
+    last_time_index = {
+        'time_bidstart':"time_bidclose",
+        'time_publicityStart':"time_publicityEnd",
+        'time_getFileStart':"time_getFileEnd",
+        'time_registrationStart':"time_registrationEnd",
+        'time_earnestMoneyStart':"time_earnestMoneyEnd",
+        'time_commencement':"time_completion",
+    }
+    for entity in time_entitys:
+        sentence_text = list_sentence[entity.sentence_index].sentence_text
+        entity_left = sentence_text[max(0, entity.wordOffset_begin - 2):entity.wordOffset_begin]
+        entity_right = sentence_text[entity.wordOffset_end:entity.wordOffset_end + 3]
+        label_prob = entity.values[entity.label]
+        entity_text = entity.entity_text
+        extract_time = my_timeFormat(entity_text)
+        if extract_time:
+            if re.search("至|到", entity_left):
+                if entity.sentence_index == last_sentence_index:
+                    time_type = last_time_index.get(last_time_type)
+                    if time_type:
+                        dict_time[time_type].append((extract_time[0], 0.5 + label_prob / 10))
+                        last_time_type = ""
+                        continue
+            if entity.label!=0:
                 if entity.label==1 and label_prob>0.5:
                     dict_time['time_release'].append((extract_time[0],label_prob))
+                    last_time_type = 'time_release'
                 elif entity.label==2 and label_prob>0.5:
                     dict_time['time_bidopen'].append((extract_time[0],label_prob))
+                    last_time_type = 'time_bidopen'
                 elif entity.label==3 and label_prob>0.5:
                     dict_time['time_bidclose'].append((extract_time[0],label_prob))
+                    last_time_type = 'time_bidclose'
                 elif entity.label==12 and label_prob>0.5:
                     if len(extract_time)==1:
-                        if re.search("前|止|截止",entity_right) or re.search("至|止",entity_left) or re.search("前",entity_text[-2:]):
+                        if re.search("前|止|截止",entity_right) or re.search("至|止|到",entity_left) or re.search("前",entity_text[-2:]):
                             dict_time['time_bidclose'].append((extract_time[0], label_prob))
+                            last_time_type = 'time_bidclose'
                         else:
                             dict_time['time_bidstart'].append((extract_time[0], label_prob))
+                            last_time_type = 'time_bidstart'
                     else:
                         dict_time['time_bidstart'].append((extract_time[0],label_prob))
                         dict_time['time_bidclose'].append((extract_time[1],label_prob))
+                        last_time_type = ''
                 elif entity.label==4 and label_prob>0.5:
                     if len(extract_time)==1:
-                        if re.search("前|止|截止",entity_right) or re.search("至|止",entity_left) or re.search("前",entity_text[-2:]):
+                        if re.search("前|止|截止",entity_right) or re.search("至|止|到",entity_left) or re.search("前",entity_text[-2:]):
                             dict_time['time_publicityEnd'].append((extract_time[0], label_prob))
+                            last_time_type = 'time_publicityEnd'
                         else:
                             dict_time['time_publicityStart'].append((extract_time[0], label_prob))
+                            last_time_type = 'time_publicityStart'
                     else:
                         dict_time['time_publicityStart'].append((extract_time[0],label_prob))
                         dict_time['time_publicityEnd'].append((extract_time[1],label_prob))
+                        last_time_type = ''
                 elif entity.label==5 and label_prob>0.5:
                     if len(extract_time)==1:
                         dict_time['time_publicityEnd'].append((extract_time[0], label_prob))
+                        last_time_type = 'time_publicityEnd'
                     else:
                         dict_time['time_publicityStart'].append((extract_time[0],label_prob))
                         dict_time['time_publicityEnd'].append((extract_time[1],label_prob))
+                        last_time_type = ''
                 elif entity.label==6 and label_prob>0.5:
                     if len(extract_time)==1:
-                        if re.search("前|止|截止",entity_right) or re.search("至|止",entity_left) or re.search("前",entity_text[-2:]):
+                        if re.search("前|止|截止",entity_right) or re.search("至|止|到",entity_left) or re.search("前",entity_text[-2:]):
                             dict_time['time_getFileEnd'].append((extract_time[0], label_prob))
+                            last_time_type = 'time_getFileEnd'
                         else:
                             dict_time['time_getFileStart'].append((extract_time[0], label_prob))
+                            last_time_type = 'time_getFileStart'
                     else:
                         dict_time['time_getFileStart'].append((extract_time[0],label_prob))
                         dict_time['time_getFileEnd'].append((extract_time[1],label_prob))
+                        last_time_type = ''
                 elif entity.label==7 and label_prob>0.5:
                     if len(extract_time)==1:
                         dict_time['time_getFileEnd'].append((extract_time[0], label_prob))
+                        last_time_type = 'time_getFileEnd'
                     else:
                         dict_time['time_getFileStart'].append((extract_time[0],label_prob))
                         dict_time['time_getFileEnd'].append((extract_time[1],label_prob))
+                        last_time_type = ''
                 elif entity.label==8 and label_prob>0.5:
                     if len(extract_time)==1:
-                        if re.search("前|止|截止",entity_right) or re.search("至|止",entity_left) or re.search("前",entity_text[-2:]):
+                        if re.search("前|止|截止",entity_right) or re.search("至|止|到",entity_left) or re.search("前",entity_text[-2:]):
                             dict_time['time_registrationEnd'].append((extract_time[0], label_prob))
+                            last_time_type = 'time_registrationEnd'
                         else:
                             dict_time['time_registrationStart'].append((extract_time[0], label_prob))
+                            last_time_type = 'time_registrationStart'
                     else:
                         dict_time['time_registrationStart'].append((extract_time[0],label_prob))
                         dict_time['time_registrationEnd'].append((extract_time[1],label_prob))
+                        last_time_type = ''
                 elif entity.label==9 and label_prob>0.5:
                     if len(extract_time)==1:
                         dict_time['time_registrationEnd'].append((extract_time[0], label_prob))
+                        last_time_type = 'time_registrationEnd'
                     else:
                         dict_time['time_registrationStart'].append((extract_time[0],label_prob))
                         dict_time['time_registrationEnd'].append((extract_time[1],label_prob))
+                        last_time_type = ''
                 elif entity.label==10 and label_prob>0.5:
                     if len(extract_time)==1:
-                        if re.search("前|止|截止",entity_right) or re.search("至|止",entity_left) or re.search("前",entity_text[-2:]):
+                        if re.search("前|止|截止",entity_right) or re.search("至|止|到",entity_left) or re.search("前",entity_text[-2:]):
                             dict_time['time_earnestMoneyEnd'].append((extract_time[0], label_prob))
+                            last_time_type = 'time_earnestMoneyEnd'
                         else:
                             dict_time['time_earnestMoneyStart'].append((extract_time[0], label_prob))
+                            last_time_type = 'time_earnestMoneyStart'
                     else:
                         dict_time['time_earnestMoneyStart'].append((extract_time[0],label_prob))
                         dict_time['time_earnestMoneyEnd'].append((extract_time[1],label_prob))
+                        last_time_type = ''
                 elif entity.label==11 and label_prob>0.5:
                     if len(extract_time)==1:
                         dict_time['time_earnestMoneyEnd'].append((extract_time[0], label_prob))
+                        last_time_type = 'time_earnestMoneyEnd'
                     else:
                         dict_time['time_earnestMoneyStart'].append((extract_time[0],label_prob))
                         dict_time['time_earnestMoneyEnd'].append((extract_time[1],label_prob))
+                        last_time_type = ''
                 elif entity.label==13 and label_prob>0.5:
                     if len(extract_time)==1:
-                        if re.search("前|止|截止",entity_right) or re.search("至|止",entity_left) or re.search("前",entity_text[-2:]):
+                        if re.search("前|止|截止",entity_right) or re.search("至|止|到",entity_left) or re.search("前",entity_text[-2:]):
                             dict_time['time_completion'].append((extract_time[0], label_prob))
+                            last_time_type = 'time_completion'
                         else:
                             dict_time['time_commencement'].append((extract_time[0], label_prob))
+                            last_time_type = 'time_commencement'
                     else:
                         dict_time['time_commencement'].append((extract_time[0],label_prob))
                         dict_time['time_completion'].append((extract_time[1],label_prob))
+                        last_time_type = ''
                 elif entity.label==14 and label_prob>0.5:
                     if len(extract_time)==1:
                         dict_time['time_completion'].append((extract_time[0], label_prob))
+                        last_time_type = 'time_completion'
                     else:
                         dict_time['time_commencement'].append((extract_time[0],label_prob))
                         dict_time['time_completion'].append((extract_time[1],label_prob))
+                        last_time_type = ''
+                else:
+                    last_time_type = ""
+            else:
+                last_time_type = ""
+        else:
+            last_time_type = ""
+        last_sentence_index = entity.sentence_index
 
 
     result_dict = dict((key,"") for key in dict_time.keys())