Parcourir la source

Merge remote-tracking branch 'origin/master'

lsm il y a 6 mois
Parent
commit
dca68ab72f
2 fichiers modifiés avec 30 ajouts et 10 suppressions
  1. 28 8
      BiddingKG/dl/interface/getAttributes.py
  2. 2 2
      BiddingKG/dl/interface/predictor.py

+ 28 - 8
BiddingKG/dl/interface/getAttributes.py

@@ -3294,6 +3294,7 @@ def getTimeAttributes(list_entity,list_sentence,page_time):
     }
     time_entitys = [[_entity,my_timeFormat(_entity.entity_text,page_time)] for _entity in time_entitys]
     time_entitys = [item for item in time_entitys if item[1]]
+    # print(time_entitys)
     for entity_idx in range(len(time_entitys)):
         entity = time_entitys[entity_idx][0]
         extract_time = time_entitys[entity_idx][1]
@@ -3330,14 +3331,16 @@ def getTimeAttributes(list_entity,list_sentence,page_time):
             t_in_word_num = len(re.findall(t,_entity_text))
             # t_out_of_word = re.search("^[^\d]{,2}"+t.pattern,re.sub(" (?=[^\d])|(?<=[^\d]) ","",sentence_text[entity.wordOffset_end:]))
             begin_index = 0
+            definite_time_idx_list = []
             for _num in range(t_in_word_num):
                 if begin_index> _entity_text_len + 8:
                     break
                 t_in_word = re.search(t, _entity_text[begin_index:])
+                # print(_entity_text[begin_index:])
                 if t_in_word:
                     if _num==0 and t_in_word.start() > _entity_text_len + 8:
                         break
-                    begin_index = t_in_word.end()
+                    begin_index += t_in_word.end()
                     # print('t_in_word',entity_text,t_in_word.groupdict())
                     day = t_in_word.groupdict().get('day',"")
                     hour = t_in_word.groupdict().get('hour',"")
@@ -3365,13 +3368,30 @@ def getTimeAttributes(list_entity,list_sentence,page_time):
                     definite_time = "%s:%s:%s"%(hour.rjust(2,"0"),minute.rjust(2,"0"),second.rjust(2,"0"))
                     # print(definite_time)
                     definite_time_list.append(definite_time)
-
-            min_len = min(len(extract_time),len(definite_time_list))
-            for i in range(min_len):
-                if definite_time_list[i] == "24:00:00": # 修正不规范时间表述
-                    definite_time_list[i] = "23:59:59"
-                if definite_time_list[i] != "00:00:00":
-                    extract_time[i] = extract_time[i] + " " + definite_time_list[i]
+                    definite_time_idx_list.append([begin_index-len(t_in_word.group()),begin_index])
+
+            if len(extract_time)==1 and len(definite_time_list)>=2: # 实体只包含一个时间,"2024-12-09 09:00~16:00" 考虑单个时间对应两个详细时间段的识别
+                # 前两个详细时间的间隔
+                distance = definite_time_idx_list[1][0] - definite_time_idx_list[0][1]
+                if distance<=8 and int(definite_time_list[1][:2])>=int(definite_time_list[0][:2]): # 判断详细时间都‘小时’顺序从小到大
+                    new_extract_time = []
+                    for d_time in definite_time_list[:2]:
+                        if d_time == "24:00:00":  # 修正不规范时间表述
+                            d_time = "23:59:59"
+                        new_extract_time.append(extract_time[0] + " " + d_time)
+                    extract_time = new_extract_time
+                else:
+                    if definite_time_list[0] == "24:00:00":  # 修正不规范时间表述
+                        definite_time_list[0] = "23:59:59"
+                    if definite_time_list[0] != "00:00:00":
+                        extract_time[0] = extract_time[0] + " " + definite_time_list[0]
+            else:
+                min_len = min(len(extract_time),len(definite_time_list))
+                for i in range(min_len):
+                    if definite_time_list[i] == "24:00:00": # 修正不规范时间表述
+                        definite_time_list[i] = "23:59:59"
+                    if definite_time_list[i] != "00:00:00":
+                        extract_time[i] = extract_time[i] + " " + definite_time_list[i]
 
         if extract_time:
             # 时间变更prob优化

+ 2 - 2
BiddingKG/dl/interface/predictor.py

@@ -3304,7 +3304,7 @@ class ProductAttributesPredictor():
                         if re.search('项目名称', col0_l[i]):
                             header_list2.append(col0_l[i])
                             product = col1_l[i]
-                        elif re.search('采购需求|需求概况', col0_l[i]):
+                        elif re.search('采购需求|需求概况|招标内容|项目概况', col0_l[i]):
                             header_list2.append(col0_l[i])
                             demand = col1_l[i]
                         elif re.search('采购预算|预算金额|控制金额', col0_l[i]):
@@ -3327,7 +3327,7 @@ class ProductAttributesPredictor():
                             tenderee = re.sub("\s","",col1_l[i])
                             if len(tenderee) > 20:
                                 tenderee = ""
-                        elif re.search('采购时间|采购实施月份|采购月份|采购日期', col0_l[i]):
+                        elif re.search('采购时间|采购实施月份|采购月份|采购日期|预计招标时间', col0_l[i]):
                             header_list2.append(col0_l[i])
                             order_time = col1_l[i].strip()
                             order_begin, order_end = self.fix_time(order_time, html, page_time)