Procházet zdrojové kódy

截标时间提取优化

znj před 1 rokem
rodič
revize
1961c4732a

+ 1 - 0
BiddingKG/dl/interface/Preprocessing.py

@@ -1428,6 +1428,7 @@ def segment(soup,final=True):
     #替换"""为"“",否则导入deepdive出错
     # text = text.replace('"',"“").replace("\r","").replace("\n",",")
     text = text.replace('"',"“").replace("\r","").replace("\n","").replace("\\n","") #2022/1/4修复 非分段\n 替换为逗号造成 公司拆分 span \n南航\n上海\n分公司
+    text = re.sub("(&nbsp)+"," ",text) # 空白符替换
     # print('==1',text)
     # text = re.sub("\s{4,}",",",text)
     # 解决公告中的" "空格替换问题

+ 22 - 5
BiddingKG/dl/interface/getAttributes.py

@@ -3106,7 +3106,7 @@ def getTimeAttributes(list_entity,list_sentence):
         # definite_time = "00:00:00"
         if extract_time:
             definite_time_list = []
-            t = re.compile("(?P<day>下午|上午|早上)?(?P<hour>\d{1,2})[::时点](?P<half_hour>半)?(?P<minute>\d{2})?[::分]?(?P<second>\d{2})?秒?")
+            t = re.compile("(北京时间)?(?P<day>下午|上午|早上)?(?P<hour>\d{1,2})[::时点](?P<half_hour>半)?(?P<minute>\d{2})?[::分]?(?P<second>\d{2})?秒?")
             _entity_text = re.sub(" (?=[^\d])|(?<=[^\d]) ","",entity_text)
             t_in_word_num = len(re.findall(t,_entity_text))
             t_out_of_word = re.search("^[^\d]{,2}"+t.pattern,re.sub(" (?=[^\d])|(?<=[^\d]) ","",sentence_text[entity.wordOffset_end:]))
@@ -3207,6 +3207,12 @@ def getTimeAttributes(list_entity,list_sentence):
                     dict_time['time_bidclose'].append((extract_time[0], 0.5, in_attachment))
                 if entity.label==3 and re.search("保证金.{,2}(接受|收取)|(接受|收取).{,2}保证金",entity_left3):
                     dict_time['time_earnestMoneyEnd'].append((extract_time[0], 0.5, in_attachment))
+            if entity.label in [6, 7]:
+                if re.search("文件.{,2}([递提]交|接收)|截标|投标.{,2}截止|([递提]交|接收)(?:文件)?.{,2}截止|报价.{,2}截止|响应.{,2}截止",entity_left3):
+                    dict_time['time_bidclose'].append((extract_time[0], 0.5, in_attachment))
+            if entity.label==0:
+                if re.search("文件.{,2}([递提]交|接收)|截标|投标.{,2}截止|([递提]交|接收)(?:文件)?.{,2}截止|报价.{,2}截止|响应.{,2}截止",entity_left3):
+                    dict_time['time_bidclose'].append((extract_time[0], 0.45, in_attachment))
 
             # 补充公告末尾处的发布时间
             if entity.label==0:
@@ -3456,10 +3462,21 @@ def getTimeAttributes(list_entity,list_sentence):
                         break
                     result_dict[time_type] = _list_time[0][0]
     # result_dict 纠错
-    if result_dict['time_bidstart'] and not result_dict['time_bidclose']:
-        if result_dict['time_bidstart']==result_dict['time_bidopen']:
-            result_dict['time_bidstart'] = ""
-            result_dict['time_bidclose'] = result_dict['time_bidopen']
+    if not result_dict['time_bidclose']:
+        if result_dict['time_bidstart']: # 无截标时间,投标开始和开标时间一样
+            if result_dict['time_bidstart'][:10] in result_dict['time_bidopen']:
+                result_dict['time_bidstart'] = ""
+                result_dict['time_bidclose'] = result_dict['time_bidopen']
+        if not result_dict['time_bidclose']:
+            if result_dict['time_getFileEnd']:  # 无截标时间,获取文件截止时间和开标时间一样
+                if result_dict['time_getFileEnd'][:10] in result_dict['time_bidopen']:
+                    result_dict['time_bidclose'] = result_dict['time_bidopen']
+    else:
+        if result_dict['time_bidopen']: # 截标时间 和 开标时间 时分秒互补
+            if len(result_dict['time_bidclose'])<len(result_dict['time_bidopen']) and result_dict['time_bidclose'] in result_dict['time_bidopen']:
+                result_dict['time_bidclose'] = result_dict['time_bidopen']
+            elif len(result_dict['time_bidclose'])>len(result_dict['time_bidopen']) and result_dict['time_bidopen'] in result_dict['time_bidclose']:
+                result_dict['time_bidopen'] = result_dict['time_bidclose']
 
     return result_dict