Просмотр исходного кода

预处理和时间分类优化

znj 11 месяцев назад
Родитель
Сommit
904ad2f68d
2 измененных файлов с 28 добавлено и 42 удалено
  1. 4 2
      BiddingKG/dl/interface/Preprocessing.py
  2. 24 40
      BiddingKG/dl/interface/getAttributes.py

+ 4 - 2
BiddingKG/dl/interface/Preprocessing.py

@@ -2150,7 +2150,7 @@ def segment(soup,final=True):
                 else:
                     text = re.sub(punc_del,punc_del.strip()[0],text)   #2021/12/09 修正由于某些标签后插入符号把原来符号替换
             else:
-                text = re.sub(punc_del,"",text)
+                text = re.sub(punc_del," ",text) # 多个空字符替换为一个空格(防止时间类连接),后面还有对空格处理
 
     #将连续的中文句号替换为一个
     text_split = text.split("。")
@@ -2177,7 +2177,7 @@ def segment(soup,final=True):
 
     if len(text)<10000000:
         while(LOOP_BEGIN<len(text)):
-            _text += re.sub(")",")",re.sub("(","(",re.sub("\s(?!\d{2}:\d{2})","",text[LOOP_BEGIN:LOOP_BEGIN+LOOP_LEN])))
+            _text += re.sub(")",")",re.sub("(","(",re.sub("\s(?!\d{1,2}[::]\d{2}|\d{1,2}[点时])","",text[LOOP_BEGIN:LOOP_BEGIN+LOOP_LEN])))
             LOOP_BEGIN += LOOP_LEN
         text = _text
     # 附件标识前修改为句号,避免正文和附件内容混合在一起
@@ -2976,7 +2976,9 @@ def get_preprocessed_article(articles,cost_time = dict(),useselffool=True):
         article_processed = get_preprocessed_outline(article_processed)
         # print('article_processed')
         article_processed = tableToText(article_processed)
+        # print(article_processed)
         article_processed = segment(article_processed)
+        # print(article_processed)
 
         article_processed = article_processed.replace('(', '(').replace(')', ')')  #2022/8/10 统一为中文括号
         # article_processed = article_processed.replace(':', ':')  #2023/1/5 统一为中文冒号

+ 24 - 40
BiddingKG/dl/interface/getAttributes.py

@@ -3206,36 +3206,6 @@ def getTimeAttributes(list_entity,list_sentence,page_time):
                     # print(definite_time)
                     definite_time_list.append(definite_time)
 
-            # if t_out_of_word:
-            #     # print('t_out_of_word', entity_text+sentence_text[entity.wordOffset_end:], t_out_of_word.groupdict())
-            #     day = t_out_of_word.groupdict().get('day', "")
-            #     hour = t_out_of_word.groupdict().get('hour', "")
-            #     half_hour = t_out_of_word.groupdict().get('half_hour', "")
-            #     minute = t_out_of_word.groupdict().get('minute', "")
-            #     second = t_out_of_word.groupdict().get('second', "")
-            #     if hour:
-            #         if day == '下午' and int(hour) < 12:
-            #             hour = str(int(hour) + 12)
-            #         if int(hour) > 24:
-            #             continue
-            #     else:
-            #         hour = "00"
-            #     if not minute:
-            #         if half_hour:
-            #             minute = "30"
-            #         else:
-            #             minute = "00"
-            #     if int(minute) > 60:
-            #         continue
-            #     if not second:
-            #         second = "00"
-            #     if int(second) > 60:
-            #         continue
-            #     definite_time = "%s:%s:%s" % (hour.rjust(2, "0"), minute.rjust(2, "0"), second.rjust(2, "0"))
-            #     # print(definite_time)
-            #     definite_time_list.append(definite_time)
-            #
-
             min_len = min(len(extract_time),len(definite_time_list))
             for i in range(min_len):
                 if definite_time_list[i] != "00:00:00":
@@ -3259,7 +3229,7 @@ def getTimeAttributes(list_entity,list_sentence,page_time):
             if entity.label in [2,3,9]:
                 if entity.label==2 and re.search("截标|投标.{,2}截止|([递提]交|接收)(?:文件)?.{,2}截止|报价.{,2}截止|响应.{,2}截止|文件.{,2}([递提]交|接收)",entity_left3):
                     dict_time['time_bidclose'].append((extract_time[0], label_prob, in_attachment))
-                if entity.label==3 and re.search("开标|评审.{,2}(?:开始)?时间|选取.{,2}时间",entity_left3):
+                if entity.label==3 and re.search("开标|(评审|比选).{,2}(?:开始)?(时间|日期)|选取.{,2}(时间|日期)",entity_left3):
                     dict_time['time_bidopen'].append((extract_time[0], label_prob, in_attachment))
                 if entity.label==3 and re.search("报名",entity_left3):
                     dict_time['time_registrationEnd'].append((extract_time[0], 0.5, in_attachment))
@@ -3276,6 +3246,14 @@ def getTimeAttributes(list_entity,list_sentence,page_time):
             if entity.label==0:
                 if re.search("文件.{,2}([递提]交|接收)|截标|投标.{,2}截止|([递提]交|接收)(?:文件)?.{,2}截止|报价.{,2}截止|响应.{,2}截止",entity_left3):
                     dict_time['time_bidclose'].append((extract_time[0], 0.45, in_attachment))
+            if entity.label==6:
+                # "文件获取时间"和"报名时间"并列
+                if re.search("报名",entity_left3):
+                    if len(extract_time)==1:
+                        dict_time['time_registrationStart'].append((extract_time[0], 0.51, in_attachment))
+                    else:
+                        dict_time['time_registrationStart'].append((extract_time[0], 0.51, in_attachment))
+                        dict_time['time_registrationEnd'].append((extract_time[1], 0.51, in_attachment))
 
             # 补充公告末尾处的发布时间
             if entity.label==0:
@@ -3337,11 +3315,11 @@ def getTimeAttributes(list_entity,list_sentence,page_time):
             if entity.label==0:
                 re_service = '合同期限|工期/交货期/服务期|工期\(交货期\)|合格工期|服务期限|工期' \
                     '|工期要求|项目周期|工期\(交货期\)|计划工期\(服务期限\)|服务时限|履行期限|服务周期|供货期限' \
-                    '|合格工期|计划工期\(服务期\)|服务期|服务,期|交货\(完工\)时间|交付\(服务、完工\)时间' \
-                    '|交货时间|工期承诺|(服务|合同|施工|实施|工程|设计)的?(年限|期限|周期|期:)' \
+                    '|合格工期|计划工期\(服务期\)|服务期|服务,期|交货\(完工\)(时间|日期)|交付\(服务、完工\)(时间|日期)' \
+                    '|交货(时间|日期)|工期承诺|(服务|合同|施工|实施|工程|设计)的?(年限|期限|周期|期:)' \
                     '|服务期限为|计划工期|工期要求|服务期限|服务期' \
-                    '|投标工期|设计工期|合格服务周期|总工期|服务时间(范围)?|流转期限|维护期限|服务时限|交货期' \
-                    '|完成时间|服务期限|中标工期|项目周期|期限要求|供货期|合同履行日期|计划的?周期' \
+                    '|投标工期|设计工期|合格服务周期|总工期|服务(时间|日期)(范围)?|流转期限|维护期限|服务时限|交货期' \
+                    '|完成(时间|日期)|服务期限|中标工期|项目周期|期限要求|供货期|合同履行日期|计划的?周期' \
                     '|履约期限|合同约定完成时限|合同完成日期|承诺完成日期' \
                     '|合同起始日起|合同履约期|履约截止日期|承包期限|合同完成日期' \
                     '|服务期间|服务履行期|委托(管理)?期限'
@@ -3352,21 +3330,27 @@ def getTimeAttributes(list_entity,list_sentence,page_time):
                         last_time_type = ''
             # 报价/投标时间补充
             if entity.label == 0:
-                if re.search("[报竞]价.{,2}(开始|起始).{,2}时间",entity_left2):
+                if re.search("[报竞]价.{,2}(开始|起始).{,2}(时间|日期)",entity_left2):
                     entity.label = 12
                     label_prob = 0.8
-                elif re.search("[报竞]价.{,2}起止.{,2}时间",entity_left2):
+                elif re.search("[报竞]价.{,2}起止.{,2}(时间|日期)",entity_left2):
                     entity.label = 12
                     label_prob = 0.6
-                elif re.search("响应.{,2}文件([递提]交|接收).{,2}时间[::]|([递提]交|接收).{,2}响应.{,2}文件.{,2}时间[::]",entity_left2):
+                elif re.search("响应.{,2}文件([递提]交|接收).{,2}(时间|日期)[::]|([递提]交|接收).{,2}响应.{,2}文件.{,2}(时间|日期)[::]",entity_left2):
                     entity.label = 3
                     label_prob = 0.501
-                elif re.search("响应.{,2}文件([递提]交|接收).{,2}时间|([递提]交|接收).{,2}响应.{,2}文件.{,2}时间",entity_left2) and not re.search("截[止至]",entity_left2):
+                elif re.search("响应.{,2}文件([递提]交|接收).{,2}(时间|日期)|([递提]交|接收).{,2}响应.{,2}文件.{,2}(时间|日期)",entity_left2) and not re.search("截[止至]",entity_left2):
                     entity.label = 12
                     label_prob = 0.51
-                elif re.search("[报竞]价.{,2}截[止至].{,2}时间",entity_left2):
+                elif re.search("[报竞]价.{,2}截[止至].{,2}(时间|日期)",entity_left2):
                     entity.label = 3
                     label_prob = 0.8
+                elif re.search("(竞价|报价).?(时间|日期)",entity_left2):
+                    entity.label = 12
+                    label_prob = 0.51
+                elif re.search("(竞价|报价).?(时间|日期)",entity_left3) and re.search("参与|报价|有意",entity_left2):
+                    entity.label = 12
+                    label_prob = 0.501
 
 
             if re.search("至|到|[日\d][-—]$|[~~]", entity_left):