浏览代码

Merge remote-tracking branch 'origin/master'

lsm 11 月之前
父节点
当前提交
58181d0f58
共有 3 个文件被更改,包括 55 次插入58 次删除
  1. 5 3
      BiddingKG/dl/interface/Preprocessing.py
  2. 24 40
      BiddingKG/dl/interface/getAttributes.py
  3. 26 15
      BiddingKG/dl/interface/predictor.py

+ 5 - 3
BiddingKG/dl/interface/Preprocessing.py

@@ -1739,7 +1739,7 @@ def table_head_repair_process(_inner_table, docid=None, show=0, show_row_index=0
         head_keyword2 = ['管理中心', '有限公司', '项目采购', '确定。', ]
         # 开头匹配关键词,直接不做表头
         head_keyword3 = ['详见', '选定', '咨询服务', '标准物资', '电汇', '承兑', '低档', '高档',
-                         '更换配置']
+                         '更换配置', '各种数据']
         # 文本匹配关键词且前一列为表头,直接作为表头
         head_keyword4 = ['综合排名', '工期(交货期)', '检测批', '检测范围', '混凝土设计强检测批的容度等级',
                          '量(个)']
@@ -2150,7 +2150,7 @@ def segment(soup,final=True):
                 else:
                     text = re.sub(punc_del,punc_del.strip()[0],text)   #2021/12/09 修正由于某些标签后插入符号把原来符号替换
             else:
-                text = re.sub(punc_del,"",text)
+                text = re.sub(punc_del," ",text) # 多个空字符替换为一个空格(防止时间类连接),后面还有对空格处理
 
     #将连续的中文句号替换为一个
     text_split = text.split("。")
@@ -2177,7 +2177,7 @@ def segment(soup,final=True):
 
     if len(text)<10000000:
         while(LOOP_BEGIN<len(text)):
-            _text += re.sub(")",")",re.sub("(","(",re.sub("\s(?!\d{2}:\d{2})","",text[LOOP_BEGIN:LOOP_BEGIN+LOOP_LEN])))
+            _text += re.sub(")",")",re.sub("(","(",re.sub("\s(?!\d{1,2}[::]\d{2}|\d{1,2}[点时])","",text[LOOP_BEGIN:LOOP_BEGIN+LOOP_LEN])))
             LOOP_BEGIN += LOOP_LEN
         text = _text
     # 附件标识前修改为句号,避免正文和附件内容混合在一起
@@ -2978,7 +2978,9 @@ def get_preprocessed_article(articles,cost_time = dict(),useselffool=True):
         article_processed = get_preprocessed_outline(article_processed)
         # print('article_processed')
         article_processed = tableToText(article_processed)
+        # print(article_processed)
         article_processed = segment(article_processed)
+        # print(article_processed)
 
         article_processed = article_processed.replace('(', '(').replace(')', ')')  #2022/8/10 统一为中文括号
         # article_processed = article_processed.replace(':', ':')  #2023/1/5 统一为中文冒号

+ 24 - 40
BiddingKG/dl/interface/getAttributes.py

@@ -3207,36 +3207,6 @@ def getTimeAttributes(list_entity,list_sentence,page_time):
                     # print(definite_time)
                     definite_time_list.append(definite_time)
 
-            # if t_out_of_word:
-            #     # print('t_out_of_word', entity_text+sentence_text[entity.wordOffset_end:], t_out_of_word.groupdict())
-            #     day = t_out_of_word.groupdict().get('day', "")
-            #     hour = t_out_of_word.groupdict().get('hour', "")
-            #     half_hour = t_out_of_word.groupdict().get('half_hour', "")
-            #     minute = t_out_of_word.groupdict().get('minute', "")
-            #     second = t_out_of_word.groupdict().get('second', "")
-            #     if hour:
-            #         if day == '下午' and int(hour) < 12:
-            #             hour = str(int(hour) + 12)
-            #         if int(hour) > 24:
-            #             continue
-            #     else:
-            #         hour = "00"
-            #     if not minute:
-            #         if half_hour:
-            #             minute = "30"
-            #         else:
-            #             minute = "00"
-            #     if int(minute) > 60:
-            #         continue
-            #     if not second:
-            #         second = "00"
-            #     if int(second) > 60:
-            #         continue
-            #     definite_time = "%s:%s:%s" % (hour.rjust(2, "0"), minute.rjust(2, "0"), second.rjust(2, "0"))
-            #     # print(definite_time)
-            #     definite_time_list.append(definite_time)
-            #
-
             min_len = min(len(extract_time),len(definite_time_list))
             for i in range(min_len):
                 if definite_time_list[i] != "00:00:00":
@@ -3260,7 +3230,7 @@ def getTimeAttributes(list_entity,list_sentence,page_time):
             if entity.label in [2,3,9]:
                 if entity.label==2 and re.search("截标|投标.{,2}截止|([递提]交|接收)(?:文件)?.{,2}截止|报价.{,2}截止|响应.{,2}截止|文件.{,2}([递提]交|接收)",entity_left3):
                     dict_time['time_bidclose'].append((extract_time[0], label_prob, in_attachment))
-                if entity.label==3 and re.search("开标|评审.{,2}(?:开始)?时间|选取.{,2}时间",entity_left3):
+                if entity.label==3 and re.search("开标|(评审|比选).{,2}(?:开始)?(时间|日期)|选取.{,2}(时间|日期)",entity_left3):
                     dict_time['time_bidopen'].append((extract_time[0], label_prob, in_attachment))
                 if entity.label==3 and re.search("报名",entity_left3):
                     dict_time['time_registrationEnd'].append((extract_time[0], 0.5, in_attachment))
@@ -3277,6 +3247,14 @@ def getTimeAttributes(list_entity,list_sentence,page_time):
             if entity.label==0:
                 if re.search("文件.{,2}([递提]交|接收)|截标|投标.{,2}截止|([递提]交|接收)(?:文件)?.{,2}截止|报价.{,2}截止|响应.{,2}截止",entity_left3):
                     dict_time['time_bidclose'].append((extract_time[0], 0.45, in_attachment))
+            if entity.label==6:
+                # "文件获取时间"和"报名时间"并列
+                if re.search("报名",entity_left3):
+                    if len(extract_time)==1:
+                        dict_time['time_registrationStart'].append((extract_time[0], 0.51, in_attachment))
+                    else:
+                        dict_time['time_registrationStart'].append((extract_time[0], 0.51, in_attachment))
+                        dict_time['time_registrationEnd'].append((extract_time[1], 0.51, in_attachment))
 
             # 补充公告末尾处的发布时间
             if entity.label==0:
@@ -3338,11 +3316,11 @@ def getTimeAttributes(list_entity,list_sentence,page_time):
             if entity.label==0:
                 re_service = '合同期限|工期/交货期/服务期|工期\(交货期\)|合格工期|服务期限|工期' \
                     '|工期要求|项目周期|工期\(交货期\)|计划工期\(服务期限\)|服务时限|履行期限|服务周期|供货期限' \
-                    '|合格工期|计划工期\(服务期\)|服务期|服务,期|交货\(完工\)时间|交付\(服务、完工\)时间' \
-                    '|交货时间|工期承诺|(服务|合同|施工|实施|工程|设计)的?(年限|期限|周期|期:)' \
+                    '|合格工期|计划工期\(服务期\)|服务期|服务,期|交货\(完工\)(时间|日期)|交付\(服务、完工\)(时间|日期)' \
+                    '|交货(时间|日期)|工期承诺|(服务|合同|施工|实施|工程|设计)的?(年限|期限|周期|期:)' \
                     '|服务期限为|计划工期|工期要求|服务期限|服务期' \
-                    '|投标工期|设计工期|合格服务周期|总工期|服务时间(范围)?|流转期限|维护期限|服务时限|交货期' \
-                    '|完成时间|服务期限|中标工期|项目周期|期限要求|供货期|合同履行日期|计划的?周期' \
+                    '|投标工期|设计工期|合格服务周期|总工期|服务(时间|日期)(范围)?|流转期限|维护期限|服务时限|交货期' \
+                    '|完成(时间|日期)|服务期限|中标工期|项目周期|期限要求|供货期|合同履行日期|计划的?周期' \
                     '|履约期限|合同约定完成时限|合同完成日期|承诺完成日期' \
                     '|合同起始日起|合同履约期|履约截止日期|承包期限|合同完成日期' \
                     '|服务期间|服务履行期|委托(管理)?期限'
@@ -3353,21 +3331,27 @@ def getTimeAttributes(list_entity,list_sentence,page_time):
                         last_time_type = ''
             # 报价/投标时间补充
             if entity.label == 0:
-                if re.search("[报竞]价.{,2}(开始|起始).{,2}时间",entity_left2):
+                if re.search("[报竞]价.{,2}(开始|起始).{,2}(时间|日期)",entity_left2):
                     entity.label = 12
                     label_prob = 0.8
-                elif re.search("[报竞]价.{,2}起止.{,2}时间",entity_left2):
+                elif re.search("[报竞]价.{,2}起止.{,2}(时间|日期)",entity_left2):
                     entity.label = 12
                     label_prob = 0.6
-                elif re.search("响应.{,2}文件([递提]交|接收).{,2}时间[::]|([递提]交|接收).{,2}响应.{,2}文件.{,2}时间[::]",entity_left2):
+                elif re.search("响应.{,2}文件([递提]交|接收).{,2}(时间|日期)[::]|([递提]交|接收).{,2}响应.{,2}文件.{,2}(时间|日期)[::]",entity_left2):
                     entity.label = 3
                     label_prob = 0.501
-                elif re.search("响应.{,2}文件([递提]交|接收).{,2}时间|([递提]交|接收).{,2}响应.{,2}文件.{,2}时间",entity_left2) and not re.search("截[止至]",entity_left2):
+                elif re.search("响应.{,2}文件([递提]交|接收).{,2}(时间|日期)|([递提]交|接收).{,2}响应.{,2}文件.{,2}(时间|日期)",entity_left2) and not re.search("截[止至]",entity_left2):
                     entity.label = 12
                     label_prob = 0.51
-                elif re.search("[报竞]价.{,2}截[止至].{,2}时间",entity_left2):
+                elif re.search("[报竞]价.{,2}截[止至].{,2}(时间|日期)",entity_left2):
                     entity.label = 3
                     label_prob = 0.8
+                elif re.search("(竞价|报价).?(时间|日期)",entity_left2):
+                    entity.label = 12
+                    label_prob = 0.51
+                elif re.search("(竞价|报价).?(时间|日期)",entity_left3) and re.search("参与|报价|有意",entity_left2):
+                    entity.label = 12
+                    label_prob = 0.501
 
 
             if re.search("至|到|[日\d][-—]$|[~~]", entity_left):

+ 26 - 15
BiddingKG/dl/interface/predictor.py

@@ -3156,10 +3156,12 @@ class ProductAttributesPredictor():
                             _budget = col1_l[i]
                             re_price = re.findall("[零壹贰叁肆伍陆柒捌玖拾佰仟萬億圆十百千万亿元角分]{3,}|\d[\d,]*(?:\.\d+)?万?", _budget)
                             if re_price:
-                                _budget = re_price[0]
-                                if '万元' in col0_l[i] and '万' not in _budget:
-                                    _budget += '万元'
-                                budget = str(getUnifyMoney(_budget))
+                                # _budget = re_price[0]
+                                # if '万元' in col0_l[i] and '万' not in _budget:
+                                #     _budget += '万元'
+                                # budget = str(getUnifyMoney(_budget))
+                                _budget, _money_unit = money_process(_budget, col0_l[i])
+                                budget = str(_budget)
                                 if '.' in budget:
                                     budget = budget.rstrip('0').rstrip('.')
                                 if float(budget)>= 500*100000000:
@@ -3702,13 +3704,17 @@ class ProductAttributesPredictor():
                                         _unitPrice = deal_list[id3]
                                         re_price = re.findall("[零壹贰叁肆伍陆柒捌玖拾佰仟萬億圆十百千万亿元角分]{3,}|\d[\d,]*(?:\.\d+)?万?",_unitPrice)
                                         if re_price:
-                                            _unitPrice = re_price[0]
-                                            if '万元' in header_list[3] and '万' not in _unitPrice:
-                                                _unitPrice += '万元'
-                                            unitPrice = getUnifyMoney(_unitPrice)
-                                            if unitPrice>=10000*10000:
-                                                unitPrice = ""
-                                            unitPrice = str(unitPrice)
+                                            # _unitPrice = re_price[0]
+                                            # if '万元' in header_list[3] and '万' not in _unitPrice:
+                                            #     _unitPrice += '万元'
+                                            # unitPrice = getUnifyMoney(_unitPrice)
+                                            # if unitPrice>=10000*10000:
+                                            #     unitPrice = ""
+                                            # unitPrice = str(unitPrice)
+                                            _unitPrice, _money_unit = money_process(_unitPrice, header_list[3])
+                                            if _unitPrice >= 10000 * 10000:
+                                                _unitPrice = ""
+                                            unitPrice = str(_unitPrice)
                                             if '.' in unitPrice:
                                                 unitPrice = unitPrice.rstrip('0').rstrip('.')
                                 if id4 != "":
@@ -3735,10 +3741,12 @@ class ProductAttributesPredictor():
                                         _budget = deal_list[id7]
                                         re_price = re.findall("[零壹贰叁肆伍陆柒捌玖拾佰仟萬億圆十百千万亿元角分]{3,}|\d[\d,]*(?:\.\d+)?万?",_budget)
                                         if re_price:
-                                            _budget = re_price[0]
-                                            if '万元' in header_list2[2] and '万' not in _budget:
-                                                _budget += '万元'
-                                            budget = str(getUnifyMoney(_budget))
+                                            # _budget = re_price[0]
+                                            # if '万元' in header_list2[2] and '万' not in _budget:
+                                            #     _budget += '万元'
+                                            # budget = str(getUnifyMoney(_budget))
+                                            _budget, _money_unit = money_process(_budget, header_list2[2])
+                                            budget = str(_budget)
                                             if '.' in budget:
                                                 budget = budget.rstrip('0').rstrip('.')
                                             if float(budget)>= 100000*10000:
@@ -3873,10 +3881,13 @@ class ProductAttributesPredictor():
 
 
     def add_product_attrs(self,channel_dic, product_attrs,  list_sentences,list_entitys,list_outlines,product_list,codeName,prem,text,page_time):
+        # print(1,product_attrs[1]['demand_info']['data'])
         if channel_dic['docchannel']['docchannel']=="采购意向" and len(product_attrs[1]['demand_info']['data']) == 0:
             product_attrs = self.predict_without_table(product_attrs, list_sentences,list_entitys,codeName,prem,text,page_time)
+        # print(2,product_attrs[1]['demand_info']['data'])
         if len(product_attrs[0]['product_attrs']['data']) == 0:
             product_attrs = self.predict_by_text(product_attrs,text,list_outlines,product_list,page_time)
+        # print(3,product_attrs[1]['demand_info']['data'])
         if len(product_attrs[1]['demand_info']['data'])>0:
             for d in product_attrs[1]['demand_info']['data']:
                 for product in set(prem[0]['product']):