浏览代码

分类对比校验方法

znj 5 月之前
父节点
当前提交
b052692f43
共有 2 个文件被更改,包括 57 次插入23 次删除
  1. 52 18
      BiddingKG/dl/channel/channel_bert.py
  2. 5 5
      BiddingKG/dl/interface/predictor.py

+ 52 - 18
BiddingKG/dl/channel/channel_bert.py

@@ -341,6 +341,7 @@ def text_process(text):
 
     # 优化部分未识别表达
     text = re.sub("中止", "终止", text)
+    text = re.sub("遴选", "招标", text)
 
     return text
 
@@ -416,12 +417,18 @@ def channel_predict(title,text):
     # process text
     if title in text:
         text = text.replace(title, '', 1)
-    text = text_process(text)
+    if "##attachment##" in text:
+        main_text,attachment_text = text.split("##attachment##",maxsplit=1)
+        # print('main_text',main_text)
+        if len(main_text)>=500: # 正文有足够的内容时不需要使用附件预测
+            text = main_text
     text = re.sub("##attachment##。?","",text)
+    text = text_process(text)
+
     if len(text)<=100:
         # 正文内容过短时,不预测
         return
-    elif len(text)<=200:
+    elif len(text)<=150:
         # 正文内容过短时,重复正文
         text = text * 2
     text = text[:2000]
@@ -429,6 +436,7 @@ def channel_predict(title,text):
     title = title[:100]
     text = "公告标题:" + title + "。" + "公告内容:" + text
     text = text[:2000]
+    # print('predict text:',text)
 
     # to torch data
     text = [text]
@@ -455,9 +463,15 @@ def channel_predict(title,text):
             pred_class = label2class_dict[pred_label]
         else:
             return
-    # check
+    # print('check rule before',pred_class)
+    # check rule
     if pred_class==101 and re.search("((资格|资质)(审查|预审|后审|审核)|资审)结果(公告|公示)?|(资质|资格)(预审|后审)公示|资审及业绩公示",title): # 纠正部分‘资审结果’模型错误识别为中标
         pred_class = 105
+    elif pred_class==122 and re.search("验收服务",title):
+        pred_class = None
+    # elif pred_class==118 and re.search("重新招标",title): #重新招标类公告,因之前公告的废标原因而错识别为废标公告
+    #     pred_class = 52
+
 
     return pred_class
 
@@ -488,9 +502,12 @@ class_dict = {51: '公告变更',
        122: '验收合同'
               }
 
+tenderee_type = ['公告变更','招标公告','招标预告','招标答疑','资审结果','采购意向']
+win_type = ['中标信息','废标公告','候选人公示','合同公告','开标记录','验收合同']
+
 def merge_channel(list_articles,channel_dic,original_docchannel):
 
-    def merge_rule(title,text,docchannel,pred_channel,channel_dic):
+    def merge_rule(title,text,docchannel,pred_channel,channel_dic,original_docchannel):
         front_text_len = len(text)//3 if len(text)>300 else 100
         front_text = text[:front_text_len]
         pred_channel = class_dict[pred_channel]
@@ -512,10 +529,18 @@ def merge_channel(list_articles,channel_dic,original_docchannel):
                 channel_dic['docchannel']['use_original_docchannel'] = 0
 
             else:
-                channel_dic = {'docchannel': {'doctype': '采招数据',
-                                              'docchannel': class_dict.get(original_docchannel, '原始类别'),
-                                              'life_docchannel': class_dict.get(original_docchannel, '原始类别')}}
-                channel_dic['docchannel']['use_original_docchannel'] = 1
+                original_type = class_dict.get(original_docchannel, '原始类别')
+                if pred_channel in tenderee_type and docchannel in tenderee_type and original_type not in tenderee_type:
+                    # pred_channel和docchannel都是同一(招标/中标)类型时,original_docchannel不一致时不使用原网类型
+                    channel_dic['docchannel']['use_original_docchannel'] = 0
+                elif pred_channel in win_type and docchannel in win_type and original_type not in win_type:
+                    # pred_channel和docchannel都是同一(招标/中标)类型时,original_docchannel不一致时不使用原网类型
+                    channel_dic['docchannel']['use_original_docchannel'] = 0
+                else:
+                    channel_dic = {'docchannel': {'doctype': '采招数据',
+                                                  'docchannel': original_type,
+                                                  'life_docchannel': original_type}}
+                    channel_dic['docchannel']['use_original_docchannel'] = 1
 
         return channel_dic
 
@@ -526,7 +551,7 @@ def merge_channel(list_articles,channel_dic,original_docchannel):
 
     doctype = channel_dic['docchannel']['doctype']
     docchannel = channel_dic['docchannel']['docchannel']
-    # print('doctype',doctype,'docchannel',docchannel)
+    # print('doctype',doctype,'docchannel',docchannel,'original_docchannel',original_docchannel)
     compare_type = ['公告变更','招标公告','中标信息','招标预告','招标答疑','资审结果','采购意向','废标公告','候选人公示',
                       '合同公告','开标记录','验收合同']
     # 仅比较部分数据
@@ -535,15 +560,8 @@ def merge_channel(list_articles,channel_dic,original_docchannel):
             pred = channel_predict(title, text)
             # print('pred_res', pred)
             if pred is not None and original_docchannel: # 无original_docchannel时不进行对比校正
-                # if class_dict[pred] == docchannel:
-                #     channel_dic['docchannel']['use_original_docchannel'] = 0
-                # else:
-                #     channel_dic = {'docchannel': {'docchannel': '采招数据',
-                #                                   'doctype': class_dict.get(original_docchannel, '原始类别'),
-                #                                   'life_docchannel': class_dict.get(original_docchannel, '原始类别')}}
-                #     channel_dic['docchannel']['use_original_docchannel'] = 1
-
-                channel_dic = merge_rule(title,text,docchannel,pred,channel_dic)
+                channel_dic = merge_rule(title,text,docchannel,pred,channel_dic,original_docchannel)
+
     elif doctype=='采招数据' and docchannel=="":
         pred = channel_predict(title, text)
         if pred is not None:
@@ -551,6 +569,22 @@ def merge_channel(list_articles,channel_dic,original_docchannel):
             channel_dic['docchannel']['docchannel'] = pred
             channel_dic['docchannel']['use_original_docchannel'] = 0
 
+    # '招标预告'类 规则纠正
+    if channel_dic['docchannel']['doctype']=='采招数据' and channel_dic['docchannel']['docchannel']=="招标公告":
+        if "##attachment##" in text:
+            main_text, attachment_text = text.split("##attachment##", maxsplit=1)
+        else:
+            main_text = text
+        main_text = text_process(main_text)
+        if re.search("采购实施月份|采购月份|预计(招标|采购|发标|发包)(时间|月份)|招标公告预计发布时间",main_text[:len(main_text)//2]):
+            front_text_len = len(main_text) // 3 if len(main_text) > 300 else 100
+            front_text = main_text[:front_text_len]
+            if re.search("意向|意愿",title) or re.search("意向|意愿",front_text):
+                channel_dic['docchannel']['docchannel'] = "采购意向"
+            else:
+                channel_dic['docchannel']['docchannel'] = "招标预告"
+            channel_dic['docchannel']['use_original_docchannel'] = 0
+
     return channel_dic
 
 

+ 5 - 5
BiddingKG/dl/interface/predictor.py

@@ -3251,7 +3251,7 @@ class ProductAttributesPredictor():
                 elif re.search('预算|控制金额', items[j]) and not re.search('预算单位',items[j]):
                     header_dic['预算'] = j
                     budget = items[j]
-                elif re.search('时间|采购实施月份|采购月份|采购日期', items[j]):
+                elif re.search('时间|采购实施月份|采购月份|采购日期|预计(招标|采购|发标|发包)(时间|月份)', items[j]):
                     header_dic['时间'] = j
                     order_time = items[j]
                 elif re.search('总价|(成交|中标|验收|合同|预算|控制|总|合计))?([金总]额|价格?)|最高限价|价格|金额', items[j]) and re.search('数量|规格|型号|品牌|供应商', items[j])==None:
@@ -3368,7 +3368,7 @@ class ProductAttributesPredictor():
                             tenderee = re.sub("\s","",col1_l[i])
                             if len(tenderee) > 20:
                                 tenderee = ""
-                        elif re.search('采购时间|采购实施月份|采购月份|采购日期|预计招标时间', col0_l[i]):
+                        elif re.search('采购时间|采购实施月份|采购月份|采购日期|预计(招标|采购|发标|发包)(时间|月份)', col0_l[i]):
                             header_list2.append(col0_l[i])
                             order_time = col1_l[i].strip()
                             order_begin, order_end = self.fix_time(order_time, html, page_time)
@@ -3740,7 +3740,7 @@ class ProductAttributesPredictor():
             list_sentence = list_sentences[0]
             list_entity = list_entitys[0]
             _data = product_attrs[1]['demand_info']['data']
-            re_bidding_time = re.compile("(采购|采购实施|预计招标)(时间|月份|日期)[::,].{0,2}$")
+            re_bidding_time = re.compile("(采购|采购实施|预计(招标|采购|发标|发包))(时间|月份|日期)[::,].{0,2}$")
             order_times = []
             for entity in list_entity:
                 if entity.entity_type=='time':
@@ -3961,7 +3961,7 @@ class ProductAttributesPredictor():
                                             if float(budget)>= 100000*10000:
                                                 budget = ""
                                 if id8 != "":
-                                    if re.search('\w', deal_list[id8]) and re.search("(采购|采购实施|预计招标)(时间|月份|日期)",header_list2[3]):
+                                    if re.search('\w', deal_list[id8]) and re.search("(采购|采购实施|预计(招标|采购|发标|发包))(时间|月份|日期)",header_list2[3]):
                                         order_time = deal_list[id8].strip()
                                         order_begin, order_end = self.fix_time(order_time, html, page_time)
                                 if id9 != "":
@@ -4769,7 +4769,7 @@ class DocChannel():
           result['docchannel']['docchannel'] = origin_dic.get(original_docchannel, '')
           msc += '最终规则修改:中标公告、合同公告无中标人且原始为非中标,返回原类型'
       elif result['docchannel']['docchannel'] in ['中标信息'] and is_contain_winner(prem_json) == False \
-              and re.search("监督(抽查|检查)结果", title):
+              and re.search("监督(抽查|检查)结果|抽查结果", title):
           result['docchannel']['doctype'] = "新闻资讯"
           result['docchannel']['docchannel'] = ""
           msc += '最终规则修改:中标公告无中标人且包含新闻资讯关键词,返回新闻资讯类型'