Browse Source

Merge remote-tracking branch 'origin/master'

luojiehua 5 tháng trước cách đây
mục cha
commit
da7a6718d8

+ 59 - 9
BiddingKG/dl/channel/channel_bert.py

@@ -339,6 +339,9 @@ def text_process(text):
     # text = re.sub("\s+", "", text)
     text = re.sub("\s+", " ", text)
 
+    # 优化部分未识别表达
+    text = re.sub("中止", "终止", text)
+
     return text
 
 label2class_dict = {
@@ -445,8 +448,16 @@ def channel_predict(title,text):
     with torch.no_grad():
         outputs = model(None, text)
         predic = torch.max(outputs.data, 1)[1].cpu().numpy()
-        pred_label = predic[0]
-        pred_class = label2class_dict[pred_label]
+        pred_prob = torch.max(outputs.data, 1)[0].cpu().numpy()
+        # print('pred_prob',pred_prob)
+        if pred_prob>0.5:
+            pred_label = predic[0]
+            pred_class = label2class_dict[pred_label]
+        else:
+            return
+    # check
+    if pred_class==101 and re.search("((资格|资质)(审查|预审|后审|审核)|资审)结果(公告|公示)?|(资质|资格)(预审|后审)公示|资审及业绩公示",title): # 纠正部分‘资审结果’模型错误识别为中标
+        pred_class = 105
 
     return pred_class
 
@@ -478,6 +489,37 @@ class_dict = {51: '公告变更',
               }
 
 def merge_channel(list_articles,channel_dic,original_docchannel):
+
+    def merge_rule(title,text,docchannel,pred_channel,channel_dic):
+        front_text_len = len(text)//3 if len(text)>300 else 100
+        front_text = text[:front_text_len]
+        pred_channel = class_dict[pred_channel]
+        if pred_channel == docchannel:
+            channel_dic['docchannel']['use_original_docchannel'] = 0
+        else:
+            if pred_channel in ['采购意向','招标预告'] and docchannel in ['采购意向','招标预告']:
+                merge_res = '采购意向' if re.search("意向|意愿",title) or re.search("意向|意愿",front_text) else "招标预告"
+                channel_dic['docchannel']['docchannel'] = merge_res
+                channel_dic['docchannel']['use_original_docchannel'] = 0
+            elif pred_channel in ['公告变更','招标答疑'] and docchannel in ['公告变更','招标答疑']:
+                channel_dic['docchannel']['docchannel'] = docchannel
+                channel_dic['docchannel']['use_original_docchannel'] = 0
+            elif pred_channel=='公告变更' and docchannel in ['中标信息','废标公告','候选人公示','合同公告']: #中标类的变更还是中标类公告
+                channel_dic['docchannel']['docchannel'] = docchannel
+                channel_dic['docchannel']['use_original_docchannel'] = 0
+            elif docchannel=='公告变更' and pred_channel in ['中标信息','废标公告','候选人公示','合同公告']:
+                channel_dic['docchannel']['docchannel'] = pred_channel
+                channel_dic['docchannel']['use_original_docchannel'] = 0
+
+            else:
+                channel_dic = {'docchannel': {'doctype': '采招数据',
+                                              'docchannel': class_dict.get(original_docchannel, '原始类别'),
+                                              'life_docchannel': class_dict.get(original_docchannel, '原始类别')}}
+                channel_dic['docchannel']['use_original_docchannel'] = 1
+
+        return channel_dic
+
+
     article = list_articles[0]
     title = article.title
     text = article.content
@@ -493,13 +535,21 @@ def merge_channel(list_articles,channel_dic,original_docchannel):
             pred = channel_predict(title, text)
             # print('pred_res', pred)
             if pred is not None and original_docchannel: # 无original_docchannel时不进行对比校正
-                if class_dict[pred] == docchannel:
-                    channel_dic['docchannel']['use_original_docchannel'] = 0
-                else:
-                    channel_dic = {'docchannel': {'docchannel': '采招数据',
-                                                  'doctype': class_dict.get(original_docchannel, '原始类别'),
-                                                  'life_docchannel': class_dict.get(original_docchannel, '原始类别')}}
-                    channel_dic['docchannel']['use_original_docchannel'] = 1
+                # if class_dict[pred] == docchannel:
+                #     channel_dic['docchannel']['use_original_docchannel'] = 0
+                # else:
+                #     channel_dic = {'docchannel': {'docchannel': '采招数据',
+                #                                   'doctype': class_dict.get(original_docchannel, '原始类别'),
+                #                                   'life_docchannel': class_dict.get(original_docchannel, '原始类别')}}
+                #     channel_dic['docchannel']['use_original_docchannel'] = 1
+
+                channel_dic = merge_rule(title,text,docchannel,pred,channel_dic)
+    elif doctype=='采招数据' and docchannel=="":
+        pred = channel_predict(title, text)
+        if pred is not None:
+            pred = class_dict[pred]
+            channel_dic['docchannel']['docchannel'] = pred
+            channel_dic['docchannel']['use_original_docchannel'] = 0
 
     return channel_dic
 

+ 1 - 1
BiddingKG/dl/interface/extract.py

@@ -474,7 +474,7 @@ def predict(doc_id,text,title="",page_time="",web_source_no='',web_source_name="
 
     # data_res = Preprocessing.union_result(Preprocessing.union_result(codeName, prem),list_punish_dic)[0]
     # data_res = Preprocessing.union_result(Preprocessing.union_result(Preprocessing.union_result(codeName, prem),list_punish_dic), list_channel_dic)[0]
-    version_date = {'version_date': '2024-12-12'}
+    version_date = {'version_date': '2024-12-18'}
     data_res = dict(codeName[0], **prem[0], **channel_dic, **product_attrs[0], **product_attrs[1], **payment_way_dic, **fail_reason, **industry, **district, **candidate_dic, **version_date, **all_moneys, **pb_json)
 
     if original_docchannel == 302:

+ 1 - 1
BiddingKG/dl/interface/outline_extractor.py

@@ -60,7 +60,7 @@ requirement_pattern = "(采购需求|需求分析|项目说明|(采购|合同|
 aptitude_pattern = "((资格|资质)[的及]?(要求|条件)|竞买资格及要求|供应商报价须知)([::,]|$)|(竞买|竞买人|竞投人)?资格(条件)?:|按以下要求参与竞买|(报名|竞买)(条件|资格)"
 addr_bidopen_pattern = "([开评]标|开启|评选|比选|磋商|遴选|寻源|采购|招标|竞价|议价|委托|询比?价|比价|谈判|邀标|邀请|洽谈|约谈|选取|抽取|抽选|递交\w{,4}文件)[))]?(时间[与及和、])?(地址|地点)([与及和、]时间)?([::,]|$)|开启([::,]|$)"
 addr_bidsend_pattern = "((\w{,4}文件)?(提交|递交)(\w{,4}文件)?|投标)(截止时间[与及和、])?地[点址]([与及和、]截止时间)?([::,]|$)"
-pinmu_name_pattern = "采购品目名称([::,]|$)"
+pinmu_name_pattern = "采购品目(名称)?([::,]|$)"
 out_lines = []
 policy_pattern = "《.+?(通知|办法|条例|规定|规程|规范|须知|规则|标准|细则|意见|协议|条件|要求|手册|法典|方案|指南|指引|法)》"
 not_policy_pattern = "(表|函|书|证|\d页|公告|合同|文件|清单)》$|采购合同|响应方须知|响应文件格式|营业执照|开标一览|采购需求"

+ 8 - 3
BiddingKG/dl/interface/predictor.py

@@ -1474,7 +1474,7 @@ class RoleRulePredictor():
                "|(选定单位|指定的中介服务机构|实施主体|中标银行|中标通知书,致|征集结果|选择中介|选择结果|成交对象|勘察人|(,|审计|处置|勘察|设计)服务单位|受托[人方])[::是为]+$" \
                "|((评审结果|名次|排名|中标结果)[::]*第?[一1]名?)[::是为]+$|成交供应商信息[,:]?(序号1)?:?|供应商名称$|竞争性选择申请人名称:$" \
                "|单一来源(采购)?(供应商|供货商|服务商|方式向)$|((中标|成交)(结果|信息))[::是为]+$|(中标|成交)供应商、(中标|成交)(金额|价格),$" \
-               "|现(公布|宣布|公示)中标单位如下:$|现将中标单位(公布|公示)如下:$|现宣布以下(企业|单位|公司)中标:$|经讨论,决定采用$)"  # 承办单位:不作为中标 83914772  |施工 单位不作为中标人 例:386692187
+               "|现(公布|宣布|公示)中标单位如下:$|现将中标单位(公布|公示)如下:$|现宣布以下(企业|单位|公司)中标:$|经讨论,决定采用$|第\d+(包件?|标段?)(中标|中选|成交)候选人:$)"  # 承办单位:不作为中标 83914772  |施工 单位不作为中标人 例:386692187
         self.pattern_winTenderer_left_60 = "(?P<winTenderer_left_60>" \
                                            "(,|。|:|^)((中标(投标)?|[拟预]中标|中选|中价|中签|成交)(人|单位|机构|中介(服务)?机构|供应商|客户|方|公司|企业|厂商|商家?|社会资本方?|银行)|(中标候选人)?第?[一1]名|第[一1](中标|中选|成交)?候选人|服务机构)" \
                                            "(:?单位名称|:?名称|盖章)?[,,]?([((]按综合排名排序[))]|:择优选取)?[::,,]$|选取(情况|说明):中选,中介机构名称:$|排名如下:1、$|第[一1]名,?投标(人|单位|银行|公司):$)"  # 解决表头识别不到加逗号情况,需前面为,。空 20240621补充 中选 云南省投资审批中介超市 补充排名如下 南阳师范学院
@@ -4128,14 +4128,14 @@ class DocChannel():
           '土地矿产': '供地结果|(土地|用地|宗地|地块|海域|矿)的?(基本信息|基本情况|概况|信息|详情|来源|用途|性质|编号|位置|坐落|使用年限|出让年限)|(土地|山地|农田)(经营权)?(出让|出租|招租|租赁|承包|流转)|流转土地',
           '拍卖出让': '(拍卖|变卖|流拍|竞拍)的?(公告|活动|信息|结果|成交|主体|标的|资产|财产|方式|类型|流程|程序|规则|价格|保证金|时间)|(公开|进行|密封)(拍卖|变卖|竞拍)|第[一二三]次拍卖|(资产|司法|网络)拍卖|交易方式.{,2}拍卖|拍卖会',
           '产权交易': '(产权|资产|权证)的?(类型|类别|用途|性质|状态|信息|名称|编号|(基本)?情况)|(经营权|承包权|使用权|租赁权|股权|债权|排污权|化学需氧量|储备量)(挂牌|转让|出让)|竞价销售|销售结果|房屋所有权房产|免租期限|交易期限|(受让|转让|承租|出租)(人|方)|(店面|店铺|商铺|铺位?|门面|门市|食堂|饭堂|校舍|车位|停车场|厂?房|仓?库|馆|资产|物业|房产|房屋|场地|农田|鱼?塘)\w{,4}(处置|招租|出租|续租|租赁|转让)|(出租|转让|产权|资产)(项目|中标|成交|流标|废标)|出租(用途|类型)|转让底价|租赁(标的物|情况)|看[样货](时间|地[点址]|方式|仓库|验货)|最小加价|加价[幅梯]度|交易模式[::\s]*延时竞价销售|挂牌(开始|结束)时间',
-          '采招数据': '(采购|招标)(条件|范围|文件|内容)|(申请人|投标人|供应商|报价人|参选人)的?资格要求;|采购需求清单|最低价排序|竞争性采购方式|采购进行公开竞价|竞价模式[::\s]*一次报价|预算金额'  # |变更|答疑|澄清|中标|成交|合同|废标|流标 |(采购|招标|代理)(人|机构|单位)|
+          '采招数据': '(采购|招标)(条件|范围|文件|内容)|(申请人|投标人|供应商|报价人|参选人)的?资格要求;|采购需求清单|最低价排序|竞争性采购方式|采购进行公开竞价|竞价模式[::\s]*一次报价|预算金额|代理银行资格选定'  # |变更|答疑|澄清|中标|成交|合同|废标|流标 |(采购|招标|代理)(人|机构|单位)|
       }
 
       self.title_type_dic = {
           '土地矿产': '(土地|用地|宗地|荒地|山地|海域|矿)(出让|出租|招租|租赁|承包|流转|使用权|经营权|征收|划拨|中标|成交)|供地结果|矿业权|探矿权|采矿权|(土地|用地|宗地|地块)(使用权)?(终止|中止|网上)?(挂牌|出让|拍卖|招拍|划拨)|征收土地',
           '拍卖出让': '(拍卖|变卖|流拍|竞拍)的?(公告|公示)|拍卖|变卖|流拍|竞拍',
           '产权交易': '经营权|承包权|使用权|租赁权|股权|债权|排污权|化学需氧量|储备量|竞价销售|销售结果|出租|招租|拍租|竞租|续租|挂牌|出让',
-          '采招数据': '(采购|招标|询价|议价|比价|比选|遴选|邀请|邀标|磋商|洽谈|约谈|谈判|征询|调研)的?(公告|公示|中标|成交|结果|$)|工程招标|定点服务|(设备|服务|\w{2})[直采]购|(建设|改造)项目|工程|拦标价|控制价',
+          '采招数据': '(采购|招标|询价|议价|比价|比选|遴选|邀请|邀标|磋商|洽谈|约谈|谈判|征询|调研)的?(公告|公示|中标|成交|结果|$)|工程招标|定点服务|(设备|服务|\w{2})[直采]购|(建设|改造)项目|工程|拦标价|控制价|银行|资格选定|资金|公款|存款|存放|现金管理|招募|入围|入库',
           # |竞价 采招/产权都有竞价方式 # 意向|需求|预公?告|报建|总承包|工程|施工|设计|勘察|代理|监理 |变更|答疑|澄清|中标|成交|合同|废标|流标
           '新闻资讯': '(考试|面试|笔试)成绩|成绩的?(公告|公示|公布)|公开招聘|招聘(公告|简章|启事|合同制)|疫情防控\s{,5}(通知|情况|提示)|行政审批结果'
       }
@@ -4768,6 +4768,11 @@ class DocChannel():
           prem_json) == False and re.search(self.title_life_dic['中标信息'], title) == None:
           result['docchannel']['docchannel'] = origin_dic.get(original_docchannel, '')
           msc += '最终规则修改:中标公告、合同公告无中标人且原始为非中标,返回原类型'
+      elif result['docchannel']['docchannel'] in ['中标信息'] and is_contain_winner(prem_json) == False \
+              and re.search("监督(抽查|检查)结果", title):
+          result['docchannel']['doctype'] = "新闻资讯"
+          result['docchannel']['docchannel'] = ""
+          msc += '最终规则修改:中标公告无中标人且包含新闻资讯关键词,返回新闻资讯类型'
       elif result['docchannel']['docchannel'] == '废标公告' and is_contain_winner(prem_json) and re.search(
               self.title_life_dic['废标公告'], title) == None:
           result['docchannel']['docchannel'] = '中标信息'