فهرست منبع

Merge remote-tracking branch 'origin/master'

lsm 3 ماه پیش
والد
کامیت
9aa6785256

+ 1 - 1
BiddingKG/dl/channel/channel_bert.py

@@ -596,7 +596,7 @@ def merge_channel(list_articles,channel_dic,original_docchannel):
             main_text = text
         main_text = text_process(main_text)
         # if re.search("采购实施月份|采购月份|预计(招标|采购|发标|发包)(时间|月份)|招标公告预计发布时间",main_text[:max(500,len(main_text)//2)]):
-        if re.search("采购实施月份|采购月份|预计(招标|采购|发标|发包)(时间|月份)|招标公告预计发布时间",main_text):
+        if re.search("采购实施月份|采购月份|(计划|预计|预期)(招标|采购|发标|发包)(时间|月份)|招标公告预计发布时间",main_text):
             front_text_len = len(main_text) // 3 if len(main_text) > 300 else 100
             front_text = main_text[:front_text_len]
             if re.search("意向|意愿",title) or re.search("意向|意愿",front_text):

تفاوت فایلی نمایش داده نمی شود زیرا این فایل بسیار بزرگ است
+ 16 - 10
BiddingKG/dl/interface/extract.py


+ 8 - 8
BiddingKG/dl/interface/getAttributes.py

@@ -2043,7 +2043,7 @@ def findAttributeAfterEntity(PackDict,roleSet,PackageList,PackageSet,list_senten
                                 if entity.label in [2, 3, 4] and distance>=20:
                                     break
                                 # 角色为中标候选人,排除"质疑|投诉|监督|受理"相关的联系人
-                                if entity.label in [2, 3, 4] and re.search("纪检|监察|质疑|投诉|监督|受理|项目(单位)?联系", list_sentence[after_entity.sentence_index].sentence_text[max(0,after_entity.wordOffset_begin - 10):after_entity.wordOffset_begin]):
+                                if entity.label in [2, 3, 4] and re.search("纪检|监察|质疑|投诉|监督|受理|项目(单位)?联系|(采购|招标)人?联系", list_sentence[after_entity.sentence_index].sentence_text[max(0,after_entity.wordOffset_begin - 10):after_entity.wordOffset_begin]):
                                     break
                                 # 角色为招标/代理人,排除"纪检|监察"相关的联系人
                                 if entity.label in [0,1] and re.search("纪检|监察",list_sentence[after_entity.sentence_index].sentence_text[max(0,after_entity.wordOffset_begin - 10):after_entity.wordOffset_begin]):
@@ -2953,7 +2953,7 @@ def findAttributeAfterEntity(PackDict,roleSet,PackageList,PackageSet,list_senten
                                         if t_person.person_phone:
                                             _phone = [p.entity_text for p in t_person.person_phone]
                                             for _p in _phone:
-                                                if t_person.entity_text not in exist_person and _p not in exist_phone:
+                                                if t_person.entity_text not in exist_person and _p not in ",".join(exist_phone):
                                                     tenderee_agency_role[0].linklist.append((t_person.entity_text, _p))
                                                     get_contacts = True
                                             break
@@ -2963,7 +2963,7 @@ def findAttributeAfterEntity(PackDict,roleSet,PackageList,PackageSet,list_senten
                                 if not get_contacts:
                                     sentence_phone = phone.findall(outline.outline_text)
                                     if sentence_phone:
-                                        if sentence_phone[0] not in exist_phone:
+                                        if sentence_phone[0] not in ",".join(exist_phone):
                                             tenderee_agency_role[0].linklist.append(("", sentence_phone[0]))
                                             get_contacts = True
                                             break
@@ -2974,14 +2974,14 @@ def findAttributeAfterEntity(PackDict,roleSet,PackageList,PackageSet,list_senten
                                 if _entity.person_phone:
                                     _phone = [p.entity_text for p in _entity.person_phone]
                                     for _p in _phone:
-                                        if _entity.entity_text not in exist_person and _p not in exist_phone:
+                                        if _entity.entity_text not in exist_person and _p not in ",".join(exist_phone):
                                             tenderee_agency_role[0].linklist.append((_entity.entity_text, _p))
                                             get_contacts = True
                                     break
                     if not get_contacts:
                         # 如果文中只有一个“phone”实体,则直接取为联系人电话
                         if len(phone_entitys) == 1:
-                            if phone_entitys[0].entity_text not in exist_phone:
+                            if phone_entitys[0].entity_text not in ",".join(exist_phone):
                                 tenderee_agency_role[0].linklist.append(("", phone_entitys[0].entity_text))
                                 get_contacts = True
                     if not get_contacts:
@@ -2993,7 +2993,7 @@ def findAttributeAfterEntity(PackDict,roleSet,PackageList,PackageSet,list_senten
                                 if re.search("联系人|联系方|联系方式|联系电话|电话|负责人|与.{2,4}联系", sentence_outline):
                                     sentence_phone = phone.findall(temp_sentence)
                                     if sentence_phone:
-                                        if sentence_phone[0] in [ent.entity_text for ent in phone_entitys] and sentence_phone[0] not in exist_phone:
+                                        if sentence_phone[0] in [ent.entity_text for ent in phone_entitys] and sentence_phone[0] not in ",".join(exist_phone):
                                             tenderee_agency_role[0].linklist.append(("", sentence_phone[0]))
                                             get_contacts = True
                                             break
@@ -3008,11 +3008,11 @@ def findAttributeAfterEntity(PackDict,roleSet,PackageList,PackageSet,list_senten
                         for _pattern in contact_pattern_list:
                             get_tenderee_contacts = False
                             for regular_match in re.finditer(_pattern, _content):
-                                match_text = _content[regular_match.end():regular_match.end() + 40]
+                                match_text = _content[regular_match.end():regular_match.end() + 50]
                                 match_text = match_text.split("。")[0]
                                 sentence_phone = phone.findall(match_text)
                                 if sentence_phone:
-                                    if sentence_phone[0] not in exist_phone:
+                                    if sentence_phone[0] not in ",".join(exist_phone):
                                         tenderee_agency_role[0].linklist.append(("", sentence_phone[0]))
                                         get_tenderee_contacts = True
                                         break

+ 13 - 4
BiddingKG/dl/interface/predictor.py

@@ -4151,14 +4151,14 @@ class DocChannel():
           '产权交易': '经营权|承包权|使用权|租赁权|股权|债权|排污权|化学需氧量|储备量|竞价销售|销售结果|出租|招租|拍租|竞租|续租|挂牌|出让|废[旧弃]?(物资|设备|资源|金属|钢筋|料)处[置理]',
           '产权交易2': '使用权|租赁权|股权|债权|排污权|竞价销售|销售结果|出租|招租|拍租|竞租|续租|挂牌|出让|废[旧弃]?(物资|设备|资源|金属|钢筋|料)处[置理]',
           # '采招数据': '(采购|招标|询价|议价|比价|比选|遴选|邀请|邀标|磋商|洽谈|约谈|谈判|征询|调研)的?(公告|公示|中标|成交|结果|$)|工程招标|定点服务|(设备|服务|\w{2})[直采]购|(建设|改造)项目|工程|拦标价|控制价|银行|资格选定|资金|公款|存款|存放|现金管理|招募|入围|入库',
-          '采招数据': '(采购|招标|询价|议价|比价|比选|遴选|邀请|邀标|磋商|洽谈|约谈|谈判|征询|调研)的?(公告|公示|中标|成交|结果|$)|工程招标|定点服务|(设备|服务|\w{2})[直采]购|(建设|改造)项目|拦标价|控制价|资格选定|资格认定|资金|公款|存款|现金管理|招募|入库',
+          '采招数据': '(采购|招标|询价|议价|比价|比选|遴选|邀请|邀标|磋商|洽谈|约谈|谈判|征询|调研)的?(公告|公示|中标|成交|结果|$)|工程招标|定点服务|(设备|服务|\w{2})[直采]购|(建设|改造)项目|拦标价|控制价|资格选定|资格认定|资金|公款|存款|现金管理|招募|入库|遴选.{,25}(服务|事务所|机构)',
           # |竞价 采招/产权都有竞价方式 # 意向|需求|预公?告|报建|总承包|工程|施工|设计|勘察|代理|监理 |变更|答疑|澄清|中标|成交|合同|废标|流标
           '新闻资讯': '(考试|面试|笔试)成绩|成绩的?(公告|公示|公布)|公开招聘|招聘(公告|简章|启事|合同制)|疫情防控\s{,5}(通知|情况|提示)|行政审批结果'
       }
       self.life_dic = {
           '采购意向': '采购意向|招标意向|选取意向|意向公告|意向公示',
           '采购意向neg': '发布政府采购意向|采购意向公告已于',
-          '招标预告': '(预计|计划)(采购|招标)(时间|日期)|采购(计划编号|需求方案|预告|预案)|(预|需求)公示|需求(方案|信息|论证|公告|公示)',
+          '招标预告': '(预计|计划)(招标|采购|发标|发包)(时间|日期)|采购(计划编号|需求方案|预告|预案)|(预|需求)公示|需求(方案|信息|论证|公告|公示)',
           '招标公告': '(采购|招标|竞选|报名)条件|报名(时间|流程|方法|要求|\w{,5}材料)[:\s]|[^\w]成交规则|参加竞价采购交易资格|(申请人|投标人|供应商|报价人|参选人)的?资格(要求|条件)|获取(采购|招标|询价|议价|竞价|比价|比选|遴选|邀请|邀标|磋商|洽谈|约谈|谈判|竞谈|应答)文件|(采购|招标|询价|议价|竞价|比价|比选|遴选|邀请|邀标|磋商|洽谈|约谈|谈判|竞谈|应答)文件的?(获取|领取)|评选方式:?\s*价格最低',
           '资审结果': '资审及业绩公示|资审结果及业绩|资格后审情况报告|资格(后审|预审|审查)结果(公告|公示)|(预审|审查)工作已经?结束|未通过原因', #|资格
           '招标答疑': '现澄清(为|如下)|答疑补遗|澄清内容如下|第[0-9一二三四五]次澄清|答疑澄清|(最高(投标)?限价|控制价|拦标价)公示',  # |异议的回复
@@ -4783,11 +4783,14 @@ class DocChannel():
               return False
 
       tenderee = ""
+      agency = ""
       try:
           for k, v in prem['prem'].items():
               for link in v['roleList']:
                   if link['role_name'] == 'tenderee' and tenderee == "":
                       tenderee = link['role_text']
+                  if link['role_name'] == 'agency' and agency == "":
+                      agency = link['role_text']
       except Exception as e:
           # print('解析prem 获取招标人、代理人出错')
           pass
@@ -4798,6 +4801,9 @@ class DocChannel():
       if tenderee:
           title = title.replace(tenderee, " ")
           text = text.replace(tenderee, " ")
+      if agency:
+          title = title.replace(agency, " ")
+          text = text.replace(agency, " ")
       prem_json = json.dumps(prem, ensure_ascii=False)
       if result['docchannel']['docchannel'] in ['中标信息', '合同公告'] and origin_dic.get(
               original_docchannel, '') in ['招标公告', '采购意向', '招标预告', '公告变更'] and is_contain_winner(
@@ -4811,7 +4817,10 @@ class DocChannel():
           msc += '最终规则修改:中标公告无中标人且包含新闻资讯关键词,返回新闻资讯类型'
       elif result['docchannel']['docchannel'] == '废标公告' and is_contain_winner(prem_json) and re.search(
               self.title_life_dic['废标公告'], title) == None:
-          result['docchannel']['docchannel'] = '中标信息'
+          if re.search(self.title_life_dic['合同公告'], title):
+            result['docchannel']['docchannel'] = '合同公告'
+          else:
+            result['docchannel']['docchannel'] = '中标信息'
           msc += '最终规则修改:预测为废标却有中标人且标题无废标关键词改为中标信息;'
       elif result['docchannel']['docchannel'] in ['招标答疑'] and re.search(
               self.title_life_dic['招标答疑'], title) == None and origin_dic.get(
@@ -8583,7 +8592,7 @@ class EntityTypeRulePredictor():
         self.pattern_addr_delivery = '(交货|交付|收货|提货|交接|送货(安装)?|送达|到货|供货|卸货)((期|时间)[及和、])?)?(地[点址区]?|区域)[:为]'
         self.pattern_addr_project = '(项目|施工|实施|建设|工程|服务|展示|看样|拍卖)(实施|服务|现场)?(地[点址区]|位置|所在地区?)(位于)?[:为]|项目位于|[^\w]所[属在](区域|地区?):|存放地[点址]?[:为]' # 银行所属区域:北京市西城区 不作项目地址
         self.pattern_addr_contact = '(联系|收件人?|邮寄)地[点址区][:为]|行政区:'
-        self.pattern_time_planned = '(计划|预计|预期)(采购|招标|发包)时间|招标(公告|文件)(预计|预期|计划)发布时间'
+        self.pattern_time_planned = '(计划|预计|预期)(招标|采购|发标|发包)时间|招标(公告|文件)(预计|预期|计划)发布时间'
         self.pattern_code_investment = '投资(审批)?项目[编代]码[:为]'
         self.pattern_addr_dic = {'addr_bidopen': self.pattern_addr_bidopen,
                                  'addr_bidsend': self.pattern_addr_bidsend,

BIN
BiddingKG/dl/table_head/model_40_2_0.959.pth


+ 94 - 0
BiddingKG/dl/table_head/models/model_torch.py

@@ -73,6 +73,100 @@ class TableHeadModel(nn.Module):
         cnn3d_x = torch.permute(cnn3d_x, [2, 3, 1, 0])
         cnn3d_x = cnn3d_x.contiguous().view(row, col, char_num * self.char_embed_expand)
 
+        # dnn
+        x = self.dense3(cnn3d_x)
+        x = self.ln_dnn_2(x)
+        x = self.relu(x)
+        x = self.dense4(x)
+        x = self.sigmoid(x)
+        x = torch.squeeze(x, -1)
+        return x
+
+
+class TableHeadModel2(nn.Module):
+    def __init__(self):
+        super(TableHeadModel2, self).__init__()
+        self.char_num = 20
+        self.char_embed = 60
+        self.char_embed_expand = 128
+
+        self.dense0 = nn.Linear(self.char_embed, self.char_embed_expand)
+
+        self.dense3 = nn.Linear(self.char_num * self.char_embed_expand, 64)
+        self.dense4 = nn.Linear(64, 1)
+
+        self.sigmoid = nn.Sigmoid()
+
+        self.ln_dnn_2 = nn.LayerNorm([64])
+
+        self.device = torch.device("cpu")
+
+        self.relu = nn.LeakyReLU()
+        self.dropout = nn.Dropout(0.6)
+
+        # self.cnn1d_0 = nn.Conv1d(self.char_embed_expand,
+        #                          self.char_embed_expand,
+        #                          (3,), padding=self.get_padding(3))
+        # self.cnn1d_1 = nn.Conv1d(self.char_embed_expand,
+        #                          self.char_embed_expand,
+        #                          (3,), padding=self.get_padding(3))
+
+        encoder_layer1 = nn.TransformerEncoderLayer(d_model=self.char_embed_expand, nhead=2,
+                                                    dim_feedforward=128, batch_first=True)
+        self.transformer1 = nn.TransformerEncoder(encoder_layer1, 2)
+        self.ln_encoder_0 = nn.LayerNorm([self.char_embed_expand])
+
+        self.cnn3d_0 = nn.Conv3d(self.char_embed_expand, self.char_embed_expand,
+                                 (3, 3, 3), padding=self.get_padding(3))
+        self.cnn3d_1 = nn.Conv3d(self.char_embed_expand, self.char_embed_expand,
+                                 (3, 3, 3), padding=self.get_padding(3))
+        # self.cnn3d_2 = nn.Conv3d(self.char_embed, self.char_embed,
+        #                          (3, 3, 3), padding=self.get_padding(3))
+
+    def get_padding(self, kernel_size, stride=1):
+        return (kernel_size - 1) // 2 * stride
+
+    def forward(self, x):
+        batch, row, col, char_num, char_embed = x.shape
+
+        # Embedding
+        x = torch.squeeze(x, 0)
+        x = x.view([row*col, char_num, char_embed])
+        x = self.dense0(x)
+
+        # transformer
+        box_attention = self.transformer1(x)
+        box_attention = self.ln_encoder_0(box_attention)
+        box_attention = torch.permute(box_attention, [0, 2, 1])
+        box_attention = box_attention.contiguous().view(row, col, char_num, self.char_embed_expand)
+        box_attention = torch.unsqueeze(box_attention, 0)
+
+        # cnn1d_x = torch.permute(cnn1d_x, [0, 2, 1])
+        # cnn1d_x = self.cnn1d_0(cnn1d_x)
+        # cnn1d_x = self.relu(cnn1d_x)
+        # cnn1d_x = self.dropout(cnn1d_x)
+        # cnn1d_x = self.cnn1d_1(cnn1d_x)
+        # cnn1d_x = self.relu(cnn1d_x)
+        # cnn1d_x = self.dropout(cnn1d_x)
+        #
+        # cnn1d_x = torch.permute(cnn1d_x, [0, 2, 1])
+        # cnn1d_x = cnn1d_x.contiguous().view(row, col, char_num, self.char_embed_expand)
+        # cnn1d_x = torch.unsqueeze(cnn1d_x, 0)
+        # print(cnn1d_x.shape)
+
+        # cnn 3d
+        cnn3d_x = torch.permute(box_attention, [0, 4, 3, 1, 2])
+        cnn3d_x = self.cnn3d_0(cnn3d_x)
+        cnn3d_x = self.relu(cnn3d_x)
+        cnn3d_x = self.dropout(cnn3d_x)
+        cnn3d_x = self.cnn3d_1(cnn3d_x)
+        cnn3d_x = self.relu(cnn3d_x)
+        cnn3d_x = self.dropout(cnn3d_x)
+
+        cnn3d_x = torch.squeeze(cnn3d_x, 0)
+        cnn3d_x = torch.permute(cnn3d_x, [2, 3, 1, 0])
+        cnn3d_x = cnn3d_x.contiguous().view(row, col, char_num * self.char_embed_expand)
+
         # dnn
         x = self.dense3(cnn3d_x)
         x = self.ln_dnn_2(x)

+ 4 - 1
BiddingKG/dl/table_head/predict_torch.py

@@ -6,10 +6,12 @@ from torch.utils.data import DataLoader
 
 sys.path.append(os.path.abspath(os.path.dirname(__file__) + "/../../../"))
 from BiddingKG.dl.table_head.models.model_torch import TableHeadModel
+# from BiddingKG.dl.table_head.models.model_torch import TableHeadModel2
 from BiddingKG.dl.table_head.pre_process_torch import CustomDatasetTiny40, set_same_table_head, set_label
 
 device = torch.device("cpu")
-model_path = os.path.abspath(os.path.dirname(__file__)) + '/model_40_0.951.pth'
+model_path = os.path.abspath(os.path.dirname(__file__)) + '/model_40_0.959.pth'
+# model_path = os.path.abspath(os.path.dirname(__file__)) + '/model_40_2_0.959.pth'
 batch_size = 1
 
 
@@ -18,6 +20,7 @@ def predict(table_text_list):
         print("="*15, "init table_head model", "="*15)
         # 实例化模型
         model = TableHeadModel()
+        # model = TableHeadModel2()
         model.to(device)
         model.load_state_dict(torch.load(model_path, map_location=torch.device(device)))
         # 将模型设置为评估模式

برخی فایل ها در این مقایسه diff نمایش داده نمی شوند زیرا تعداد فایل ها بسیار زیاد است