فهرست منبع

服务期限规则优化

znj 2 ماه پیش
والد
کامیت
a387e37667
3فایلهای تغییر یافته به همراه24 افزوده شده و 12 حذف شده
  1. 1 1
      BiddingKG/dl/channel/channel_bert.py
  2. 18 7
      BiddingKG/dl/interface/getAttributes.py
  3. 5 4
      BiddingKG/dl/interface/predictor.py

+ 1 - 1
BiddingKG/dl/channel/channel_bert.py

@@ -577,7 +577,7 @@ def merge_channel(list_articles,channel_dic,original_docchannel):
     if doctype=='采招数据' and docchannel in compare_type:
         if not re.search("单一来源",title) and not re.search("单一来源",text[:100]):
             pred = channel_predict(title, text)
-            # print('pred_res', pred)
+            # print(text, '\n pred_res', pred)
             if pred is not None and original_docchannel: # 无original_docchannel时不进行对比校正
                 channel_dic = merge_rule(title,text,docchannel,pred,channel_dic,original_docchannel)
 

+ 18 - 7
BiddingKG/dl/interface/getAttributes.py

@@ -1303,12 +1303,12 @@ def findAttributeAfterEntity(PackDict,roleSet,PackageList,PackageSet,list_senten
                    '[\(|\(]0[1-9]\d{1,2}[\)|\)]-?[2-9]\d{6}\d?-?\d{,4}|' \
                    '[2-9]\d{6,7})'
     re_tenderee_phone = re.compile(
-        "(?:(?:(?:采购|招标|议价|议标|比选)(?:人|公司|单位|组织|部门)|建设(?:单位|业主)|(?:采购|招标|甲)方|询价单位|项目业主|业主)[^。]{0,5}(?:电话|联系方式|联系人|联系电话)[::]?[^。]{0,7}?)"
+        "(?:(?:(?:采购|招标|议价|议标|比选)(?:人|公司|单位|组织|部门)|建设(?:单位|业主)|(?:采购|招标|甲)方|询价单位|项目业主|业主)[^。代理]{0,5}(?:电话|联系方式|联系人|联系电话)[::]?[^。]{0,7}?)"
         # 电话号码
         + phone_pattern)
     # 例:"采购人地址和联系方式:峨边彝族自治县教育局,0833-5226788,"
     re_tenderee_phone2 = re.compile(
-        "(?:(?:(?:采购|招标|议价|议标|比选)(?:人|公司|单位|组织|部门)|建设(?:单位|业主)|(?:采购|招标|甲)方|询价单位|项目业主|业主)[^。]{0,3}(?:地址)[^。]{0,3}(?:电话|联系方式|联系人|联系电话)[::]?[^。]{0,20}?)"
+        "(?:(?:(?:采购|招标|议价|议标|比选)(?:人|公司|单位|组织|部门)|建设(?:单位|业主)|(?:采购|招标|甲)方|询价单位|项目业主|业主)[^。代理]{0,3}(?:地址)[^。]{0,3}(?:电话|联系方式|联系人|联系电话)[::]?[^。]{0,20}?)"
         # 电话号码
         + phone_pattern)
     re_agent_phone = re.compile(
@@ -1586,6 +1586,7 @@ def findAttributeAfterEntity(PackDict,roleSet,PackageList,PackageSet,list_senten
         # 去重结果
         relation_list = list(set(relation_list))
     # print([(rel[0].entity_text,rel[2].entity_text) for rel in relation_list])
+    # relation_list = [] # 放弃原来的模型连接,结果不好控制
     right_combination = [('org','person'),('company','person'),('company','location'),('org','location'),('person','phone')]
     linked_company = set()
     linked_person = set()
@@ -1604,6 +1605,7 @@ def findAttributeAfterEntity(PackDict,roleSet,PackageList,PackageSet,list_senten
                     distance = (tokens_num_dict[_object.sentence_index] + _object.begin_index) - (
                             tokens_num_dict[_subject.sentence_index] + _subject.end_index)
                     if predicate=="rel_person":
+                        # print(predicate, _subject.entity_text, _object.entity_text)
                         if (_subject.label==0 and _object.entity_text in agency_contact ) or (_subject.label==1 and _object.entity_text in tenderee_contact):
                             continue
                         # 角色为中标候选人,排除"质疑|投诉|监督|受理"相关的联系人
@@ -1611,6 +1613,7 @@ def findAttributeAfterEntity(PackDict,roleSet,PackageList,PackageSet,list_senten
                             continue
                         # 角色为招标/代理人,排除"纪检|监察"相关的联系人
                         if _subject.label in [0,1] and re.search("纪检|监察|乙方|中标",list_sentence[_object.sentence_index].sentence_text[max(0,_object.wordOffset_begin - 10):_object.wordOffset_begin]):
+                        # if _subject.label in [0,1] and re.search("纪检|监察|乙方|中标",list_sentence[_object.sentence_index].sentence_text[_subject.end_index:_object.wordOffset_begin]):
                             continue
                         if _object.sentence_index!=0 and _object.wordOffset_begin<=10:
                             if _subject.label in [2, 3, 4] and re.search("请.{0,4}联系",
@@ -4086,7 +4089,8 @@ def extract_serviceTime(service_time,page_time):
                 if service_days <= 1 and service_days > 4000:
                     service_days = 0
 
-                if service_days>3:
+                # if service_days>3:
+                if service_days>0:
                     # service_days = str(service_days) + "天"
                     serviceTime_dict['service_days'] = service_days
                     break
@@ -4153,7 +4157,6 @@ def getOtherAttributes(list_entity,page_time,prem,channel_dic):
         list_serviceTime = [serviceTime for serviceTime in list_serviceTime if serviceTime.in_attachment==0]
         error_serviceTime = []
         for list_time in [list_serviceTime,list_serviceTime_inAtt]:
-            # if not dict_other["serviceTime"]:
             if not serviceTime_dict['service_end'] and not serviceTime_dict['service_days']:
                 list_time.sort(key=lambda x: (x.prob,-x.sentence_index,-x.begin_index), reverse=True)
                 for _serviceTime in list_time:
@@ -4171,7 +4174,6 @@ def getOtherAttributes(list_entity,page_time,prem,channel_dic):
                                 break
                             else:
                                 error_serviceTime.append(_serviceTime.entity_text)
-                # if not dict_other["serviceTime"]:
                 if not serviceTime_dict['service_end']:
                     for _serviceTime in list_time:
                         # 优先取具体时间(20XX年x月-20XX年x月)
@@ -4181,7 +4183,6 @@ def getOtherAttributes(list_entity,page_time,prem,channel_dic):
                             if extract_time['service_end']:
                                 serviceTime_dict = extract_time
                                 break
-                # if not dict_other["serviceTime"]:
                 if not serviceTime_dict['service_end']:
                     for _serviceTime in list_time:
                         # 优先取具体时间(20XX年x月x日)
@@ -4192,7 +4193,16 @@ def getOtherAttributes(list_entity,page_time,prem,channel_dic):
                                 if extract_time['service_end']:
                                     serviceTime_dict = extract_time
                                     break
-                # if not dict_other["serviceTime"]:
+                if not serviceTime_dict['service_end'] and not serviceTime_dict['service_days']:
+                    for _serviceTime in list_time:
+                        if _serviceTime.entity_text not in error_serviceTime:
+                            # dict_other["serviceTime"] = _serviceTime.entity_text
+                            extract_time = extract_serviceTime(_serviceTime.entity_text,page_time)
+                            # service_days > 3
+                            if extract_time['service_end'] or extract_time['service_days']>3:
+                                serviceTime_dict = extract_time
+                                break
+                # 若上一步仍无结果,取消service_days > 3 的条件
                 if not serviceTime_dict['service_end'] and not serviceTime_dict['service_days']:
                     for _serviceTime in list_time:
                         if _serviceTime.entity_text not in error_serviceTime:
@@ -4201,6 +4211,7 @@ def getOtherAttributes(list_entity,page_time,prem,channel_dic):
                             if extract_time['service_end'] or extract_time['service_days']:
                                 serviceTime_dict = extract_time
                                 break
+
     if serviceTime_dict['service_start'] and serviceTime_dict['service_end']:
         service_days = get_days_between(serviceTime_dict['service_start'],serviceTime_dict['service_end'])
         serviceTime_dict['service_days'] = service_days

+ 5 - 4
BiddingKG/dl/interface/predictor.py

@@ -1501,7 +1501,7 @@ class RoleRulePredictor():
         self.pattern_thirdTenderer_left = "(?P<thirdTenderer_left>(第[三3]名?(名|((中标|中选|中价|成交|候选)(候选)?(人|单位|机构|供应商|公司|银行))))(名称)?[::是为]+$|((评审结果|名次|排名|排序)[::]第?[三3]名?,?(投标(供应)?商|供应商)(名称)?[::]+$))"
         self.pattern_thirdTenderer_right = "(?P<thirdTenderer_right>^[是为\(]第[三3](名|(中标|中选|中价|成交)(候选)?(人|单位|机构|供应商|公司|银行)))"
 
-        self.condadate_left = "(?P<candidate_left>(((中[标选商]|成交|入围|入选)候选|投标)(人|单位|机构|中介(服务)?机构|供应商|客户|方|公司|厂商|商家?|社会资本方?|银行)|服务单位)(:?单位名称|:?名称|全称|(?盖\w{,5}章)?|如下|:?牵头人|[及与和](成交|中标)金额)?[::是为]+$)"
+        self.candidate_left = "(?P<candidate_left>(((中[标选商]|成交|入围|入选)候选|投标)(人|单位|机构|中介(服务)?机构|供应商|客户|方|公司|厂商|商家?|社会资本方?|银行)|服务单位)(:?单位名称|:?名称|全称|(?盖\w{,5}章)?|如下|:?牵头人|[及与和](成交|中标)金额)?[::是为]+$)"
 
         self.pattern_left = [
             self.pattern_tenderee_left_60,
@@ -1799,7 +1799,7 @@ class RoleRulePredictor():
                                     p_entity.values[_label] = _prob + p_entity.values[int(_label)] / 10
                                     # log('正则召回实体: %s, %s, %d, %.4f, %s'%(kw, p_entity.entity_text, p_entity.label, p_entity.values[p_entity.label], before+"  "+after))
                                     break
-                                if re.search(self.condadate_left, before) and re.search('尊敬的|各', before[-10:])==None:
+                                if re.search(self.candidate_left, before) and re.search('尊敬的|各', before[-10:])==None:
                                     candidates.append(p_entity)
                                 elif channel_dic['docchannel']['docchannel'] in ['中标信息', '候选人公示', '合同公告'] and re.search(':$', before) and re.search('^[,。]', after) and re.search('候选人', before): # 补充 577756336 候选人,三期A160、A166地块:中国建设银行成都第九支行,
                                     candidates.append(p_entity)
@@ -1860,7 +1860,7 @@ class RoleRulePredictor():
                                     #     p_entity.values[int(_label)] = on_value*_prob_weight + p_entity.values[int(_label)] / 10
                                     #     # log('正则召回实体: %s, %s, %s, %d, %.4f, %s'%(_group,  _v_group, p_entity.entity_text, p_entity.label, p_entity.values[p_entity.label], list_spans[_i_span]))
                                     #     break
-                                    # if _i_span == 0 and  re.search(self.condadate_left, list_spans[_i_span]):
+                                    # if _i_span == 0 and  re.search(self.candidate_left, list_spans[_i_span]):
                                     #     candidates.append(p_entity)
 
                     elif str(p_entity.label) in ['2', '3', '4']:
@@ -4157,7 +4157,7 @@ class DocChannel():
           '产权交易': '经营权|承包权|使用权|租赁权|股权|债权|排污权|化学需氧量|储备量|竞价销售|销售结果|出租|招租|拍租|竞租|续租|挂牌|出让|废[旧弃]?(物资|设备|资源|金属|钢筋|料)处[置理]',
           '产权交易2': '使用权|租赁权|股权|债权|排污权|竞价销售|销售结果|出租|招租|拍租|竞租|续租|挂牌|出让|废[旧弃]?(物资|设备|资源|金属|钢筋|料)处[置理]',
           # '采招数据': '(采购|招标|询价|议价|比价|比选|遴选|邀请|邀标|磋商|洽谈|约谈|谈判|征询|调研)的?(公告|公示|中标|成交|结果|$)|工程招标|定点服务|(设备|服务|\w{2})[直采]购|(建设|改造)项目|工程|拦标价|控制价|银行|资格选定|资金|公款|存款|存放|现金管理|招募|入围|入库',
-          '采招数据': '(采购|招标|询价|议价|比价|比选|遴选|邀请|邀标|磋商|洽谈|约谈|谈判|征询)的?(公告|公示|中标|成交|结果|$)|工程招标|定点服务|(设备|服务|\w{2})[直采]购|(建设|改造)项目|拦标价|控制价|资格选定|资格认定|资金|公款|存款|现金管理|招募|入库|遴选.{,25}(服务|事务所|机构)',
+          '采招数据': '(采购|招标|询价|议价|比价|比选|遴选|邀请|邀标|磋商|洽谈|约谈|谈判|征询|调研)的?(公告|公示|中标|成交|结果|$)|工程招标|定点服务|(设备|服务|\w{2})[直采]购|(建设|改造)项目|拦标价|控制价|资格选定|资格认定|资金|公款|存款|现金管理|招募|入库|遴选.{,25}(服务|事务所|机构)',
           # |竞价 采招/产权都有竞价方式 # 意向|需求|预公?告|报建|总承包|工程|施工|设计|勘察|代理|监理 |变更|答疑|澄清|中标|成交|合同|废标|流标
           '新闻资讯': '(考试|面试|笔试)成绩|成绩的?(公告|公示|公布)|公开招聘|招聘(公告|简章|启事|合同制)|疫情防控\s{,5}(通知|情况|提示)|行政审批结果'
       }
@@ -4853,6 +4853,7 @@ class DocChannel():
       elif result['docchannel']['doctype'] == '采招数据' and origin_dic.get(
               original_docchannel, '') in ['产权交易', '土地矿产'] and re.search('产权|转让|受让|招租|出租|承租|竞价', text):
           result['docchannel']['doctype'] = origin_dic.get(original_docchannel, '')
+          # print(re.findall('产权|转让|受让|招租|出租|承租|竞价', text))
           msc += '最终规则修改:预测为采招数据,原始为产权且有关键词,返回原始类别'
       elif result['docchannel']['docchannel'] == '废标公告' and origin_dic.get(
               original_docchannel, '') in ['招标公告', '采购意向', '招标预告'] and re.search(