소스 검색

新增工商实体纠正实体及角色;优化大学数据源唯一招标人;优化最后角色召回

lsm 1 년 전
부모
커밋
3e22c74352
3개의 변경된 파일139개의 추가작업 그리고 49개의 파일을 삭제
  1. 117 20
      BiddingKG/dl/entityLink/entityLink.py
  2. 1 1
      BiddingKG/dl/interface/extract.py
  3. 21 28
      BiddingKG/dl/interface/predictor.py

+ 117 - 20
BiddingKG/dl/entityLink/entityLink.py

@@ -88,22 +88,108 @@ def is_short(shorter_cut, longer):
     else:
         return 0
 
+def get_business_data(enterprise_name):
+    '''
+    获取指定公司名称是否有工商数据,有就返回True及相关招投标数据,没有返回False及{}
+    :param enterprise_name: 公司名称
+    :return:
+    '''
+    global ENTERPRISE_HUGE,SET_ENTERPRISE,POOL_REDIS
+    # print("test",enterprise_name)
+    if ENTERPRISE_HUGE:
+        if POOL_REDIS is None:
+            init_redis_pool()
+        _db = POOL_REDIS.getConnector()
+        try:
+            _time = time.time()
+            _v = _db.get(enterprise_name)
+
+            POOL_REDIS.putConnector(_db)
+            if _v is None:
+                return False, {}
+            else:
+                _v = str(_v, 'utf-8')
+                if 'have_business' in _v:
+                    # log("redis take %.5f of '%s' exists"%(time.time()-_time,enterprise_name))
+                    d = json.loads(_v)
+                    if d.get('have_business', '') == 1:
+                        return True, d
+                    return False, {}
+                else:
+                    return False, {}
+        except Exception as e:
+            traceback.print_exc()
+        return False, {}
+    else:
+        if enterprise_name in SET_ENTERPRISE:
+            return True, {}
+        else:
+            return False, {}
+
+def get_role(dic):
+    '''
+    通过字典统计 招标、代理、中标公告数量 返回最大比例及对应类别
+    :param dic: redics 获取实体的工商数据字典
+    :return:
+    '''
+    if 'zhao_biao_number' in dic:
+        zhaobiao = dic.get('zhao_biao_number', 0)
+        daili = dic.get('dai_li_number', 0)
+        zhongbiao = dic.get('zhong_biao_number', 0)
+        bid = zhaobiao+ daili+ zhongbiao
+        if bid > 100: # 总数大于100的才统计
+            if zhaobiao>=daili:
+                if zhaobiao>=zhongbiao:
+                    return 0, zhaobiao/bid
+                else:
+                    return 2, zhongbiao/bid
+            elif daili >= zhongbiao:
+                return 1, daili/bid
+            else:
+                return 2, zhongbiao/bid
+    return 5, 0
+
 def link_entitys(list_entitys,on_value=1):#on_value=0.81
     for list_entity in list_entitys:
         range_entity = []
-        short_entity = []
-        long_entity = []
+        short_entity = []  # 不包含工商数据实体
+        long_entity = []  # 包含工商数据实体
         n = 0
+        bus_dic = {} # 保存已查询包含工商数据实体 属于招标、代理、中标 何种类别及对应概率
+        find_tenderee = False
+        bus_tenderee = []
         for _entity in list_entity:
             if _entity.entity_type in ["org","company"]:
                 range_entity.append(_entity)
-                if len(_entity.entity_text) in [4, 5, 6]:
-                    short_entity.append(_entity)
-                if len(_entity.entity_text)>6:
+                if _entity.entity_text in bus_dic:
+                    have_bus = True
+                else:
+                    have_bus, dic = get_business_data(_entity.entity_text)
+                    lb, prob = get_role(dic)
+                    bus_dic[_entity.entity_text] = (lb, prob)
+                    if lb == 0 and prob > 0.9:
+                        bus_tenderee.append(_entity)
+                if have_bus: # 20231115 改为只判断是否有工商数据,没有就考虑替换
                     long_entity.append(_entity)
+                    lb, prob = bus_dic[_entity.entity_text]
+                    if lb in [0,1] and prob>0.9 and _entity.label in [0, 1] and _entity.values[_entity.label]<0.55: # 如果工商统计概率较高,文中概率较低,换为统计类别,主要为标题及发布人等招标、代理划分不明确情况
+                        if _entity.label != lb:
+                            _entity.label = lb
+                            _entity.values[_entity.label] = 0.55
+                        else:
+                            _entity.values[_entity.label] += 0.05
+                else:
+                    short_entity.append(_entity)
+                if _entity.label == 0:  # 找到招标人
+                    find_tenderee = True
                 n += 1
                 if n > 1000:
                     break
+
+        if find_tenderee == False and len(bus_tenderee)==1 and bus_tenderee[0].label==5:  # 如果整篇都没招标人,工商统计只有一个高概率招标人把它作为招标人
+            bus_tenderee[0].label = 0
+            bus_tenderee[0].values[0] = 0.55
+
         range_entity = range_entity[:1000]
         #替换公司的逻辑有问题,先取消
         # for first_i in range(len(range_entity)):
@@ -126,24 +212,35 @@ def link_entitys(list_entitys,on_value=1):#on_value=0.81
         #                 if len(_ent.entity_text)>len(_entity.entity_text):
         #                     _entity.entity_text = _ent.entity_text
 
-        if short_entity and long_entity:
+        if short_entity and long_entity:  #
             for first_i in range(len(short_entity)):
                 _entity = short_entity[first_i]
-                if is_enterprise_exist(_entity.entity_text): # 实体表存在的不替换
-                    continue
-                if _entity.label == 0 and re.search('(医院|学院|学校|中学|小学|大学|幼儿园|保健院|党校)', _entity.entity_text)==None:
-                    ree_l = []
-                    other_l = []
+                if _entity.label == 0:
                     for second_i in range(len(long_entity)):
                         _ent = long_entity[second_i]
-                        if _ent.label in [0,1,5] and is_short(_entity.entity_text, _ent.entity_text):
-                            if _ent.label in [0 ,1]:
-                                ree_l.append(_ent)
-                            elif _ent.label in [5]:
-                                other_l.append(_ent)
-                    for _ent in ree_l + other_l:
-                        if is_enterprise_exist(_ent.entity_text) or re.search('有限(责任)?公司', _ent.entity_text):
-                            _entity.entity_text = _ent.entity_text
+                        if _ent.label in [0,1,5]:
+                            if len(_entity.entity_text)<len(_ent.entity_text) and is_short(_entity.entity_text, _ent.entity_text):  # 简称顺序包含在工商名称内的替换
+                                _entity.entity_text = _ent.entity_text
+                                lb, prob = bus_dic[_entity.entity_text]
+                                if lb in [0, 1] and prob > 0.9 and _entity.values[
+                                    _entity.label] < 0.55:  # 如果工商统计概率较高,文中概率较低,换为统计类别,主要为标题及发布人等招标、代理划分不明确情况
+                                    if _entity.label != lb:
+                                        _entity.label = lb
+                                        _entity.values[_entity.label] = 0.55
+                                    else:
+                                        _entity.values[_entity.label] += 0.05
+                                break
+                            elif len(_entity.entity_text)>len(_ent.entity_text) and _ent.entity_text in _entity.entity_text:  # 不包含工商数据实体完全包含工商数据实体名称的替换
+                                _entity.entity_text = _ent.entity_text
+                                lb, prob = bus_dic[_entity.entity_text]
+                                if lb in [0, 1] and prob > 0.9 and _entity.values[
+                                    _entity.label] < 0.55:  # 如果工商统计概率较高,文中概率较低,换为统计类别,主要为标题及发布人等招标、代理划分不明确情况
+                                    if _entity.label != lb:
+                                        _entity.label = lb
+                                        _entity.values[_entity.label] = 0.55
+                                    else:
+                                        _entity.values[_entity.label] += 0.05
+                                break
 
         # 2021/12/21 替换通过字典识别到的取长度最大的相似实体
         for _entity in range_entity:
@@ -359,7 +456,7 @@ def is_enterprise_exist(enterprise_name):
                 return False
             else:
                 if _v:
-                    log("redis take %.5f of '%s' exists"%(time.time()-_time,enterprise_name))
+                    # log("redis take %.5f of '%s' exists"%(time.time()-_time,enterprise_name))
                     return True
                 else:
                     return False

+ 1 - 1
BiddingKG/dl/interface/extract.py

@@ -342,7 +342,7 @@ def predict(doc_id,text,title="",page_time="",web_source_no='',web_source_name="
 
     # data_res = Preprocessing.union_result(Preprocessing.union_result(codeName, prem),list_punish_dic)[0]
     # data_res = Preprocessing.union_result(Preprocessing.union_result(Preprocessing.union_result(codeName, prem),list_punish_dic), list_channel_dic)[0]
-    version_date = {'version_date': '2023-11-09'}
+    version_date = {'version_date': '2023-11-17'}
     data_res = dict(codeName[0], **prem[0], **channel_dic, **product_attrs[0], **product_attrs[1], **payment_way_dic, **fail_reason, **industry, **district, **candidate_dic, **version_date)
 
     '''最终检查修正招标、中标金额'''

+ 21 - 28
BiddingKG/dl/interface/predictor.py

@@ -1528,10 +1528,16 @@ class RoleRulePredictor():
                                     find_flag = True
                                     break
 
+                                if re.search('(项目|工程|招标|采购(条目)?|合同|标项|标的|计划|询价|询价单|询价通知书|申购单|申购)(名称|标名|标题|主题):$', _span[0]):
+                                    find_flag = True
+                                    p_entity.values[0] = on_value  # 项目名称里面实体修改为最低概率
+                                    break
+
                                 for _name in name_entitys:
                                     if _name.sentence_index == p_entity.sentence_index and p_entity.wordOffset_begin >=_name.wordOffset_begin and p_entity.wordOffset_end < _name.wordOffset_end:
                                         find_flag = True
                                         p_entity.values[0] = on_value # 项目名称里面实体修改为最低概率
+                                        break
                                         # if p_entity.values[0] > on_value:
                                         #     p_entity.values[0] = 0.5 + (p_entity.values[0] - 0.5) / 10
                                         # else:
@@ -1807,8 +1813,8 @@ class RoleRuleFinalAdd():
             end_tokens.extend(sentence.tokens)
         # text_end = "".join(end_tokens[-30:])
         text_end = "".join(end_tokens)
-        text_end = re.sub(',?(招标办|招投标管理中心|国有资产管理处|采办共享中心|采购与招标管理办公室|附件\d+:[^附件,。]{5,100}\.(docx|doc|rar|xlsx|xls|jpg|pdf)|附件\d+:)', '', text_end)[-200:]  # 处理 类似 285264698 传真:0512-62690315,苏州卫生职业技术学院,国有资产管理处,2022年11月24日, 这种情况
-        # print('text_end: ', text_end)
+        text_end = re.sub(r"http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\\(\\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+", '', text_end) # 去除网址
+        text_end = re.sub(',?(招标办|招投标管理中心|国有资产管理处|采办共享中心|采购与招标管理办公室|附件\d*:[^附件,。]{5,100}\.(docx|doc|rar|xlsx|xls|jpg|pdf)|附件\d*:.{,100})', '', text_end)[-200:]  # 处理 类似 285264698 传真:0512-62690315,苏州卫生职业技术学院,国有资产管理处,2022年11月24日, 这种情况
         # sear_ent = re.search('[,。]([\u4e00-\u9fa5()()]{5,20}),?\s*[.]{2,4}年.{1,2}月.{1,2}日', text_end)
         sear_ent = re.search('[,。;](?P<entity>[\u4e00-\u9fa5()()]{5,20}(,?[\u4e00-\u9fa5]{,6}(分公司|部))?),?\s*(公告日期:)?[0-9零一二三四五六七八九十〇]{2,4}[年\-/][0-9零一二三四五六七八九十]{1,2}[月\-/][0-9零一二三四五六七八九十]{1,2}日?', text_end)
         sear_ent1 = re.search('((招标|采购)联系人)[,::][A-Za-z0-9_]*(?P<entity>[\u4e00-\u9fa5()()]{4,20})', list_articles[0].content[:5000])
@@ -1877,26 +1883,6 @@ class RoleRuleFinalAdd():
                     if not tenderee_notfound:
                         break
 
-        elif list_codenames[0]['name'] != "":  #把标题包含的公司实体作为招标人
-            # tenderee_notfound = True
-            # ents = []
-            # for ent in list_entitys[0]:
-            #     if ent.entity_type in ['org', 'company']:
-            #         if ent.label == 0:
-            #             tenderee_notfound = False
-            #         elif ent.label == 1:
-            #             agency_notfound = False
-            #         elif ent.label == 5:
-            #             ents.append(ent)
-            if tenderee_notfound == True:
-                # print('list_codenames',list_codenames[0]['name'])
-                for ent in ents:
-                    if ent.entity_text in list_codenames[0]['name']:
-                        ent.label = 0
-                        ent.values[0] = 0.5
-                        tenderee_notfound == False
-                        # log('正则召回标题中包含的实体:%s'%ent.entity_text)
-                        break
 
 # 招标人角色召回规则
 class TendereeRuleRecall():
@@ -2183,14 +2169,15 @@ class RoleGrade():
         self.tenderee_left_9 = "(?P<tenderee_left_9>(招标|采购|遴选|寻源|竞价|议价|比选|询比?价|比价|评选|谈判|邀标|邀请|洽谈|约谈|选取|抽取|抽选)(人|方|单位))"
         self.tenderee_center_8 = "(?P<tenderee_center_8>受.{5,20}委托)"
         self.tenderee_left_8 = "(?P<tenderee_left_8>(尊敬的供应商|(需求|最终|发包|征集|甲|转让|出租|处置)(人|方|单位|组织|用户|业主|主体|部门|公司)))"
-        self.tenderee_left_6 = "(?P<tenderee_left_6>(发布|业主|建设|委托)(人|方|单位|组织|用户|业主|主体|部门|公司|企业)|业主|买方|发布机构)"
+        self.tenderee_left_6 = "(?P<tenderee_left_6>(业主|建设|委托)(人|方|单位|组织|用户|业主|主体|部门|公司|企业)|业主|买方)"
+        self.tenderee_left_5 = "(?P<tenderee_left_5>(发布)(人|方|单位|组织|用户|业主|主体|部门|公司|企业)|买方|发布机构)"
         self.agency_left_9 = "(?P<agency_left_9>代理)"
         self.winTenderer_left_9 = "(?P<winTenderer_left_9>(中标|中选|中价|成交|竞得)|第[1一]名|排[名序]:1|名次:1)"
         self.winTenderer_left_8 = "(?P<winTenderer_left_8>(入选供应商|供货商|乙方))"
         self.winTenderer_left_6 = "(?P<winTenderer_left_6>(入围|承[接建包修做制担租销]))"
         self.secondTenderer_left_9 = "(?P<secondTenderer_left_9>(第[二2](中标|中选|中价|成交)?候选(人|单位|供应商|公司)|第[二2]名|排[名序]:2|名次:2))"
         self.thirdTenderer_left_9 = "(?P<thirdTenderer_left_9>(第[三3](中标|中选|中价|成交)?候选(人|单位|供应商|公司)|第[三3]名|排[名序]:3|名次:3))"
-        self.pattern_list = [self.tenderee_left_9,self.tenderee_center_8, self.tenderee_left_8,self.tenderee_left_6,self.agency_left_9, self.winTenderer_left_9,
+        self.pattern_list = [self.tenderee_left_9,self.tenderee_center_8, self.tenderee_left_8,self.tenderee_left_6,self.tenderee_left_5,self.agency_left_9, self.winTenderer_left_9,
                              self.winTenderer_left_8,self.winTenderer_left_6, self.secondTenderer_left_9, self.thirdTenderer_left_9]
     def predict(self, list_sentences, list_entitys, span=15, min_prob=0.7):
         '''
@@ -3780,7 +3767,7 @@ class DocChannel():
           '土地矿产': '(土地|用地|宗地|荒地|山地|海域|矿)(出让|出租|招租|租赁|承包|流转|使用权|经营权|征收|划拨|中标|成交)|供地结果|矿业权|探矿权|采矿权|(土地|用地|宗地|地块)(使用权)?(终止|中止|网上)?(挂牌|出让|拍卖|招拍|划拨)|征收土地',
           '拍卖出让': '(拍卖|变卖|流拍|竞拍)的?(公告|公示)|拍卖|变卖|流拍|竞拍',
           '产权交易': '经营权|承包权|使用权|租赁权|股权|债权|排污权|化学需氧量|储备量|竞价销售|销售结果|出租|招租|拍租|竞租|续租|挂牌|出让',
-          '采招数据': '(采购|招标|询价|议价|比价|比选|遴选|邀请|邀标|磋商|洽谈|约谈|谈判)的?(公告|公示|中标|成交|结果|$)|工程招标',
+          '采招数据': '(采购|招标|询价|议价|比价|比选|遴选|邀请|邀标|磋商|洽谈|约谈|谈判)的?(公告|公示|中标|成交|结果|$)|工程招标|定点服务',
           # |竞价 采招/产权都有竞价方式 # 意向|需求|预公?告|报建|总承包|工程|施工|设计|勘察|代理|监理 |变更|答疑|澄清|中标|成交|合同|废标|流标
           '新闻资讯': '(考试|面试|笔试)成绩|成绩的?(公告|公示|公布)|公开招聘|招聘(公告|简章|启事|合同制)|疫情防控\s{,5}(通知|情况|提示)'
       }
@@ -3817,7 +3804,7 @@ class DocChannel():
           '资审结果': '((资格|资质)(审查|预审|后审|审核)|资审)结果(公告|公示)?|(资质|资格)(预审|后审)公示|资审及业绩公示',
           '招标公告': '(采购|招标|询价|议价|竞价|比价|比选|遴选|邀请|邀标|磋商|洽谈|约谈|谈判|拍卖|招租|交易|出让)的?(公告|公示|$)|公开(采购|招标|招租|拍卖|挂牌|出让)|(资审|预审|后审)公告',
           '开标记录': '开标记录|截标信息|评委名单公示|开标安排|开标数据表|开标信息|开标情况|开标一览表|开标结果',
-          '验收合同': '验收公告|验收单公示|验收结果公告|验收报告公示|验收意见报告|履约公告|履约结果公告'
+          '验收合同': '(验收|履约)(公告|公示)|(验收|履约)(结果|报告|意见|单)(公告|公示)'
       }
 
   def load_life(self,life_model,config):
@@ -4426,8 +4413,12 @@ class DocChannel():
           if doc_type =="":
               type_id, type_prob = type_model_predict()
               type_model = self.id2type[type_id]
-              result['docchannel']['doctype'] = type_model
-              msc += type_model + ' 概率:%.4f;'%type_prob
+              if type_model == '新闻资讯' and doc_life!='': # 修复bug 78584245 "docchannel": "合同公告", "doctype": "新闻资讯",
+                  result['docchannel']['doctype'] = '采招数据'
+                  msc += '模型结果为新闻资讯,生命周期不为空,改为采招数据;'
+              else:
+                  result['docchannel']['doctype'] = type_model
+                  msc += type_model + ' 概率:%.4f;'%type_prob
               # print('公告类别:', self.id2type[id], '概率:',prob)
               # if id == 0:
           if doc_life=="" and result['docchannel']['doctype'] not in ['', '新闻资讯']:
@@ -6098,6 +6089,8 @@ class WebsourceTenderee():
                         find_tenderee = True
                         if d['role_text'] == "":
                             d['role_text'] = web_ree
+                        elif re.search('大学$', web_ree) and re.search('学院$', d['role_text']) and web_ree not in d['role_text']:
+                            d['role_text'] = web_ree
                         # elif re.search(p, web_ree) and (re.search(p, d['role_text'])==None and len(d['role_text'])<6): # 数据源唯一招标人以医院等结尾,角色中无相关关键词的,替换为数据源招标人
                         #     d['role_text'] = web_ree
                         # elif re.search('有限(责任)?公司', web_ree) and (re.search('有限(责任)?公司', d['role_text'])==None and len(d['role_text'])<6):