Ver Fonte

新增短实体替换逻辑、数据源唯一招标人召回逻辑;

lsm há 1 ano atrás
pai
commit
f72486d83e

+ 53 - 6
BiddingKG/dl/entityLink/entityLink.py

@@ -69,13 +69,41 @@ def get_place_list():
 place_list = get_place_list()
 place_pattern = "|".join(place_list)
 
+def is_short(shorter_cut, longer):
+    '''
+    判断是否为简称
+    :param shorter_cut: 简称
+    :param longer: 全称
+    :return:
+    '''
+    flag = 1
+    for words in shorter_cut:
+        if words in longer:
+            longer = longer[longer.find(words) + len(words):]
+        else:
+            flag = 0
+            break
+    if flag:
+        return 1
+    else:
+        return 0
 
 def link_entitys(list_entitys,on_value=1):#on_value=0.81
     for list_entity in list_entitys:
         range_entity = []
+        short_entity = []
+        long_entity = []
+        n = 0
         for _entity in list_entity:
             if _entity.entity_type in ["org","company"]:
                 range_entity.append(_entity)
+                if len(_entity.entity_text) in [4, 5, 6]:
+                    short_entity.append(_entity)
+                if len(_entity.entity_text)>6:
+                    long_entity.append(_entity)
+                n += 1
+                if n > 1000:
+                    break
         range_entity = range_entity[:1000]
         #替换公司的逻辑有问题,先取消
         # for first_i in range(len(range_entity)):
@@ -90,12 +118,31 @@ def link_entitys(list_entitys,on_value=1):#on_value=0.81
         #             _entity.linked_entitys.append(_ent)
         #             _ent.linked_entitys.append(_entity)
         #             print("=-===",_entity.entity_text,_ent.entity_text,_score)
-        #替换公司名称
-        for _entity in range_entity:
-            if re.search("公司",_entity.entity_text) is None:
-                for _ent in _entity.linked_entitys:
-                    if re.search("公司$",_ent.entity_text) is not None:
-                        if len(_ent.entity_text)>len(_entity.entity_text):
+        # #替换公司名称
+        # for _entity in range_entity:
+        #     if re.search("公司",_entity.entity_text) is None:
+        #         for _ent in _entity.linked_entitys:
+        #             if re.search("公司$",_ent.entity_text) is not None:
+        #                 if len(_ent.entity_text)>len(_entity.entity_text):
+        #                     _entity.entity_text = _ent.entity_text
+
+        if short_entity and long_entity:
+            for first_i in range(len(short_entity)):
+                _entity = short_entity[first_i]
+                if is_enterprise_exist(_entity.entity_text): # 实体表存在的不替换
+                    continue
+                if _entity.label == 0 and re.search('(医院|学院|学校|中学|小学|大学|幼儿园|保健院|党校)', _entity.entity_text)==None:
+                    ree_l = []
+                    other_l = []
+                    for second_i in range(len(long_entity)):
+                        _ent = long_entity[second_i]
+                        if _ent.label in [0,1,5] and is_short(_entity.entity_text, _ent.entity_text):
+                            if _ent.label in [0 ,1]:
+                                ree_l.append(_ent)
+                            elif _ent.label in [5]:
+                                other_l.append(_ent)
+                    for _ent in ree_l + other_l:
+                        if is_enterprise_exist(_ent.entity_text) or re.search('有限(责任)?公司', _ent.entity_text):
                             _entity.entity_text = _ent.entity_text
 
         # 2021/12/21 替换通过字典识别到的取长度最大的相似实体

+ 2 - 2
BiddingKG/dl/interface/Preprocessing.py

@@ -24,8 +24,6 @@ from BiddingKG.dl.bidway.re_bidway import extract_bidway,bidway_integrate
 from BiddingKG.dl.fingerprint.documentFingerprint import getFingerprint
 from BiddingKG.dl.entityLink.entityLink import *
 
-
-#
 def tableToText(soup):
     '''
     @param:
@@ -2293,6 +2291,8 @@ def get_preprocessed_article(articles,cost_time = dict(),useselffool=True):
         article_processed = re.sub(',最高有效报价者:', ',中标人名称:', article_processed)  # 224678159 # 2023/7/4 四川站源特殊中标修改
         article_processed = re.sub(',最高有效报价:', ',投标报价:', article_processed)  # 224678159 # 2023/7/4 四川站源特殊中标修改
         article_processed = re.sub('备选中标人', '第二候选人', article_processed)  # 341344142 # 2023/7/17 特殊表达修改
+        if web_source_no.startswith('DX002756-'):
+            article_processed = re.sub('状态:(进行中|已结束)单位', ',项目单位', article_processed)  # 376225646
         ser = re.search('(采购|招标|比选)人(名称)?/(采购|招标|比选)?代理机构(名称)?:(?P<tenderee>[\w()]{4,25}(/[\w()]{4,25})?)/(?P<agency>[\w()]{4,25})[,。]', article_processed)
         if ser:
             article_processed = article_processed.replace(ser.group(0), '采购人名称:%s,采购代理机构名称:%s,' % (ser.group('tenderee'), ser.group('agency')))

+ 4 - 2
BiddingKG/dl/interface/extract.py

@@ -305,7 +305,7 @@ def predict(doc_id,text,title="",page_time="",web_source_no='',web_source_name="
     # content = list_articles[0].content
     # channel_dic = predictor.getPredictor("channel").predict_rule(title, content, channel_dic, prem_dic=prem[0]['prem'])
     channel_dic, msc = predictor.getPredictor("channel").predict_merge(title,list_sentences[0], text, list_articles[0].bidway, prem[0], original_docchannel)
-    print('msc', msc)
+    # print('msc', msc)
     cost_time["rule_channel"] = round(time.time()-start_time,2)
 
     start_time = time.time() # 产品名称及废标原因提取  #依赖 docchannel结果
@@ -337,10 +337,12 @@ def predict(doc_id,text,title="",page_time="",web_source_no='',web_source_name="
     # '''限制行业最高金额'''
     # getAttributes.limit_maximum_amount(prem, industry) # 20230703取消,改为整合所有要素后面纠正
 
+    '''根据数据源最后召回招标人角色'''
+    prem = predictor.getPredictor('websource_tenderee').get_websource_tenderee(web_source_no, prem)
 
     # data_res = Preprocessing.union_result(Preprocessing.union_result(codeName, prem),list_punish_dic)[0]
     # data_res = Preprocessing.union_result(Preprocessing.union_result(Preprocessing.union_result(codeName, prem),list_punish_dic), list_channel_dic)[0]
-    version_date = {'version_date': '2023-11-07'}
+    version_date = {'version_date': '2023-11-09'}
     data_res = dict(codeName[0], **prem[0], **channel_dic, **product_attrs[0], **product_attrs[1], **payment_way_dic, **fail_reason, **industry, **district, **candidate_dic, **version_date)
 
     '''最终检查修正招标、中标金额'''

+ 63 - 2
BiddingKG/dl/interface/predictor.py

@@ -60,6 +60,7 @@ dict_predictor = {"codeName":{"predictor":None,"Lock":RLock()},
                   "district": {"predictor": None, "Lock": RLock()},
                   'tableprem': {"predictor": None, "Lock": RLock()},
                   'candidate': {"predictor": None, "Lock": RLock()},
+                  'websource_tenderee': {"predictor": None, "Lock": RLock()},
                   }
 
 
@@ -107,6 +108,8 @@ def getPredictor(_type):
                     dict_predictor[_type]["predictor"] = TablePremExtractor()
                 if _type == 'candidate':
                     dict_predictor[_type]["predictor"] = CandidateExtractor()
+                if _type == 'websource_tenderee':
+                    dict_predictor[_type]['predictor'] = WebsourceTenderee()
             return dict_predictor[_type]["predictor"]
     raise NameError("no this type of predictor")
 
@@ -1564,7 +1567,7 @@ class RoleRulePredictor():
                                             break
                                         if str(_span[0][-len(str(_name)):]+_span[1] + _span[2][:len(str(_name))]).find(
                                                 _name) >= 0:
-                                            if p_entity.entity_text in agency_set: # 在代理人集合的作为代理人
+                                            if p_entity.entity_text in agency_set or re.search('(代理|管理|咨询|招投?标|采购)\w{,6}公司', p_entity.entity_text): # 在代理人集合的作为代理人
                                                 find_flag = True
                                                 _label = 1
                                                 p_entity.label = _label
@@ -1575,6 +1578,8 @@ class RoleRulePredictor():
                                                 _label = 0
                                                 p_entity.label = _label
                                                 p_entity.values[int(_label)] = on_value
+                                                if 6<len(p_entity.entity_text) < 20: # 标题中角色长度在一定范围内的加分 优化类似367720967 标题中两个实体选择错误问题
+                                                    p_entity.values[int(_label)] += 0.005
                                                 break
                                     if p_entity.sentence_index >= 4:
                                         break
@@ -5322,7 +5327,10 @@ class TableTag2List():
                     try:
                         if text_process != None:
                             # text = [re.sub('\xa0', '', text_process(cell, final=False)), 0]
-                            td_text = re.sub('\xa0', '', text_process(cell, final=False))
+                            # td_text = re.sub('\xa0', '', text_process(cell, final=False))
+                            td_text = re.sub('\s|\xa0', '', str(cell.get_text())) # 修复 370835008 td 内公司被p标签拆分为两半情况
+                            if len(td_text)>30:
+                                td_text = re.sub('\xa0', '', text_process(cell, final=False))
                             if td_text == "":
                                 td_text = ' '
                             text = [td_text,0]
@@ -5472,6 +5480,7 @@ class TablePremExtractor(object):
         '''
         text = re.sub('联合体:|联合体(成员|单位)[12345一二三四五]?:|(联合体)?成员单位[12345一二三四五]?:|特殊普通合伙:|[((][主成][))]'
                       , ',', text)
+        text = re.sub('\s', '', text) # 修复 370835008 表格中实体中间有\n
         if text in nlp_enterprise:
             return text
         if len(text) > 50 or len(text)<4:
@@ -5794,6 +5803,7 @@ class CandidateExtractor(object):
         '''
         text = re.sub('联合体:|联合体(成员|单位)[12345一二三四五]?:|(联合体)?成员单位[12345一二三四五]?:|特殊普通合伙:|[((][主成][))]'
                       , ',', text)
+        text = re.sub('\s', '', text) # 修复 370835008 表格中实体中间有\n
         if text in nlp_enterprise:
             return text
         if len(text) > 50 or len(text)<4:
@@ -6056,6 +6066,57 @@ def role_special_predictor(web_source_name, content, nlp_enterprise):
         if ser and ser.group(1) in nlp_enterprise:
             return ser.group(1)
 
+class WebsourceTenderee():
+    def __init__(self):
+        with open(os.path.dirname(__file__)+'/websource_tenderee.pkl', 'r', encoding='utf-8') as f:
+            self.webno2ree = json.load(f)
+
+    def get_websource_tenderee(self, web_source_no, prem):
+        '''
+        通过数据源唯一招标人召回调整prem中的招标人,
+        :param web_source_no:
+        :param prem:
+        :return:
+        '''
+        p = '(医院|学院|学校|中学|小学|大学|幼儿园|保健院|党校|银行|研究院|血站|红十字会|防治院|研究所)'
+        web_ree = self.webno2ree.get(web_source_no, '')
+        if web_ree != '':
+            if 'Project' in prem[0]['prem']:
+                find_tenderee = False
+                for d in prem[0]['prem']['Project']['roleList']:
+                    if d['role_name'] == 'tenderee':
+                        find_tenderee = True
+                        if d['role_text'] == "":
+                            d['role_text'] = web_ree
+                        # elif re.search(p, web_ree) and (re.search(p, d['role_text'])==None and len(d['role_text'])<6): # 数据源唯一招标人以医院等结尾,角色中无相关关键词的,替换为数据源招标人
+                        #     d['role_text'] = web_ree
+                        # elif re.search('有限(责任)?公司', web_ree) and (re.search('有限(责任)?公司', d['role_text'])==None and len(d['role_text'])<6):
+                        #     d['role_text'] = web_ree
+                        break
+                if not find_tenderee: # 没招标人的添加
+                    prem[0]['prem']['Project']['roleList'].append({'role_name': 'tenderee',
+                                                                   'role_text': '%s' % web_ree,
+                                                                   'role_money': {'money': 0, 'money_unit': '',
+                                                                                  'floating_ratio': '',
+                                                                                  'downward_floating_ratio': '',
+                                                                                  'discount_ratio': ''},
+                                                                   'linklist': [],
+                                                                   'serviceTime': '',
+                                                                   'address': ''})
+            else:
+                prem[0]['prem']['Project'] = {'code': '',
+                                              'tendereeMoney': 0,
+                                              'roleList': [
+                                                  {'role_name': 'tenderee',
+                                                   'role_text': '%s' % web_ree,
+                                                   'role_money': {'money': 0, 'money_unit': '', 'floating_ratio': '',
+                                                                  'downward_floating_ratio': '', 'discount_ratio': ''},
+                                                   'linklist': [],
+                                                   'serviceTime': '',
+                                                   'address': ''}
+                                              ]}
+        return prem
+
 
 def getSavedModel():
     #predictor = FormPredictor()

Diff do ficheiro suprimidas por serem muito extensas
+ 0 - 0
BiddingKG/dl/interface/websource_tenderee.pkl


Alguns ficheiros não foram mostrados porque muitos ficheiros mudaram neste diff