Quellcode durchsuchen

补充站源招标人;去除包含“采购人:某单位”等招标人

lsm vor 20 Stunden
Ursprung
Commit
7250ec15af

+ 3 - 3
BiddingKG/dl/interface/extract.py

@@ -499,7 +499,7 @@ def predict(doc_id,text,title="",page_time="",web_source_no='',web_source_name="
     repair_entity(prem,district,list_articles)
 
     '''根据数据源最后召回招标人角色'''
-    prem = predictor.getPredictor('websource_tenderee').get_websource_tenderee(web_source_no, web_source_name, prem)
+    prem = predictor.getPredictor('websource_tenderee').get_websource_tenderee(doc_id, web_source_no, web_source_name, prem)
 
     '''根据关键词表生成项目标签'''
     project_label = predictor.getPredictor('project_label').predict(title,product=','.join(product_list),project_name=codeName[0]['name'],prem=prem)
@@ -511,7 +511,7 @@ def predict(doc_id,text,title="",page_time="",web_source_no='',web_source_name="
     property_label = predictor.getPredictor('property_label').predict(title, product=','.join(product_list),project_name=codeName[0]['name'], prem=prem,channel_dic=channel_dic)
 
     '''最终验证prem'''
-    getAttributes.confirm_prem(prem[0]['prem'], channel_dic, deposit_project, prem[0]['total_tendereeMoney'])
+    getAttributes.confirm_prem(doc_id, prem[0]['prem'], channel_dic, list_articles[0].content, deposit_project, prem[0]['total_tendereeMoney'])
 
     '''规则补充招标无招标人中标无中标人角色'''
     getAttributes.rule_add_role(doc_id,prem[0]['prem'], channel_dic, list_articles[0].content, web_source_no, nlp_enterprise)
@@ -533,7 +533,7 @@ def predict(doc_id,text,title="",page_time="",web_source_no='',web_source_name="
 
     # data_res = Preprocessing.union_result(Preprocessing.union_result(codeName, prem),list_punish_dic)[0]
     # data_res = Preprocessing.union_result(Preprocessing.union_result(Preprocessing.union_result(codeName, prem),list_punish_dic), list_channel_dic)[0]
-    version_date = {'version_date': '2025-06-06'}
+    version_date = {'version_date': '2025-06-10'}
     data_res = dict(codeName[0], **prem[0], **channel_dic, **product_attrs[0], **product_attrs[1], **payment_way_dic, **fail_reason, **industry, **district, **candidate_dic, **version_date, **all_moneys, **pb_json)
 
     if original_docchannel == 302:

+ 9 - 2
BiddingKG/dl/interface/getAttributes.py

@@ -5089,13 +5089,13 @@ def rule_add_role(docid, prem, channel, content, web_source_no, nlp_enterprise):
         match = re.search('((中标|中选|成交))?(人|方|供应商|服务商|单位|部门)|(拟定|[,。])供应商)(信息[,:]?)?(名称)?((乙方))?:(?P<name>[\w()—-]{4,35})([,。]|$)',content)
         if match:
             ent_name = match.group('name')
-            if re.search('测试|演示|某|\d号|\*|XX', ent_name)==None and re.search('^\w{1,5}[省市县区][\w()]{2,25}[厂店铺市场行部城室馆中心站处社会狱所园关局署段厅院队小学]((个体工商户)?|(普通合伙)?)?$',
+            if re.search('测试|演示|某|\d号|\*|XX', ent_name)==None and re.search('^\w{1,5}[省市县区][\w()]{2,25}[厂店铺市场行部城室馆中心站处社会狱所园关局署段厅院队小学]((个体工商户)?|(普通合伙)?)?$',
                          ent_name):  #  or is_enterprise_exist(ent_name)
                 log('规则补充中标人角色:%s,docid:%s'%(ent_name, docid))
                 add_role(ent_name, "win_tenderer", prem)
 
 
-def confirm_prem(prem, channel_dic, is_deposit_project=False, total_tendereeMoney=0):
+def confirm_prem(docid, prem, channel_dic, content, is_deposit_project=False, total_tendereeMoney=0):
     '''
     规则检查纠正prem,如果Project包中标人在其他包中标人,去掉project包中标角色;如果有其他包中标人,去掉roleList为空的包;
     :param prem: prem 字段字典
@@ -5152,6 +5152,13 @@ def confirm_prem(prem, channel_dic, is_deposit_project=False, total_tendereeMone
             if float(prem[k]['tendereeMoney'])==0:
                 prem[k]['tendereeMoney'] = total_tendereeMoney
 
+    # 采购人:某单位 等去掉tenderee
+    if "Project" in prem and re.search('(招标|采购|招商)(人|商|单位|部门)(信息[,:]?)?(名称)?((甲方))?:某(单位|部门)', content):
+        for d in prem['Project']['roleList']:
+            if d['role_name'] == 'tenderee' and d.get('role_prob', 0)<0.8:
+                log('规则去除文中包含“采购人:某单位”等概率小于0.8的招标人:%s,docid:%s'%(d.get('role_text', ''), docid))
+                prem['Project']['roleList'].remove(d)
+
 def add_package_name(prem, list_entity, product_list, name):
     '''
     通过产品、项目名称,补充各标段包名,如果标段无包名,标段后紧接产品,把产品作为包名;如果标段数少于等于2且包名为空,补充项目名称为包名

+ 11 - 1
BiddingKG/dl/interface/predictor.py

@@ -7841,7 +7841,7 @@ class WebsourceTenderee():
         with open(os.path.dirname(__file__)+'/websource_tenderee.pkl', 'r', encoding='utf-8') as f:
             self.webno2ree = json.load(f)
 
-    def get_websource_tenderee(self, web_source_no, web_source_name, prem):
+    def get_websource_tenderee(self, docid, web_source_no, web_source_name, prem):
         '''
         通过数据源唯一招标人召回调整prem中的招标人,
         :param web_source_no:
@@ -7854,6 +7854,8 @@ class WebsourceTenderee():
             web_ree = '中国人民解放军总医院'
         elif web_source_no.startswith('Y00484-') and web_ree == "":
             web_ree = '航空总医院'
+        elif web_source_no.startswith('DX015142-'):
+            web_ree = '海尔集团'
         if web_ree == "" and re.search('\w{2,8}(大学|医院|妇幼保健院)$', web_source_name): # 20240524 大学、医院类站源没唯一招标人默认为站源名称
             web_ree = web_source_name
         if web_source_no in ['DX013230-2', 'DX008427-1', 'DX001298', 'DX013230-3', 'DX003960-1', 'DX002797-1', 'DX010532',
@@ -7862,6 +7864,7 @@ class WebsourceTenderee():
                              'DX000980', 'DX013748-1', 'DX002488', '07056-4', 'XX2102']: # 统计分析可能有问题的唯一招标人
             web_ree = ''
         if web_ree != '':
+            flag = 0 # 是否被替换为站源招标人
             if 'Project' in prem[0]['prem']:
                 find_tenderee = False
                 for d in prem[0]['prem']['Project']['roleList']:
@@ -7869,10 +7872,13 @@ class WebsourceTenderee():
                         find_tenderee = True
                         if d['role_text'] == "":
                             d['role_text'] = web_ree
+                            flag = 1
                         elif re.search('大学$', web_ree) and re.search('学院$', d['role_text']) and web_ree not in d['role_text']:
                             d['role_text'] = web_ree
+                            flag = 1
                         elif d.get('role_prob', 0) < 0.8 and get_business_data(d['role_text'])[0] == False: # 20240201 概率低于0.8且没有工商数据的替换为站源招标人
                             d['role_text'] = web_ree
+                            flag = 1
                         # elif re.search(p, web_ree) and (re.search(p, d['role_text'])==None and len(d['role_text'])<6): # 数据源唯一招标人以医院等结尾,角色中无相关关键词的,替换为数据源招标人
                         #     d['role_text'] = web_ree
                         # elif re.search('有限(责任)?公司', web_ree) and (re.search('有限(责任)?公司', d['role_text'])==None and len(d['role_text'])<6):
@@ -7888,6 +7894,7 @@ class WebsourceTenderee():
                                                                    'linklist': [],
                                                                    'serviceTime': '',
                                                                    'address': ''})
+                    flag = 1
 
             else:
                 prem[0]['prem']['Project'] = {'code': '',
@@ -7901,6 +7908,7 @@ class WebsourceTenderee():
                                                    'serviceTime': '',
                                                    'address': ''}
                                               ]}
+                flag = 1
             tenderee_l = [d2['role_text'] for v in prem[0]['prem'].values() for d2 in v['roleList'] if
                           d2['role_name'] == 'tenderee']
             winner_l = [d2['role_text'] for v in prem[0]['prem'].values() for d2 in v['roleList'] if
@@ -7909,6 +7917,8 @@ class WebsourceTenderee():
                 for k in prem[0]['prem']:
                     prem[0]['prem'][k]['roleList'] = [d for d in prem[0]['prem'][k]['roleList'] if
                                                not (d['role_name'] == 'win_tenderer' and d['role_text'] in tenderee_l)]
+            if flag == 1:
+                log('规则补充站源招标人:%s,docid:%s'%(web_ree, docid))
         return prem
 
 def get_header_line(list_item):