Browse Source

修复某些公告业绩删除报错;优化角色、地区匹配

lsm 11 hours ago
parent
commit
033cb3b1a0

+ 1 - 1
BiddingKG/dl/common/Utils.py

@@ -1074,7 +1074,7 @@ def del_tabel_achievement(soup):
             del_tag = tag.extract()
             # print('删除表格业绩内容', del_tag.text)
         #     print(re.search(p1, pre_text),pre_text, len(pre_text), re.findall('序号|中标候选人名称|项目名称|工程名称|合同金额|建设单位|业主', tr_text))
-        if re.search(p1, pre_text) and len(pre_text) < 20 and tag.find('tr') != None and len(tr_text)<100:
+        elif re.search(p1, pre_text) and len(pre_text) < 20 and tag.find('tr') != None and len(tr_text)<100:
             _count = 0
             for td in tag.find('tr').find_all('td'):
                 td_text = td.text.strip()

+ 2 - 0
BiddingKG/dl/interface/Preprocessing.py

@@ -3451,6 +3451,8 @@ def get_preprocessed_article(articles,cost_time = dict(),useselffool=True):
             article_processed = article_processed.replace(match.group(0), match.group(0)[:-2])
         if web_source_no.startswith('27763-') and re.search(',招标方(名称)?,', article_processed):
             article_processed = re.sub(',招标方(名称)?,', ',招标方名称:', article_processed)
+        if web_source_no.startswith('00678-') and re.search('采购方需求信息,', article_processed): # 修复 654313319 站源特殊表达
+            article_processed = re.sub('采购方需求信息,', '采购方需求信息:', article_processed)
 
         '''去除业绩内容'''
         article_processed = del_achievement(article_processed)

+ 1 - 1
BiddingKG/dl/interface/extract.py

@@ -524,7 +524,7 @@ def predict(doc_id,text,title="",page_time="",web_source_no='',web_source_name="
 
     # data_res = Preprocessing.union_result(Preprocessing.union_result(codeName, prem),list_punish_dic)[0]
     # data_res = Preprocessing.union_result(Preprocessing.union_result(Preprocessing.union_result(codeName, prem),list_punish_dic), list_channel_dic)[0]
-    version_date = {'version_date': '2025-08-01'}
+    version_date = {'version_date': '2025-08-04'}
     data_res = dict(codeName[0], **prem[0], **channel_dic, **product_attrs[0], **product_attrs[1], **payment_way_dic, **fail_reason, **industry, **district, **candidate_dic, **version_date, **all_moneys, **pb_json)
 
     if original_docchannel == 302:

+ 13 - 13
BiddingKG/dl/interface/predictor.py

@@ -881,7 +881,7 @@ class PREMPredict():
             elif label in [2,3,4] and re.search('序号:\d+,\w{,2}候选', front):
                 label = 5
             elif label == 0:
-                if re.search('拟邀请$|受邀谈判方|流入方名称:$|拟(选用|采用|选取)(单位|公司|企业)(名称)?:$|选择(建设|\w{,2})?服务单位:$', front): # 修复 626700009 二、拟选用单位:海南和泰消防技术服务有限公司。 632486555 选择建设服务单位:四川富吉兴工程管理有限公司,  642115802 拟采用公司:山东久木影视传媒有限公司
+                if re.search('拟邀请$|受邀谈判方|流入方名称:$|拟(选用|采用|选取)(单位|公司|企业)(名称)?:$|选择(建设|\w{,2})?服务单位:$|单一来源采购单位:$', front): # 修复 626700009 二、拟选用单位:海南和泰消防技术服务有限公司。 632486555 选择建设服务单位:四川富吉兴工程管理有限公司,  642115802 拟采用公司:山东久木影视传媒有限公司 654427839 单一来源采购单位:长沙新天地金融服务科技有限公司
                     label = 2
                     values[label] = 0.501
                 elif re.search('(发布(人|方|单位|机构|组织|用户|业主|主体|部门|公司|企业)|组织(单位|人|方|机构)?|(采购|招标|发布)机构)(名称)?[是为:]+', front) and is_agency(entity.entity_text):
@@ -890,7 +890,7 @@ class PREMPredict():
                 elif re.search('受托人((盖章))?:$', front):
                     label = 1
                     values[label] = 0.501
-                elif re.search('采用$|异议受理部门|本次招标有:$|直购企业:$|主报名人:$|采购候选人:$', front): # 368177736 因本项目招标采用广西壮族自治区公共资源交易平台系统-  标公告,本次招标有:内黄县汇融钢材有限公司、安阳正元建筑工程有限公司、内黄县鸿业贸易有限责任公司三家合格供应商进行报名投标。  438880541 直购企业可能为多个,其中一个中标
+                elif re.search('采用$|异议受理部门|本次招标有:$|直购企业:$|主报名人:$|采购候选人:$|申报企业:$|生产企业:$', front): # 368177736 因本项目招标采用广西壮族自治区公共资源交易平台系统-  标公告,本次招标有:内黄县汇融钢材有限公司、安阳正元建筑工程有限公司、内黄县鸿业贸易有限责任公司三家合格供应商进行报名投标。  438880541 直购企业可能为多个,其中一个中标 # 654390120 申报企业:
                     label = 5
                 elif re.search(',单位名称:$', front) and re.search('^,(中标|中选)价格', behind):
                     label = 2
@@ -6494,8 +6494,8 @@ class DistrictPredictor():
 
         province_l, city_l, district_l = self.find_whole_areas('%s %s'%(title, addr_project), self.pettern, self.area_variance_dic, self.full_dic)
         pro_ids, city_ids, dis_ids = self.merge_score(province_l, city_l, district_l, self.full_dic, self.short_dic, self.idx_dic)
-        big_area_1, pred_pro_1, pred_city_1, pred_dis_1, prob, max_score, code_dic_1 = self.get_final_addr(pro_ids, city_ids, dis_ids, self.idx_dic)
-        big_area, pred_pro, pred_city, pred_dis, code_dic = big_area_1, pred_pro_1, pred_city_1, pred_dis_1, code_dic_1
+        big_area_1, pred_pro_1, pred_city_1, pred_dis_1, prob_1, max_score, code_dic_1 = self.get_final_addr(pro_ids, city_ids, dis_ids, self.idx_dic)
+        big_area, pred_pro, pred_city, pred_dis, prob, code_dic = big_area_1, pred_pro_1, pred_city_1, pred_dis_1, prob_1, code_dic_1
         # print('关键词1:', province_l, city_l, district_l)
         # print('输入:', '标题:%s; 项目地址:%s'%(title, addr_project))
         # print('分数:', pro_ids, city_ids, dis_ids, prob, max_score)
@@ -6515,12 +6515,12 @@ class DistrictPredictor():
             city_l.extend(city_l2)
             district_l.extend(district_l2)
             pro_ids, city_ids, dis_ids = self.merge_score(province_l, city_l, district_l, self.full_dic, self.short_dic, self.idx_dic)
-            big_area_2, pred_pro_2, pred_city_2, pred_dis_2, prob, max_score, code_dic_2 = self.get_final_addr(pro_ids, city_ids, dis_ids, self.idx_dic)
-            big_area, pred_pro, pred_city, pred_dis, code_dic = big_area_2, pred_pro_2, pred_city_2, pred_dis_2, code_dic_2
+            big_area_2, pred_pro_2, pred_city_2, pred_dis_2, prob_2, max_score, code_dic_2 = self.get_final_addr(pro_ids, city_ids, dis_ids, self.idx_dic)
+            big_area, pred_pro, pred_city, pred_dis, prob, code_dic = big_area_2, pred_pro_2, pred_city_2, pred_dis_2, prob_2, code_dic_2
             # print('关键词2:', province_l, city_l, district_l)
             # print('输入:', '招标人:%s; 招标人地址:%s; 收货地址:%s' % (ree, addr, addr_delivery))
             # print('分数:', pro_ids, city_ids, dis_ids, prob, max_score)
-            if re.search('省|市|县|自治', addr_project) and pred_pro_1 != '' and pred_pro_1 != pred_pro_2: # 如果有项目地址使用项目地址  要有省市县等 275127622 工程地点为狮山镇颜峰综合区岐山至人和段道路, 提错 岐山
+            if re.search('省|市|县|自治', addr_project) and prob_1 !=0.5 and pred_pro_1 != '' and pred_pro_1 != pred_pro_2: # 如果有项目地址使用项目地址  要有省市县等 275127622 工程地点为狮山镇颜峰综合区岐山至人和段道路, 提错 岐山
                 not_sure = False
                 big_area, pred_pro, pred_city, pred_dis, code_dic = big_area_1, pred_pro_1, pred_city_1, pred_dis_1, code_dic_1
             if not_sure and (pred_city_2 == "" or prob < 0.7 or max_score<2):
@@ -6530,12 +6530,12 @@ class DistrictPredictor():
                 city_l.extend(city_l3)
                 district_l.extend(district_l3)
                 pro_ids, city_ids, dis_ids = self.merge_score(province_l, city_l, district_l, self.full_dic, self.short_dic, self.idx_dic)
-                big_area_3, pred_pro_3, pred_city_3, pred_dis_3, prob, max_score, code_dic_3 = self.get_final_addr(pro_ids, city_ids, dis_ids, self.idx_dic)
-                big_area, pred_pro, pred_city, pred_dis, code_dic = big_area_3, pred_pro_3, pred_city_3, pred_dis_3, code_dic_3
+                big_area_3, pred_pro_3, pred_city_3, pred_dis_3, prob_3, max_score, code_dic_3 = self.get_final_addr(pro_ids, city_ids, dis_ids, self.idx_dic)
+                big_area, pred_pro, pred_city, pred_dis, prob, code_dic = big_area_3, pred_pro_3, pred_city_3, pred_dis_3, prob_3, code_dic_3
                 # print('关键词3:', province_l, city_l, district_l)
                 # print('输入:', '联系:%s, 开标:%s, 邮寄:%s'%(addr_contact, addr_bidopen, addr_bidsend))
                 # print('分数:', pro_ids, city_ids, dis_ids, prob, max_score)
-                if pred_city_2 != "" and pred_city_2 != pred_city_3:
+                if pred_city_2 != "" and prob_2 !=0.5 and pred_city_2 != pred_city_3:
                     not_sure = False
                     big_area, pred_pro, pred_city, pred_dis, code_dic = big_area_2, pred_pro_2, pred_city_2, pred_dis_2, code_dic_2 # 如果招标人、招标人地址、收货地址与开标地址、联系地址等不一致,取招标人地址
                 if not_sure and (pred_city_3 == "" or prob < 0.6 or max_score < 2):
@@ -6545,9 +6545,9 @@ class DistrictPredictor():
                     city_l.extend(city_l4)
                     district_l.extend(district_l4)
                     pro_ids, city_ids, dis_ids = self.merge_score(province_l, city_l, district_l, self.full_dic, self.short_dic, self.idx_dic)
-                    big_area_4, pred_pro_4, pred_city_4, pred_dis_4, prob, max_score, code_dic_4 = self.get_final_addr(pro_ids, city_ids,dis_ids, self.idx_dic)
-                    big_area, pred_pro, pred_city, pred_dis, code_dic = big_area_4, pred_pro_4, pred_city_4, pred_dis_4, code_dic_4
-                    if pred_city_3 != "" and pred_city_3 != pred_city_4:
+                    big_area_4, pred_pro_4, pred_city_4, pred_dis_4, prob_4, max_score, code_dic_4 = self.get_final_addr(pro_ids, city_ids,dis_ids, self.idx_dic)
+                    big_area, pred_pro, pred_city, pred_dis, prob, code_dic = big_area_4, pred_pro_4, pred_city_4, pred_dis_4, prob_4, code_dic_4
+                    if pred_city_3 != "" and prob_3 !=0.5 and pred_city_3 != pred_city_4:
                         not_sure = False
                         big_area, pred_pro, pred_city, pred_dis, code_dic = big_area_3, pred_pro_3, pred_city_3, pred_dis_3, code_dic_3  # 如果开标地址等提取的城市与所有地址提取的城市不一致,取开标地址等
                     if pred_pro_3 != pred_pro_4 and (prob < 0.6 or max_score < 2):