Bladeren bron

需求与标段关联;处理发现的bug;

lsm 5 maanden geleden
bovenliggende
commit
e8fcf367fc

+ 1 - 1
BiddingKG/dl/interface/Preprocessing.py

@@ -2261,7 +2261,7 @@ def table_head_repair_process(_inner_table, docid=None, show=0, show_row_index=0
     def repair_by_summation(inner_table):
         # 修复合计在中间的特殊情况
         if len(inner_table) >= 3 and len(inner_table[1]) == 2 \
-                and inner_table[1][0][0] == '合计' and inner_table[1][1][0][-1] == '%':
+                and inner_table[1][0][0] == '合计' and inner_table[1][1][0].endswith('%'):
             inner_table[1][0][1] = 0
             inner_table[1][1][1] = 0
         return inner_table

+ 6 - 1
BiddingKG/dl/interface/extract.py

@@ -434,6 +434,7 @@ def predict(doc_id,text,title="",page_time="",web_source_no='',web_source_name="
     '''地区获取'''
     start_time = time.time()
     district = predictor.getPredictor('district').predict(project_name=codeName[0]['name'], prem=prem,title=title, list_articles=list_articles, web_source_name=web_source_name, list_entitys=list_entitys)
+    # district = predictor.getPredictor('district').predict_area(title, list_articles[0].content, web_source_name, prem=prem[0]['prem'], addr_dic=addr_dic)
     cost_time["district"] = round(time.time() - start_time, 2)
 
     '''根据district提取结果修复实体'''
@@ -471,7 +472,7 @@ def predict(doc_id,text,title="",page_time="",web_source_no='',web_source_name="
 
     # data_res = Preprocessing.union_result(Preprocessing.union_result(codeName, prem),list_punish_dic)[0]
     # data_res = Preprocessing.union_result(Preprocessing.union_result(Preprocessing.union_result(codeName, prem),list_punish_dic), list_channel_dic)[0]
-    version_date = {'version_date': '2024-12-18'}
+    version_date = {'version_date': '2024-12-24'}
     data_res = dict(codeName[0], **prem[0], **channel_dic, **product_attrs[0], **product_attrs[1], **payment_way_dic, **fail_reason, **industry, **district, **candidate_dic, **version_date, **all_moneys, **pb_json)
 
     if original_docchannel == 302:
@@ -501,6 +502,10 @@ def predict(doc_id,text,title="",page_time="",web_source_no='',web_source_name="
     '''最终检查修正招标、中标金额'''
     getAttributes.limit_maximum_amount(data_res, list_entitys[0])
 
+    '''利用采购意向需求信息补充项目'''
+    if channel_dic['docchannel']['docchannel'] == '采购意向':
+        getAttributes.demand_to_prem(data_res.get('demand_info', {}), prem[0]['prem'])
+
     data_res["project_label"] = project_label
     data_res["property_label"] = property_label
     data_res["doctitle_refine"] = doctitle_refine

+ 26 - 0
BiddingKG/dl/interface/getAttributes.py

@@ -4895,6 +4895,32 @@ def fix_single_source(prem, channel_dic, original_docchannel):
                 if d['role_name'] == "win_tenderer":
                     d['role_name'] = 'pre_win_tenderer'
 
+def demand_to_prem(demand, prem):
+    if len(demand.get('data', [])) > len(prem):
+        i = 1
+        for d in demand.get('data', []):
+            d['demand_id'] = i
+            if d.get('project_name', '') != '' and d.get('budget', '') != '':
+                if d.get('project_name', '') not in prem:
+                    prem[d.get('project_name', '')] = {
+                        'demand_id': i,
+                        'code': '',
+                        'name': d.get('project_name', ''),
+                        'roleList': [],
+                        'tendereeMoney': d.get('budget', ''),
+                        'tendereeMoneyUnit': ""
+                    }
+                else:
+                    prem[d.get('project_name', '')+'_%d'%i] = {
+                        'demand_id': i,
+                        'code': '',
+                        'name': d.get('project_name', ''),
+                        'roleList': [],
+                        'tendereeMoney': d.get('budget', ''),
+                        'tendereeMoneyUnit': ""
+                    }
+            i += 1
+
 if __name__=="__main__":
     '''
     conn = getConnection()

+ 36 - 45
BiddingKG/dl/interface/predictor.py

@@ -3609,6 +3609,8 @@ class ProductAttributesPredictor():
                                 # print('产品拆分:', len(products),len(quantitys) , len(unitPrices),len(brands),len(specses))
                                 if len(products) == len(quantitys) == len(unitPrices) == len(brands) == len(specses):
                                     for product, quantity, unitPrice, brand, specs, total_price, parameter in zip(products,quantitys,unitPrices, brands, specses, total_prices, parameters):
+                                        if product.strip() == '': # 20241219修复 572876124 最后一个符号分割产品所有要素为空问题
+                                            continue
                                         if quantity != "":
                                             quantity, quantity_unit_ = self.fix_quantity(quantity, header_quan_unit)
                                             quantity_unit = quantity_unit_ if quantity_unit_ != "" else quantity_unit
@@ -5811,7 +5813,7 @@ class DistrictPredictor():
         with open(os.path.dirname(__file__) + "/area_variance_dic.pkl", 'rb') as f: # 20241113 地区变更新旧名称对照字典
             self.area_variance_dic = pickle.load(f)
 
-    def predict_area(self, title, ree, addr, web_source_name):
+    def predict_area(self, title, content, web_source_name, prem={}, addr_dic={}):
         p_pro, p_city, p_dis, idx_dic, full_dic, short_dic = self.p_pro, self.p_city, self.p_dis, self.idx_dic, self.full_dic, self.short_dic
 
         def find_whole_areas(text, weight=1):
@@ -5823,8 +5825,9 @@ class DistrictPredictor():
             '''
             province_l, city_l, district_l = [], [], []
 
-            text = str(text)
-            text = re.sub('复合肥|海南岛|兴业银行|双河口|阳光|杭州湾|新城区|中粮屯河|老城(区|改造|更新|升级|翻新)|沙县小吃|北京时间|福田汽车|中山(大学|公园|纪念堂)|孙中山|海天水泥|阳光采购|示范县',
+            text = str(text).replace('(', '(').replace(')', ')')
+            text = re.sub('\d{2,4}年度?|[\d/-]{1,5}[月日]|\d+|[a-zA-Z0-9]+', ' ', text)
+            text = re.sub('复合肥|海南岛|兴业银行|双河口|阳光|杭州湾|新城区|中粮屯河|老城(区|改造|更新|升级|翻新)|沙县小吃|北京时间|福田汽车|中山(大学|公园|纪念堂)|孙中山|海天水泥|阳光采购|示范县|珠江城',
                           ' ', text)  # 544151395 赤壁市老城区燃气管道老化更新改造
             text = re.sub('珠海城市', '珠海', text)  # 修复 426624023 珠海城市 预测为海城市
             text = re.sub('怒江州', '怒江傈僳族自治州', text)  # 修复 423589589  所属地域:怒江州 识别为广西 - 崇左 - 江州
@@ -5836,6 +5839,7 @@ class DistrictPredictor():
                 text = text.replace(ser.group(0), ser.group(0) + '黎族')
             for k, v in self.area_variance_dic.items():  # 20241113 根据地区变更信息替换文本
                 text = text.replace(k, v)
+            text = re.sub('\s+', '', text)
 
             if re.search('[\u4e00-\u9fa5]', text) == None:
                 return province_l, city_l, district_l
@@ -5858,8 +5862,8 @@ class DistrictPredictor():
                                 score = 2
                             else:
                                 score = 1
-                                if it.start(k)==0 or re.search('^(\w{,2}[分支](公司|局|行|校|院|干?线)|校区|\w{,3}段|地铁|(火车|高铁)?站)'
-                                        , text[it.end(k):]) or (it.start(k)>0 and it.end(k)<len(text) and text[it.start(k)-1]=='(' and text[it.end(k)]==')'):
+                                if re.search('^(\w{,2}[分支](公司|局|行|校|院|干?线)|校区|\w{,3}段|地铁|(火车|高铁)?站)'
+                                        , text[it.end(k):]) or re.search('^((%s)|\-%s)'%(v, v), text[max(0, it.start(k)-1):]):
                                     score += 1
                             score += it.end(k) / len(text) / 10
                             province_l.append((v, score * weight))
@@ -5868,8 +5872,8 @@ class DistrictPredictor():
                                 score = 2
                             else:
                                 score = 1
-                                if it.start(k)==0 or re.search('^(\w{,2}[分支](公司|局|行|校|院|干?线)|校区|\w{,3}段|地铁|(火车|高铁)?站)'
-                                        , text[it.end(k):]) or (it.start(k)>0 and it.end(k)<len(text) and text[it.start(k)-1]=='(' and text[it.end(k)]==')'):
+                                if re.search('^(\w{,2}[分支](公司|局|行|校|院|干?线)|校区|\w{,3}段|地铁|(火车|高铁)?站)'
+                                        , text[it.end(k):]) or re.search('^((%s)|\-%s)'%(v, v), text[max(0, it.start(k)-1):]):
                                     score += 1
                             score += it.end(k) / len(text) / 10
                             city_l.append((v, score * weight))
@@ -5880,11 +5884,11 @@ class DistrictPredictor():
                                 score = 2
                             else:
                                 score = 0.5
-                                if it.start(k)==0 or re.search('^(\w{,2}[分支](公司|局|行|校|院|干?线)|校区|\w{,3}段|地铁|(火车|高铁)?站)'
-                                        , text[it.end(k):]) or (it.start(k)>0 and it.end(k)<len(text) and text[it.start(k)-1]=='(' and text[it.end(k)]==')'):
-                                    score += 1
-                                    # print('县区加分:', v, text)
-                            score += it.end(k) / len(text) / 10
+                                if re.search('^(\w{,2}[分支](公司|局|行|校|院|干?线)|校区|\w{,3}段|地铁|(火车|高铁)?站)'
+                                        , text[it.end(k):]) or (re.match('\s*%s'%v, text) and it.start(k)<2) or re.search(
+                                    '^((%s)|\-%s)'%(v, v), text[max(0, it.start(k)-1):]):
+                                    score += 0.5
+                            # score += it.end(k) / len(text) / 10
                             if v == '昌江' and '景德镇' not in it.group(0):
                                 district_l.append(('昌江黎族', score * weight))
                             else:
@@ -5964,7 +5968,7 @@ class DistrictPredictor():
                             dis_ids[idx] = 0
                         dis_ids[idx] += score
                         pro_idx = idx_dic[idx]['省']
-                        if filter_short_dist and pro_idx not in pro_ids:
+                        if filter_short_dist and score < 1: # pro_idx not in pro_ids
                             continue
                         if pro_idx in tmp_pro:
                             tmp_pro[pro_idx] += score
@@ -6042,7 +6046,7 @@ class DistrictPredictor():
             tenderee = ""
             tenderee_address = ""
             try:
-                for v in prem[0]['prem'].values():
+                for v in prem.values():
                     for link in v['roleList']:
                         if link['role_name'] == 'tenderee' and tenderee == "":
                             tenderee = link['role_text']
@@ -6068,23 +6072,6 @@ class DistrictPredictor():
             else:
                 return ''
 
-        def get_project_addr(text):
-            p1 = '(项目|施工|实施|建设|工程|服务|交货|送货|收货|展示|看样|拍卖)(地址|地点|位置|所在地区?)(位于)?:(?P<addr>(\w{1,13}(自治[区州县旗]|地区|[省市区县旗盟])[^\w]*)+([\w()]{,20}[,。])?|\w{2,15}[,。])'
-            p2 = '项目位于(?P<addr>\w{2}市\w{2,4}区)'
-            if re.search(p1, text):
-                return re.search(p1, text).group('addr')
-            elif re.search(p2, text):
-                return re.search(p2, text).group('addr')
-            else:
-                return ''
-
-        def get_bid_addr(text):
-            p2 = '(磋商|谈判|开标|投标|评标|报名|递交|评审|发售|所属)(地址|地点|所在地区?|地域):(?P<addr>(\w{1,13}(自治[区州县旗]|地区|[省市区县旗盟])[^\w]*)+|\w{2,15}[,。])'
-            if re.search(p2, text):
-                return re.search(p2, text).group('addr')
-            else:
-                return ''
-
         def get_all_addr(list_entitys):
             tenderee_l = []
             addr_l = []
@@ -6096,20 +6083,24 @@ class DistrictPredictor():
                         tenderee_l.append(ent.entity_text)
             return ' '.join(addr_l), ' '.join(tenderee_l)
 
-        def get_title_addr(text):
-            p1 = '(?P<addr>(\w{1,13}(自治[区州县旗]|地区|[省市区县旗盟])[^\w]*)+|\w{2,15}[,。])'
-            if re.search(p1, text):
-                return re.search(p1, text).group('addr')
-            else:
-                return ''
-
         area_dic = {'area': '全国', 'province': '全国', 'city': '未知', 'district': '未知', "is_in_text": False}
-        province_l, city_l, district_l = find_whole_areas(title)
+        addr_project = addr_dic.get('addr_project', '')
+        addr_delivery = addr_dic.get('addr_delivery', '')
+        addr_bidopen = addr_dic.get('addr_bidopen', '')
+        addr_bidsend = addr_dic.get('addr_bidsend', '')
+        province_l, city_l, district_l = find_whole_areas('%s %s %s'%(title, addr_delivery, addr_project))
         pro_ids, city_ids, dis_ids = merge_score(province_l, city_l, district_l)
         big_area, pred_pro, pred_city, pred_dis, prob, max_score = get_final_addr(pro_ids, city_ids, dis_ids)
         # print('关键词1:', province_l, city_l, district_l)
         # print('分数:', pro_ids, city_ids, dis_ids, prob, max_score)
         if pred_city == "" or prob < 0.7 or max_score<2:
+            ree, addr = get_ree_addr(prem)
+            rule_ree_addr = get_role_address(content)
+            if rule_ree_addr:
+                addr = rule_ree_addr
+
+            # addr = content
+            # ree = ''
             province_l2, city_l2, district_l2 = find_whole_areas('%s %s' % (ree, addr), weight=0.8)
             province_l.extend(province_l2)
             city_l.extend(city_l2)
@@ -6119,7 +6110,7 @@ class DistrictPredictor():
             # print('关键词2:', province_l, city_l, district_l)
             # print('分数:', pro_ids, city_ids, dis_ids, prob, max_score)
             if pred_city == "" or prob < 0.7 or max_score<2:
-                province_l3, city_l3, district_l3 = find_whole_areas(web_source_name, weight=0.6)
+                province_l3, city_l3, district_l3 = find_whole_areas('%s %s %s'%(web_source_name, addr_bidopen, addr_bidsend), weight=0.6)
                 province_l.extend(province_l3)
                 city_l.extend(city_l3)
                 district_l.extend(district_l3)
@@ -8240,7 +8231,7 @@ class EntityTypeRulePredictor():
     def __init__(self):
         self.pattern_addr_bidopen = '([开评]标|开启|评选|比选|磋商|遴选|寻源|采购|招标|竞价|议价|委托|询比?价|比价|谈判|邀标|邀请|洽谈|约谈|选取|抽取|抽选))?(会议)?地[点址]([((]网址[))])?[:为]'
         self.pattern_addr_bidsend = '((\w{,4}文件)?(提交|递交)(\w{,4}文件)?|投标)地[点址]([((]网址[))])?[:为]'
-        self.pattern_addr_delivery = '(交货|交付|收货|提货|交接|送货(安装)?|送达|到货|卸货)((期|时间)[及和、])?)?地[点址][:为]'
+        self.pattern_addr_delivery = '(交货|交付|收货|提货|交接|送货(安装)?|送达|到货|卸货)((期|时间)[及和、])?)?地[点址]?[:为]'
         self.pattern_addr_project = '(项目|施工|实施|建设|工程|服务|交货|送货|收货|展示|看样|拍卖)(实施|服务)?(地址|地点|位置|所在地区?)(位于)?[:为]|项目位于'
         self.pattern_time_planned = '(计划|预计|预期)(采购|招标|发包)时间|招标(公告|文件)(预计|预期|计划)发布时间'
         self.pattern_code_investment = '投资(审批)?项目[编代]码[:为]'
@@ -8279,13 +8270,13 @@ class EntityTypeRulePredictor():
         ser3 = re.search('(%s)(?P<addr>[\w()-]{5,100})[,。]'%self.pattern_addr_delivery, list_articles[0].content)
         ser4 = re.search('(%s)(?P<addr>[\w()-]{5,100})[,。]'%self.pattern_addr_project, list_articles[0].content)
         ser5 = re.search('(%s)(?P<code>[\da-zA-Z()-]{5,30})[,。]'%self.pattern_code_investment, list_articles[0].content)
-        if ser1 and re.search('\w{2,5}[省市区]|\d号|采购网|http', ser1.group('addr')) and addr_dic.get('addr_bidopen', '') in ser1.group('addr'):
+        if ser1 and re.search('\w{2,5}[省市区]|\d号|\w{2,12}自治[区州县旗]|采购网|http', ser1.group('addr')) and addr_dic.get('addr_bidopen', '') in ser1.group('addr'):
             addr_dic['addr_bidopen'] = ser1.group('addr')
-        if ser2 and re.search('\w{2,5}[省市区]|\d号|采购网|http', ser2.group('addr')) and addr_dic.get('addr_bidsend', '') in ser2.group('addr'):
+        if ser2 and re.search('\w{2,5}[省市区]|\d号|\w{2,12}自治[区州县旗]|采购网|http', ser2.group('addr')) and addr_dic.get('addr_bidsend', '') in ser2.group('addr'):
             addr_dic['addr_bidsend'] = ser2.group('addr')
-        if ser3 and re.search('\w{2,5}[省市区]|\d号', ser3.group('addr')) and addr_dic.get('addr_delivery', '') in ser3.group('addr'):
+        if ser3 and re.search('\w{2,5}[省市区]|\d号|\w{2,12}自治[区州县旗]', ser3.group('addr')) and addr_dic.get('addr_delivery', '') in ser3.group('addr'):
             addr_dic['addr_delivery'] = ser3.group('addr')
-        if ser4 and re.search('\w{2,5}[省市区]|\d号', ser4.group('addr')) and addr_dic.get('addr_project', '') in ser4.group('addr'):
+        if ser4 and re.search('\w{2,5}[省市区]|\d号|\w{2,12}自治[区州县旗]', ser4.group('addr')) and addr_dic.get('addr_project', '') in ser4.group('addr'):
             addr_dic['addr_project'] = ser4.group('addr')
         if ser5 and code_investment == '':
             code_investment = ser5.group('code')