6 bulan lalu · 2435fb0e06
--- a/BiddingKG/dl/interface/extract.py
+++ b/BiddingKG/dl/interface/extract.py
@@ -267,10 +267,10 @@ def predict(doc_id,text,title="",page_time="",web_source_no='',web_source_name="
 
				     '''大纲提取及大纲内容相关提取'''
			
 
				     sentence2_list, sentence2_list_attach = extract_sentence_list(list_sentences[0])
			
 
				     parse_document = ParseDocument(text, True,list_obj=sentence2_list)
			
 
				-    requirement_text, aptitude_text, addr_bidopen_text, addr_bidsend_text, out_lines, requirement_scope = extract_parameters(parse_document)
			
 
				+    requirement_text, aptitude_text, addr_bidopen_text, addr_bidsend_text, out_lines, requirement_scope, pinmu_name = extract_parameters(parse_document)
			
 
				     if sentence2_list_attach!=[] and requirement_text == '' and aptitude_text == '' and addr_bidopen_text=="":
			
 
				         parse_document = ParseDocument(text, True, list_obj=sentence2_list_attach)
			
 
				-        requirement_text, aptitude_text, addr_bidopen_text, addr_bidsend_text, out_lines, requirement_scope = extract_parameters(parse_document)
			
 
				+        requirement_text, aptitude_text, addr_bidopen_text, addr_bidsend_text, out_lines, requirement_scope, pinmu_name = extract_parameters(parse_document)
			
 
				     if addr_bidopen_text == '':
			
 
				         addr_bidopen_text = extract_addr(list_articles[0].content)
			
 
				 
			
@@ -455,7 +455,7 @@ def predict(doc_id,text,title="",page_time="",web_source_no='',web_source_name="
 
				 
			
 
				     # data_res = Preprocessing.union_result(Preprocessing.union_result(codeName, prem),list_punish_dic)[0]
			
 
				     # data_res = Preprocessing.union_result(Preprocessing.union_result(Preprocessing.union_result(codeName, prem),list_punish_dic), list_channel_dic)[0]
			
 
				-    version_date = {'version_date': '2024-11-08'}
			
 
				+    version_date = {'version_date': '2024-11-21'}
			
 
				     data_res = dict(codeName[0], **prem[0], **channel_dic, **product_attrs[0], **product_attrs[1], **payment_way_dic, **fail_reason, **industry, **district, **candidate_dic, **version_date, **all_moneys, **pb_json)
			
 
				 
			
 
				     if original_docchannel == 302:
			
@@ -526,6 +526,7 @@ def predict(doc_id,text,title="",page_time="",web_source_no='',web_source_name="
 
				     data_res['product_attrs']['data'] = data_res['product_attrs']['data'][:500]
			
 
				     # 是否为存款项目
			
 
				     data_res['is_deposit_project'] = deposit_project
			
 
				+    data_res['pinmu_name'] = pinmu_name
			
 
				 
			
 
				     # for _article in list_articles:
			
 
				     #         log(_article.content)
			
--- a/BiddingKG/dl/interface/get_label_dic.py
+++ b/BiddingKG/dl/interface/get_label_dic.py
@@ -219,6 +219,24 @@ def get_all_label(title, content):
 
				     #         return 3
			
 
				     #     return 0
			
 
				 
			
 
				+    def need_government_policy():
			
 
				+        # 有落实政府采购政策需满足的资格要求：1；否则 0
			
 
				+        if re.search('落实政府采购政策需满足的资格要求', content):
			
 
				+            if re.search('落实政府采购政策需满足的资格要求：无，', content) == None:
			
 
				+                return '否'
			
 
				+            return '是'
			
 
				+        return 0
			
 
				+
			
 
				+    def consortium_permit():
			
 
				+        # 允许联合体：是；不允许：否；无关键词：0
			
 
				+        if re.search('(接受|允许|欢迎|同意)）?联合体(参与)?投标|联合体投标是合法的|联合体投标的，应|联合体各方应具备承担|投标人可以组成联合体', content):
			
 
				+            if re.search('不(接受|允许|欢迎|同意)）?联合体(参与)?投标|禁止联合体(参与)?投标|投标人不得为联合体|仅接受独立法人投标|投标人必须为独立法人|不得组成联合体|只有独立法人单位可以参与', content):
			
 
				+                return '否'
			
 
				+            return '是'
			
 
				+        elif re.search('联合(体|方|投标人)：|联合体(成员|单位)[12345一二三四五]?：|(联合体)?成员单位[12345一二三四五]?：|特殊普通合伙：|（联合(体|投标人)）|（联合体(成员|单位)方?[12345一二三四五]?）|（(联合体)?成员单位[12345一二三四五]?）|（特殊普通合伙|成员?）|[，；]成：|（成[），]|与[^，。]{6,100}联合体', content):
			
 
				+            return '是'
			
 
				+        return 0
			
 
				+
			
 
				     label_dic = {}
			
 
				     is_direct_procurement = is_direct_procurement() # 是否直接采购
			
 
				     is_target_small = is_target_small() # 是否面向中小企业
			
@@ -229,6 +247,8 @@ def get_all_label(title, content):
 
				     registered_capital = registered_capital() # 注册资本
			
 
				     registered_years = registered_years() # 注册年限
			
 
				     # suitable_small = suitable_small() # 适合小微企业
			
 
				+    government_policy = need_government_policy() # 落实政府采购政策需满足的资格要求
			
 
				+    consortium_permit = consortium_permit()
			
 
				 
			
 
				     label_dic['is_direct_procurement'] = is_direct_procurement
			
 
				     label_dic['is_target_small'] = is_target_small
			
@@ -239,6 +259,8 @@ def get_all_label(title, content):
 
				     label_dic['registered_capital'] = registered_capital
			
 
				     label_dic['registered_years'] = registered_years
			
 
				     # label_dic['suitable_small'] = suitable_small
			
 
				+    label_dic['government_policy'] = government_policy
			
 
				+    label_dic['consortium_permit'] = consortium_permit
			
 
				 
			
 
				     label_dic = {k: v for k, v in label_dic.items() if v!=0}
			
 
				 
			
--- a/BiddingKG/dl/interface/outline_extractor.py
+++ b/BiddingKG/dl/interface/outline_extractor.py
@@ -60,6 +60,7 @@ requirement_pattern = "(采购需求|需求分析|项目说明|(采购|合同|
 
				 aptitude_pattern = "(资格要求|资质要求)([:：，]|$)"
			
 
				 addr_bidopen_pattern = "([开评]标|开启|评选|比选|磋商|遴选|寻源|采购|招标|竞价|议价|委托|询比?价|比价|谈判|邀标|邀请|洽谈|约谈|选取|抽取|抽选|递交\w{,4}文件)[)）]?(时间[与及和、])?(地址|地点)([与及和、]时间)?([:：，]|$)|开启([:：，]|$)"
			
 
				 addr_bidsend_pattern = "((\w{,4}文件)?(提交|递交)(\w{,4}文件)?|投标)(截止时间[与及和、])?地[点址]([与及和、]截止时间)?([:：，]|$)"
			
 
				+pinmu_name_pattern = "采购品目名称：(\w{2,50})[，。]"
			
 
				 out_lines = []
			
 
				 
			
 
				 def extract_parameters(parse_document):
			
@@ -74,6 +75,7 @@ def extract_parameters(parse_document):
 
				     addr_bidopen_text = '' # 开标地址
			
 
				     addr_bidsend_text = '' # 投标地址
			
 
				     requirement_scope = [] # 采购内容始末位置
			
 
				+    pinmu_name = '' # 品目名称
			
 
				 
			
 
				     _find_count = 0
			
 
				     _data_i = -1
			
@@ -158,6 +160,8 @@ def extract_parameters(parse_document):
 
				                         addr_bidsend_text += c["text"]
			
 
				                     _data_i += len(childs)
			
 
				                     _data_i -= 1
			
 
				+                elif re.search(pinmu_name_pattern, _text):
			
 
				+                    pinmu_name += re.search(pinmu_name_pattern, _text).group(1)
			
 
				     if re.search('时间：', addr_bidopen_text) and re.search('([开评]标|开启|评选|比选|递交\w{,4}文件)?地[点址]([(（]网址[)）])?：[^，；。]{2,100}[，；。]', addr_bidopen_text):
			
 
				         for ser in re.finditer('([开评]标|开启|评选|比选|递交\w{,4}文件)?地[点址]([(（]网址[)）])?：[^，；。]{2,100}[，；。]', addr_bidopen_text):
			
 
				             b, e = ser.span()
			
@@ -168,7 +172,7 @@ def extract_parameters(parse_document):
 
				         for ser in re.finditer('((\w{,4}文件)?(提交|递交)(\w{,4}文件)?|投标)?地[点址]([(（]网址[)）])?：[^，；。]{2,100}[，；。]', addr_bidsend_text):
			
 
				             b, e = ser.span()
			
 
				         addr_bidsend_text = addr_bidsend_text[b:e]
			
 
				-    return requirement_text, aptitude_text, addr_bidopen_text, addr_bidsend_text, out_lines, requirement_scope
			
 
				+    return requirement_text, aptitude_text, addr_bidopen_text, addr_bidsend_text, out_lines, requirement_scope, pinmu_name
			
 
				 
			
 
				 def extract_addr(content):
			
 
				     '''
			
--- a/BiddingKG/dl/interface/predictor.py
+++ b/BiddingKG/dl/interface/predictor.py
@@ -3116,7 +3116,7 @@ class ProductAttributesPredictor():
 
				         '''
			
 
				         items = [re.sub('\s', '', it) for it in items]
			
 
				         flag = False
			
 
				-        header_dic = {'名称': '', '数量': '', '单位': '', '单价': '', '品牌': '', '规格': '', '需求': '', '预算': '', '时间': '', '总价': '', '品目': '', '参数': '', '采购人':'', '备注':'','发布日期':''}
			
 
				+        header_dic = {'名称': '', '数量': '', '单位': '', '单价': '', '品牌': '', '规格': '', '需求': '', '预算': '', '时间': '', '总价': '', '品目': '', '参数': '', '采购人':'', '备注':'','发布日期':'', '品目号':'', '品目名':''}
			
 
				         product = ""  # 产品
			
 
				         quantity = ""  # 数量
			
 
				         quantity_unit = "" # 数量单位
			
@@ -3132,6 +3132,8 @@ class ProductAttributesPredictor():
 
				         tenderee = "" # 采购人
			
 
				         notes = "" # 备注  2024/3/27 达仁 需求
			
 
				         issue_date = ""  # 发布日期 2024/3/27 达仁 需求
			
 
				+        pinmu_no = "" # 品目号
			
 
				+        pinmu_name = "" # 品目名称
			
 
				 
			
 
				         # for i in range(min(6, len(items))):
			
 
				         for i in range(len(items)):
			
@@ -3168,6 +3170,12 @@ class ProductAttributesPredictor():
 
				         if flag:
			
 
				             # for j in range(i + 1, len(items)):
			
 
				             for j in range(len(items)):
			
 
				+                if header_dic['品目号'] == "" and re.search('(品目|品类)(编?号|编码|序号)', items[j]):
			
 
				+                    header_dic['品目号'] = j
			
 
				+                    pinmu_no = items[j]
			
 
				+                elif header_dic['品目名'] == "" and re.search('(品目|品类)名称|采购(品目|品类)$', items[j]):
			
 
				+                    header_dic['品目名'] = j
			
 
				+                    pinmu_name = items[j]
			
 
				                 if items[j] in [product, category]:
			
 
				                     continue
			
 
				                 if len(items[j]) > 20 and len(re.sub('[\(（].*[）\)]|[^\u4e00-\u9fa5]', '', items[j])) > 10:
			
@@ -3220,9 +3228,9 @@ class ProductAttributesPredictor():
 
				                 # if num >=2:
			
 
				                 #     return header_dic, flag, (product, quantity, quantity_unit, unitPrice, brand, specs, total_price, category, parameter), (product, demand, budget, order_time)
			
 
				                 if set([quantity, brand, specs, unitPrice, total_price])!=set([""]) or set([demand, budget])!=set([""]):
			
 
				-                    return header_dic, flag, (product, quantity, quantity_unit, unitPrice, brand, specs, total_price, category, parameter), (product, demand, budget, order_time,tenderee, notes,issue_date)
			
 
				+                    return header_dic, flag, (product, quantity, quantity_unit, unitPrice, brand, specs, total_price, category, parameter, pinmu_no, pinmu_name), (product, demand, budget, order_time,tenderee, notes,issue_date)
			
 
				         flag = False
			
 
				-        return header_dic, flag, (product, quantity, quantity_unit, unitPrice, brand, specs, total_price, category, parameter), (product, demand, budget, order_time,tenderee,notes,issue_date)
			
 
				+        return header_dic, flag, (product, quantity, quantity_unit, unitPrice, brand, specs, total_price, category, parameter, pinmu_no, pinmu_name), (product, demand, budget, order_time,tenderee,notes,issue_date)
			
 
				 
			
 
				     def predict(self, docid='', html='', page_time=""):
			
 
				         '''
			
@@ -3390,12 +3398,13 @@ class ProductAttributesPredictor():
 
				                 tenderee = "" # 采购人
			
 
				                 notes = '' # 备注
			
 
				                 issue_date = '' # 发布日期
			
 
				+                pinmu_no = '' # 品目号
			
 
				+                pinmu_name = '' # 品目名称
			
 
				                 if len(set([re.sub('[:：\s]','',td) for td in tds]) & self.header_set) > len(tds) * 0.4:
			
 
				                 # if len(set(tds) & self.header_set) > len(tds) * 0.2:
			
 
				                     header_dic, found_header, header_list, header_list2 = self.find_header(tds, self.p0, self.p1, self.p2)
			
 
				                     if found_header:
			
 
				                         header_colnum = len(tds) # 保存表头所在行列数
			
 
				-                        # print('发现表头：', header_colnum, header_dic)
			
 
				                     if found_header and isinstance(header_list, tuple) and len(header_list) > 2: # 获取表头中的 数量单位
			
 
				                             quantity_header = header_list[1].replace('单位：', '')
			
 
				                             if re.search('（([\w/]{,5})）', quantity_header):
			
@@ -3433,6 +3442,8 @@ class ProductAttributesPredictor():
 
				 
			
 
				                     id12 = header_dic.get('备注', "")
			
 
				                     id13 = header_dic.get('发布日期', "")
			
 
				+                    id14 = header_dic.get('品目号', "")
			
 
				+                    id15 = header_dic.get('品目名', "")
			
 
				 
			
 
				                     not_attr = 0
			
 
				                     for k, v in header_dic.items():
			
@@ -3528,6 +3539,10 @@ class ProductAttributesPredictor():
 
				                             notes = tds[id12].strip()
			
 
				                         if id13 != "":
			
 
				                             issue_date = self.fix_time(tds[id13].strip(), '', '')[0]
			
 
				+                        if id14 != "":
			
 
				+                            pinmu_no = tds[id14].strip()
			
 
				+                        if id15 != "":
			
 
				+                            pinmu_name = tds[id15].strip()
			
 
				                         # print('数量：{0}, 单价：{1}, 品牌：{2}， 规格：{3}，总价：{4}'.format(quantity ,unitPrice, brand, specs, total_price))
			
 
				                         if quantity != "" or unitPrice != "" or brand != "" or specs != "" or total_price or '单价' in header_dic or '总价' in header_dic:
			
 
				                             if id1!="" and id2 != "" and id3 != "" and len(re.split('[;；、，\n]+', tds[id2])) > 1 and len(re.split('[;；、，\n]+', tds[id1])) == len(re.split('[;；、，\n]+', tds[id2])): # 处理一个空格包含多个产品，逗号或空格分割情况 例子 292846806 292650743
			
@@ -3608,7 +3623,8 @@ class ProductAttributesPredictor():
 
				                                     total_price_list.append(total_price)
			
 
				                                     total_price = str(total_price) if total_price != 0 and total_price<50000000000 else ""
			
 
				                                 link = {'product': product, 'quantity': quantity, 'quantity_unit': quantity_unit, 'unitPrice': unitPrice,
			
 
				-                                                          'brand': brand[:50], 'specs':specs, 'total_price': total_price, 'parameter': parameter}
			
 
				+                                                          'brand': brand[:50], 'specs':specs, 'total_price': total_price, 'parameter': parameter,
			
 
				+                                                            'pinmu_no': pinmu_no, 'pinmu_name': pinmu_name}
			
 
				 
			
 
				                                 # if link not in product_link:
			
 
				                                 #     product_link.append(link)
			
@@ -5744,6 +5760,9 @@ class DistrictPredictor():
 
				             district_tuple = pickle.load(f)
			
 
				             self.p_pro, self.p_city, self.p_dis, self.idx_dic, self.full_dic, self.short_dic = district_tuple
			
 
				 
			
 
				+        with open(os.path.dirname(__file__) + "area_variance_dic.pkl", 'rb') as f: # 20241113 地区变更新旧名称对照字典
			
 
				+            self.area_variance_dic = pickle.load(f)
			
 
				+
			
 
				     def predict_backup(self, project_name, prem, title, list_articles, web_source_name = "", list_entitys=""):
			
 
				         '''
			
 
				         先匹配 project_name+tenderee+tenderee_address， 如果缺少省或市 再匹配 title+content
			
@@ -6000,8 +6019,12 @@ class DistrictPredictor():
 
				 
			
 
				             final_pro = ""
			
 
				             final_city = ""
			
 
				+            pro_prob = 0
			
 
				+            city_prob = 0
			
 
				             if len(pro_ids) >= 1:
			
 
				                 pro_l = sorted([(k, v) for k, v in pro_ids.items()], key=lambda x: x[1], reverse=True)
			
 
				+                scores = [it[1] for it in pro_l]
			
 
				+                pro_prob = max(scores)/sum(scores)
			
 
				                 final_pro, score = pro_l[0]
			
 
				                 if score >= 0.01:
			
 
				                     pred_pro = idx_dic[final_pro]['返回名称']
			
@@ -6011,6 +6034,8 @@ class DistrictPredictor():
 
				 
			
 
				             if pred_pro != "" and len(city_ids) >= 1:
			
 
				                 city_l = sorted([(k, v) for k, v in city_ids.items()], key=lambda x: x[1], reverse=True)
			
 
				+                scores = [it[1] for it in city_l]
			
 
				+                city_prob = max(scores) / sum(scores)
			
 
				                 for it in city_l:
			
 
				                     if idx_dic[it[0]]['省'] == final_pro:
			
 
				                         final_city = it[0]
			
@@ -6021,6 +6046,13 @@ class DistrictPredictor():
 
				                 for it in dis_l:
			
 
				                     if idx_dic[it[0]]['市'] == final_city:
			
 
				                         pred_dis = idx_dic[it[0]]['返回名称']
			
 
				+            elif pred_pro != "" and pred_city == "" and len(set(dis_ids)) >= 1: # 20241111 省份不为空，市为空，如果区县在省份下，补充对应的市县
			
 
				+                dis_l = sorted([(k, v) for k, v in dis_ids.items()], key=lambda x: x[1], reverse=True)
			
 
				+                for it in dis_l:
			
 
				+                    if idx_dic[it[0]]['省'] == final_pro:
			
 
				+                        pred_city = idx_dic[idx_dic[it[0]]['市']]['返回名称']
			
 
				+                        pred_dis = idx_dic[it[0]]['返回名称']
			
 
				+                        # print('20241111 省份不为空，市为空，如果区县在省份下，补充对应的市县: ', pred_city, pred_dis)
			
 
				 
			
 
				             if pred_city in ['北京', '天津', '上海', '重庆']:
			
 
				                 pred_city = pred_dis
			
@@ -6073,7 +6105,7 @@ class DistrictPredictor():
 
				             p_pro, p_city, p_dis, p_city, p_dis, p_dis)
			
 
				             province_l, city_l, district_l = [], [], []
			
 
				             for it in re.finditer(pettern, text):
			
 
				-                if re.search('[省市区县旗盟]$', it.group(0)) == None and re.search(
			
 
				+                if re.search('[省市区县旗盟]', it.group(0)) == None and re.search(
			
 
				                         '^([东南西北中一二三四五六七八九十大小]?(村|镇|街|路|道|社区)|酒店|宾馆)', text[it.end():]):
			
 
				                     continue
			
 
				                 if it.group(0) == '站前':  # 20240314 修复类似 中铁二局新建沪苏湖铁路工程站前VI标项目 错识别为 省份：辽宁， 城市：营口，区县：站前
			
@@ -6096,7 +6128,7 @@ class DistrictPredictor():
 
				             return province_l, city_l, district_l
			
 
				 
			
 
				         def get_pro_city_dis_score(text, text_weight=1):
			
 
				-            text = re.sub('复合肥|海南岛|兴业银行|双河口|阳光|杭州湾|新城区|中粮屯河|老城(区|改造|更新|升级|翻新)', ' ', text) # 544151395 赤壁市老城区燃气管道老化更新改造
			
 
				+            text = re.sub('复合肥|海南岛|兴业银行|双河口|阳光|杭州湾|新城区|中粮屯河|老城(区|改造|更新|升级|翻新)|沙县小吃|北京时间', ' ', text) # 544151395 赤壁市老城区燃气管道老化更新改造
			
 
				             text = re.sub('珠海城市', '珠海', text)  # 修复 426624023 珠海城市 预测为海城市
			
 
				             text = re.sub('怒江州', '怒江傈僳族自治州', text)  # 修复 423589589  所属地域：怒江州 识别为广西 - 崇左 - 江州
			
 
				             text = re.sub('茂名滨海新区', '茂名市', text)
			
@@ -6105,15 +6137,17 @@ class DistrictPredictor():
 
				             ser = re.search('海南(昌江|白沙|乐东|陵水|保亭|琼中)(黎族)?', text)
			
 
				             if ser and '黎族' not in ser.group(0):
			
 
				                 text = text.replace(ser.group(0), ser.group(0)+'黎族')
			
 
				+            for k, v in self.area_variance_dic.items(): # 20241113 根据地区变更信息替换文本
			
 
				+                text = text.replace(k, v)
			
 
				             # province_l = find_areas(p_pro, text)
			
 
				             # city_l = find_areas(p_city, text)
			
 
				             # district_l = find_areas(p_dis, text)
			
 
				 
			
 
				             province_l, city_l, district_l = find_whole_areas(text) # 20240703 优化地址提取，解决类似 海南昌江 得到 海南 南昌 结果
			
 
				 
			
 
				-            if len(province_l) == len(city_l) == 0:
			
 
				-                district_l = [it for it in district_l if
			
 
				-                              re.search('[市县旗区]$', it[0])]  # 20240428去掉只有区县地址且不是全称的匹配，避免错误 例 凌云工业股份有限公司 提取地区为广西白色凌云
			
 
				+            # if len(province_l) == len(city_l) == 0:
			
 
				+            #     district_l = [it for it in district_l if
			
 
				+            #                   re.search('[市县旗区]$', it[0])]  # 20240428去掉只有区县地址且不是全称的匹配，避免错误 例 凌云工业股份有限公司 提取地区为广西白色凌云
			
 
				 
			
 
				             province_l = chage_area2score(province_l, max_len=len(text))
			
 
				             city_l = chage_area2score(city_l, max_len=len(text))
			
@@ -6192,7 +6226,8 @@ class DistrictPredictor():
 
				                             dis_ids[idx] = 0
			
 
				                         weight = idx_dic[idx]['权重']
			
 
				                         dis_ids[idx] += (score + 0) * w
			
 
				-
			
 
				+                        if idx_dic[idx]['市'] not in city_ids and idx_dic[idx]['省'] not in pro_ids: # 20241111 区县简称不在获取到的省、市范围内的过滤掉
			
 
				+                            continue
			
 
				                         pro_idx = idx_dic[idx]['省']
			
 
				                         if pro_idx in pro_ids:
			
 
				                             pro_ids[pro_idx] += (score + 0) * w * weight
			
@@ -6203,6 +6238,8 @@ class DistrictPredictor():
 
				                             city_ids[city_idx] += (score + 0) * w * weight
			
 
				                         # else: # 20241015 注销 区县简称且不在提取的省市下面，不加分，避免提取错误 例：536550843
			
 
				                         #     city_ids[city_idx] = (score + 0) * w * weight * 0.1
			
 
				+                        elif pro_idx in pro_ids:
			
 
				+                            city_ids[city_idx] = (score + 0) * w * weight * 0.1
			
 
				 
			
 
				             for k, v in pro_ids.items():
			
 
				                 pro_ids[k] = v * text_weight
			
@@ -6215,7 +6252,7 @@ class DistrictPredictor():
 
				         area_dic = {'area': '全国', 'province': '全国', 'city': '未知', 'district': '未知', "is_in_text": False}
			
 
				 
			
 
				         pro_ids, city_ids, dis_ids = get_pro_city_dis_score(text)
			
 
				-        pro_ids1, city_ids1, dis_ids1 = get_pro_city_dis_score(web_name[:3], text_weight=0.2) # 20240422 修改为站源名称只取前三字，避免类似 459056219 中金岭南阳光采购平台 错提取阳光
			
 
				+        pro_ids1, city_ids1, dis_ids1 = get_pro_city_dis_score(web_name, text_weight=0.01) # 20240422 修改为站源名称只取前三字，避免类似 459056219 中金岭南阳光采购平台 错提取阳光
			
 
				         for k in pro_ids1:
			
 
				             if k in pro_ids:
			
 
				                 pro_ids[k] += pro_ids1[k]
			
@@ -6288,7 +6325,7 @@ class DistrictPredictor():
 
				                 return ''
			
 
				 
			
 
				         def get_project_addr(text):
			
 
				-            p1 = '(项目|施工|实施|建设|工程|服务|交货|送货|收货|展示|看样|拍卖)(地址|地点|位置|所在地区?)(位于)?：(?P<addr>(\w{1,13}(自治[区州县旗]|地区|[省市区县旗盟])[^\w]*)+|\w{2,15}[，。])'
			
 
				+            p1 = '(项目|施工|实施|建设|工程|服务|交货|送货|收货|展示|看样|拍卖)(地址|地点|位置|所在地区?)(位于)?：(?P<addr>(\w{1,13}(自治[区州县旗]|地区|[省市区县旗盟])[^\w]*)+([\w（）]{,20}[，。])?|\w{2,15}[，。])'
			
 
				             p2 = '项目位于(?P<addr>\w{2}市\w{2,4}区)'
			
 
				             if re.search(p1, text):
			
 
				                 return re.search(p1, text).group('addr')
			
@@ -6332,12 +6369,12 @@ class DistrictPredictor():
 
				         tenderee, tenderee_address = get_ree_addr(prem)
			
 
				         msc = ""
			
 
				         pro_addr = get_project_addr(content)
			
 
				-        if pro_addr != "":
			
 
				+        if pro_addr != "" and re.search('(采购人|招标人)?指定地点', pro_addr)==None: # 排除错误项目地址 例：554024168 1.5服务地点：采购人指定地点。
			
 
				             msc += '使用规则提取的项目地址；'
			
 
				             tenderee_address = pro_addr
			
 
				         else:
			
 
				             role_addr = get_role_address(content)
			
 
				-            if role_addr != "":
			
 
				+            if role_addr != "" and re.search('(采购人|招标人)?指定地点', role_addr)==None:
			
 
				                 msc += '使用规则提取的联系人地址；'
			
 
				                 tenderee_address = role_addr
			
 
				 
			
@@ -6370,14 +6407,14 @@ class DistrictPredictor():
 
				         web_source_name = str(web_source_name)  # 修复某些不是字符串类型造成报错
			
 
				         text1 = re.sub('复合肥|铁路|公路|新会计', ' ', text1)  # 预防提取错 合肥 路南 新会 等地区
			
 
				 
			
 
				-        if pro_addr and re.search('\w{2,}([省市县旗盟]|自治[区州县旗])', pro_addr):
			
 
				+        if pro_addr and re.search('\w{2,}([市县旗盟]|自治[区州县旗])', pro_addr):
			
 
				             if re.search('[市县旗盟]', pro_addr)==None: # 修复 486623506 项目地址不完整
			
 
				                 pro_addr = text1 + ' '+ pro_addr
			
 
				             msc += '## 使用项目地址输入：%s ##；' % pro_addr
			
 
				             rs = self.get_area(pro_addr, '')
			
 
				             msc += '预测结果：省份：%s， 城市：%s，区县：%s；' % (
			
 
				                 rs['district']['province'], rs['district']['city'], rs['district']['district'])
			
 
				-            if rs['district']['province'] != '全国':
			
 
				+            if rs['district']['province'] != '全国' and rs['district']['city'] != '未知':
			
 
				                 # print('地区匹配：', msc)
			
 
				                 return rs
			
 
				 
			
@@ -6389,7 +6426,7 @@ class DistrictPredictor():
 
				         # self.f.write('%s %s \n' % (list_articles[0].id, msc))
			
 
				         # print('地区匹配：', msc)
			
 
				         if rs['district']['province'] == '全国' or rs['district']['city'] == '未知':
			
 
				-            msc = ""
			
 
				+            # msc = ""
			
 
				             all_addr, tenderees = get_all_addr(list_entitys)
			
 
				             text2 = tenderees + " " + all_addr + ' ' + title
			
 
				             msc += '使用实体列表所有招标人+所有地址；'