Browse Source

公告去重规则优化

znj 2 weeks ago
parent
commit
5586f390f7

+ 2 - 1
BaseDataMaintenance/maintenance/dataflow.py

@@ -2284,6 +2284,7 @@ class Dataflow_dumplicate(Dataflow):
         # 变更内容(变更答疑公告)
         _dict["change_content"] = _extract.get("change_content","")
         _dict["change_time"] = _extract.get("change_time","")
+        _dict["word_count"] = _extract.get("word_count", {})# 正文附件文本字数统计
 
         # 专项债字段
         issue_details = _extract.get("debt_dic",{}).get("issue_details",[])
@@ -4251,7 +4252,7 @@ class Dataflow_dumplicate(Dataflow):
                     if v is not None and len(v)>0:
                         if l_page_time>v:
                             has_before = True
-                        if v>page_time:
+                        if v>=page_time:
                             has_after = True
                         if k==document_tmp_time_bidclose:
                             bidclose_time = v

+ 140 - 48
BaseDataMaintenance/maxcompute/documentDumplicate.py

@@ -3,6 +3,8 @@ from odps.udf import annotate
 from odps.udf import BaseUDTF
 from odps.udf import BaseUDAF
 import re
+import os
+import traceback
 
 @annotate('string,string -> string,bigint,bigint,string,string,string,string,string,string,string,string,string,string,string,string,string,string,string,bigint,string,string,string,string,string,string,string,string,string,string,string,string,string,string')
 class f_decode_extract(BaseUDTF):
@@ -702,6 +704,16 @@ class f_dumplicate_groupPairs(BaseUDAF):
 
         return json.dumps(list_dict)
 
+from decimal import Decimal
+# 高精度四舍五入方法,参数同round,结果更准确
+def precise_round(number, decimals=0):
+    # 转换为Decimal对象
+    d = Decimal(str(number))
+    # 构造四舍五入规则 (ROUND_HALF_UP为标准四舍五入)
+    result = d.quantize(Decimal("1e%d"%-decimals), rounding='ROUND_HALF_UP')
+    # result = d.quantize(Decimal("1e-%d"%decimals) if decimals>=0 else Decimal("1e%d"%-decimals), rounding='ROUND_HALF_UP')
+    return float(result)
+
 def check_columns(tenderee_less,tenderee_greater,
                   agency_less,agency_greater,project_code_less,project_code_greater,project_name_less,project_name_greater,
                   win_tenderer_less,win_tenderer_greater,win_bid_price_less,win_bid_price_greater,
@@ -783,7 +795,7 @@ def check_money(bidding_budget_less,bidding_budget_greater,
                 win_bid_price_less,win_bid_price_greater,
                 moneys_less,moneys_greater,
                 moneys_attachment_less,moneys_attachment_greater):
-    # print('bidding_budget_less',bidding_budget_less,'bidding_budget_greater',bidding_budget_greater)
+    # print('bidding_budget_less',bidding_budget_less,'bidding_budget_greater',bidding_budget_greater,'win_bid_price_less',win_bid_price_less,'win_bid_price_greater',win_bid_price_greater)
     bidding_budget_less_source = bidding_budget_less
     bidding_budget_greater_source = bidding_budget_greater
     win_bid_price_less_source = win_bid_price_less
@@ -791,21 +803,29 @@ def check_money(bidding_budget_less,bidding_budget_greater,
     #只判断最高前六位
     if getLength(bidding_budget_less)>0:
         bidding_budget_less_source = float(bidding_budget_less_source)
-        bidding_budget_less = round(float(bidding_budget_less))
-        bidding_budget_less = str(round(bidding_budget_less,6-len(str(bidding_budget_less))))
+        # bidding_budget_less = round(float(bidding_budget_less))
+        bidding_budget_less = int(precise_round(float(bidding_budget_less)))
+        # bidding_budget_less = str(round(bidding_budget_less,6-len(str(bidding_budget_less))))
+        bidding_budget_less = str(precise_round(bidding_budget_less,6-len(str(bidding_budget_less))))
     if getLength(bidding_budget_greater)>0:
         bidding_budget_greater_source = float(bidding_budget_greater_source)
-        bidding_budget_greater = round(float(bidding_budget_greater))
-        bidding_budget_greater = str(round(bidding_budget_greater,6-len(str(bidding_budget_greater))))
+        # bidding_budget_greater = round(float(bidding_budget_greater))
+        bidding_budget_greater = int(precise_round(float(bidding_budget_greater)))
+        # bidding_budget_greater = str(round(bidding_budget_greater,6-len(str(bidding_budget_greater))))
+        bidding_budget_greater = str(precise_round(bidding_budget_greater,6-len(str(bidding_budget_greater))))
 
     if getLength(win_bid_price_less)>0:
         win_bid_price_less_source = float(win_bid_price_less_source)
-        win_bid_price_less = round(float(win_bid_price_less))
-        win_bid_price_less = str(round(win_bid_price_less,6-len(str(win_bid_price_less))))
+        # win_bid_price_less = round(float(win_bid_price_less))
+        win_bid_price_less = int(precise_round(float(win_bid_price_less)))
+        # win_bid_price_less = str(round(win_bid_price_less,6-len(str(win_bid_price_less))))
+        win_bid_price_less = str(precise_round(win_bid_price_less,6-len(str(win_bid_price_less))))
     if getLength(win_bid_price_greater)>0:
         win_bid_price_greater_source = float(win_bid_price_greater_source)
-        win_bid_price_greater = round(float(win_bid_price_greater))
-        win_bid_price_greater = str(round(win_bid_price_greater,6-len(str(win_bid_price_greater))))
+        # win_bid_price_greater = round(float(win_bid_price_greater))
+        win_bid_price_greater = int(precise_round(float(win_bid_price_greater)))
+        # win_bid_price_greater = str(round(win_bid_price_greater,6-len(str(win_bid_price_greater))))
+        win_bid_price_greater = str(precise_round(win_bid_price_greater,6-len(str(win_bid_price_greater))))
 
     #check saming
     budget_is_same = ""
@@ -814,7 +834,6 @@ def check_money(bidding_budget_less,bidding_budget_greater,
         budget_less = float(bidding_budget_less)
         budget_greater = float(bidding_budget_greater)
 
-
         if budget_less!=budget_greater:
             if min(budget_less,budget_greater)>0:
                 # if max(budget_less,budget_greater)/min(budget_less,budget_greater)==10000:
@@ -822,7 +841,7 @@ def check_money(bidding_budget_less,bidding_budget_greater,
                 if (max(budget_less,budget_greater)/min(budget_less,budget_greater)>9999 and max(budget_less,budget_greater)/min(budget_less,budget_greater)<10001)\
                         or (max(bidding_budget_less_source,bidding_budget_greater_source)/min(bidding_budget_less_source,bidding_budget_greater_source)>9999 and max(bidding_budget_less_source,bidding_budget_greater_source)/min(bidding_budget_less_source,bidding_budget_greater_source)<10001):
                     budget_is_same = True
-            if budget_less>10000 and budget_greater>10000 and round(budget_less/10000,2)==round(budget_greater/10000,2):
+            if budget_less>10000 and budget_greater>10000 and precise_round(budget_less/10000,2)==precise_round(budget_greater/10000,2):
                 budget_is_same = True
             if budget_less in moneys_greater or budget_less in moneys_attachment_greater:
                 budget_is_same = True
@@ -837,7 +856,6 @@ def check_money(bidding_budget_less,bidding_budget_greater,
 
     if getLength(win_bid_price_less)>0 and getLength(win_bid_price_greater)>0:
 
-
         price_less = float(win_bid_price_less)
         price_greater = float(win_bid_price_greater)
 
@@ -848,7 +866,7 @@ def check_money(bidding_budget_less,bidding_budget_greater,
                 if (max(price_less,price_greater)/min(price_less,price_greater)>9999 and max(price_less,price_greater)/min(price_less,price_greater)<10001)\
                         or (max(win_bid_price_less_source,win_bid_price_greater_source)/min(win_bid_price_less_source,win_bid_price_greater_source)>9999 and max(win_bid_price_less_source,win_bid_price_greater_source)/min(win_bid_price_less_source,win_bid_price_greater_source)<10001):
                     price_is_same = True
-            if price_less>10000 and price_greater>10000 and round(price_less/10000,2)==round(price_greater/10000,2):
+            if price_less>10000 and price_greater>10000 and precise_round(price_less/10000,2)==precise_round(price_greater/10000,2):
                 price_is_same = True
             if price_less in moneys_greater or price_less in moneys_attachment_greater:
                 price_is_same = True
@@ -970,17 +988,21 @@ def check_approval(approval_less,approval_greater,b_log):
     return flag,0,0
 
 
-def check_codes(project_codes_less,project_codes_greater):
+def check_codes(project_codes_less,project_codes_greater,word_count_less={},word_count_greater={}):
     #check the similarity
     is_same = False
     is_sim = False
 
-
     for project_code_less in project_codes_less:
+        project_code_less = str(project_code_less).upper()
+        project_code_refine_less = "".join(re.findall("[\u4e00-\u9fa5a-zA-Z\d]+", project_code_less))
         for project_code_greater in project_codes_greater:
-            project_code_less = str(project_code_less).upper()
             project_code_greater = str(project_code_greater).upper()
+            project_code_refine_greater = "".join(re.findall("[\u4e00-\u9fa5a-zA-Z\d]+", project_code_greater))
             code_sim = getSimilarityOfString(project_code_less,project_code_greater)
+            # print('code_sim',code_sim,project_code_less,project_code_greater)
+            if project_code_refine_less == project_code_refine_greater:
+                is_same = True
             if project_code_less is not None and project_code_greater is not None:
                 if code_sim>0.6:
                     if str(project_code_less).find(str(project_code_greater))>=0 or str(project_code_greater).find(str(project_code_less))>=0:
@@ -990,6 +1012,28 @@ def check_codes(project_codes_less,project_codes_greater):
                 if project_code_less!=project_code_greater:
                     if code_sim>0.4 and len(project_code_less)==len(project_code_greater):
                         is_sim = True
+                        if word_count_less.get("附件",0)>20 or word_count_greater.get("附件",0)>20:# 有一篇公告包含附件内容
+                            # code相似且长度相等时计算编辑距离
+                            distance, differences = edit_distance_with_diff(project_code_less,project_code_greater)
+                            is_all_same = True
+                            if distance >= len(project_code_less)/2:
+                                is_all_same = False
+                            else:
+                                for diff in differences:
+                                    if diff[0] == '替换':
+                                        if (diff[1] in similar_char_dict and diff[2] in similar_char_dict.get(diff[1],[])) or \
+                                                (diff[2] in similar_char_dict and diff[1] in similar_char_dict.get(diff[2],[])):
+                                            pass
+                                        else:
+                                            is_all_same = False
+                                            break
+                                    else:
+                                        is_all_same = False
+                                        break
+                            # 编辑字符是否都为OCR易识别错的相似字符,例:"0-O", "1-IL"
+                            if is_all_same:
+                                is_same = True
+
     if is_same:
         return True
     if is_sim:
@@ -999,6 +1043,14 @@ def check_codes(project_codes_less,project_codes_greater):
 def check_demand():
     return True
 
+similar_char_dict = {
+    "0":['O','Q'],
+    "O":["0",'Q'],
+    'Q':['0','O'],
+    "1":["L","I"],
+    "L":["1"],
+    "I":["1"]
+}
 def edit_distance_with_diff(s1, s2):
     m, n = len(s1), len(s2)
     # 创建动态规划表
@@ -1220,6 +1272,7 @@ def product_dump(list_product):
             _product_l_l.append(_l)
     return _product_l_l
 def check_product(product_less,product_greater,split_char=",",doctitle_refine_less='',doctitle_refine_greater=''):
+    # print('product_less',product_less,'product_greater',product_greater)
     if getLength(product_less)>0 and getLength(product_greater)>0:
 
         _product_l = product_less.split(split_char)
@@ -1250,8 +1303,11 @@ def check_product(product_less,product_greater,split_char=",",doctitle_refine_le
             _set_union = set_product_l_in_title & set_product_g_in_title
 
             # 不同的部门若有重叠则通过
-            diff_l = set_product_l_in_title-_set_union
-            diff_g = set_product_g_in_title-_set_union
+            # diff_l = set_product_l_in_title-_set_union
+            # diff_g = set_product_g_in_title-_set_union
+            # 排除因模型识别缺漏字导致结果不同的情况
+            diff_l = {p for p in set_product_l_in_title - _set_union if not _title_g.find(p)}
+            diff_g = {p for p in set_product_g_in_title - _set_union if not _title_l.find(p)}
 
             diff_dump = product_dump(list(diff_l.union(diff_g)))
             if not(len(diff_dump)<=len(diff_l) or len(diff_dump)<=len(diff_g)):
@@ -1273,6 +1329,7 @@ def check_product(product_less,product_greater,split_char=",",doctitle_refine_le
                 if getSimilarityOfString(_l,_g)>=0.8 or doctitle_refine_greater.find(_l)>=0:
                     same_count += 1
                     break
+        # print('check product',same_count,len(_product_l))
         if same_count/len(_product_l)>=0.5:
             return True
         return False
@@ -1287,7 +1344,7 @@ def check_package(package_less,package_greater,split_char=","):
         for _l in _product_l:
             for _g in _product_g:
                 if abs(len(_l)-len(_g))<=2:
-                    save_level = True
+                    same_level = True
                 if _l==_g:
                     return True
         if same_level:
@@ -1340,9 +1397,9 @@ def check_products(products_less,products_greater):
         products_greater = json.loads(products_greater) if products_greater else []
     # if len(products_less)>0 and len(products_greater)>0:
     if len(products_less)>=4 and len(products_greater)>=4:
-        products_less_list = [p['product'] for p in products_less]
+        products_less_list = [p['product'].upper() for p in products_less]
         products_less_list = product_dump(products_less_list)
-        products_greater_list = [p['product'] for p in products_greater]
+        products_greater_list = [p['product'].upper() for p in products_greater]
         products_greater_list = product_dump(products_greater_list)
         if len(products_less_list)>len(products_greater_list):
             a = products_greater_list
@@ -1362,13 +1419,35 @@ def check_products(products_less,products_greater):
 
     return True
 
+def get_login_web_set():
+
+    file = os.path.join(os.path.dirname(__file__),"login_weblist.txt")
+    list_web = []
+    try:
+        if os.path.exists(file):
+            with open(file,"r",encoding="utf8") as f:
+                while 1:
+                    line = f.readline()
+                    if not line:
+                        break
+                    line = line.strip()
+                    if line:
+                        list_web.append(line)
+    except Exception as e:
+        traceback.print_exc()
+    _set = set(list_web)
+    # log("get_login_web_set length %d"%(len(_set)))
+    return _set
+set_login_web = get_login_web_set()
+
 
 def check_dumplicate_rule(document_less,document_greater,min_counts,b_log=False,hard_level=1):
 
     docid_less = document_less["docid"]
     docchannel_less = document_less.get("docchannel",0)
     page_time_less = document_less.get("page_time")
-    doctitle_refine_less = document_less["doctitle_refine"]
+    doctitle_refine_less = document_less.get("doctitle_refine","").upper()
+    doctitle_less = document_less.get("doctitle","").upper()
     project_codes_less = document_less.get("project_codes")
     nlp_enterprise_less = document_less["nlp_enterprise"]
     tenderee_less = document_less.get("tenderee","")
@@ -1376,10 +1455,10 @@ def check_dumplicate_rule(document_less,document_greater,min_counts,b_log=False,
     win_tenderer_less = document_less["win_tenderer"]
     bidding_budget_less = document_less["bidding_budget"]
     win_bid_price_less = document_less["win_bid_price"]
-    product_less = document_less.get("product")
-    package_less = document_less.get("package")
+    product_less = document_less.get("product").upper()
+    package_less = document_less.get("package").upper()
     json_time_less = document_less.get("dict_time")
-    project_name_less = document_less.get("project_name")
+    project_name_less = document_less.get("project_name").upper()
     fingerprint_less = document_less.get("fingerprint")
     extract_count_less = document_less.get("extract_count",0)
     web_source_no_less = document_less.get("web_source_no")
@@ -1399,12 +1478,13 @@ def check_dumplicate_rule(document_less,document_greater,min_counts,b_log=False,
     products_original_less = document_less.get("products_original",[])
     change_content_less = document_less.get("change_content","")
     change_time_less = document_less.get("change_time","")
-
+    word_count_less = document_less.get("word_count",{})
 
     docid_greater = document_greater["docid"]
     page_time_greater = document_greater["page_time"]
     docchannel_greater = document_greater.get("docchannel",0)
-    doctitle_refine_greater = document_greater.get("doctitle_refine","")
+    doctitle_refine_greater = document_greater.get("doctitle_refine","").upper()
+    doctitle_greater = document_greater.get("doctitle","").upper()
     project_codes_greater = document_greater["project_codes"]
     nlp_enterprise_greater = document_greater["nlp_enterprise"]
     tenderee_greater = document_greater.get("tenderee","")
@@ -1412,10 +1492,10 @@ def check_dumplicate_rule(document_less,document_greater,min_counts,b_log=False,
     win_tenderer_greater = document_greater["win_tenderer"]
     bidding_budget_greater = document_greater["bidding_budget"]
     win_bid_price_greater = document_greater["win_bid_price"]
-    product_greater = document_greater.get("product")
-    package_greater = document_greater.get("package")
+    product_greater = document_greater.get("product").upper()
+    package_greater = document_greater.get("package").upper()
     json_time_greater = document_greater["dict_time"]
-    project_name_greater = document_greater.get("project_name")
+    project_name_greater = document_greater.get("project_name").upper()
     fingerprint_greater = document_greater.get("fingerprint")
     extract_count_greater = document_greater.get("extract_count",0)
     web_source_no_greater = document_greater.get("web_source_no")
@@ -1429,6 +1509,7 @@ def check_dumplicate_rule(document_less,document_greater,min_counts,b_log=False,
     products_original_greater = document_greater.get("products_original", [])
     change_content_greater = document_greater.get("change_content", "")
     change_time_greater = document_greater.get("change_time", "")
+    word_count_greater = document_greater.get("word_count", {})
 
     moneys_greater = document_greater.get("moneys")
     moneys_attachment_greater = document_greater.get("moneys_attachment")
@@ -1438,6 +1519,20 @@ def check_dumplicate_rule(document_less,document_greater,min_counts,b_log=False,
     approval_greater = document_greater.get("approval",[])
     source_type_greater = document_greater.get("source_type")
 
+    if isinstance(project_codes_less,str):
+        project_codes_less = [a.upper() for a in project_codes_less.split(",") if a!=""]
+    elif isinstance(project_codes_less,list):
+        project_codes_less = [a.upper() for a in project_codes_less if a!=""]
+    elif project_codes_less is None:
+        project_codes_less = []
+
+    if isinstance(project_codes_greater,str):
+        project_codes_greater = [a.upper() for a in project_codes_greater.split(",") if a!=""]
+    elif isinstance(project_codes_greater,list):
+        project_codes_greater = [a.upper() for a in project_codes_greater if a!=""]
+    elif project_codes_greater is None:
+        project_codes_greater = []
+
     # print('docid:',docid_less,docid_greater)
     if fingerprint_less==fingerprint_greater and getLength(fingerprint_less)>0:
         # print('fingerprint same')
@@ -1564,15 +1659,6 @@ def check_dumplicate_rule(document_less,document_greater,min_counts,b_log=False,
         if b_log:
             logging.info("same web_site,both has attach but not same web_source_no_less:%s,web_source_no_greater:%s"%(web_source_no_less,web_source_no_greater))
         return 0
-    if isinstance(project_codes_less,str):
-        project_codes_less = [a for a in project_codes_less.split(",") if a!=""]
-    elif project_codes_less is None:
-        project_codes_less = []
-
-    if isinstance(project_codes_greater,str):
-        project_codes_greater = [a for a in project_codes_greater.split(",") if a!=""]
-    elif project_codes_greater is None:
-        project_codes_greater = []
 
     # 采购意向去重
     if docchannel_greater==docchannel_less==114:
@@ -1593,21 +1679,21 @@ def check_dumplicate_rule(document_less,document_greater,min_counts,b_log=False,
                 demand_info_greater = _demand_info_greater
             for item1 in demand_info_less:
                 tmp_project_name_less = re.sub("\s","",item1.get("project_name","").strip())
-                tmp_project_name_less = tmp_project_name_less.replace("(","(").replace(")",")")
+                tmp_project_name_less = tmp_project_name_less.replace("(","(").replace(")",")").upper()
                 tmp_budget_less = float(item1.get("budget",0) if item1.get("budget",0) else 0)
                 tmp_order_begin_less = item1.get("order_begin","")
                 tmp_order_end_less = item1.get("order_end", "")
                 get_same = False
                 for item2 in demand_info_greater:
                     tmp_project_name_greater = re.sub("\s", "", item2.get("project_name", "").strip())
-                    tmp_project_name_greater = tmp_project_name_greater.replace("(", "(").replace(")", ")")
+                    tmp_project_name_greater = tmp_project_name_greater.replace("(", "(").replace(")", ")").upper()
                     tmp_budget_greater = float(item2.get("budget",0) if item2.get("budget",0) else 0)
                     tmp_order_begin_greater = item2.get("order_begin", "")
                     tmp_order_end_greater = item2.get("order_end", "")
                     # 项目名称相同或包含关系,预算金额对比,预计采购时间开始或结束相等(只对比到月份)
                     if (tmp_project_name_less==tmp_project_name_greater or
                         (len(tmp_project_name_less)>0 and len(tmp_project_name_greater)>0 and (tmp_project_name_less.find(tmp_project_name_greater)>=0 or tmp_project_name_greater.find(tmp_project_name_less)>=0))) and \
-                            check_money(tmp_budget_less,tmp_budget_greater,0,0,[],[],[],[]) and \
+                            (check_money(tmp_budget_less,tmp_budget_greater,0,0,[],[],[],[]) or (tmp_budget_less>=100000 and tmp_budget_greater>=100000 and precise_round(tmp_budget_less/10000,0)==precise_round(tmp_budget_greater/10000,0 and (tmp_budget_less%10000==0 or tmp_budget_greater%10000==0)))) and \
                             (tmp_order_begin_less[:7]==tmp_order_begin_greater[:7] or tmp_order_end_less[:7]==tmp_order_end_greater[:7]):
                         get_same = True
                         break
@@ -1677,14 +1763,17 @@ def check_dumplicate_rule(document_less,document_greater,min_counts,b_log=False,
         base_prob = 0.6
     _prob = base_prob*same_count/all_count
     # print('base_prob',base_prob,'min_counts',min_counts,'same_count',same_count,'all_count',all_count)
-    if min(extract_count_less,extract_count_greater)<=3 and max(extract_count_less,extract_count_greater)>=5:
+    # web_source_name在set_login_web的站源表中时,extract_count加回3再比较
+    if min(extract_count_less if web_source_name_less not in set_login_web else extract_count_less+3,extract_count_greater if web_source_name_greater not in set_login_web  else extract_count_greater+3)<=3 and \
+            max(extract_count_less if web_source_name_less not in set_login_web else extract_count_less+3,extract_count_greater if web_source_name_greater not in set_login_web  else extract_count_greater+3)>=5:
         if _prob<0.1 and str(page_time_less)==str(page_time_greater):
             if str(docchannel_less) not in ("302","303"):
                 _prob = 0.15
         if getLength(province_less)>0 and getLength(province_greater)>0 and province_less not in ("全国","未知") and province_greater not in ("全国","未知") and province_less!=province_greater:
-            if b_log:
-                logging.info("province not same:%s-%s"%(province_less,province_greater))
-            return 0
+            if doctitle_refine_less!=doctitle_refine_greater and len(set(project_codes_less) & set(project_codes_greater))==0:
+                if b_log:
+                    logging.info("%d-%d,province not same:%s-%s"%(docid_less,docid_greater,province_less,province_greater))
+                return 0
     if _prob<0.1:
         if b_log:
             logging.info("prob too low:%f"%(_prob))
@@ -1707,7 +1796,7 @@ def check_dumplicate_rule(document_less,document_greater,min_counts,b_log=False,
         check_result["doctitle"] = 2
 
     #added check
-    if not check_codes(project_codes_less,project_codes_greater):
+    if not check_codes(project_codes_less,project_codes_greater,word_count_less,word_count_greater):
         check_result["code"] = 0
         check_result["pass"] = 0
         if b_log:
@@ -1719,7 +1808,8 @@ def check_dumplicate_rule(document_less,document_greater,min_counts,b_log=False,
             check_result["code"] = 1
 
 
-    if not check_product(product_less,product_greater,doctitle_refine_less=doctitle_refine_less,doctitle_refine_greater=doctitle_refine_greater):
+    # if not check_product(product_less,product_greater,doctitle_refine_less=doctitle_refine_less,doctitle_refine_greater=doctitle_refine_greater):
+    if not check_product(product_less,product_greater,doctitle_refine_less=doctitle_less,doctitle_refine_greater=doctitle_greater):
         check_result["product"] = 0
         check_result["pass"] = 0
         if b_log:
@@ -1822,6 +1912,8 @@ def check_dumplicate_rule(document_less,document_greater,min_counts,b_log=False,
 
         if check_result.get("entity",1)==2 and check_result.get("code",1)==2 and check_result.get("doctitle",2)==2 and check_result.get("product",2)==2 and check_result.get("money",0)==2:
             return _prob
+        elif check_result.get("entity",1)==2 and check_result.get("code",1)>=1 and check_result.get("doctitle",2)==2 and check_result.get("package",2)==2 and check_result.get("money",0)==2:
+            return _prob
         else:
             return 0
     return _prob

+ 115 - 0
BaseDataMaintenance/maxcompute/login_weblist.txt

@@ -0,0 +1,115 @@
+金采网
+隆道-大企业采购平台
+海尔招标网
+中国船舶采购管理电子商务平台
+睿采网
+江阴热电有限公司采购电子平台
+深圳市国深房地产开发有限公司招采平台
+优采云
+益和电气电子采购平台
+亚泰电子招标采购平台
+浙江大有集团招投标采购平台
+山西晋龙集团
+天能重工
+北京市政路桥集团(股份)有限公司招标采购平台
+物集港商城
+中国中化控股有限责任公司电子商务平台
+华招医药网
+河源市万信招标代理有限公司
+欧贝 - 工业品供应链生态平台
+中金岭南阳光采购平台
+东方希望数字化采购平台
+宁波市北仑区大碶博平小学
+西电集团电子采购平台
+聚拍网
+智采招标代理(天津)有限公司
+智采医用耗材信息网
+中国兵工物资集团有限公司电子商务平台
+四川省工程建设项目审批管理系统
+首采云数字化采购平台
+鞍钢现货交易平台
+山钢股份莱芜分公司电子采购平台
+中国电建设备物资集中采购平台
+优质采云采购平台
+特乐意建材电商交易平台
+织巢鸟
+河北普阳钢铁集团网上招标管理平台
+河钢供应链管理平台
+南方水泥招采平台
+航天电子采购平台
+供应链数字化管理平台
+四川省投资项目在线审批监管平台
+盈峰环境
+军队自采平台
+陕西鼓风机(集团)有限公司电子采购系统
+易采平台
+得力集团
+浑源县政府采购电子卖场
+铁建商城
+中国工程物理研究院招投标信息网
+山西省招标投标协会
+浙江保利置业阳光招采平台
+旺采网
+广东省教育系统采购竞价平台
+数字云采
+龙成集团电子招标平台
+友云采
+邯郸市邯钢附属企业公司
+中国硫酸网
+云南江东房地产集团有限公司
+珍药采购招标信息化管理平台
+大地阳光采购平台
+江苏省中医院投标平台
+福建省船舶工业集团公司采购平台
+渤化易采平台
+八戒公采
+云端采购网
+中国航发网上商城
+晋能控股电力集团
+四川玄同工程项目管理有限责任公司
+物联宝
+畅采通招标采购网
+广西保利置业阳光招采平台
+智慧工厂在线
+中复神鹰碳纤维有限责任公司
+云采网
+中国兵器废旧物资处置平台
+中电环保科技公司电子采购平台
+中南锦时招采平台
+浙江中医药大学
+丝路汇采
+中铁鲁班商务网
+津水云采
+中国电子科技集团有限公司电子采购平台
+中国巨石股份有限公司
+龙蟒集团
+中集集装箱电子采购协同平台
+城轨采购网
+中国铁路招标网
+中电环保电子采购平台电子采购平台
+工程众创云平台
+中国石油电子招标投标网
+浪潮爱购云
+福建建工分包与劳务管理平台
+金正大集团电子采购平台
+冠洲集团电子采购系统
+浙江云采购中心平台
+华新阳光采购平台
+苏州市宇杰工程技术服务咨询有限公司
+中储粮服务网
+中国华电集团电子商务平台
+招商局集团电子招标采购交易网
+中国制造网采购平台
+钜商网
+中车购
+中建鸿腾招标与采购平台
+U材U建平台
+山西华鑫电子采购平台
+深圳保利阳光招采平台
+渤商网
+厦门航空采购平台
+山东农商行集中采购管理系统
+湖北保利投资阳光招采平台
+山东省采购与招标网
+政采云
+中国招标投标公共服务平台