|
@@ -3,6 +3,8 @@ from odps.udf import annotate
|
|
|
from odps.udf import BaseUDTF
|
|
|
from odps.udf import BaseUDAF
|
|
|
import re
|
|
|
+import os
|
|
|
+import traceback
|
|
|
|
|
|
@annotate('string,string -> string,bigint,bigint,string,string,string,string,string,string,string,string,string,string,string,string,string,string,string,bigint,string,string,string,string,string,string,string,string,string,string,string,string,string,string')
|
|
|
class f_decode_extract(BaseUDTF):
|
|
@@ -702,6 +704,16 @@ class f_dumplicate_groupPairs(BaseUDAF):
|
|
|
|
|
|
return json.dumps(list_dict)
|
|
|
|
|
|
+from decimal import Decimal
|
|
|
+# 高精度四舍五入方法,参数同round,结果更准确
|
|
|
+def precise_round(number, decimals=0):
|
|
|
+ # 转换为Decimal对象
|
|
|
+ d = Decimal(str(number))
|
|
|
+ # 构造四舍五入规则 (ROUND_HALF_UP为标准四舍五入)
|
|
|
+ result = d.quantize(Decimal("1e%d"%-decimals), rounding='ROUND_HALF_UP')
|
|
|
+ # result = d.quantize(Decimal("1e-%d"%decimals) if decimals>=0 else Decimal("1e%d"%-decimals), rounding='ROUND_HALF_UP')
|
|
|
+ return float(result)
|
|
|
+
|
|
|
def check_columns(tenderee_less,tenderee_greater,
|
|
|
agency_less,agency_greater,project_code_less,project_code_greater,project_name_less,project_name_greater,
|
|
|
win_tenderer_less,win_tenderer_greater,win_bid_price_less,win_bid_price_greater,
|
|
@@ -783,7 +795,7 @@ def check_money(bidding_budget_less,bidding_budget_greater,
|
|
|
win_bid_price_less,win_bid_price_greater,
|
|
|
moneys_less,moneys_greater,
|
|
|
moneys_attachment_less,moneys_attachment_greater):
|
|
|
- # print('bidding_budget_less',bidding_budget_less,'bidding_budget_greater',bidding_budget_greater)
|
|
|
+ # print('bidding_budget_less',bidding_budget_less,'bidding_budget_greater',bidding_budget_greater,'win_bid_price_less',win_bid_price_less,'win_bid_price_greater',win_bid_price_greater)
|
|
|
bidding_budget_less_source = bidding_budget_less
|
|
|
bidding_budget_greater_source = bidding_budget_greater
|
|
|
win_bid_price_less_source = win_bid_price_less
|
|
@@ -791,21 +803,29 @@ def check_money(bidding_budget_less,bidding_budget_greater,
|
|
|
#只判断最高前六位
|
|
|
if getLength(bidding_budget_less)>0:
|
|
|
bidding_budget_less_source = float(bidding_budget_less_source)
|
|
|
- bidding_budget_less = round(float(bidding_budget_less))
|
|
|
- bidding_budget_less = str(round(bidding_budget_less,6-len(str(bidding_budget_less))))
|
|
|
+ # bidding_budget_less = round(float(bidding_budget_less))
|
|
|
+ bidding_budget_less = int(precise_round(float(bidding_budget_less)))
|
|
|
+ # bidding_budget_less = str(round(bidding_budget_less,6-len(str(bidding_budget_less))))
|
|
|
+ bidding_budget_less = str(precise_round(bidding_budget_less,6-len(str(bidding_budget_less))))
|
|
|
if getLength(bidding_budget_greater)>0:
|
|
|
bidding_budget_greater_source = float(bidding_budget_greater_source)
|
|
|
- bidding_budget_greater = round(float(bidding_budget_greater))
|
|
|
- bidding_budget_greater = str(round(bidding_budget_greater,6-len(str(bidding_budget_greater))))
|
|
|
+ # bidding_budget_greater = round(float(bidding_budget_greater))
|
|
|
+ bidding_budget_greater = int(precise_round(float(bidding_budget_greater)))
|
|
|
+ # bidding_budget_greater = str(round(bidding_budget_greater,6-len(str(bidding_budget_greater))))
|
|
|
+ bidding_budget_greater = str(precise_round(bidding_budget_greater,6-len(str(bidding_budget_greater))))
|
|
|
|
|
|
if getLength(win_bid_price_less)>0:
|
|
|
win_bid_price_less_source = float(win_bid_price_less_source)
|
|
|
- win_bid_price_less = round(float(win_bid_price_less))
|
|
|
- win_bid_price_less = str(round(win_bid_price_less,6-len(str(win_bid_price_less))))
|
|
|
+ # win_bid_price_less = round(float(win_bid_price_less))
|
|
|
+ win_bid_price_less = int(precise_round(float(win_bid_price_less)))
|
|
|
+ # win_bid_price_less = str(round(win_bid_price_less,6-len(str(win_bid_price_less))))
|
|
|
+ win_bid_price_less = str(precise_round(win_bid_price_less,6-len(str(win_bid_price_less))))
|
|
|
if getLength(win_bid_price_greater)>0:
|
|
|
win_bid_price_greater_source = float(win_bid_price_greater_source)
|
|
|
- win_bid_price_greater = round(float(win_bid_price_greater))
|
|
|
- win_bid_price_greater = str(round(win_bid_price_greater,6-len(str(win_bid_price_greater))))
|
|
|
+ # win_bid_price_greater = round(float(win_bid_price_greater))
|
|
|
+ win_bid_price_greater = int(precise_round(float(win_bid_price_greater)))
|
|
|
+ # win_bid_price_greater = str(round(win_bid_price_greater,6-len(str(win_bid_price_greater))))
|
|
|
+ win_bid_price_greater = str(precise_round(win_bid_price_greater,6-len(str(win_bid_price_greater))))
|
|
|
|
|
|
#check saming
|
|
|
budget_is_same = ""
|
|
@@ -814,7 +834,6 @@ def check_money(bidding_budget_less,bidding_budget_greater,
|
|
|
budget_less = float(bidding_budget_less)
|
|
|
budget_greater = float(bidding_budget_greater)
|
|
|
|
|
|
-
|
|
|
if budget_less!=budget_greater:
|
|
|
if min(budget_less,budget_greater)>0:
|
|
|
# if max(budget_less,budget_greater)/min(budget_less,budget_greater)==10000:
|
|
@@ -822,7 +841,7 @@ def check_money(bidding_budget_less,bidding_budget_greater,
|
|
|
if (max(budget_less,budget_greater)/min(budget_less,budget_greater)>9999 and max(budget_less,budget_greater)/min(budget_less,budget_greater)<10001)\
|
|
|
or (max(bidding_budget_less_source,bidding_budget_greater_source)/min(bidding_budget_less_source,bidding_budget_greater_source)>9999 and max(bidding_budget_less_source,bidding_budget_greater_source)/min(bidding_budget_less_source,bidding_budget_greater_source)<10001):
|
|
|
budget_is_same = True
|
|
|
- if budget_less>10000 and budget_greater>10000 and round(budget_less/10000,2)==round(budget_greater/10000,2):
|
|
|
+ if budget_less>10000 and budget_greater>10000 and precise_round(budget_less/10000,2)==precise_round(budget_greater/10000,2):
|
|
|
budget_is_same = True
|
|
|
if budget_less in moneys_greater or budget_less in moneys_attachment_greater:
|
|
|
budget_is_same = True
|
|
@@ -837,7 +856,6 @@ def check_money(bidding_budget_less,bidding_budget_greater,
|
|
|
|
|
|
if getLength(win_bid_price_less)>0 and getLength(win_bid_price_greater)>0:
|
|
|
|
|
|
-
|
|
|
price_less = float(win_bid_price_less)
|
|
|
price_greater = float(win_bid_price_greater)
|
|
|
|
|
@@ -848,7 +866,7 @@ def check_money(bidding_budget_less,bidding_budget_greater,
|
|
|
if (max(price_less,price_greater)/min(price_less,price_greater)>9999 and max(price_less,price_greater)/min(price_less,price_greater)<10001)\
|
|
|
or (max(win_bid_price_less_source,win_bid_price_greater_source)/min(win_bid_price_less_source,win_bid_price_greater_source)>9999 and max(win_bid_price_less_source,win_bid_price_greater_source)/min(win_bid_price_less_source,win_bid_price_greater_source)<10001):
|
|
|
price_is_same = True
|
|
|
- if price_less>10000 and price_greater>10000 and round(price_less/10000,2)==round(price_greater/10000,2):
|
|
|
+ if price_less>10000 and price_greater>10000 and precise_round(price_less/10000,2)==precise_round(price_greater/10000,2):
|
|
|
price_is_same = True
|
|
|
if price_less in moneys_greater or price_less in moneys_attachment_greater:
|
|
|
price_is_same = True
|
|
@@ -970,17 +988,21 @@ def check_approval(approval_less,approval_greater,b_log):
|
|
|
return flag,0,0
|
|
|
|
|
|
|
|
|
-def check_codes(project_codes_less,project_codes_greater):
|
|
|
+def check_codes(project_codes_less,project_codes_greater,word_count_less={},word_count_greater={}):
|
|
|
#check the similarity
|
|
|
is_same = False
|
|
|
is_sim = False
|
|
|
|
|
|
-
|
|
|
for project_code_less in project_codes_less:
|
|
|
+ project_code_less = str(project_code_less).upper()
|
|
|
+ project_code_refine_less = "".join(re.findall("[\u4e00-\u9fa5a-zA-Z\d]+", project_code_less))
|
|
|
for project_code_greater in project_codes_greater:
|
|
|
- project_code_less = str(project_code_less).upper()
|
|
|
project_code_greater = str(project_code_greater).upper()
|
|
|
+ project_code_refine_greater = "".join(re.findall("[\u4e00-\u9fa5a-zA-Z\d]+", project_code_greater))
|
|
|
code_sim = getSimilarityOfString(project_code_less,project_code_greater)
|
|
|
+ # print('code_sim',code_sim,project_code_less,project_code_greater)
|
|
|
+ if project_code_refine_less == project_code_refine_greater:
|
|
|
+ is_same = True
|
|
|
if project_code_less is not None and project_code_greater is not None:
|
|
|
if code_sim>0.6:
|
|
|
if str(project_code_less).find(str(project_code_greater))>=0 or str(project_code_greater).find(str(project_code_less))>=0:
|
|
@@ -990,6 +1012,28 @@ def check_codes(project_codes_less,project_codes_greater):
|
|
|
if project_code_less!=project_code_greater:
|
|
|
if code_sim>0.4 and len(project_code_less)==len(project_code_greater):
|
|
|
is_sim = True
|
|
|
+ if word_count_less.get("附件",0)>20 or word_count_greater.get("附件",0)>20:# 有一篇公告包含附件内容
|
|
|
+ # code相似且长度相等时计算编辑距离
|
|
|
+ distance, differences = edit_distance_with_diff(project_code_less,project_code_greater)
|
|
|
+ is_all_same = True
|
|
|
+ if distance >= len(project_code_less)/2:
|
|
|
+ is_all_same = False
|
|
|
+ else:
|
|
|
+ for diff in differences:
|
|
|
+ if diff[0] == '替换':
|
|
|
+ if (diff[1] in similar_char_dict and diff[2] in similar_char_dict.get(diff[1],[])) or \
|
|
|
+ (diff[2] in similar_char_dict and diff[1] in similar_char_dict.get(diff[2],[])):
|
|
|
+ pass
|
|
|
+ else:
|
|
|
+ is_all_same = False
|
|
|
+ break
|
|
|
+ else:
|
|
|
+ is_all_same = False
|
|
|
+ break
|
|
|
+ # 编辑字符是否都为OCR易识别错的相似字符,例:"0-O", "1-IL"
|
|
|
+ if is_all_same:
|
|
|
+ is_same = True
|
|
|
+
|
|
|
if is_same:
|
|
|
return True
|
|
|
if is_sim:
|
|
@@ -999,6 +1043,14 @@ def check_codes(project_codes_less,project_codes_greater):
|
|
|
def check_demand():
|
|
|
return True
|
|
|
|
|
|
+similar_char_dict = {
|
|
|
+ "0":['O','Q'],
|
|
|
+ "O":["0",'Q'],
|
|
|
+ 'Q':['0','O'],
|
|
|
+ "1":["L","I"],
|
|
|
+ "L":["1"],
|
|
|
+ "I":["1"]
|
|
|
+}
|
|
|
def edit_distance_with_diff(s1, s2):
|
|
|
m, n = len(s1), len(s2)
|
|
|
# 创建动态规划表
|
|
@@ -1220,6 +1272,7 @@ def product_dump(list_product):
|
|
|
_product_l_l.append(_l)
|
|
|
return _product_l_l
|
|
|
def check_product(product_less,product_greater,split_char=",",doctitle_refine_less='',doctitle_refine_greater=''):
|
|
|
+ # print('product_less',product_less,'product_greater',product_greater)
|
|
|
if getLength(product_less)>0 and getLength(product_greater)>0:
|
|
|
|
|
|
_product_l = product_less.split(split_char)
|
|
@@ -1250,8 +1303,11 @@ def check_product(product_less,product_greater,split_char=",",doctitle_refine_le
|
|
|
_set_union = set_product_l_in_title & set_product_g_in_title
|
|
|
|
|
|
# 不同的部门若有重叠则通过
|
|
|
- diff_l = set_product_l_in_title-_set_union
|
|
|
- diff_g = set_product_g_in_title-_set_union
|
|
|
+ # diff_l = set_product_l_in_title-_set_union
|
|
|
+ # diff_g = set_product_g_in_title-_set_union
|
|
|
+ # 排除因模型识别缺漏字导致结果不同的情况
|
|
|
+ diff_l = {p for p in set_product_l_in_title - _set_union if not _title_g.find(p)}
|
|
|
+ diff_g = {p for p in set_product_g_in_title - _set_union if not _title_l.find(p)}
|
|
|
|
|
|
diff_dump = product_dump(list(diff_l.union(diff_g)))
|
|
|
if not(len(diff_dump)<=len(diff_l) or len(diff_dump)<=len(diff_g)):
|
|
@@ -1273,6 +1329,7 @@ def check_product(product_less,product_greater,split_char=",",doctitle_refine_le
|
|
|
if getSimilarityOfString(_l,_g)>=0.8 or doctitle_refine_greater.find(_l)>=0:
|
|
|
same_count += 1
|
|
|
break
|
|
|
+ # print('check product',same_count,len(_product_l))
|
|
|
if same_count/len(_product_l)>=0.5:
|
|
|
return True
|
|
|
return False
|
|
@@ -1287,7 +1344,7 @@ def check_package(package_less,package_greater,split_char=","):
|
|
|
for _l in _product_l:
|
|
|
for _g in _product_g:
|
|
|
if abs(len(_l)-len(_g))<=2:
|
|
|
- save_level = True
|
|
|
+ same_level = True
|
|
|
if _l==_g:
|
|
|
return True
|
|
|
if same_level:
|
|
@@ -1340,9 +1397,9 @@ def check_products(products_less,products_greater):
|
|
|
products_greater = json.loads(products_greater) if products_greater else []
|
|
|
# if len(products_less)>0 and len(products_greater)>0:
|
|
|
if len(products_less)>=4 and len(products_greater)>=4:
|
|
|
- products_less_list = [p['product'] for p in products_less]
|
|
|
+ products_less_list = [p['product'].upper() for p in products_less]
|
|
|
products_less_list = product_dump(products_less_list)
|
|
|
- products_greater_list = [p['product'] for p in products_greater]
|
|
|
+ products_greater_list = [p['product'].upper() for p in products_greater]
|
|
|
products_greater_list = product_dump(products_greater_list)
|
|
|
if len(products_less_list)>len(products_greater_list):
|
|
|
a = products_greater_list
|
|
@@ -1362,13 +1419,35 @@ def check_products(products_less,products_greater):
|
|
|
|
|
|
return True
|
|
|
|
|
|
+def get_login_web_set():
|
|
|
+
|
|
|
+ file = os.path.join(os.path.dirname(__file__),"login_weblist.txt")
|
|
|
+ list_web = []
|
|
|
+ try:
|
|
|
+ if os.path.exists(file):
|
|
|
+ with open(file,"r",encoding="utf8") as f:
|
|
|
+ while 1:
|
|
|
+ line = f.readline()
|
|
|
+ if not line:
|
|
|
+ break
|
|
|
+ line = line.strip()
|
|
|
+ if line:
|
|
|
+ list_web.append(line)
|
|
|
+ except Exception as e:
|
|
|
+ traceback.print_exc()
|
|
|
+ _set = set(list_web)
|
|
|
+ # log("get_login_web_set length %d"%(len(_set)))
|
|
|
+ return _set
|
|
|
+set_login_web = get_login_web_set()
|
|
|
+
|
|
|
|
|
|
def check_dumplicate_rule(document_less,document_greater,min_counts,b_log=False,hard_level=1):
|
|
|
|
|
|
docid_less = document_less["docid"]
|
|
|
docchannel_less = document_less.get("docchannel",0)
|
|
|
page_time_less = document_less.get("page_time")
|
|
|
- doctitle_refine_less = document_less["doctitle_refine"]
|
|
|
+ doctitle_refine_less = document_less.get("doctitle_refine","").upper()
|
|
|
+ doctitle_less = document_less.get("doctitle","").upper()
|
|
|
project_codes_less = document_less.get("project_codes")
|
|
|
nlp_enterprise_less = document_less["nlp_enterprise"]
|
|
|
tenderee_less = document_less.get("tenderee","")
|
|
@@ -1376,10 +1455,10 @@ def check_dumplicate_rule(document_less,document_greater,min_counts,b_log=False,
|
|
|
win_tenderer_less = document_less["win_tenderer"]
|
|
|
bidding_budget_less = document_less["bidding_budget"]
|
|
|
win_bid_price_less = document_less["win_bid_price"]
|
|
|
- product_less = document_less.get("product")
|
|
|
- package_less = document_less.get("package")
|
|
|
+ product_less = document_less.get("product").upper()
|
|
|
+ package_less = document_less.get("package").upper()
|
|
|
json_time_less = document_less.get("dict_time")
|
|
|
- project_name_less = document_less.get("project_name")
|
|
|
+ project_name_less = document_less.get("project_name").upper()
|
|
|
fingerprint_less = document_less.get("fingerprint")
|
|
|
extract_count_less = document_less.get("extract_count",0)
|
|
|
web_source_no_less = document_less.get("web_source_no")
|
|
@@ -1399,12 +1478,13 @@ def check_dumplicate_rule(document_less,document_greater,min_counts,b_log=False,
|
|
|
products_original_less = document_less.get("products_original",[])
|
|
|
change_content_less = document_less.get("change_content","")
|
|
|
change_time_less = document_less.get("change_time","")
|
|
|
-
|
|
|
+ word_count_less = document_less.get("word_count",{})
|
|
|
|
|
|
docid_greater = document_greater["docid"]
|
|
|
page_time_greater = document_greater["page_time"]
|
|
|
docchannel_greater = document_greater.get("docchannel",0)
|
|
|
- doctitle_refine_greater = document_greater.get("doctitle_refine","")
|
|
|
+ doctitle_refine_greater = document_greater.get("doctitle_refine","").upper()
|
|
|
+ doctitle_greater = document_greater.get("doctitle","").upper()
|
|
|
project_codes_greater = document_greater["project_codes"]
|
|
|
nlp_enterprise_greater = document_greater["nlp_enterprise"]
|
|
|
tenderee_greater = document_greater.get("tenderee","")
|
|
@@ -1412,10 +1492,10 @@ def check_dumplicate_rule(document_less,document_greater,min_counts,b_log=False,
|
|
|
win_tenderer_greater = document_greater["win_tenderer"]
|
|
|
bidding_budget_greater = document_greater["bidding_budget"]
|
|
|
win_bid_price_greater = document_greater["win_bid_price"]
|
|
|
- product_greater = document_greater.get("product")
|
|
|
- package_greater = document_greater.get("package")
|
|
|
+ product_greater = document_greater.get("product").upper()
|
|
|
+ package_greater = document_greater.get("package").upper()
|
|
|
json_time_greater = document_greater["dict_time"]
|
|
|
- project_name_greater = document_greater.get("project_name")
|
|
|
+ project_name_greater = document_greater.get("project_name").upper()
|
|
|
fingerprint_greater = document_greater.get("fingerprint")
|
|
|
extract_count_greater = document_greater.get("extract_count",0)
|
|
|
web_source_no_greater = document_greater.get("web_source_no")
|
|
@@ -1429,6 +1509,7 @@ def check_dumplicate_rule(document_less,document_greater,min_counts,b_log=False,
|
|
|
products_original_greater = document_greater.get("products_original", [])
|
|
|
change_content_greater = document_greater.get("change_content", "")
|
|
|
change_time_greater = document_greater.get("change_time", "")
|
|
|
+ word_count_greater = document_greater.get("word_count", {})
|
|
|
|
|
|
moneys_greater = document_greater.get("moneys")
|
|
|
moneys_attachment_greater = document_greater.get("moneys_attachment")
|
|
@@ -1438,6 +1519,20 @@ def check_dumplicate_rule(document_less,document_greater,min_counts,b_log=False,
|
|
|
approval_greater = document_greater.get("approval",[])
|
|
|
source_type_greater = document_greater.get("source_type")
|
|
|
|
|
|
+ if isinstance(project_codes_less,str):
|
|
|
+ project_codes_less = [a.upper() for a in project_codes_less.split(",") if a!=""]
|
|
|
+ elif isinstance(project_codes_less,list):
|
|
|
+ project_codes_less = [a.upper() for a in project_codes_less if a!=""]
|
|
|
+ elif project_codes_less is None:
|
|
|
+ project_codes_less = []
|
|
|
+
|
|
|
+ if isinstance(project_codes_greater,str):
|
|
|
+ project_codes_greater = [a.upper() for a in project_codes_greater.split(",") if a!=""]
|
|
|
+ elif isinstance(project_codes_greater,list):
|
|
|
+ project_codes_greater = [a.upper() for a in project_codes_greater if a!=""]
|
|
|
+ elif project_codes_greater is None:
|
|
|
+ project_codes_greater = []
|
|
|
+
|
|
|
# print('docid:',docid_less,docid_greater)
|
|
|
if fingerprint_less==fingerprint_greater and getLength(fingerprint_less)>0:
|
|
|
# print('fingerprint same')
|
|
@@ -1564,15 +1659,6 @@ def check_dumplicate_rule(document_less,document_greater,min_counts,b_log=False,
|
|
|
if b_log:
|
|
|
logging.info("same web_site,both has attach but not same web_source_no_less:%s,web_source_no_greater:%s"%(web_source_no_less,web_source_no_greater))
|
|
|
return 0
|
|
|
- if isinstance(project_codes_less,str):
|
|
|
- project_codes_less = [a for a in project_codes_less.split(",") if a!=""]
|
|
|
- elif project_codes_less is None:
|
|
|
- project_codes_less = []
|
|
|
-
|
|
|
- if isinstance(project_codes_greater,str):
|
|
|
- project_codes_greater = [a for a in project_codes_greater.split(",") if a!=""]
|
|
|
- elif project_codes_greater is None:
|
|
|
- project_codes_greater = []
|
|
|
|
|
|
# 采购意向去重
|
|
|
if docchannel_greater==docchannel_less==114:
|
|
@@ -1593,21 +1679,21 @@ def check_dumplicate_rule(document_less,document_greater,min_counts,b_log=False,
|
|
|
demand_info_greater = _demand_info_greater
|
|
|
for item1 in demand_info_less:
|
|
|
tmp_project_name_less = re.sub("\s","",item1.get("project_name","").strip())
|
|
|
- tmp_project_name_less = tmp_project_name_less.replace("(","(").replace(")",")")
|
|
|
+ tmp_project_name_less = tmp_project_name_less.replace("(","(").replace(")",")").upper()
|
|
|
tmp_budget_less = float(item1.get("budget",0) if item1.get("budget",0) else 0)
|
|
|
tmp_order_begin_less = item1.get("order_begin","")
|
|
|
tmp_order_end_less = item1.get("order_end", "")
|
|
|
get_same = False
|
|
|
for item2 in demand_info_greater:
|
|
|
tmp_project_name_greater = re.sub("\s", "", item2.get("project_name", "").strip())
|
|
|
- tmp_project_name_greater = tmp_project_name_greater.replace("(", "(").replace(")", ")")
|
|
|
+ tmp_project_name_greater = tmp_project_name_greater.replace("(", "(").replace(")", ")").upper()
|
|
|
tmp_budget_greater = float(item2.get("budget",0) if item2.get("budget",0) else 0)
|
|
|
tmp_order_begin_greater = item2.get("order_begin", "")
|
|
|
tmp_order_end_greater = item2.get("order_end", "")
|
|
|
# 项目名称相同或包含关系,预算金额对比,预计采购时间开始或结束相等(只对比到月份)
|
|
|
if (tmp_project_name_less==tmp_project_name_greater or
|
|
|
(len(tmp_project_name_less)>0 and len(tmp_project_name_greater)>0 and (tmp_project_name_less.find(tmp_project_name_greater)>=0 or tmp_project_name_greater.find(tmp_project_name_less)>=0))) and \
|
|
|
- check_money(tmp_budget_less,tmp_budget_greater,0,0,[],[],[],[]) and \
|
|
|
+ (check_money(tmp_budget_less,tmp_budget_greater,0,0,[],[],[],[]) or (tmp_budget_less>=100000 and tmp_budget_greater>=100000 and precise_round(tmp_budget_less/10000,0)==precise_round(tmp_budget_greater/10000,0 and (tmp_budget_less%10000==0 or tmp_budget_greater%10000==0)))) and \
|
|
|
(tmp_order_begin_less[:7]==tmp_order_begin_greater[:7] or tmp_order_end_less[:7]==tmp_order_end_greater[:7]):
|
|
|
get_same = True
|
|
|
break
|
|
@@ -1677,14 +1763,17 @@ def check_dumplicate_rule(document_less,document_greater,min_counts,b_log=False,
|
|
|
base_prob = 0.6
|
|
|
_prob = base_prob*same_count/all_count
|
|
|
# print('base_prob',base_prob,'min_counts',min_counts,'same_count',same_count,'all_count',all_count)
|
|
|
- if min(extract_count_less,extract_count_greater)<=3 and max(extract_count_less,extract_count_greater)>=5:
|
|
|
+ # web_source_name在set_login_web的站源表中时,extract_count加回3再比较
|
|
|
+ if min(extract_count_less if web_source_name_less not in set_login_web else extract_count_less+3,extract_count_greater if web_source_name_greater not in set_login_web else extract_count_greater+3)<=3 and \
|
|
|
+ max(extract_count_less if web_source_name_less not in set_login_web else extract_count_less+3,extract_count_greater if web_source_name_greater not in set_login_web else extract_count_greater+3)>=5:
|
|
|
if _prob<0.1 and str(page_time_less)==str(page_time_greater):
|
|
|
if str(docchannel_less) not in ("302","303"):
|
|
|
_prob = 0.15
|
|
|
if getLength(province_less)>0 and getLength(province_greater)>0 and province_less not in ("全国","未知") and province_greater not in ("全国","未知") and province_less!=province_greater:
|
|
|
- if b_log:
|
|
|
- logging.info("province not same:%s-%s"%(province_less,province_greater))
|
|
|
- return 0
|
|
|
+ if doctitle_refine_less!=doctitle_refine_greater and len(set(project_codes_less) & set(project_codes_greater))==0:
|
|
|
+ if b_log:
|
|
|
+ logging.info("%d-%d,province not same:%s-%s"%(docid_less,docid_greater,province_less,province_greater))
|
|
|
+ return 0
|
|
|
if _prob<0.1:
|
|
|
if b_log:
|
|
|
logging.info("prob too low:%f"%(_prob))
|
|
@@ -1707,7 +1796,7 @@ def check_dumplicate_rule(document_less,document_greater,min_counts,b_log=False,
|
|
|
check_result["doctitle"] = 2
|
|
|
|
|
|
#added check
|
|
|
- if not check_codes(project_codes_less,project_codes_greater):
|
|
|
+ if not check_codes(project_codes_less,project_codes_greater,word_count_less,word_count_greater):
|
|
|
check_result["code"] = 0
|
|
|
check_result["pass"] = 0
|
|
|
if b_log:
|
|
@@ -1719,7 +1808,8 @@ def check_dumplicate_rule(document_less,document_greater,min_counts,b_log=False,
|
|
|
check_result["code"] = 1
|
|
|
|
|
|
|
|
|
- if not check_product(product_less,product_greater,doctitle_refine_less=doctitle_refine_less,doctitle_refine_greater=doctitle_refine_greater):
|
|
|
+ # if not check_product(product_less,product_greater,doctitle_refine_less=doctitle_refine_less,doctitle_refine_greater=doctitle_refine_greater):
|
|
|
+ if not check_product(product_less,product_greater,doctitle_refine_less=doctitle_less,doctitle_refine_greater=doctitle_greater):
|
|
|
check_result["product"] = 0
|
|
|
check_result["pass"] = 0
|
|
|
if b_log:
|
|
@@ -1822,6 +1912,8 @@ def check_dumplicate_rule(document_less,document_greater,min_counts,b_log=False,
|
|
|
|
|
|
if check_result.get("entity",1)==2 and check_result.get("code",1)==2 and check_result.get("doctitle",2)==2 and check_result.get("product",2)==2 and check_result.get("money",0)==2:
|
|
|
return _prob
|
|
|
+ elif check_result.get("entity",1)==2 and check_result.get("code",1)>=1 and check_result.get("doctitle",2)==2 and check_result.get("package",2)==2 and check_result.get("money",0)==2:
|
|
|
+ return _prob
|
|
|
else:
|
|
|
return 0
|
|
|
return _prob
|