2 months ago · 80b4db5df5
--- a/BaseDataMaintenance/maxcompute/documentDumplicate.py
+++ b/BaseDataMaintenance/maxcompute/documentDumplicate.py
@@ -1005,6 +1005,8 @@ num_pattern = re.compile("^\d+(?:\.\d+)?$")
 
				 num1_pattern = re.compile("[一二三四五六七八九十A-Za-z]+")
			
 
				 location_pattern = re.compile("[^\[【\(]{1,2}[市区镇县村路]")
			
 
				 building_pattern = "工程招标代理|工程设计|暂停|继续|工程造价咨询|施工图设计文件审查|咨询|环评|设计|施工监理|施工|监理|EPC|epc|总承包|水土保持|选址论证|勘界|勘察|预算编制|预算审核|结算审计|招标代理|设备类|第?[\(（]?[一二三四五六七八九十1-9]+[）\)]?[次批]"
			
 
				+# 标题中被括号括起来的重点内容
			
 
				+brackets_pattern = "【([^【】]+?)】" # |｛([^｛｝]+?)｝
			
 
				 rebid_pattern = "再次|重新招标|[一二三四五六七八九十]+次"
			
 
				 date_pattern = re.compile("\d{2,4}[\-\./年]\d{1,2}[\-\./月]\d{1,2}")
			
 
				 def check_doctitle(doctitle_refind_less, doctitle_refind_greater, codes_less=[], code_greater=[],page_time_less="",page_time_greater=""):
			
@@ -1068,7 +1070,7 @@ def check_doctitle(doctitle_refind_less, doctitle_refind_greater, codes_less=[],
 
				                 return False
			
 
				 
			
 
				     #check location and keywords
			
 
				-    for _p in [num1_pattern,building_pattern]:
			
 
				+    for _p in [num1_pattern,building_pattern,brackets_pattern]:
			
 
				         num_all_l = re.findall(_p,doctitle_refind_less)
			
 
				         num_all_g = re.findall(_p,doctitle_refind_greater)
			
 
				         set_num_l = set(num_all_l)
			
@@ -1404,21 +1406,21 @@ def check_dumplicate_rule(document_less,document_greater,min_counts,b_log=False,
 
				         _md5 = _l.get("fileMd5")
			
 
				         if _md5 is not None:
			
 
				             set_md5_greater.add(_md5)
			
 
				-    if len(set_md5_less&set_md5_greater)>0 and len(set_md5_less&set_md5_greater)==len(set_md5_less):
			
 
				-        one_in_attach = False
			
 
				-        dict_enterprise_less = json.loads(nlp_enterprise_less)
			
 
				-        dict_enterprise_greater = json.loads(nlp_enterprise_greater)
			
 
				-        indoctextcon_less = dict_enterprise_less.get("indoctextcon",[])
			
 
				-        notindoctextcon_less = dict_enterprise_less.get("notindoctextcon",[])
			
 
				-        indoctextcon_greater = dict_enterprise_greater.get("indoctextcon",[])
			
 
				-        notindoctextcon_greater = dict_enterprise_greater.get("notindoctextcon",[])
			
 
				-        if len(indoctextcon_less)<=1 and len(notindoctextcon_less)>=2:
			
 
				-            one_in_attach = True
			
 
				-        if len(indoctextcon_greater)<=1 and len(notindoctextcon_greater)>=2:
			
 
				-            one_in_attach = True
			
 
				-        if one_in_attach:
			
 
				-            if check_product(product_less,product_greater,doctitle_refine_less=doctitle_refine_less,doctitle_refine_greater=doctitle_refine_greater):
			
 
				-                return 1
			
 
				+    # if len(set_md5_less&set_md5_greater)>0 and len(set_md5_less&set_md5_greater)==len(set_md5_less):
			
 
				+    #     one_in_attach = False
			
 
				+    #     dict_enterprise_less = json.loads(nlp_enterprise_less)
			
 
				+    #     dict_enterprise_greater = json.loads(nlp_enterprise_greater)
			
 
				+    #     indoctextcon_less = dict_enterprise_less.get("indoctextcon",[])
			
 
				+    #     notindoctextcon_less = dict_enterprise_less.get("notindoctextcon",[])
			
 
				+    #     indoctextcon_greater = dict_enterprise_greater.get("indoctextcon",[])
			
 
				+    #     notindoctextcon_greater = dict_enterprise_greater.get("notindoctextcon",[])
			
 
				+    #     if len(indoctextcon_less)<=1 and len(notindoctextcon_less)>=2:
			
 
				+    #         one_in_attach = True
			
 
				+    #     if len(indoctextcon_greater)<=1 and len(notindoctextcon_greater)>=2:
			
 
				+    #         one_in_attach = True
			
 
				+    #     if one_in_attach:
			
 
				+    #         if check_product(product_less,product_greater,doctitle_refine_less=doctitle_refine_less,doctitle_refine_greater=doctitle_refine_greater):
			
 
				+    #             return 1
			
 
				 
			
 
				     #同一个站源，都有附件但附件没有重叠则不去重
			
 
				     if web_source_no_less==web_source_no_greater and len(set_md5_less)>0 and len(set_md5_greater)>0 and len(set_md5_less&set_md5_greater)==0: