Browse Source

公告去重标题对比规则优化

znj 5 days ago
parent
commit
80b4db5df5
1 changed files with 18 additions and 16 deletions
  1. 18 16
      BaseDataMaintenance/maxcompute/documentDumplicate.py

+ 18 - 16
BaseDataMaintenance/maxcompute/documentDumplicate.py

@@ -1005,6 +1005,8 @@ num_pattern = re.compile("^\d+(?:\.\d+)?$")
 num1_pattern = re.compile("[一二三四五六七八九十A-Za-z]+")
 location_pattern = re.compile("[^\[【\(]{1,2}[市区镇县村路]")
 building_pattern = "工程招标代理|工程设计|暂停|继续|工程造价咨询|施工图设计文件审查|咨询|环评|设计|施工监理|施工|监理|EPC|epc|总承包|水土保持|选址论证|勘界|勘察|预算编制|预算审核|结算审计|招标代理|设备类|第?[\((]?[一二三四五六七八九十1-9]+[)\)]?[次批]"
+# 标题中被括号括起来的重点内容
+brackets_pattern = "【([^【】]+?)】" # |{([^{}]+?)}
 rebid_pattern = "再次|重新招标|[一二三四五六七八九十]+次"
 date_pattern = re.compile("\d{2,4}[\-\./年]\d{1,2}[\-\./月]\d{1,2}")
 def check_doctitle(doctitle_refind_less, doctitle_refind_greater, codes_less=[], code_greater=[],page_time_less="",page_time_greater=""):
@@ -1068,7 +1070,7 @@ def check_doctitle(doctitle_refind_less, doctitle_refind_greater, codes_less=[],
                 return False
 
     #check location and keywords
-    for _p in [num1_pattern,building_pattern]:
+    for _p in [num1_pattern,building_pattern,brackets_pattern]:
         num_all_l = re.findall(_p,doctitle_refind_less)
         num_all_g = re.findall(_p,doctitle_refind_greater)
         set_num_l = set(num_all_l)
@@ -1404,21 +1406,21 @@ def check_dumplicate_rule(document_less,document_greater,min_counts,b_log=False,
         _md5 = _l.get("fileMd5")
         if _md5 is not None:
             set_md5_greater.add(_md5)
-    if len(set_md5_less&set_md5_greater)>0 and len(set_md5_less&set_md5_greater)==len(set_md5_less):
-        one_in_attach = False
-        dict_enterprise_less = json.loads(nlp_enterprise_less)
-        dict_enterprise_greater = json.loads(nlp_enterprise_greater)
-        indoctextcon_less = dict_enterprise_less.get("indoctextcon",[])
-        notindoctextcon_less = dict_enterprise_less.get("notindoctextcon",[])
-        indoctextcon_greater = dict_enterprise_greater.get("indoctextcon",[])
-        notindoctextcon_greater = dict_enterprise_greater.get("notindoctextcon",[])
-        if len(indoctextcon_less)<=1 and len(notindoctextcon_less)>=2:
-            one_in_attach = True
-        if len(indoctextcon_greater)<=1 and len(notindoctextcon_greater)>=2:
-            one_in_attach = True
-        if one_in_attach:
-            if check_product(product_less,product_greater,doctitle_refine_less=doctitle_refine_less,doctitle_refine_greater=doctitle_refine_greater):
-                return 1
+    # if len(set_md5_less&set_md5_greater)>0 and len(set_md5_less&set_md5_greater)==len(set_md5_less):
+    #     one_in_attach = False
+    #     dict_enterprise_less = json.loads(nlp_enterprise_less)
+    #     dict_enterprise_greater = json.loads(nlp_enterprise_greater)
+    #     indoctextcon_less = dict_enterprise_less.get("indoctextcon",[])
+    #     notindoctextcon_less = dict_enterprise_less.get("notindoctextcon",[])
+    #     indoctextcon_greater = dict_enterprise_greater.get("indoctextcon",[])
+    #     notindoctextcon_greater = dict_enterprise_greater.get("notindoctextcon",[])
+    #     if len(indoctextcon_less)<=1 and len(notindoctextcon_less)>=2:
+    #         one_in_attach = True
+    #     if len(indoctextcon_greater)<=1 and len(notindoctextcon_greater)>=2:
+    #         one_in_attach = True
+    #     if one_in_attach:
+    #         if check_product(product_less,product_greater,doctitle_refine_less=doctitle_refine_less,doctitle_refine_greater=doctitle_refine_greater):
+    #             return 1
 
     #同一个站源,都有附件但附件没有重叠则不去重
     if web_source_no_less==web_source_no_greater and len(set_md5_less)>0 and len(set_md5_greater)>0 and len(set_md5_less&set_md5_greater)==0: