Эх сурвалжийг харах

check_time规则优化;新增编辑距离判断标题规则;变更公告变更内容对比规则

znj 2 долоо хоног өмнө
parent
commit
9be0d1dfa7

+ 6 - 1
BaseDataMaintenance/maintenance/dataflow.py

@@ -2278,6 +2278,9 @@ class Dataflow_dumplicate(Dataflow):
         _dict["products_original"] = _extract.get("product_attrs_original", {}).get("data",[])
         _dict["products"] = _dict.get("products") if _dict.get("products") is not None else []
         _dict["products"] = _dict["products"] if isinstance(_dict["products"], list) else json.loads(_dict["products"])
+        # 变更内容(变更答疑公告)
+        _dict["change_content"] = _extract.get("change_content","")
+        _dict["change_time"] = _extract.get("change_time","")
 
         # 专项债字段
         issue_details = _extract.get("debt_dic",{}).get("issue_details",[])
@@ -2291,7 +2294,8 @@ class Dataflow_dumplicate(Dataflow):
 
     def dumplicate_fianl_check(self,base_list,b_log=False):
         the_group = base_list
-        the_group.sort(key=lambda x:x["confidence"],reverse=True)
+        # the_group.sort(key=lambda x:x["confidence"],reverse=True)
+        the_group.sort(key=lambda x:(x["confidence"],-x['docid']),reverse=True)
 
         _index = 0
         base_fingerprint = "None"
@@ -2312,6 +2316,7 @@ class Dataflow_dumplicate(Dataflow):
                 _prob,day_dis = self.dumplicate_check(_dict1,_dict2,_dict1.get("min_counts",10),b_log=b_log)
                 if _prob<=0.1:
                     _pass = False
+                    # print('final check error',_dict1['docid'])
                     break
             log("checking index:%d %s %.2f"%(_i,str(_pass),_prob))
             _index = _i

+ 115 - 4
BaseDataMaintenance/maxcompute/documentDumplicate.py

@@ -999,17 +999,68 @@ def check_codes(project_codes_less,project_codes_greater):
 def check_demand():
     return True
 
+def edit_distance_with_diff(s1, s2):
+    m, n = len(s1), len(s2)
+    # 创建动态规划表
+    dp = [[0] * (n + 1) for _ in range(m + 1)]
+
+    # 初始化动态规划表
+    for i in range(m + 1):
+        dp[i][0] = i
+    for j in range(n + 1):
+        dp[0][j] = j
+
+    # 填充动态规划表
+    for i in range(1, m + 1):
+        for j in range(1, n + 1):
+            if s1[i - 1] == s2[j - 1]:
+                dp[i][j] = dp[i - 1][j - 1]
+            else:
+                dp[i][j] = 1 + min(dp[i - 1][j], dp[i][j - 1], dp[i - 1][j - 1])
+
+    # 回溯找到差异部分
+    diff = []
+    i, j = m, n
+    while i > 0 and j > 0:
+        if s1[i - 1] == s2[j - 1]:
+            i -= 1
+            j -= 1
+        elif dp[i][j] == dp[i - 1][j] + 1:
+            diff.append(("删除",s1[i - 1]))
+            i -= 1
+        elif dp[i][j] == dp[i][j - 1] + 1:
+            diff.append(("插入",s2[j - 1]))
+            j -= 1
+        else:
+            diff.append(("替换",s1[i - 1],s2[j - 1]))
+            i -= 1
+            j -= 1
+
+    # 处理剩余部分
+    while i > 0:
+        diff.append(("删除",s1[i - 1]))
+        i -= 1
+    while j > 0:
+        diff.append(("插入",s2[j - 1]))
+        j -= 1
+
+    # 返回编辑距离和差异部分
+    return dp[m][n], diff[::-1]  # 将差异部分反转,因为我们是从后往前回溯的
+
 package_number_pattern = re.compile("(?P<name>(((([^承]|^)包|标[段号的包]|分?包|包组|包件)编?号?|子项目|项目类型|项目)[::]?[0-9A-Za-z一二三四五六七八九十ⅠⅡⅢⅣⅤⅥⅦ]{1,4}[^\.]?)[^至]?|((?![\.])第?[ⅠⅡⅢⅣⅤⅥⅦ0-9A-Za-z一二三四五六七八九十]{1,4}(包号|标[段号的包]|分?包)))")  # 第? 去掉问号 修复 纯木浆8包/箱复印 这种作为包号
 code_pattern = re.compile("[A-Za-z0-9\-\(\)()【】\.-]+")
 num_pattern = re.compile("^\d+(?:\.\d+)?$")
 num1_pattern = re.compile("[一二三四五六七八九十A-Za-z]+")
+num2_pattern = re.compile("[一二三四五六七八九十A-Za-z\d-]+")
+num3_pattern = re.compile("[一二三四五六七八九十A-Za-z\d-]+|.")
 location_pattern = re.compile("[^\[【\(]{1,2}[市区镇县村路]")
 building_pattern = "工程招标代理|工程设计|暂停|继续|工程造价咨询|施工图设计文件审查|咨询|环评|设计|施工监理|施工|监理|EPC|epc|总承包|水土保持|选址论证|勘界|勘察|预算编制|预算审核|结算审计|招标代理|设备类|第?[\((]?[一二三四五六七八九十1-9]+[)\)]?[次批]"
 # 标题中被括号括起来的重点内容
 brackets_pattern = "【([^【】]+?)】" # |{([^{}]+?)}
 rebid_pattern = "再次|重新招标|[一二三四五六七八九十]+次"
 date_pattern = re.compile("\d{2,4}[\-\./年]\d{1,2}[\-\./月]\d{1,2}")
-def check_doctitle(doctitle_refind_less, doctitle_refind_greater, codes_less=[], code_greater=[],page_time_less="",page_time_greater=""):
+def check_doctitle(doctitle_refind_less, doctitle_refind_greater,docchannel_less,docchannel_greater, codes_less=[], code_greater=[],page_time_less="",page_time_greater=""):
+    # print('doctitle',doctitle_refind_less,doctitle_refind_greater)
     if code_greater is None:
         code_greater = []
     doctitle_refind_less = str(doctitle_refind_less).replace("(","(").replace(")",")")
@@ -1026,6 +1077,8 @@ def check_doctitle(doctitle_refind_less, doctitle_refind_greater, codes_less=[],
         doctitle_refind_less = ""
     if doctitle_refind_greater is None:
         doctitle_refind_greater = ""
+    if doctitle_refind_less==doctitle_refind_greater:
+        return True
     _pack1 = None
     _pack2 = None
     #if contain then pass
@@ -1078,6 +1131,26 @@ def check_doctitle(doctitle_refind_less, doctitle_refind_greater, codes_less=[],
         if len(set_num_l)==len(set_num_g):
             if len(set_num_l&set_num_g)!=len(set_num_l):
                 return False
+    # 产权拍卖类公告,例:小区6号楼2单元1302号
+    if docchannel_less==docchannel_greater and docchannel_less in [115,116,117]:
+        for _p in [num2_pattern]:
+            num_all_l = re.findall(_p, doctitle_refind_less)
+            num_all_g = re.findall(_p, doctitle_refind_greater)
+            set_num_l = set(num_all_l)
+            set_num_g = set(num_all_g)
+            if len(set_num_l) == len(set_num_g):
+                if len(set_num_l & set_num_g) != len(set_num_l):
+                    return False
+    # 相似标题对比,编辑距离中替换字段前后都为"数字字母字符串"则判断为不同
+    if getSimilarityOfString(doctitle_refind_less,doctitle_refind_greater) > 0.7:
+        doctitle_refind_less_re = re.findall(num3_pattern,doctitle_refind_less)
+        doctitle_refind_greater_re = re.findall(num3_pattern,doctitle_refind_greater)
+        distance, differences = edit_distance_with_diff(doctitle_refind_less_re, doctitle_refind_greater_re)
+        for diff in differences:
+            if diff[0]=='替换':
+                if re.search("^[一二三四五六七八九十A-Za-z\d-]+$",diff[1]) and re.search("^[一二三四五六七八九十A-Za-z\d-]+$",diff[2]):
+                    # print("标题编辑距离中替换字段前后 数字字母字符串不同")
+                    return False
     # 重新(多次)招标关键词
     for _p in [rebid_pattern]:
         num_all_l = re.findall(_p,doctitle_refind_less)
@@ -1087,7 +1160,8 @@ def check_doctitle(doctitle_refind_less, doctitle_refind_greater, codes_less=[],
         if len(set_num_l)==len(set_num_g):
             if len(set_num_l&set_num_g)!=len(set_num_l):
                 return False
-        elif (len(set_num_l) and not len(set_num_g)) or (len(set_num_g) and not len(set_num_l)):
+        # if page_time_less and page_time_less != page_time_greater:
+        if (len(set_num_l) and not len(set_num_g)) or (len(set_num_g) and not len(set_num_l)):
             return False
 
     #check the location has conflict
@@ -1206,23 +1280,30 @@ def check_package(package_less,package_greater,split_char=","):
 def check_time(json_time_less,json_time_greater):
     has_same = False
     has_diff = False
+    time_count_less = 0
+    time_count_greater = 0
     if getLength(json_time_less)>0 and getLength(json_time_greater)>0:
         if isinstance(json_time_less,dict):
             time_less = json_time_less
         else:
             time_less = json.loads(json_time_less)
+        time_count_less += sum([1 for k,v in time_less.items() if v])
         if isinstance(json_time_greater,dict):
             time_greater = json_time_greater
         else:
             time_greater = json.loads(json_time_greater)
+        time_count_greater += sum([1 for k, v in time_greater.items() if v])
         for k,v in time_less.items():
             if getLength(v)>0:
                 v1 = time_greater.get(k,"")
                 if getLength(v1)>0:
                     if v[:10]!=v1[:10]:
+                        # print('time diff',k,v,v1)
                         has_diff = True
                     else:
                         has_same = True
+    if time_count_less==0 and time_count_greater==0:
+        return 2
     if has_same:
         if has_diff:
             return 1
@@ -1299,6 +1380,8 @@ def check_dumplicate_rule(document_less,document_greater,min_counts,b_log=False,
     is_special_bonds_less = document_less.get("is_special_bonds")
     products_less = document_less.get("products")
     products_original_less = document_less.get("products_original",[])
+    change_content_less = document_less.get("change_content","")
+    change_time_less = document_less.get("change_time","")
 
 
     docid_greater = document_greater["docid"]
@@ -1327,6 +1410,8 @@ def check_dumplicate_rule(document_less,document_greater,min_counts,b_log=False,
     is_special_bonds_greater = document_greater.get("is_special_bonds")
     products_greater = document_greater.get("products")
     products_original_greater = document_greater.get("products_original", [])
+    change_content_greater = document_greater.get("change_content", "")
+    change_time_greater = document_greater.get("change_time", "")
 
     moneys_greater = document_greater.get("moneys")
     moneys_attachment_greater = document_greater.get("moneys_attachment")
@@ -1394,6 +1479,30 @@ def check_dumplicate_rule(document_less,document_greater,min_counts,b_log=False,
             # print("check_products error")
             return 0
 
+    # 变更答疑公告 变更内容对比
+    if docchannel_less in [51,103] and docchannel_less==docchannel_greater:
+        if getLength(change_time_less)>0 and getLength(change_time_greater)>0:
+            if change_time_less != change_time_greater:
+                # print("change_time diff")
+                return 0
+        if getLength(change_content_less) > 10 and getLength(change_content_greater) > 10:
+            _change_content_less = re.findall("[\u4e00-\u9fa5a-zA-Z0-9]+", change_content_less)
+            _change_content_less = "".join(_change_content_less)
+            _change_content_greater = re.findall("[\u4e00-\u9fa5a-zA-Z0-9]+", change_content_greater)
+            _change_content_greater = "".join(_change_content_greater)
+            if _change_content_less == _change_content_greater:
+                # print("change_content same 1")
+                return 1
+            elif _change_content_less.find(_change_content_greater)>=0 or _change_content_greater.find(_change_content_less)>=0:
+                # print("change_content same 2")
+                return 1
+            # elif getSimilarityOfString(_change_content_less,_change_content_greater)>0.8:
+            #     print("change_content same 3")
+            #     print(_change_content_less)
+            #     print(_change_content_greater)
+            #     print(getSimilarityOfString(_change_content_less,_change_content_greater))
+            #     return 1
+
     #一篇要素都在附件,且两篇附件md5有重叠
     set_md5_less = set()
     set_md5_greater = set()
@@ -1572,7 +1681,7 @@ def check_dumplicate_rule(document_less,document_greater,min_counts,b_log=False,
                 check_result["pass"] = 0
             else:
                 check_result["docchannel"] = 2
-    if not check_doctitle(doctitle_refine_less,doctitle_refine_greater,project_codes_less,project_codes_greater,page_time_less,page_time_greater):
+    if not check_doctitle(doctitle_refine_less,doctitle_refine_greater,docchannel_less,docchannel_greater,project_codes_less,project_codes_greater,page_time_less,page_time_greater):
         check_result["doctitle"] = 0
         check_result["pass"] = 0
         if b_log:
@@ -1654,7 +1763,9 @@ def check_dumplicate_rule(document_less,document_greater,min_counts,b_log=False,
 
     #added check
     _time_check = check_time(json_time_less,json_time_greater)
-    if not _time_check or (_time_check==1 and docchannel_less in (51,103)):
+    # if not _time_check or (_time_check==1 and docchannel_less in (51,103)):
+    if not _time_check or (_time_check==1 and docchannel_less in (51,103) and
+                           len([k for k,v in json_time_less.items() if v])>0 and len([k for k,v in json_time_greater.items() if v])>0):
         if b_log:
             logging.info("%d-%d,check_time_failed:%s==%s"%(docid_less,docid_greater,str(json_time_less),str(json_time_greater)))
             if isinstance(json_time_less,dict):