Browse Source

去重合并流程报错修复

znj 13 hours ago
parent
commit
6e0b3a84e7

+ 3 - 3
BaseDataMaintenance/maintenance/dataflow.py

@@ -3376,7 +3376,7 @@ class Dataflow_dumplicate(Dataflow):
 
         set_docid = set()
         list_delete_projects = []
-        list_projects = self.search_projects_with_document([docid])
+        list_projects = self.search_projects_with_document([docid],project_table="project2",project_table_index="project2_index_formerge")
         for _proj in list_projects:
             _p = {}
             _docids = _proj.get(project_docids,"")
@@ -4017,7 +4017,7 @@ class Dataflow_dumplicate(Dataflow):
                     _time = time.time()
                     _check,_prob = check_merge_rule(_proj,_data,b_log=b_log,return_prob=True)
                     if b_log:
-                        log(str(_check))
+                        log("merge rule res: "+str(_check)+" prob: "+str(_prob))
                     projects_check_rule_time += time.time()-_time
                     if _check:
                         list_check_data.append([_data,_prob])
@@ -4173,7 +4173,7 @@ class Dataflow_dumplicate(Dataflow):
                 self.update_projects_by_document(_docid,save,list_projects,document_name=document_name)
                 # log("update projects takes:%.3f"%(time.time()-_time))
             _time = time.time()
-            list_projects = dumplicate_projects(list_projects)
+            list_projects = dumplicate_projects(list_projects,max_count=20)
 
 
             # log("dumplicate projects takes:%.3f"%(time.time()-_time))

+ 4 - 4
BaseDataMaintenance/maxcompute/documentMerge.py

@@ -1570,10 +1570,10 @@ def generate_common_properties(list_docs):
         list_loc.append(_d)
     list_loc.sort(key=lambda x:x.get("count",0),reverse=True)
     if len(list_loc)>0:
-        project_dict[document_district] = _doc.get(document_district)
-        project_dict[document_city] = _doc.get(document_city)
-        project_dict[document_province] = _doc.get(document_province)
-        project_dict[document_area] = _doc.get(document_area)
+        project_dict[document_district] = list_loc[0].get(document_district)
+        project_dict[document_city] = list_loc[0].get(document_city)
+        project_dict[document_province] = list_loc[0].get(document_province)
+        project_dict[document_area] = list_loc[0].get(document_area)
         _find = True
     # print(dict_count)
     # print(len(list_docs))