Pārlūkot izejas kodu

合并时去重规则更新优化

znj 1 nedēļu atpakaļ
vecāks
revīzija
fe5ac403ff

+ 2 - 2
BaseDataMaintenance/maintenance/dataflow.py

@@ -2229,7 +2229,7 @@ class Dataflow_dumplicate(Dataflow):
         return dict_time
 
 
-    def get_attrs_before_dump(self,docid,columns=[document_tmp_status,document_tmp_save,document_tmp_page_time,document_tmp_docchannel,document_tmp_tenderee,document_tmp_agency,document_tmp_doctitle,document_tmp_sub_docs_json,document_tmp_extract_json,document_attachment_extract_status,document_update_document,document_province,document_city,document_district,document_tmp_attachment_path,document_tmp_web_source_no,document_tmp_web_source_name,document_tmp_source_stage,document_tmp_source_type]):
+    def get_attrs_before_dump(self,docid,columns=[document_tmp_status,document_tmp_save,document_tmp_page_time,document_tmp_docchannel,document_tmp_tenderee,document_tmp_agency,document_tmp_doctitle,document_tmp_sub_docs_json,document_tmp_extract_json,document_attachment_extract_status,document_update_document,document_province,document_city,document_district,document_tmp_attachment_path,document_tmp_web_source_no,document_tmp_web_source_name,document_tmp_source_stage,document_tmp_source_type,"detail_link",'products']):
 
         bool_query = BoolQuery(must_queries=[
             TermQuery("docid",docid)
@@ -5495,7 +5495,7 @@ if __name__ == '__main__':
     # test_attachment_interface()
     df_dump = Dataflow_dumplicate(start_delete_listener=False)
     # df_dump.start_flow_dumplicate()
-    df_dump.test_dumplicate(628365020
+    df_dump.test_dumplicate(629906009
 )
     # df_dump.dumplicate_comsumer_handle_interface(603504420,document_table="document_0000",document_table_index="document_0000_index",project_table="project_0000",project_table_index="project_0000_index_formerge")
     # compare_dumplicate_check()

+ 2 - 7
BaseDataMaintenance/maxcompute/documentDumplicate.py

@@ -1178,9 +1178,6 @@ def check_product(product_less,product_greater,split_char=",",doctitle_refine_le
             for _g in _product_g:
                 # if getSimilarityOfString(_l,_g)>=0.8 or doctitle_refine_greater.find(_l)>=0 or doctitle_refine_less.find(_g)>=0:
                 if getSimilarityOfString(_l,_g)>=0.8 or doctitle_refine_greater.find(_l)>=0:
-                    print(_l,_g)
-                    print(doctitle_refine_greater.find(_l))
-                    print(doctitle_refine_less.find(_g))
                     same_count += 1
                     break
         if same_count/len(_product_l)>=0.5:
@@ -1251,9 +1248,8 @@ def check_products(products_less,products_greater):
             a = products_greater_list
             products_greater_list = products_less_list
             products_less_list = a
-
-        # print('products_less_set',products_less_list)
-        # print('products_greater_set',products_greater_list)
+        # print('products_less_list',products_less_list)
+        # print('products_greater_list',products_greater_list)
         same_count = 0
         for _l in products_less_list:
             for _g in products_greater_list:
@@ -1341,7 +1337,6 @@ def check_dumplicate_rule(document_less,document_greater,min_counts,b_log=False,
         # print('fingerprint same')
         return 1
 
-
     # 站源相同时,除了fingerprint一样和detail_link一样,其他不去重
     if web_source_no_less==web_source_no_greater and getLength(web_source_no_less)>0:
         if getLength(detail_link_less)>0 and getLength(detail_link_greater)>0: