3 mēneši atpakaļ · a187ece310
--- a/BaseDataMaintenance/maintenance/dataflow.py
+++ b/BaseDataMaintenance/maintenance/dataflow.py
@@ -2356,6 +2356,8 @@ class Dataflow_dumplicate(Dataflow):
 
				         source_type_greater = document_greater.get("source_type")
			
 
				 
			
 
				         hard_level=1
			
 
				+        if docchannel_less==docchannel_greater==302:
			
 
				+            hard_level=2
			
 
				         if web_source_no_less==web_source_no_greater=="17397-3":
			
 
				             hard_level=2
			
 
				 
			
@@ -5036,6 +5038,69 @@ def compare_dumplicate_check():
 
				     df.to_excel("compare_dump.xlsx")
			
 
				 
			
 
				 
			
 
				+def fix_merge_docid(docid):
			
 
				+
			
 
				+    def get_uuid_docids(docid):
			
 
				+        ots_client = getConnect_ots()
			
 
				+        bool_query = BoolQuery(must_queries=[
			
 
				+            TermQuery("docids",docid)
			
 
				+        ])
			
 
				+
			
 
				+        rows,next_token,total_count,is_all_succeed = ots_client.search("project2","project2_index",
			
 
				+                                                                       SearchQuery(bool_query,sort=Sort(sorters=[FieldSort("page_time")]),limit=100,get_total_count=True),
			
 
				+                                                                       ColumnsToGet(["docids"],return_type=ColumnReturnType.SPECIFIED))
			
 
				+        list_row = getRow_ots(rows)
			
 
				+        while next_token:
			
 
				+            rows,next_token,total_count,is_all_succeed = ots_client.search("project2","project2_index",
			
 
				+                                                                       SearchQuery(bool_query,next_token=next_token,limit=100,get_total_count=True),
			
 
				+                                                                       ColumnsToGet(["docids"],return_type=ColumnReturnType.SPECIFIED))
			
 
				+            list_row.extend(getRow_ots(rows))
			
 
				+        return list_row
			
 
				+    def get_new_docid(list_docid1,list_docid2):
			
 
				+        return list(set(list_docid1)-set(list_docid2))
			
 
				+    def get_list_docid(list_row):
			
 
				+        list_docid = []
			
 
				+        for row in list_row:
			
 
				+            docids = row.get("docids",'')
			
 
				+            if docids:
			
 
				+                list_docid.extend([int(a) for a in docids.split(",")])
			
 
				+        return list(set(list_docid))
			
 
				+    def get_list_uuid(list_row):
			
 
				+        list_uuid = []
			
 
				+        for row in list_row:
			
 
				+            uuid = row.get("uuid",'')
			
 
				+            if uuid:
			
 
				+                list_uuid.append(uuid)
			
 
				+        return list(set(list_uuid))
			
 
				+    list_row = get_uuid_docids(docid)
			
 
				+    print(list_row)
			
 
				+    list_docid1 = get_list_docid(list_row)
			
 
				+    list_new_docid = get_new_docid(list_docid1,[docid])
			
 
				+    while 1:
			
 
				+        if len(list_new_docid)==0:
			
 
				+            break
			
 
				+        list_row2 = []
			
 
				+        for _docid in list_new_docid:
			
 
				+            list_row2.extend(get_uuid_docids(_docid))
			
 
				+        list_docid1 = get_list_docid(list_row)
			
 
				+        list_docid2 = get_list_docid(list_row2)
			
 
				+        list_new_docid = get_new_docid(list_docid1,list_docid2)
			
 
				+        list_row.extend(list_row2)
			
 
				+    list_uuid = get_list_uuid(list_row)
			
 
				+    list_docid = get_list_docid(list_row)
			
 
				+    print(list_uuid)
			
 
				+    print(list_docid)
			
 
				+    for _docid in list_docid:
			
 
				+        _d = Document({document_partitionkey:_docid%500+1,
			
 
				+                       document_docid:_docid,
			
 
				+                       document_status:1})
			
 
				+        if _d.exists_row(ots_client):
			
 
				+            _d.update_row(ots_client)
			
 
				+    for _uuid in list_uuid:
			
 
				+        _p = Project({project_uuid:_uuid,})
			
 
				+        _p.delete_row(ots_client)
			
 
				+
			
 
				+
			
 
				 if __name__ == '__main__':
			
 
				     a = time.time()
			
 
				     # df = Dataflow()
			
@@ -5052,13 +5117,14 @@ if __name__ == '__main__':
 
				     # test_attachment_interface()
			
 
				     df_dump = Dataflow_dumplicate(start_delete_listener=False)
			
 
				     # df_dump.start_flow_dumplicate()
			
 
				-    df_dump.test_dumplicate(563465267
			
 
				-                            )
			
 
				+    # df_dump.test_dumplicate(606243000
			
 
				+    #                         )
			
 
				     # df_dump.dumplicate_comsumer_handle_interface(603504420,document_table="document_0000",document_table_index="document_0000_index",project_table="project_0000",project_table_index="project_0000_index_formerge")
			
 
				     # compare_dumplicate_check()
			
 
				     # df_dump.test_merge([391898061
			
 
				     #                     ],[371551361,])
			
 
				     # df_dump.flow_remove_project_tmp()
			
 
				+    fix_merge_docid(595271944)
			
 
				     print("takes",time.time()-a)
			
 
				     # df_dump.fix_doc_which_not_in_project()
			
 
				     # df_dump.delete_projects_by_document(16288036)
			
--- a/BaseDataMaintenance/maintenance/dataflow_mq.py
+++ b/BaseDataMaintenance/maintenance/dataflow_mq.py
--- a/BaseDataMaintenance/maxcompute/documentDumplicate.py
+++ b/BaseDataMaintenance/maxcompute/documentDumplicate.py
@@ -1520,7 +1520,7 @@ def check_dumplicate_rule(document_less,document_greater,min_counts,b_log=False,
 
				 
			
 
				     if hard_level==2 and check_result["product"]<=1:
			
 
				         if b_log:
			
 
				-            logging.inf("hard_level %s and check_product less than 2"%(str(hard_level)))
			
 
				+            logging.info("hard_level %s and check_product less than 2"%(str(hard_level)))
			
 
				         return 0
			
 
				     if check_result.get("pass",0)==0:
			
 
				         if b_log:
			
--- a/BaseDataMaintenance/maxcompute/documentMerge.py
+++ b/BaseDataMaintenance/maxcompute/documentMerge.py
@@ -2656,6 +2656,7 @@ def check_project_codes_merge(list_code,list_code_to_merge,b_log):
 
				         return 1
			
 
				     return 0
			
 
				 
			
 
				+print("check_codes",check_project_codes_merge(["2351101000019509973","3207991170012025000201"],["2351101000015686841","ZC3207000002024001333"],True))
			
 
				 
			
 
				 def check_merge_rule(_proj,_dict,b_log=False,time_limit=86400*300,return_prob=False,simple_check=False):
			
 
				 
			
@@ -2777,7 +2778,11 @@ def check_merge_rule(_proj,_dict,b_log=False,time_limit=86400*300,return_prob=Fa
 
				 
			
 
				     prob_count += _codes_check
			
 
				 
			
 
				-    if _codes_check!=1:
			
 
				+    if _codes_check==-1:
			
 
				+        if return_prob:
			
 
				+            return False,0
			
 
				+        return False
			
 
				+    elif _codes_check!=1:
			
 
				         if _title_check!=1:
			
 
				             if return_prob:
			
 
				                 return False,0
			
@@ -2792,6 +2797,7 @@ def check_merge_rule(_proj,_dict,b_log=False,time_limit=86400*300,return_prob=Fa
 
				                 return False,0
			
 
				             return False
			
 
				 
			
 
				+
			
 
				     min_count = 2
			
 
				     if product=="" or product_to_merge=="":
			
 
				         min_count = 1
			
--- a/BaseDataMaintenance/model/ots/document.py
+++ b/BaseDataMaintenance/model/ots/document.py
@@ -307,8 +307,9 @@ def turn_document_status():
 
				 
			
 
				         bool_query = BoolQuery(
			
 
				             must_queries=[
			
 
				-                MatchPhraseQuery("doctitle","破产清算案"),
			
 
				-                MatchPhraseQuery("project_name","经相关部门批准后方可开展经营活动"),
			
 
				+                MatchPhraseQuery("doctitle","质量竣工验收监督"),
			
 
				+                RangeQuery("status",401,451)
			
 
				+                # MatchPhraseQuery("project_name","经相关部门批准后方可开展经营活动"),
			
 
				                 # WildcardQuery("web_source_no","03716-*"),
			
 
				                 # RangeQuery("product_number",500),
			
 
				                 # TermQuery("save",1)
			
@@ -342,25 +343,25 @@ def turn_document_status():
 
				         #
			
 
				         # )
			
 
				 
			
 
				-        # rows,next_token,total_count,is_all_succeed = ots_client.search("document","document_index",
			
 
				-        #                                                                SearchQuery(bool_query,sort=Sort(sorters=[FieldSort("docid",SortOrder.DESC)]),limit=100,get_total_count=True),
			
 
				-        #                                                                columns_to_get=ColumnsToGet(["product","product_number"],return_type=ColumnReturnType.SPECIFIED))
			
 
				-        # list_data = getRow_ots(rows)
			
 
				-        # print(total_count)
			
 
				-        # _count = len(list_data)
			
 
				-        # for _data in list_data:
			
 
				-        #     _document = Document(_data)
			
 
				-        #     task_queue.put(_document)
			
 
				-        # while next_token:
			
 
				-        #     rows,next_token,total_count,is_all_succeed = ots_client.search("document","document_index",
			
 
				-        #                                                                    SearchQuery(bool_query,next_token=next_token,limit=100,get_total_count=True),
			
 
				-        #                                                                    columns_to_get=ColumnsToGet(["product"],return_type=ColumnReturnType.SPECIFIED))
			
 
				-        #     list_data = getRow_ots(rows)
			
 
				-        #     _count += len(list_data)
			
 
				-        #     print("%d/%d"%(_count,total_count))
			
 
				-        #     for _data in list_data:
			
 
				-        #         _document = Document(_data)
			
 
				-        #         task_queue.put(_document)
			
 
				+        rows,next_token,total_count,is_all_succeed = ots_client.search("document","document_index",
			
 
				+                                                                       SearchQuery(bool_query,sort=Sort(sorters=[FieldSort("docid",SortOrder.DESC)]),limit=100,get_total_count=True),
			
 
				+                                                                       columns_to_get=ColumnsToGet(["product","product_number"],return_type=ColumnReturnType.SPECIFIED))
			
 
				+        list_data = getRow_ots(rows)
			
 
				+        print(total_count)
			
 
				+        _count = len(list_data)
			
 
				+        for _data in list_data:
			
 
				+            _document = Document(_data)
			
 
				+            task_queue.put(_document)
			
 
				+        while next_token:
			
 
				+            rows,next_token,total_count,is_all_succeed = ots_client.search("document","document_index",
			
 
				+                                                                           SearchQuery(bool_query,next_token=next_token,limit=100,get_total_count=True),
			
 
				+                                                                           columns_to_get=ColumnsToGet(["product"],return_type=ColumnReturnType.SPECIFIED))
			
 
				+            list_data = getRow_ots(rows)
			
 
				+            _count += len(list_data)
			
 
				+            print("%d/%d"%(_count,total_count))
			
 
				+            for _data in list_data:
			
 
				+                _document = Document(_data)
			
 
				+                task_queue.put(_document)
			
 
				 
			
 
				         # docids = [223820830,224445409]
			
 
				         # for docid in docids:
			
@@ -368,9 +369,9 @@ def turn_document_status():
 
				         #              document_partitionkey:int(docid)%500+1,
			
 
				         #              }
			
 
				         #     task_queue.put(Document(_dict))
			
 
				-        import pandas as pd
			
 
				-        df = pd.read_csv(r"C:\Users\Administrator\Desktop\export_241224_6.csv")
			
 
				-        list_docid = df["docid"]
			
 
				+        # import pandas as pd
			
 
				+        # df = pd.read_csv(r"C:\Users\Administrator\Desktop\export_241224_6.csv")
			
 
				+        # list_docid = df["docid"]
			
 
				         # list_docid = [519497468]
			
 
				 
			
 
				         # list_docid = []
			
@@ -385,13 +386,13 @@ def turn_document_status():
 
				         #         if re.search("^\d+$",docid) is not None:
			
 
				         #             list_docid.append(int(docid))
			
 
				 
			
 
				-        for docid,construct_company,recall_flag in zip(list_docid,df["construct_company"],df["recall_flag"]):
			
 
				-            if recall_flag == 1:
			
 
				-                _dict = {document_docid:int(docid),
			
 
				-                         document_partitionkey:int(docid)%500+1,
			
 
				-                         "construct_company":construct_company
			
 
				-                         }
			
 
				-                task_queue.put(Document(_dict))
			
 
				+        # for docid,construct_company,recall_flag in zip(list_docid,df["construct_company"],df["recall_flag"]):
			
 
				+        #     if recall_flag == 1:
			
 
				+        #         _dict = {document_docid:int(docid),
			
 
				+        #                  document_partitionkey:int(docid)%500+1,
			
 
				+        #                  "construct_company":construct_company
			
 
				+        #                  }
			
 
				+        #         task_queue.put(Document(_dict))
			
 
				         # for docid in df["docid2"]:
			
 
				         #     _dict = {document_docid:int(docid),
			
 
				         #              document_partitionkey:int(docid)%500+1,
			
@@ -434,6 +435,7 @@ def turn_document_status():
 
				         # n_product = ",".join(l_product[:500])
			
 
				         # item.setValue(document_product,n_product,True)
			
 
				         # item.fix_columns(ots_client,["extract_json","doctitle",""],True)
			
 
				+        item.setValue(document_status,1,True)
			
 
				         item.update_row(ots_client)
			
 
				         # log("update %d status done"%(item.getProperties().get(document_docid)))
			
 
				         pass
			
--- a/BaseDataMaintenance/model/ots/document_tmp.py
+++ b/BaseDataMaintenance/model/ots/document_tmp.py
@@ -268,7 +268,8 @@ def turn_document_tmp_status():
 
				             must_queries=[
			
 
				                 # TermQuery("fingerprint","md5=2cc044b81ec13acddcc970b71b780365")
			
 
				                 # TermQuery("save",0),
			
 
				-                RangeQuery("crtime","2025-03-05 09:30:00")
			
 
				+                # RangeQuery("crtime","2025-03-05 09:30:00")
			
 
				+                MatchPhraseQuery("doctitle","质量竣工验收监督")
			
 
				                 # RangeQuery("status",1,51),
			
 
				                 # BoolQuery(should_queries=[
			
 
				                 #                           # TermQuery("tenderee","山西利民工业有限责任公司"),
			
@@ -345,25 +346,25 @@ def turn_document_tmp_status():
 
				         # item.setValue(document_tmp_attachment_path,json.dumps(list_attachment,ensure_ascii=False),True)
			
 
				         # item.all_columns.remove(document_tmp_dochtmlcon)
			
 
				 
			
 
				-        best_docid = item.getProperties().get(document_tmp_best_docid,"")
			
 
				-        if best_docid==-1:
			
 
				-            dup_docid = item.getProperties().get(document_tmp_dup_docid,"")
			
 
				-            list_docid = [item.getProperties().get(document_tmp_docid,"")]
			
 
				-            for _id in dup_docid.split(","):
			
 
				-                if _id!="":
			
 
				-                    list_docid.append(int(_id))
			
 
				-            for docid in list_docid:
			
 
				-                _d = {
			
 
				-                    document_tmp_partitionkey:docid%500+1,
			
 
				-                    document_tmp_docid:docid,
			
 
				-                }
			
 
				-                _document = Document(_d)
			
 
				-                if _document.fix_columns(ots_client,[document_tmp_status],True):
			
 
				-                    if _document.getProperties().get("status",0)>=401:
			
 
				-                        _document.setValue(document_tmp_status,1,True)
			
 
				-
			
 
				-                        print(_d)
			
 
				-                        _document.update_row(ots_client)
			
 
				+        # best_docid = item.getProperties().get(document_tmp_best_docid,"")
			
 
				+        # if best_docid==-1:
			
 
				+        #     dup_docid = item.getProperties().get(document_tmp_dup_docid,"")
			
 
				+        #     list_docid = [item.getProperties().get(document_tmp_docid,"")]
			
 
				+        #     for _id in dup_docid.split(","):
			
 
				+        #         if _id!="":
			
 
				+        #             list_docid.append(int(_id))
			
 
				+        #     for docid in list_docid:
			
 
				+        #         _d = {
			
 
				+        #             document_tmp_partitionkey:docid%500+1,
			
 
				+        #             document_tmp_docid:docid,
			
 
				+        #         }
			
 
				+        #         _document = Document(_d)
			
 
				+        #         if _document.fix_columns(ots_client,[document_tmp_status],True):
			
 
				+        #             if _document.getProperties().get("status",0)>=401:
			
 
				+        #                 _document.setValue(document_tmp_status,1,True)
			
 
				+        #
			
 
				+        #                 print(_d)
			
 
				+        #                 _document.update_row(ots_client)
			
 
				 
			
 
				         #change status
			
 
				         # item.setValue(document_tmp_docchannel,item.getProperties().get(document_tmp_original_docchannel),True)
			
@@ -372,13 +373,13 @@ def turn_document_tmp_status():
 
				         # item.setValue(document_tmp_extract_json,_extract_json,True)
			
 
				         # json.loads(_extract_json)
			
 
				         # item.setValue(document_tmp_status,0,True)
			
 
				-        # item.setValue(document_tmp_save,1,True)
			
 
				+        item.setValue(document_tmp_save,1,True)
			
 
				         # if item.exists_row(ots_client):
			
 
				         #     item.update_row(ots_client)
			
 
				         # print(item.getProperties())
			
 
				-        # item.update_row(ots_client)
			
 
				+        item.update_row(ots_client)
			
 
				         # log("update %d status done"%(item.getProperties().get(document_tmp_docid)))
			
 
				-        # item.delete_row(ots_client)
			
 
				+        # #item.delete_row(ots_client)
			
 
				         # from BaseDataMaintenance.model.ots.document import Document
			
 
				         #
			
 
				         # Doc = Document(item.getProperties())