Pārlūkot izejas kodu

AI提取优化%金额

luojiehua 1 mēnesi atpakaļ
vecāks
revīzija
a187ece310

+ 68 - 2
BaseDataMaintenance/maintenance/dataflow.py

@@ -2356,6 +2356,8 @@ class Dataflow_dumplicate(Dataflow):
         source_type_greater = document_greater.get("source_type")
 
         hard_level=1
+        if docchannel_less==docchannel_greater==302:
+            hard_level=2
         if web_source_no_less==web_source_no_greater=="17397-3":
             hard_level=2
 
@@ -5036,6 +5038,69 @@ def compare_dumplicate_check():
     df.to_excel("compare_dump.xlsx")
 
 
+def fix_merge_docid(docid):
+
+    def get_uuid_docids(docid):
+        ots_client = getConnect_ots()
+        bool_query = BoolQuery(must_queries=[
+            TermQuery("docids",docid)
+        ])
+
+        rows,next_token,total_count,is_all_succeed = ots_client.search("project2","project2_index",
+                                                                       SearchQuery(bool_query,sort=Sort(sorters=[FieldSort("page_time")]),limit=100,get_total_count=True),
+                                                                       ColumnsToGet(["docids"],return_type=ColumnReturnType.SPECIFIED))
+        list_row = getRow_ots(rows)
+        while next_token:
+            rows,next_token,total_count,is_all_succeed = ots_client.search("project2","project2_index",
+                                                                       SearchQuery(bool_query,next_token=next_token,limit=100,get_total_count=True),
+                                                                       ColumnsToGet(["docids"],return_type=ColumnReturnType.SPECIFIED))
+            list_row.extend(getRow_ots(rows))
+        return list_row
+    def get_new_docid(list_docid1,list_docid2):
+        return list(set(list_docid1)-set(list_docid2))
+    def get_list_docid(list_row):
+        list_docid = []
+        for row in list_row:
+            docids = row.get("docids",'')
+            if docids:
+                list_docid.extend([int(a) for a in docids.split(",")])
+        return list(set(list_docid))
+    def get_list_uuid(list_row):
+        list_uuid = []
+        for row in list_row:
+            uuid = row.get("uuid",'')
+            if uuid:
+                list_uuid.append(uuid)
+        return list(set(list_uuid))
+    list_row = get_uuid_docids(docid)
+    print(list_row)
+    list_docid1 = get_list_docid(list_row)
+    list_new_docid = get_new_docid(list_docid1,[docid])
+    while 1:
+        if len(list_new_docid)==0:
+            break
+        list_row2 = []
+        for _docid in list_new_docid:
+            list_row2.extend(get_uuid_docids(_docid))
+        list_docid1 = get_list_docid(list_row)
+        list_docid2 = get_list_docid(list_row2)
+        list_new_docid = get_new_docid(list_docid1,list_docid2)
+        list_row.extend(list_row2)
+    list_uuid = get_list_uuid(list_row)
+    list_docid = get_list_docid(list_row)
+    print(list_uuid)
+    print(list_docid)
+    for _docid in list_docid:
+        _d = Document({document_partitionkey:_docid%500+1,
+                       document_docid:_docid,
+                       document_status:1})
+        if _d.exists_row(ots_client):
+            _d.update_row(ots_client)
+    for _uuid in list_uuid:
+        _p = Project({project_uuid:_uuid,})
+        _p.delete_row(ots_client)
+
+
 if __name__ == '__main__':
     a = time.time()
     # df = Dataflow()
@@ -5052,13 +5117,14 @@ if __name__ == '__main__':
     # test_attachment_interface()
     df_dump = Dataflow_dumplicate(start_delete_listener=False)
     # df_dump.start_flow_dumplicate()
-    df_dump.test_dumplicate(563465267
-                            )
+    # df_dump.test_dumplicate(606243000
+    #                         )
     # df_dump.dumplicate_comsumer_handle_interface(603504420,document_table="document_0000",document_table_index="document_0000_index",project_table="project_0000",project_table_index="project_0000_index_formerge")
     # compare_dumplicate_check()
     # df_dump.test_merge([391898061
     #                     ],[371551361,])
     # df_dump.flow_remove_project_tmp()
+    fix_merge_docid(595271944)
     print("takes",time.time()-a)
     # df_dump.fix_doc_which_not_in_project()
     # df_dump.delete_projects_by_document(16288036)

Failā izmaiņas netiks attēlotas, jo tās ir par lielu
+ 2 - 0
BaseDataMaintenance/maintenance/dataflow_mq.py


+ 1 - 1
BaseDataMaintenance/maxcompute/documentDumplicate.py

@@ -1520,7 +1520,7 @@ def check_dumplicate_rule(document_less,document_greater,min_counts,b_log=False,
 
     if hard_level==2 and check_result["product"]<=1:
         if b_log:
-            logging.inf("hard_level %s and check_product less than 2"%(str(hard_level)))
+            logging.info("hard_level %s and check_product less than 2"%(str(hard_level)))
         return 0
     if check_result.get("pass",0)==0:
         if b_log:

+ 7 - 1
BaseDataMaintenance/maxcompute/documentMerge.py

@@ -2656,6 +2656,7 @@ def check_project_codes_merge(list_code,list_code_to_merge,b_log):
         return 1
     return 0
 
+print("check_codes",check_project_codes_merge(["2351101000019509973","3207991170012025000201"],["2351101000015686841","ZC3207000002024001333"],True))
 
 def check_merge_rule(_proj,_dict,b_log=False,time_limit=86400*300,return_prob=False,simple_check=False):
 
@@ -2777,7 +2778,11 @@ def check_merge_rule(_proj,_dict,b_log=False,time_limit=86400*300,return_prob=Fa
 
     prob_count += _codes_check
 
-    if _codes_check!=1:
+    if _codes_check==-1:
+        if return_prob:
+            return False,0
+        return False
+    elif _codes_check!=1:
         if _title_check!=1:
             if return_prob:
                 return False,0
@@ -2792,6 +2797,7 @@ def check_merge_rule(_proj,_dict,b_log=False,time_limit=86400*300,return_prob=Fa
                 return False,0
             return False
 
+
     min_count = 2
     if product=="" or product_to_merge=="":
         min_count = 1

+ 33 - 31
BaseDataMaintenance/model/ots/document.py

@@ -307,8 +307,9 @@ def turn_document_status():
 
         bool_query = BoolQuery(
             must_queries=[
-                MatchPhraseQuery("doctitle","破产清算案"),
-                MatchPhraseQuery("project_name","经相关部门批准后方可开展经营活动"),
+                MatchPhraseQuery("doctitle","质量竣工验收监督"),
+                RangeQuery("status",401,451)
+                # MatchPhraseQuery("project_name","经相关部门批准后方可开展经营活动"),
                 # WildcardQuery("web_source_no","03716-*"),
                 # RangeQuery("product_number",500),
                 # TermQuery("save",1)
@@ -342,25 +343,25 @@ def turn_document_status():
         #
         # )
 
-        # rows,next_token,total_count,is_all_succeed = ots_client.search("document","document_index",
-        #                                                                SearchQuery(bool_query,sort=Sort(sorters=[FieldSort("docid",SortOrder.DESC)]),limit=100,get_total_count=True),
-        #                                                                columns_to_get=ColumnsToGet(["product","product_number"],return_type=ColumnReturnType.SPECIFIED))
-        # list_data = getRow_ots(rows)
-        # print(total_count)
-        # _count = len(list_data)
-        # for _data in list_data:
-        #     _document = Document(_data)
-        #     task_queue.put(_document)
-        # while next_token:
-        #     rows,next_token,total_count,is_all_succeed = ots_client.search("document","document_index",
-        #                                                                    SearchQuery(bool_query,next_token=next_token,limit=100,get_total_count=True),
-        #                                                                    columns_to_get=ColumnsToGet(["product"],return_type=ColumnReturnType.SPECIFIED))
-        #     list_data = getRow_ots(rows)
-        #     _count += len(list_data)
-        #     print("%d/%d"%(_count,total_count))
-        #     for _data in list_data:
-        #         _document = Document(_data)
-        #         task_queue.put(_document)
+        rows,next_token,total_count,is_all_succeed = ots_client.search("document","document_index",
+                                                                       SearchQuery(bool_query,sort=Sort(sorters=[FieldSort("docid",SortOrder.DESC)]),limit=100,get_total_count=True),
+                                                                       columns_to_get=ColumnsToGet(["product","product_number"],return_type=ColumnReturnType.SPECIFIED))
+        list_data = getRow_ots(rows)
+        print(total_count)
+        _count = len(list_data)
+        for _data in list_data:
+            _document = Document(_data)
+            task_queue.put(_document)
+        while next_token:
+            rows,next_token,total_count,is_all_succeed = ots_client.search("document","document_index",
+                                                                           SearchQuery(bool_query,next_token=next_token,limit=100,get_total_count=True),
+                                                                           columns_to_get=ColumnsToGet(["product"],return_type=ColumnReturnType.SPECIFIED))
+            list_data = getRow_ots(rows)
+            _count += len(list_data)
+            print("%d/%d"%(_count,total_count))
+            for _data in list_data:
+                _document = Document(_data)
+                task_queue.put(_document)
 
         # docids = [223820830,224445409]
         # for docid in docids:
@@ -368,9 +369,9 @@ def turn_document_status():
         #              document_partitionkey:int(docid)%500+1,
         #              }
         #     task_queue.put(Document(_dict))
-        import pandas as pd
-        df = pd.read_csv(r"C:\Users\Administrator\Desktop\export_241224_6.csv")
-        list_docid = df["docid"]
+        # import pandas as pd
+        # df = pd.read_csv(r"C:\Users\Administrator\Desktop\export_241224_6.csv")
+        # list_docid = df["docid"]
         # list_docid = [519497468]
 
         # list_docid = []
@@ -385,13 +386,13 @@ def turn_document_status():
         #         if re.search("^\d+$",docid) is not None:
         #             list_docid.append(int(docid))
 
-        for docid,construct_company,recall_flag in zip(list_docid,df["construct_company"],df["recall_flag"]):
-            if recall_flag == 1:
-                _dict = {document_docid:int(docid),
-                         document_partitionkey:int(docid)%500+1,
-                         "construct_company":construct_company
-                         }
-                task_queue.put(Document(_dict))
+        # for docid,construct_company,recall_flag in zip(list_docid,df["construct_company"],df["recall_flag"]):
+        #     if recall_flag == 1:
+        #         _dict = {document_docid:int(docid),
+        #                  document_partitionkey:int(docid)%500+1,
+        #                  "construct_company":construct_company
+        #                  }
+        #         task_queue.put(Document(_dict))
         # for docid in df["docid2"]:
         #     _dict = {document_docid:int(docid),
         #              document_partitionkey:int(docid)%500+1,
@@ -434,6 +435,7 @@ def turn_document_status():
         # n_product = ",".join(l_product[:500])
         # item.setValue(document_product,n_product,True)
         # item.fix_columns(ots_client,["extract_json","doctitle",""],True)
+        item.setValue(document_status,1,True)
         item.update_row(ots_client)
         # log("update %d status done"%(item.getProperties().get(document_docid)))
         pass

+ 24 - 23
BaseDataMaintenance/model/ots/document_tmp.py

@@ -268,7 +268,8 @@ def turn_document_tmp_status():
             must_queries=[
                 # TermQuery("fingerprint","md5=2cc044b81ec13acddcc970b71b780365")
                 # TermQuery("save",0),
-                RangeQuery("crtime","2025-03-05 09:30:00")
+                # RangeQuery("crtime","2025-03-05 09:30:00")
+                MatchPhraseQuery("doctitle","质量竣工验收监督")
                 # RangeQuery("status",1,51),
                 # BoolQuery(should_queries=[
                 #                           # TermQuery("tenderee","山西利民工业有限责任公司"),
@@ -345,25 +346,25 @@ def turn_document_tmp_status():
         # item.setValue(document_tmp_attachment_path,json.dumps(list_attachment,ensure_ascii=False),True)
         # item.all_columns.remove(document_tmp_dochtmlcon)
 
-        best_docid = item.getProperties().get(document_tmp_best_docid,"")
-        if best_docid==-1:
-            dup_docid = item.getProperties().get(document_tmp_dup_docid,"")
-            list_docid = [item.getProperties().get(document_tmp_docid,"")]
-            for _id in dup_docid.split(","):
-                if _id!="":
-                    list_docid.append(int(_id))
-            for docid in list_docid:
-                _d = {
-                    document_tmp_partitionkey:docid%500+1,
-                    document_tmp_docid:docid,
-                }
-                _document = Document(_d)
-                if _document.fix_columns(ots_client,[document_tmp_status],True):
-                    if _document.getProperties().get("status",0)>=401:
-                        _document.setValue(document_tmp_status,1,True)
-
-                        print(_d)
-                        _document.update_row(ots_client)
+        # best_docid = item.getProperties().get(document_tmp_best_docid,"")
+        # if best_docid==-1:
+        #     dup_docid = item.getProperties().get(document_tmp_dup_docid,"")
+        #     list_docid = [item.getProperties().get(document_tmp_docid,"")]
+        #     for _id in dup_docid.split(","):
+        #         if _id!="":
+        #             list_docid.append(int(_id))
+        #     for docid in list_docid:
+        #         _d = {
+        #             document_tmp_partitionkey:docid%500+1,
+        #             document_tmp_docid:docid,
+        #         }
+        #         _document = Document(_d)
+        #         if _document.fix_columns(ots_client,[document_tmp_status],True):
+        #             if _document.getProperties().get("status",0)>=401:
+        #                 _document.setValue(document_tmp_status,1,True)
+        #
+        #                 print(_d)
+        #                 _document.update_row(ots_client)
 
         #change status
         # item.setValue(document_tmp_docchannel,item.getProperties().get(document_tmp_original_docchannel),True)
@@ -372,13 +373,13 @@ def turn_document_tmp_status():
         # item.setValue(document_tmp_extract_json,_extract_json,True)
         # json.loads(_extract_json)
         # item.setValue(document_tmp_status,0,True)
-        # item.setValue(document_tmp_save,1,True)
+        item.setValue(document_tmp_save,1,True)
         # if item.exists_row(ots_client):
         #     item.update_row(ots_client)
         # print(item.getProperties())
-        # item.update_row(ots_client)
+        item.update_row(ots_client)
         # log("update %d status done"%(item.getProperties().get(document_tmp_docid)))
-        # item.delete_row(ots_client)
+        # #item.delete_row(ots_client)
         # from BaseDataMaintenance.model.ots.document import Document
         #
         # Doc = Document(item.getProperties())

Daži faili netika attēloti, jo izmaiņu fails ir pārāk liels