Explorar o código

修复配置提取效率问题;附件识别完之后删除比地链接

luojiehua hai 1 ano
pai
achega
b98633e87f

+ 1 - 1
BaseDataMaintenance/maintenance/dataflow.py

@@ -4175,7 +4175,7 @@ if __name__ == '__main__':
     df_dump = Dataflow_dumplicate(start_delete_listener=False)
     # df_dump.start_flow_dumplicate()
     a = time.time()
-    df_dump.test_dumplicate(359517787)
+    df_dump.test_dumplicate(339737931)
     # df_dump.test_merge([292315564],[287890754])
     # df_dump.flow_remove_project_tmp()
     print("takes",time.time()-a)

+ 1 - 0
BaseDataMaintenance/maintenance/dataflow_mq.py

@@ -161,6 +161,7 @@ class Dataflow_ActivteMQ_attachment(Dataflow_attachment):
 
             _dochtmlcon = item.get(document_tmp_dochtmlcon,"")
             dhtml.setValue(document_tmp_dochtmlcon,_dochtmlcon,True)
+            dhtml.delete_bidi_a()
             dtmp = Document_tmp(item)
 
 

+ 12 - 6
BaseDataMaintenance/maintenance/product/product_parameter.py

@@ -5,7 +5,8 @@ from apscheduler.schedulers.blocking import BlockingScheduler
 from tablestore import *
 from BaseDataMaintenance.dataSource.source import getConnect_ots,getAuth,is_internal
 from BaseDataMaintenance.dataSource.interface import *
-from multiprocessing import Queue as PQueue,Process
+from multiprocessing import Queue as PQueue
+from multiprocessing import Process
 from BaseDataMaintenance.model.ots.document_product import *
 from BaseDataMaintenance.model.ots.attachment import *
 from BaseDataMaintenance.common.Utils import *
@@ -52,7 +53,9 @@ class Product_Attachment_Processor():
         attachment_size = getQueueSize("dataflow_attachment")
         if attachment_size<100:
 
-            if self.product_attachment_queue.qsize()>self.product_attachment_queue_size/3:
+            _qsize = self.product_attachment_queue.qsize()
+            log("product_attachment_queue %d"%(_qsize))
+            if _qsize>self.product_attachment_queue_size/3:
                 return
             bool_query = BoolQuery(must_queries=[
                 TermQuery("parameter_status",parameter_status_to_process)
@@ -67,6 +70,7 @@ class Product_Attachment_Processor():
             _count = 0
             for data in list_data:
                 _id = data.get(DOCUMENT_PRODUCT_ID)
+                list_id.append(_id)
                 if _id in self.set_product_attachment:
                     continue
                 docid = data.get(DOCUMENT_PRODUCT_DOCID)
@@ -74,7 +78,6 @@ class Product_Attachment_Processor():
                     dict_docid_list[docid] = []
                 dict_docid_list[docid].append(data)
 
-                list_id.append(_id)
                 _count += 1
             while next_token:
                 if len(dict_docid_list.keys())>=self.product_attachment_queue_size:
@@ -85,17 +88,20 @@ class Product_Attachment_Processor():
                 list_data = getRow_ots(rows)
                 for data in list_data:
                     _id = data.get(DOCUMENT_PRODUCT_ID)
+                    list_id.append(_id)
                     if _id in self.set_product_attachment:
                         continue
                     docid = data.get(DOCUMENT_PRODUCT_DOCID)
                     if docid not in dict_docid_list:
                         dict_docid_list[docid] = []
                     dict_docid_list[docid].append(data)
-                    list_id.append(_id)
+
                     _count += 1
             for k,v in dict_docid_list.items():
                 self.product_attachment_queue.put(v)
-            self.set_product_attachment =  set(list_id)
+            _qsize = self.product_attachment_queue.qsize()
+            log("after product_attachment_queue %d"%(_qsize))
+            self.set_product_attachment = set(list_id)
 
     def get_whole_html(self,_filemd5):
         atta = attachment({attachment_filemd5:_filemd5})
@@ -251,7 +257,7 @@ class Product_Attachment_Processor():
         mt.run()
 
     def process_parameters_comsumer(self,):
-        # process_count = 2
+        # process_count = 3
         # list_process = []
         # for i in range(process_count):
         #     p = Process(target=self.start_process)

A diferenza do arquivo foi suprimida porque é demasiado grande
+ 25 - 0
BaseDataMaintenance/model/ots/document_html.py


Algúns arquivos non se mostraron porque demasiados arquivos cambiaron neste cambio