|
@@ -0,0 +1,259 @@
|
|
|
|
+#encoding:UTF8
|
|
|
|
+from BaseDataMaintenance.dataSource.pool import ConnectorPool
|
|
|
|
+from BaseDataMaintenance.dataSource.source import *
|
|
|
|
+from BaseDataMaintenance.common.Utils import *
|
|
|
|
+import queue
|
|
|
|
+from tablestore import *
|
|
|
|
+from multiprocessing import RLock
|
|
|
|
+from threading import Thread
|
|
|
|
+from apscheduler.schedulers.blocking import BlockingScheduler
|
|
|
|
+from BaseDataMaintenance.common.multiThread import MultiThreadHandler
|
|
|
|
+from BaseDataMaintenance.model.mysql.attach_document_richtext import attach_document_richtext
|
|
|
|
+from BaseDataMaintenance.model.mysql.BaseModel import BaseModel
|
|
|
|
+from BaseDataMaintenance.model.ots.document import document
|
|
|
|
+import traceback
|
|
|
|
+from BaseDataMaintenance.dataSource.download import download
|
|
|
|
+import base64
|
|
|
|
+from BaseDataMaintenance.dataSource.interface import getAttachDealInterface
|
|
|
|
+
|
|
|
|
+STATUS_TODEAL = 10
|
|
|
|
+STATUS_DEALING = 20
|
|
|
|
+STATUS_DONE = 30
|
|
|
|
+STATUS_FAILED = 40
|
|
|
|
+MAX_DEAL_COUNT = 5
|
|
|
|
+
|
|
|
|
+class Data_toDeal_Synchronization():
|
|
|
|
+
|
|
|
|
+ def __init__(self):
|
|
|
|
+ self.done_lock = RLock()
|
|
|
|
+ self.isDone = False
|
|
|
|
+ self.document_table = "document"
|
|
|
|
+ self.document_table_index = "document_index"
|
|
|
|
+ self.pool_ots = ConnectorPool(init_num=10,max_num=40,method_init=getConnect_ots)
|
|
|
|
+ self.pool_mysql = ConnectorPool(init_num=10,max_num=40,method_init=getConnection_mysql)
|
|
|
|
+
|
|
|
|
+ def producer(self,task_queue):
|
|
|
|
+ '''
|
|
|
|
+ :return:生产数据
|
|
|
|
+ '''
|
|
|
|
+ ots_client = self.pool_ots.getConnector()
|
|
|
|
+ try:
|
|
|
|
+ #获取最新的crtime
|
|
|
|
+ conn_mysql = getConnection_mysql()
|
|
|
|
+ cursor = conn_mysql.cursor()
|
|
|
|
+ sql = "select max(create_time) from attach_document_richtext "
|
|
|
|
+ cursor.execute(sql)
|
|
|
|
+ rows = cursor.fechall()
|
|
|
|
+ max_crtime = ""
|
|
|
|
+ if len(rows)>0:
|
|
|
|
+ max_crtime = rows[0][0]
|
|
|
|
+ conn_mysql.close()
|
|
|
|
+
|
|
|
|
+ bool_query = BoolQuery(must_queries=[RangeQuery("attachmentTypes","",include_lower=False),
|
|
|
|
+ RangeQuery("crtime",max_crtime)])
|
|
|
|
+
|
|
|
|
+ columns = ["partitionkey","docid","crtime"]
|
|
|
|
+ rows, next_token, total_count, is_all_succeed = ots_client.search(self.document_table, self.document_table_index,
|
|
|
|
+ SearchQuery(bool_query ,sort=Sort(sorters=[FieldSort("crtime",SortOrder.ASC)]), limit=100, get_total_count=True),
|
|
|
|
+ ColumnsToGet(columns,return_type=ColumnReturnType.SPECIFIED))
|
|
|
|
+ list_data = getRow_ots(rows)
|
|
|
|
+ for _data in list_data:
|
|
|
|
+ _document = document(_data)
|
|
|
|
+ task_queue.put(_document,True)
|
|
|
|
+ while next_token:
|
|
|
|
+ rows, next_token, total_count, is_all_succeed = ots_client.search(self.document_table, self.document_table_index,
|
|
|
|
+ SearchQuery(bool_query ,next_token=next_token, limit=100, get_total_count=True),
|
|
|
|
+ ColumnsToGet(columns,return_type=ColumnReturnType.SPECIFIED))
|
|
|
|
+ list_data = getRow_ots(rows)
|
|
|
|
+ for _data in list_data:
|
|
|
|
+ _document = document(_data)
|
|
|
|
+ task_queue.put(_document,True)
|
|
|
|
+ except Exception as e:
|
|
|
|
+ pass
|
|
|
|
+ self.pool_ots.putConnector(ots_client)
|
|
|
|
+
|
|
|
|
+ def comsumer(self,task_queue):
|
|
|
|
+
|
|
|
|
+ def _handle(_document,result_queue,pool_mysql):
|
|
|
|
+ conn_mysql = pool_mysql.getConnector()
|
|
|
|
+ cursor = conn_mysql.cursor()
|
|
|
|
+ #插入mysql
|
|
|
|
+ _attach_document = attach_document_richtext({"docid":_document.docid,
|
|
|
|
+ "create_time":_document.crtime,
|
|
|
|
+ "insert_time":getCurrent_date(),
|
|
|
|
+ "type":0})
|
|
|
|
+ _attach_document.insert_row(conn_mysql)
|
|
|
|
+ pool_mysql.putConnector(conn_mysql)
|
|
|
|
+
|
|
|
|
+
|
|
|
|
+ result_queue = queue.Queue()
|
|
|
|
+
|
|
|
|
+ mt = MultiThreadHandler(task_queue,_handle,result_queue,thread_count=30,pool_mysql=self.pool_mysql)
|
|
|
|
+ mt.run()
|
|
|
|
+
|
|
|
|
+
|
|
|
|
+ def waitTask(self,task_queue):
|
|
|
|
+ for i in range(60):
|
|
|
|
+ if task_queue.qsize()>0:
|
|
|
|
+ return True
|
|
|
|
+ else:
|
|
|
|
+ time.sleep(1)
|
|
|
|
+ return False
|
|
|
|
+
|
|
|
|
+ def maxcompute2ots(self):
|
|
|
|
+
|
|
|
|
+ task_queue = queue.Queue(maxsize=10000)
|
|
|
|
+
|
|
|
|
+ thread_producer = Thread(target=self.producer,args=([task_queue]))
|
|
|
|
+ thread_producer.start()
|
|
|
|
+
|
|
|
|
+ if self.waitTask(task_queue):
|
|
|
|
+ thread_comsumer = Thread(target=self.comsumer,args=([task_queue]))
|
|
|
|
+ thread_comsumer.start()
|
|
|
|
+
|
|
|
|
+
|
|
|
|
+ def scheduler(self):
|
|
|
|
+ _scheduler = BlockingScheduler()
|
|
|
|
+ _scheduler.add_job(self.maxcompute2ots,"cron",minute="*/10")
|
|
|
|
+ _scheduler.start()
|
|
|
|
+
|
|
|
|
+class Data_Dealing_Synchronization():
|
|
|
|
+
|
|
|
|
+ def __init__(self):
|
|
|
|
+ self.done_lock = RLock()
|
|
|
|
+ self.isDone = False
|
|
|
|
+ self.document_table = "document"
|
|
|
|
+ self.document_table_index = "document_index"
|
|
|
|
+ self.pool_ots = ConnectorPool(init_num=10,max_num=40,method_init=getConnect_ots)
|
|
|
|
+ self.pool_mysql = ConnectorPool(init_num=10,max_num=40,method_init=getConnection_mysql)
|
|
|
|
+ self.deal_type = 0#实时处理
|
|
|
|
+
|
|
|
|
+ def producer(self,task_queue):
|
|
|
|
+ '''
|
|
|
|
+ :return:生产数据
|
|
|
|
+ '''
|
|
|
|
+ conn = self.pool_mysql.getConnector()
|
|
|
|
+ list_attachDocument = BaseModel.select_rows(conn,attach_document_richtext,"attach_document_richtext",[("status",10),("type",0)],2000)
|
|
|
|
+ for _ad in list_attachDocument:
|
|
|
|
+ task_queue.put(_ad,True)
|
|
|
|
+
|
|
|
|
+ def comsumer(self,task_queue):
|
|
|
|
+
|
|
|
|
+ def _handle(_ad,result_queue,pool_ots,pool_mysql):
|
|
|
|
+ ots_client = pool_ots.getConnector()
|
|
|
|
+ conn = pool_mysql.getConnector()
|
|
|
|
+
|
|
|
|
+ try:
|
|
|
|
+ docid = _ad.docid
|
|
|
|
+ _ad.status = STATUS_DEALING
|
|
|
|
+ if _ad.getProperties().get("deal_count",0)>=MAX_DEAL_COUNT:
|
|
|
|
+ _ad.status = STATUS_FAILED
|
|
|
|
+ _ad.setValue("message",str(_ad.getProperties().get("message",""))+" 超过处理次数 ")
|
|
|
|
+ _ad.update_row(conn)
|
|
|
|
+ else:
|
|
|
|
+ _ad.setValue("deal_count",_ad.getProperties().get("deal_count",0)+1)
|
|
|
|
+ if _ad.update_row(conn,[("status",STATUS_TODEAL)])==1:
|
|
|
|
+ partitionkey = int(docid)%500+1
|
|
|
|
+ #取出document的html,获得所有链接
|
|
|
|
+ _dict = document.search(ots_client,"document",[("partitionkey",partitionkey),("docid",int(docid))],["dochtmlcon"])
|
|
|
|
+ if _dict is not None:
|
|
|
|
+ _document = document(_dict)
|
|
|
|
+ _dochtmlcon = _document.getProperties().get("dochtmlcon","")
|
|
|
|
+
|
|
|
|
+ list_url = getAttachmentUrls()
|
|
|
|
+ list_html = []
|
|
|
|
+ #对每个链接的内容进行下载
|
|
|
|
+ message = ""
|
|
|
|
+
|
|
|
|
+ if len(list_url)==0:
|
|
|
|
+ message += " 无链接 "
|
|
|
|
+
|
|
|
|
+ download_flag = False
|
|
|
|
+ deal_flag = False
|
|
|
|
+ for _url,_type in list_url:
|
|
|
|
+ if not _document.isLegalUrl(_url,self.deal_type):
|
|
|
|
+ continue
|
|
|
|
+ _success,_data = download(_url)
|
|
|
|
+ if not _success:
|
|
|
|
+ message += " 链接%s下载失败 "%_url
|
|
|
|
+ else:
|
|
|
|
+ download_flag = True
|
|
|
|
+ _data_base64 = base64.b64encode(_data)
|
|
|
|
+ #调用接口处理结果
|
|
|
|
+ _success,_html = getAttachDealInterface(_data_base64,_type)
|
|
|
|
+ if _success:
|
|
|
|
+ deal_flag = True
|
|
|
|
+ list_html.append(_html)
|
|
|
|
+ else:
|
|
|
|
+ message += " 链接%s处理失败 "%_url
|
|
|
|
+ _attach_status = STATUS_DONE
|
|
|
|
+ if download_flag and deal_flag:
|
|
|
|
+ #更新document的html
|
|
|
|
+ _document.updateAttachment(list_html)
|
|
|
|
+ _document.update_row(ots_client)
|
|
|
|
+ else:
|
|
|
|
+ _attach_status = STATUS_FAILED
|
|
|
|
+
|
|
|
|
+ _ad.setValue("message",message)
|
|
|
|
+ _ad.setValue("status",_attach_status)
|
|
|
|
+ _ad.setValue("update_time",getCurrent_date())
|
|
|
|
+ _ad.update_row(conn)
|
|
|
|
+ else:
|
|
|
|
+ _ad.setValue("message"," 公告不存在 ")
|
|
|
|
+ _ad.setValue("status",STATUS_FAILED)
|
|
|
|
+ _ad.setValue("update_time",getCurrent_date())
|
|
|
|
+ _ad.update_row(conn)
|
|
|
|
+
|
|
|
|
+ except Exception as e:
|
|
|
|
+ log("comsumer failed cause of %s"%(str(e)))
|
|
|
|
+ log(traceback.format_exc())
|
|
|
|
+ _ad.status = STATUS_TODEAL
|
|
|
|
+ _ad.update_row(conn)
|
|
|
|
+
|
|
|
|
+ pool_ots.putConnector(ots_client)
|
|
|
|
+ pool_mysql.putConnector(conn)
|
|
|
|
+
|
|
|
|
+
|
|
|
|
+ result_queue = queue.Queue()
|
|
|
|
+
|
|
|
|
+ mt = MultiThreadHandler(task_queue,_handle,result_queue,thread_count=30,pool_ots=self.pool_ots,pool_mysql=self.pool_mysql)
|
|
|
|
+ mt.run()
|
|
|
|
+
|
|
|
|
+
|
|
|
|
+ def waitTask(self,task_queue):
|
|
|
|
+ for i in range(60):
|
|
|
|
+ if task_queue.qsize()>0:
|
|
|
|
+ return True
|
|
|
|
+ else:
|
|
|
|
+ time.sleep(1)
|
|
|
|
+ return False
|
|
|
|
+
|
|
|
|
+ def maxcompute2ots(self):
|
|
|
|
+
|
|
|
|
+ task_queue = queue.Queue(maxsize=10000)
|
|
|
|
+
|
|
|
|
+ thread_producer = Thread(target=self.producer,args=([task_queue]))
|
|
|
|
+ thread_producer.start()
|
|
|
|
+
|
|
|
|
+ if self.waitTask(task_queue):
|
|
|
|
+ thread_comsumer = Thread(target=self.comsumer,args=([task_queue]))
|
|
|
|
+ thread_comsumer.start()
|
|
|
|
+
|
|
|
|
+
|
|
|
|
+ def scheduler(self):
|
|
|
|
+ _scheduler = BlockingScheduler()
|
|
|
|
+ _scheduler.add_job(self.maxcompute2ots,"cron",minute="*/10")
|
|
|
|
+ _scheduler.start()
|
|
|
|
+
|
|
|
|
+def startSychro_toDeal():
|
|
|
|
+ ds_toDeal = Data_toDeal_Synchronization()
|
|
|
|
+ ds_toDeal.scheduler()
|
|
|
|
+
|
|
|
|
+def startSychro_Dealing():
|
|
|
|
+ ds_Dealing = Data_Dealing_Synchronization()
|
|
|
|
+ ds_Dealing.scheduler()
|
|
|
|
+
|
|
|
|
+if __name__=="__main__":
|
|
|
|
+ pass
|
|
|
|
+
|
|
|
|
+
|