|
@@ -9,6 +9,9 @@ from BaseDataMaintenance.model.postgres.attachment import Attachment_postgres
|
|
|
import os
|
|
|
from BaseDataMaintenance.common.ossUtils import *
|
|
|
from BaseDataMaintenance.dataSource.pool import ConnectorPool
|
|
|
+from BaseDataMaintenance.model.ots.document import Document
|
|
|
+
|
|
|
+from BaseDataMaintenance.common.Utils import article_limit
|
|
|
|
|
|
class ActiveMQListener():
|
|
|
|
|
@@ -38,7 +41,7 @@ class Dataflow_ActivteMQ_attachment(Dataflow_attachment):
|
|
|
self.mq_attachment = "/queue/dataflow_attachment"
|
|
|
self.mq_attachment_failed = "/queue/dataflow_attachment_failed"
|
|
|
self.mq_extract = "/queue/dataflow_extract"
|
|
|
- self.comsumer_count = 50
|
|
|
+ self.comsumer_count = 80
|
|
|
self.retry_comsumer_count = 10
|
|
|
self.retry_times = 5
|
|
|
for _i in range(self.comsumer_count):
|
|
@@ -46,6 +49,7 @@ class Dataflow_ActivteMQ_attachment(Dataflow_attachment):
|
|
|
createComsumer(listener_attachment,self.mq_attachment)
|
|
|
self.attach_pool = ConnectorPool(10,30,getConnection_postgres)
|
|
|
self.conn_mq = getConnect_activateMQ()
|
|
|
+ self.pool_mq = ConnectorPool(10,30,getConnect_activateMQ)
|
|
|
|
|
|
|
|
|
def process_failed_attachment(self):
|
|
@@ -98,6 +102,9 @@ class Dataflow_ActivteMQ_attachment(Dataflow_attachment):
|
|
|
if not _not_failed:
|
|
|
return False,list_html,swf_urls
|
|
|
return True,list_html,swf_urls
|
|
|
+
|
|
|
+ except requests.ConnectionError as e1:
|
|
|
+ raise e1
|
|
|
except Exception as e:
|
|
|
return False,list_html,swf_urls
|
|
|
|
|
@@ -135,9 +142,9 @@ class Dataflow_ActivteMQ_attachment(Dataflow_attachment):
|
|
|
item["retry_times"] = _retry_times+1
|
|
|
#失败次数大于5次就放入失败队列,此队列的数据会在空闲时间重新处理一次
|
|
|
if item["retry_times"]>=self.retry_times:
|
|
|
- send_msg_toacmq(self.conn_mq,json.dumps(item,cls=MyEncoder,ensure_ascii=False),self.mq_attachment_failed)
|
|
|
+ send_msg_toacmq(self.pool_mq,json.dumps(item,cls=MyEncoder,ensure_ascii=False),self.mq_attachment_failed)
|
|
|
|
|
|
- send_msg_toacmq(self.conn_mq,json.dumps(item,cls=MyEncoder,ensure_ascii=False),self.mq_attachment)
|
|
|
+ send_succeed = send_msg_toacmq(self.pool_mq,json.dumps(item,cls=MyEncoder,ensure_ascii=False),self.mq_attachment)
|
|
|
|
|
|
#失败保存
|
|
|
dtmp.setValue(document_tmp_dochtmlcon,"",False)
|
|
@@ -145,7 +152,8 @@ class Dataflow_ActivteMQ_attachment(Dataflow_attachment):
|
|
|
if not dtmp.exists_row(self.ots_client):
|
|
|
dtmp.update_row(self.ots_client)
|
|
|
dhtml.update_row(self.ots_client)
|
|
|
- _to_ack = True
|
|
|
+ if send_succeed:
|
|
|
+ _to_ack = True
|
|
|
|
|
|
else:
|
|
|
try:
|
|
@@ -155,8 +163,9 @@ class Dataflow_ActivteMQ_attachment(Dataflow_attachment):
|
|
|
|
|
|
dtmp.setValue(document_tmp_attachment_extract_status,1,True)
|
|
|
dtmp.setValue(document_tmp_dochtmlcon,dhtml.getProperties().get(document_tmp_dochtmlcon),True)
|
|
|
- send_msg_toacmq(self.conn_mq,json.dumps(dtmp.getProperties(),cls=MyEncoder),self.mq_extract)
|
|
|
- _to_ack = True
|
|
|
+ send_succeed = send_msg_toacmq(self.pool_mq,json.dumps(dtmp.getProperties(),cls=MyEncoder),self.mq_extract)
|
|
|
+ if send_succeed:
|
|
|
+ _to_ack = True
|
|
|
except Exception as e:
|
|
|
traceback.print_exc()
|
|
|
|
|
@@ -164,8 +173,8 @@ class Dataflow_ActivteMQ_attachment(Dataflow_attachment):
|
|
|
ackMsg(conn,message_id)
|
|
|
log("document:%d get attachments with result:%s %s retry_times:%d"%(item.get("docid"),str(_succeed),str(_to_ack),_retry_times))
|
|
|
except Exception as e:
|
|
|
- send_msg_toacmq(self.conn_mq,json.dumps(item,cls=MyEncoder,ensure_ascii=False),self.mq_attachment)
|
|
|
- ackMsg(conn,message_id)
|
|
|
+ if send_msg_toacmq(self.pool_mq,json.dumps(item,cls=MyEncoder,ensure_ascii=False),self.mq_attachment):
|
|
|
+ ackMsg(conn,message_id)
|
|
|
|
|
|
|
|
|
|
|
@@ -179,7 +188,10 @@ class Dataflow_ActivteMQ_attachment(Dataflow_attachment):
|
|
|
objectPath = attach.getProperties().get(attachment_path)
|
|
|
docids = attach.getProperties().get(attachment_docids)
|
|
|
|
|
|
- relative_path = objectPath[5:]
|
|
|
+ if objectPath is None:
|
|
|
+ relative_path = "%s/%s"%(_uuid.hex[:4],_uuid.hex)
|
|
|
+ else:
|
|
|
+ relative_path = objectPath[5:]
|
|
|
localpath = "/FileInfo/%s"%(relative_path)
|
|
|
if not os.path.exists(localpath):
|
|
|
if not os.path.exists(os.path.dirname(localpath)):
|
|
@@ -203,12 +215,13 @@ class Dataflow_ActivteMQ_attachment(Dataflow_attachment):
|
|
|
log("md5:%s path:%s exists"%(filemd5,objectPath[5:]))
|
|
|
if not (local_exists or download_succeed):
|
|
|
_ots_attach = attachment(attach.getProperties_ots())
|
|
|
- _ots_exists = _ots_attach.fix_columns(self.ots_client,[attachment_attachmenthtml,attachment_attachmentcon,attachment_path,attachment_status],True)
|
|
|
+ _ots_exists = _ots_attach.fix_columns(self.ots_client,[attachment_attachmenthtml,attachment_attachmentcon,attachment_path,attachment_status,attachment_filetype],True)
|
|
|
log("md5:%s path:%s file not in local or oss,search ots.attachment"%(filemd5,objectPath[5:]))
|
|
|
if _ots_attach.getProperties().get(attachment_attachmenthtml,"")!="":
|
|
|
attach.setValue(attachment_attachmenthtml,_ots_attach.getProperties().get(attachment_attachmenthtml,""))
|
|
|
attach.setValue(attachment_attachmentcon,_ots_attach.getProperties().get(attachment_attachmentcon,""))
|
|
|
attach.setValue(attachment_status,_ots_attach.getProperties().get(attachment_status,""))
|
|
|
+ attach.setValue(attachment_filetype,_ots_attach.getProperties().get(attachment_filetype,""))
|
|
|
if attach.exists(self.attach_pool):
|
|
|
attach.update_row(self.attach_pool)
|
|
|
else:
|
|
@@ -253,10 +266,14 @@ class Dataflow_ActivteMQ_attachment(Dataflow_attachment):
|
|
|
return True
|
|
|
|
|
|
time_download = time.time()-d_start_time
|
|
|
- _data_base64 = base64.b64encode(open(localpath,"rb").read())
|
|
|
+
|
|
|
#调用接口处理结果
|
|
|
start_time = time.time()
|
|
|
- _success,_html,swf_images = getAttachDealInterface(_data_base64,_filetype)
|
|
|
+ _filetype = attach.getProperties().get(attachment_filetype)
|
|
|
+
|
|
|
+ # _data_base64 = base64.b64encode(open(localpath,"rb").read())
|
|
|
+ # _success,_html,swf_images = getAttachDealInterface(_data_base64,_filetype)
|
|
|
+ _success,_html,swf_images = getAttachDealInterface(None,_filetype,path=localpath)
|
|
|
log("process filemd5:%s %s of type:%s with size:%.3fM download:%ds recognize takes %ds,ret_size:%d"%(filemd5,str(_success),_filetype,round(_size/1024/1024,4),time_download,time.time()-start_time,len(_html)))
|
|
|
if _success:
|
|
|
if len(_html)<5:
|
|
@@ -331,6 +348,9 @@ class Dataflow_ActivteMQ_attachment(Dataflow_attachment):
|
|
|
else:
|
|
|
return True
|
|
|
|
|
|
+ except requests.ConnectionError as e1:
|
|
|
+ raise e1
|
|
|
+
|
|
|
except oss2.exceptions.NotFound as e:
|
|
|
return True
|
|
|
|
|
@@ -386,6 +406,15 @@ class Dataflow_ActivteMQ_attachment(Dataflow_attachment):
|
|
|
attachment_path:"%s/%s"%(_filemd5[:4],_path),
|
|
|
attachment_crtime:getCurrent_date(format="%Y-%m-%d %H:%M:%S")}
|
|
|
list_attachment.append(Attachment_postgres(_attach))
|
|
|
+ else:
|
|
|
+ log("getAttachments search in ots:%s"%(_filemd5))
|
|
|
+ _attach = {attachment_filemd5:_filemd5}
|
|
|
+ _attach_ots = attachment(_attach)
|
|
|
+ _attach_ots.fix_columns(self.ots_client,[attachment_status,attachment_path,attachment_attachmenthtml,attachment_attachmentcon,attachment_filetype,attachment_swfUrls],True)
|
|
|
+ if _attach_ots.getProperties().get(attachment_status) is not None:
|
|
|
+ log("getAttachments find in ots:%s"%(_filemd5))
|
|
|
+ list_attachment.append(Attachment_postgres(_attach_ots.getProperties()))
|
|
|
+
|
|
|
return list_attachment
|
|
|
except Exception as e:
|
|
|
log("attachProcess comsumer error %s"%str(e))
|
|
@@ -490,24 +519,42 @@ class Dataflow_ActivteMQ_extract(Dataflow_extract):
|
|
|
|
|
|
self.industy_url = "http://127.0.0.1:15000/industry_extract"
|
|
|
|
|
|
- self.extract_interfaces = ["http://127.0.0.1:15030/content_extract",
|
|
|
- "http://192.168.0.115:15030/content_extract"
|
|
|
+ self.extract_interfaces = [["http://127.0.0.1:15030/content_extract",11],
|
|
|
+ ["http://192.168.0.115:15030/content_extract",10]
|
|
|
]
|
|
|
|
|
|
|
|
|
self.mq_extract = "/queue/dataflow_extract"
|
|
|
|
|
|
+ whole_weight = 0
|
|
|
+ for _url,weight in self.extract_interfaces:
|
|
|
+ whole_weight+= weight
|
|
|
+ current_weight = 0
|
|
|
+ for _i in range(len(self.extract_interfaces)):
|
|
|
+ current_weight += self.extract_interfaces[_i][1]
|
|
|
+ self.extract_interfaces[_i][1] = current_weight/whole_weight
|
|
|
+
|
|
|
|
|
|
- self.comsumer_count = 25
|
|
|
+
|
|
|
+ self.comsumer_count = 20
|
|
|
for _i in range(self.comsumer_count):
|
|
|
listener_extract = self.ExtractListener(getConnect_activateMQ(),self.comsumer_handle)
|
|
|
createComsumer(listener_extract,self.mq_extract)
|
|
|
self.conn_mq = getConnect_activateMQ()
|
|
|
+ self.pool_mq = ConnectorPool(10,30,getConnect_activateMQ)
|
|
|
+
|
|
|
+ def getExtract_url(self):
|
|
|
+ _r = random.random()
|
|
|
+ for _i in range(len(self.extract_interfaces)):
|
|
|
+ if _r<=self.extract_interfaces[_i][1]:
|
|
|
+ return self.extract_interfaces[_i][0]
|
|
|
|
|
|
def request_extract_interface(self,json,headers):
|
|
|
- _i = random.randint(0,len(self.extract_interfaces)-1)
|
|
|
+ # _i = random.randint(0,len(self.extract_interfaces)-1)
|
|
|
# _i = 0
|
|
|
- resp = requests.post(self.extract_interfaces[_i],json=json,headers=headers)
|
|
|
+ # _url = self.extract_interfaces[_i]
|
|
|
+ _url = self.getExtract_url()
|
|
|
+ resp = requests.post(_url,json=json,headers=headers)
|
|
|
return resp
|
|
|
|
|
|
|
|
@@ -541,6 +588,13 @@ class Dataflow_ActivteMQ_extract(Dataflow_extract):
|
|
|
"docid":item.get("docid")})
|
|
|
|
|
|
_dochtmlcon = item.get(document_tmp_dochtmlcon,"")
|
|
|
+
|
|
|
+ if len(_dochtmlcon)>200000:
|
|
|
+ _soup = BeautifulSoup(_dochtmlcon,"lxml")
|
|
|
+ _soup = article_limit(_soup,200000)
|
|
|
+ _dochtmlcon = str(_soup)
|
|
|
+
|
|
|
+
|
|
|
dhtml.setValue(document_tmp_dochtmlcon,_dochtmlcon,True)
|
|
|
_extract = Document_extract({})
|
|
|
_extract.setValue(document_extract2_partitionkey,item.get(document_partitionkey))
|
|
@@ -578,7 +632,7 @@ class Dataflow_ActivteMQ_extract(Dataflow_extract):
|
|
|
try:
|
|
|
if all_done!=1:
|
|
|
sentMsgToDD("要素提取失败:docid:%d with result:%d"%(item.get(document_tmp_docid),all_done))
|
|
|
- send_msg_toacmq(self.conn_mq,frame.body,self.mq_extract)
|
|
|
+ send_succeed = send_msg_toacmq(self.pool_mq,frame.body,self.mq_extract)
|
|
|
|
|
|
|
|
|
#失败保存
|
|
@@ -587,7 +641,8 @@ class Dataflow_ActivteMQ_extract(Dataflow_extract):
|
|
|
if not dtmp.exists_row(self.ots_client):
|
|
|
dtmp.update_row(self.ots_client)
|
|
|
dhtml.update_row(self.ots_client)
|
|
|
- _to_ack = True
|
|
|
+ if send_succeed:
|
|
|
+ _to_ack = True
|
|
|
else:
|
|
|
|
|
|
dtmp.setValue(document_tmp_dochtmlcon,"",False)
|
|
@@ -606,14 +661,13 @@ class Dataflow_ActivteMQ_extract(Dataflow_extract):
|
|
|
except Exception as e:
|
|
|
traceback.print_exc()
|
|
|
log("process %s docid: failed message_id:%s"%(data["doc_id"],message_id))
|
|
|
- send_msg_toacmq(self.conn_mq,frame.body,self.mq_extract)
|
|
|
- ackMsg(conn,message_id,subscription)
|
|
|
+ if send_msg_toacmq(self.pool_mq,frame.body,self.mq_extract):
|
|
|
+ ackMsg(conn,message_id,subscription)
|
|
|
|
|
|
|
|
|
def start_flow_extract(self):
|
|
|
schedule = BlockingScheduler()
|
|
|
schedule.add_job(self.flow_extract_producer,"cron",second="*/20")
|
|
|
- # schedule.add_job(self.flow_extract,"cron",second="*/10")
|
|
|
schedule.start()
|
|
|
|
|
|
from multiprocessing import RLock
|
|
@@ -665,6 +719,7 @@ class Dataflow_init(Dataflow):
|
|
|
self.begin_docid = None
|
|
|
self.mq_attachment = "/queue/dataflow_attachment"
|
|
|
self.mq_extract = "/queue/dataflow_extract"
|
|
|
+ self.pool_mq1 = ConnectorPool(1,4,getConnect_activateMQ)
|
|
|
|
|
|
def on_error(self, headers):
|
|
|
log('received an error %s' % headers.body)
|
|
@@ -689,25 +744,38 @@ class Dataflow_init(Dataflow):
|
|
|
body[document_tmp_partitionkey] = partitionkey
|
|
|
body[document_tmp_docid] = next_docid
|
|
|
page_attachments = body.get(document_tmp_attachment_path,"[]")
|
|
|
+ _uuid = body.get(document_tmp_uuid,"")
|
|
|
if page_attachments!="[]":
|
|
|
status = random.randint(1,10)
|
|
|
body[document_tmp_status] = status
|
|
|
- send_msg_toacmq(self.conn,json.dumps(body,cls=MyEncoder),self.mq_attachment)
|
|
|
- ackMsg(self.conn,message_id)
|
|
|
+ if send_msg_toacmq(self.pool_mq1,json.dumps(body,cls=MyEncoder),self.mq_attachment):
|
|
|
+ log("uuid:%s with docid:%s"%(str(_uuid),str(next_docid)))
|
|
|
+ ackMsg(self.conn,message_id)
|
|
|
+ else:
|
|
|
+ log("send_msg_error on init listener")
|
|
|
else:
|
|
|
status = random.randint(11,50)
|
|
|
body[document_tmp_status] = status
|
|
|
- send_msg_toacmq(self.conn,json.dumps(body,cls=MyEncoder),self.mq_extract)
|
|
|
- ackMsg(self.conn,message_id)
|
|
|
+ if send_msg_toacmq(self.pool_mq1,json.dumps(body,cls=MyEncoder),self.mq_extract):
|
|
|
+ log("uuid:%s with docid:%s"%(str(_uuid),str(next_docid)))
|
|
|
+ ackMsg(self.conn,message_id)
|
|
|
+ else:
|
|
|
+ log("send_msg_error on init listener")
|
|
|
|
|
|
def __del__(self):
|
|
|
self.conn.disconnect()
|
|
|
+ del self.pool_mq1
|
|
|
|
|
|
def __init__(self):
|
|
|
Dataflow.__init__(self)
|
|
|
self.mq_init = "/queue/dataflow_init"
|
|
|
+
|
|
|
+ self.mq_attachment = "/queue/dataflow_attachment"
|
|
|
+ self.mq_extract = "/queue/dataflow_extract"
|
|
|
self.pool_oracle = ConnectorPool(10,15,getConnection_oracle)
|
|
|
- self.pool_mq = ConnectorPool(10,15,getConnect_activateMQ)
|
|
|
+ self.pool_mq = ConnectorPool(10,30,getConnect_activateMQ)
|
|
|
+
|
|
|
+ self.ots_capacity = getConnect_ots_capacity()
|
|
|
|
|
|
self.init_comsumer_counts = 2
|
|
|
for i in range(self.init_comsumer_counts):
|
|
@@ -717,24 +785,90 @@ class Dataflow_init(Dataflow):
|
|
|
|
|
|
|
|
|
def temp2mq(self,object):
|
|
|
- conn_mq = self.pool_mq.getConnector()
|
|
|
conn_oracle = self.pool_oracle.getConnector()
|
|
|
|
|
|
try:
|
|
|
list_obj = object.select_rows(conn_oracle,type(object),object.table_name,[],limit=1000)
|
|
|
for _obj in list_obj:
|
|
|
ots_dict = _obj.getProperties_ots()
|
|
|
- send_msg_toacmq(conn_mq,json.dumps(ots_dict,cls=MyEncoder),self.mq_init)
|
|
|
|
|
|
- #删除数据,上线放开
|
|
|
- _obj.delete_row(conn_oracle)
|
|
|
+ if len(ots_dict.get("dochtmlcon",""))>500000:
|
|
|
+ _obj.delete_row(conn_oracle)
|
|
|
+ log("msg too long:%s,%d"%(ots_dict.get("uuid"),len(ots_dict.get("dochtmlcon",""))))
|
|
|
+ continue
|
|
|
+
|
|
|
+ if send_msg_toacmq(self.pool_mq,json.dumps(ots_dict,cls=MyEncoder),self.mq_init):
|
|
|
+ #删除数据,上线放开
|
|
|
+ _obj.delete_row(conn_oracle)
|
|
|
+ else:
|
|
|
+ log("send_msg_error111:%s,%d"%(ots_dict.get("uuid"),len(ots_dict.get("dochtmlcon",""))))
|
|
|
|
|
|
except Exception as e:
|
|
|
traceback.print_exc()
|
|
|
finally:
|
|
|
- self.pool_mq.putConnector(conn_mq)
|
|
|
self.pool_oracle.putConnector(conn_oracle)
|
|
|
|
|
|
+ def ots2mq(self):
|
|
|
+ try:
|
|
|
+ bool_query = BoolQuery(must_queries=[RangeQuery("status",1,51)])
|
|
|
+
|
|
|
+ rows,next_token,total_count,is_all_succeed = self.ots_client.search("document","document_index",
|
|
|
+ SearchQuery(bool_query,sort=Sort(sorters=[FieldSort(document_docid)]),get_total_count=True,limit=100),
|
|
|
+ ColumnsToGet(return_type=ColumnReturnType.ALL))
|
|
|
+ list_data = getRow_ots(rows)
|
|
|
+ for _data in list_data:
|
|
|
+ _d = {document_tmp_partitionkey:_data.get(document_tmp_partitionkey),
|
|
|
+ document_tmp_docid:_data.get(document_tmp_docid),
|
|
|
+ document_tmp_status:0}
|
|
|
+ _document = Document(_d)
|
|
|
+ page_attachments = _data.get(document_tmp_attachment_path,"[]")
|
|
|
+
|
|
|
+ _document_html = Document(_data)
|
|
|
+ _document_html.fix_columns(self.ots_capacity,[document_tmp_dochtmlcon],True)
|
|
|
+
|
|
|
+ if page_attachments!="[]":
|
|
|
+ status = random.randint(1,10)
|
|
|
+ _data[document_tmp_status] = status
|
|
|
+ send_succeed = send_msg_toacmq(self.pool_mq,json.dumps(_document_html.getProperties(),cls=MyEncoder),self.mq_attachment)
|
|
|
+ else:
|
|
|
+ status = random.randint(11,50)
|
|
|
+ _data[document_tmp_status] = status
|
|
|
+ send_succeed = send_msg_toacmq(self.pool_mq,json.dumps(_document_html.getProperties(),cls=MyEncoder),self.mq_extract)
|
|
|
+ if send_succeed:
|
|
|
+ _document.update_row(self.ots_client)
|
|
|
+ else:
|
|
|
+ log("send_msg_error2222")
|
|
|
+ while next_token:
|
|
|
+ rows,next_token,total_count,is_all_succeed = self.ots_client.search("document","document_index",
|
|
|
+ SearchQuery(bool_query,next_token=next_token,get_total_count=True,limit=100),
|
|
|
+ ColumnsToGet(return_type=ColumnReturnType.ALL))
|
|
|
+ list_data = getRow_ots(rows)
|
|
|
+ for _data in list_data:
|
|
|
+ _d = {document_tmp_partitionkey:_data.get(document_tmp_partitionkey),
|
|
|
+ document_tmp_docid:_data.get(document_tmp_docid),
|
|
|
+ document_tmp_status:0}
|
|
|
+ _document = Document(_d)
|
|
|
+ page_attachments = _data.get(document_tmp_attachment_path,"[]")
|
|
|
+
|
|
|
+ _document_html = Document(_data)
|
|
|
+ _document_html.fix_columns(self.ots_capacity,[document_tmp_dochtmlcon],True)
|
|
|
+
|
|
|
+ if page_attachments!="[]":
|
|
|
+ status = random.randint(1,10)
|
|
|
+ _data[document_tmp_status] = status
|
|
|
+ send_succeed = send_msg_toacmq(self.pool_mq,json.dumps(_document_html.getProperties(),cls=MyEncoder),self.mq_attachment)
|
|
|
+ else:
|
|
|
+ status = random.randint(11,50)
|
|
|
+ _data[document_tmp_status] = status
|
|
|
+ send_succeed = send_msg_toacmq(self.pool_mq,json.dumps(_document_html.getProperties(),cls=MyEncoder),self.mq_extract)
|
|
|
+ if send_succeed:
|
|
|
+ _document.update_row(self.ots_client)
|
|
|
+ else:
|
|
|
+ log("send_msg_error2222")
|
|
|
+ except Exception as e:
|
|
|
+ traceback.print_exc()
|
|
|
+
|
|
|
+
|
|
|
def test_dump_docid(self):
|
|
|
class TestDumpListener(ActiveMQListener):
|
|
|
def on_message(self, headers):
|
|
@@ -792,6 +926,7 @@ class Dataflow_init(Dataflow):
|
|
|
schedule.add_job(self.temp2mq,"cron",args=(TuDiKuangChanTemp({}),),second="*/10")
|
|
|
schedule.add_job(self.temp2mq,"cron",args=(ZhaoBiaoDaYiTemp({}),),second="*/10")
|
|
|
schedule.add_job(self.temp2mq,"cron",args=(ZhaoBiaoWenJianTemp({}),),second="*/10")
|
|
|
+ schedule.add_job(self.ots2mq,"cron",second="*/10")
|
|
|
schedule.start()
|
|
|
|
|
|
|
|
@@ -886,13 +1021,17 @@ def del_test_doc():
|
|
|
_html.delete_row(ots_client)
|
|
|
|
|
|
def fixDoc_to_queue_extract():
|
|
|
+ pool_mq = ConnectorPool(10,20,getConnect_activateMQ)
|
|
|
try:
|
|
|
ots_client = getConnect_ots()
|
|
|
- conn_mq = getConnect_activateMQ()
|
|
|
- bool_query = BoolQuery(must_queries=[RangeQuery("status",range_to=66)])
|
|
|
+ ots_capacity = getConnect_ots_capacity()
|
|
|
+ bool_query = BoolQuery(must_queries=[
|
|
|
+ RangeQuery("crtime","2022-05-31"),
|
|
|
+ TermQuery("docchannel",114)
|
|
|
+ ])
|
|
|
|
|
|
list_data = []
|
|
|
- rows,next_token,total_count,is_all_succeed = ots_client.search("document_tmp","document_tmp_index",
|
|
|
+ rows,next_token,total_count,is_all_succeed = ots_client.search("document","document_index",
|
|
|
SearchQuery(bool_query,sort=Sort(sorters=[FieldSort("docid")]),get_total_count=True,limit=100),
|
|
|
columns_to_get=ColumnsToGet(return_type=ColumnReturnType.ALL))
|
|
|
print(total_count)
|
|
@@ -900,7 +1039,7 @@ def fixDoc_to_queue_extract():
|
|
|
list_data.extend(list_row)
|
|
|
_count = len(list_row)
|
|
|
while next_token:
|
|
|
- rows,next_token,total_count,is_all_succeed = ots_client.search("document_tmp","document_tmp_index",
|
|
|
+ rows,next_token,total_count,is_all_succeed = ots_client.search("document","document_index",
|
|
|
SearchQuery(bool_query,next_token=next_token,get_total_count=True,limit=100),
|
|
|
columns_to_get=ColumnsToGet(return_type=ColumnReturnType.ALL))
|
|
|
|
|
@@ -912,39 +1051,51 @@ def fixDoc_to_queue_extract():
|
|
|
for _row in list_data:
|
|
|
if "all_columns" in _row:
|
|
|
_row.pop("all_columns")
|
|
|
- _html = Document_html(_row)
|
|
|
+ _html = Document(_row)
|
|
|
task_queue.put(_html)
|
|
|
def _handle(item,result_queue):
|
|
|
_html = item
|
|
|
- _html.fix_columns(ots_client,["dochtmlcon"],True)
|
|
|
+ _html.fix_columns(ots_capacity,["dochtmlcon"],True)
|
|
|
print(_html.getProperties().get(document_tmp_docid))
|
|
|
- send_msg_toacmq(conn_mq,json.dumps(_html.getProperties()),"/queue/dataflow_extract")
|
|
|
+ send_msg_toacmq(pool_mq,json.dumps(_html.getProperties()),"/queue/dataflow_extract")
|
|
|
mt = MultiThreadHandler(task_queue,_handle,None,30)
|
|
|
mt.run()
|
|
|
except Exception as e:
|
|
|
traceback.print_exc()
|
|
|
+ finally:
|
|
|
+ pool_mq.destory()
|
|
|
|
|
|
def check_data_synchronization():
|
|
|
- filepath = "C:\\Users\\Administrator\\Desktop\\to_check.log"
|
|
|
- list_uuid = []
|
|
|
- _regrex = "ID='(?P<uuid>.+)'"
|
|
|
- with open(filepath,"r",encoding="utf8") as f:
|
|
|
- while 1:
|
|
|
- _line = f.readline()
|
|
|
- if not _line:
|
|
|
- break
|
|
|
- _match = re.search(_regrex,_line)
|
|
|
- if _match is not None:
|
|
|
- _uuid = _match.groupdict().get("uuid")
|
|
|
- if _uuid is not None:
|
|
|
- list_uuid.append(_uuid)
|
|
|
- print(len(list_uuid))
|
|
|
+ # filepath = "C:\\Users\\Administrator\\Desktop\\to_check.log"
|
|
|
+ # list_uuid = []
|
|
|
+ # _regrex = "delete\s+(?P<tablename>[^\s]+)\s+.*ID='(?P<uuid>.+)'"
|
|
|
+ # with open(filepath,"r",encoding="utf8") as f:
|
|
|
+ # while 1:
|
|
|
+ # _line = f.readline()
|
|
|
+ # if not _line:
|
|
|
+ # break
|
|
|
+ # _match = re.search(_regrex,_line)
|
|
|
+ # if _match is not None:
|
|
|
+ # _uuid = _match.groupdict().get("uuid")
|
|
|
+ # tablename = _match.groupdict.get("tablename")
|
|
|
+ # if _uuid is not None:
|
|
|
+ # list_uuid.append({"uuid":_uuid,"tablename":tablename})
|
|
|
+ # print("total_count:",len(list_uuid))
|
|
|
+
|
|
|
+ import pandas as pd
|
|
|
+ from BaseDataMaintenance.common.Utils import load
|
|
|
+
|
|
|
+
|
|
|
task_queue = Queue()
|
|
|
list_data = []
|
|
|
- for _uuid in list_uuid:
|
|
|
- _dict = {"uuid":_uuid}
|
|
|
+ df_data = load("uuid.pk")
|
|
|
+ # df_data = pd.read_excel("check.xlsx")
|
|
|
+ for uuid,tablename in zip(df_data["uuid"],df_data["tablename"]):
|
|
|
+ _dict = {"uuid":uuid,
|
|
|
+ "tablename":tablename}
|
|
|
list_data.append(_dict)
|
|
|
task_queue.put(_dict)
|
|
|
+ print("qsize:",task_queue.qsize())
|
|
|
ots_client = getConnect_ots()
|
|
|
def _handle(_item,result_queue):
|
|
|
bool_query = BoolQuery(must_queries=[TermQuery("uuid",_item.get("uuid"))])
|
|
@@ -957,14 +1108,35 @@ def check_data_synchronization():
|
|
|
mt = MultiThreadHandler(task_queue,_handle,None,30)
|
|
|
mt.run()
|
|
|
df_data = {"uuid":[],
|
|
|
+ "tablename":[],
|
|
|
"exists":[]}
|
|
|
for _data in list_data:
|
|
|
- for k,v in df_data.items():
|
|
|
- v.append(_data.get(k))
|
|
|
+ if _data["exists"]==0:
|
|
|
+ for k,v in df_data.items():
|
|
|
+ v.append(_data.get(k))
|
|
|
import pandas as pd
|
|
|
df2 = pd.DataFrame(df_data)
|
|
|
- df2.to_excel("check.xlsx")
|
|
|
+ df2.to_excel("check1.xlsx")
|
|
|
+
|
|
|
+current_path = os.path.abspath(os.path.dirname(__file__))
|
|
|
|
|
|
+def fixDoc_to_queue_init(filename=""):
|
|
|
+ import pandas as pd
|
|
|
+ from BaseDataMaintenance.model.oracle.GongGaoTemp import dict_oracle2ots
|
|
|
+ if filename=="":
|
|
|
+ filename = os.path.join(current_path,"check.xlsx")
|
|
|
+ df = pd.read_excel(filename)
|
|
|
+ dict_oracle2ots.pop("docchannel")
|
|
|
+ row_name = ",".join(list(dict_oracle2ots.keys()))
|
|
|
+ conn = getConnection_oracle()
|
|
|
+ cursor = conn.cursor()
|
|
|
+ for uuid,tablename,_exists in zip(df["uuid"],df["tablename"],df["exists"]):
|
|
|
+ if _exists==0:
|
|
|
+ _source = str(tablename).replace("_TEMP","")
|
|
|
+ sql = " insert into %s(%s) select %s from %s where id='%s' "%(tablename,row_name,row_name,_source,uuid)
|
|
|
+ cursor.execute(sql)
|
|
|
+ log(sql)
|
|
|
+ conn.commit()
|
|
|
|
|
|
if __name__ == '__main__':
|
|
|
# di = Dataflow_init()
|
|
@@ -974,4 +1146,5 @@ if __name__ == '__main__':
|
|
|
# de = Dataflow_ActivteMQ_extract()
|
|
|
# de.start_flow_extract()
|
|
|
# fixDoc_to_queue_extract()
|
|
|
- check_data_synchronization()
|
|
|
+ # check_data_synchronization()
|
|
|
+ fixDoc_to_queue_init()
|