|
@@ -108,7 +108,7 @@ class Dataflow_ActivteMQ_attachment(Dataflow_attachment):
|
|
|
|
|
|
def start_attachment_listener(self):
|
|
def start_attachment_listener(self):
|
|
for _i in range(self.comsumer_count):
|
|
for _i in range(self.comsumer_count):
|
|
- listener_attachment = self.AttachmentMQListener(getConnect_activateMQ(),self.attachment_listener_handler,_i)
|
|
|
|
|
|
+ listener_attachment = self.AttachmentMQListener(getConnect_activateMQ(),self.attachment_listener_handler ,_i)
|
|
createComsumer(listener_attachment,self.mq_attachment)
|
|
createComsumer(listener_attachment,self.mq_attachment)
|
|
self.list_attachment_comsumer.append(listener_attachment)
|
|
self.list_attachment_comsumer.append(listener_attachment)
|
|
|
|
|
|
@@ -254,11 +254,15 @@ class Dataflow_ActivteMQ_attachment(Dataflow_attachment):
|
|
'''
|
|
'''
|
|
|
|
|
|
try:
|
|
try:
|
|
|
|
+ start_time = time.time()
|
|
|
|
+
|
|
item = _dict.get("item")
|
|
item = _dict.get("item")
|
|
list_attach = _dict.get("list_attach")
|
|
list_attach = _dict.get("list_attach")
|
|
conn = _dict["conn"]
|
|
conn = _dict["conn"]
|
|
message_id = _dict.get("message_id")
|
|
message_id = _dict.get("message_id")
|
|
|
|
|
|
|
|
+ if "retry_times" not in item:
|
|
|
|
+ item["retry_times"] = 5
|
|
_retry_times = item.get("retry_times",0)
|
|
_retry_times = item.get("retry_times",0)
|
|
dhtml = Document_html({"partitionkey":item.get("partitionkey"),
|
|
dhtml = Document_html({"partitionkey":item.get("partitionkey"),
|
|
"docid":item.get("docid")})
|
|
"docid":item.get("docid")})
|
|
@@ -269,7 +273,7 @@ class Dataflow_ActivteMQ_attachment(Dataflow_attachment):
|
|
dtmp = Document_tmp(item)
|
|
dtmp = Document_tmp(item)
|
|
|
|
|
|
|
|
|
|
- start_time = time.time()
|
|
|
|
|
|
+
|
|
#调用识别接口
|
|
#调用识别接口
|
|
_succeed,list_html,swf_urls = self.rec_attachments_by_interface(list_attach,_dochtmlcon,save=True)
|
|
_succeed,list_html,swf_urls = self.rec_attachments_by_interface(list_attach,_dochtmlcon,save=True)
|
|
|
|
|
|
@@ -978,6 +982,8 @@ class Dataflow_ActivteMQ_extract(Dataflow_extract):
|
|
def comsumer_handle(self,_dict,result_queue):
|
|
def comsumer_handle(self,_dict,result_queue):
|
|
try:
|
|
try:
|
|
log("start handle")
|
|
log("start handle")
|
|
|
|
+ data = {}
|
|
|
|
+
|
|
frame = _dict["frame"]
|
|
frame = _dict["frame"]
|
|
conn = _dict["conn"]
|
|
conn = _dict["conn"]
|
|
message_id = frame.headers["message-id"]
|
|
message_id = frame.headers["message-id"]
|
|
@@ -1026,7 +1032,7 @@ class Dataflow_ActivteMQ_extract(Dataflow_extract):
|
|
_extract.setValue(document_extract2_docid,item.get(document_docid))
|
|
_extract.setValue(document_extract2_docid,item.get(document_docid))
|
|
all_done = 1
|
|
all_done = 1
|
|
|
|
|
|
- data = {}
|
|
|
|
|
|
+
|
|
for k,v in item.items():
|
|
for k,v in item.items():
|
|
data[k] = v
|
|
data[k] = v
|
|
data["timeout"] = 440
|
|
data["timeout"] = 440
|
|
@@ -1043,7 +1049,7 @@ class Dataflow_ActivteMQ_extract(Dataflow_extract):
|
|
data["web_source_name"] = item.get(document_tmp_web_source_name,"")
|
|
data["web_source_name"] = item.get(document_tmp_web_source_name,"")
|
|
data["original_docchannel"] = item.get(document_tmp_original_docchannel,"")
|
|
data["original_docchannel"] = item.get(document_tmp_original_docchannel,"")
|
|
|
|
|
|
- _fingerprint = getFingerprint(str(data["title"])+str(data["content"]))
|
|
|
|
|
|
+ _fingerprint = getFingerprint(str(data["title"])+str(data["content"]))+str(data["original_docchannel"])
|
|
|
|
|
|
if all_done>0:
|
|
if all_done>0:
|
|
_time = time.time()
|
|
_time = time.time()
|
|
@@ -1078,6 +1084,7 @@ class Dataflow_ActivteMQ_extract(Dataflow_extract):
|
|
# if all_done>0 and len(_extract.getProperties().get(document_extract2_extract_json,""))<=2:
|
|
# if all_done>0 and len(_extract.getProperties().get(document_extract2_extract_json,""))<=2:
|
|
# all_done = -4
|
|
# all_done = -4
|
|
_extract.setValue(document_extract2_industry_json,"{}",True)
|
|
_extract.setValue(document_extract2_industry_json,"{}",True)
|
|
|
|
+ _to_ack = True
|
|
try:
|
|
try:
|
|
if all_done!=1:
|
|
if all_done!=1:
|
|
sentMsgToDD("要素提取失败:docid:%d with result:%d"%(item.get(document_tmp_docid),all_done))
|
|
sentMsgToDD("要素提取失败:docid:%d with result:%d"%(item.get(document_tmp_docid),all_done))
|
|
@@ -1138,7 +1145,11 @@ class Dataflow_ActivteMQ_extract(Dataflow_extract):
|
|
|
|
|
|
if _to_ack:
|
|
if _to_ack:
|
|
ackMsg(conn,message_id,subscription)
|
|
ackMsg(conn,message_id,subscription)
|
|
- log("process %s docid:%d %s"%(str(_to_ack),data["doc_id"],str(all_done)))
|
|
|
|
|
|
+ else:
|
|
|
|
+ item["extract_times"] -= 1
|
|
|
|
+ send_msg_toacmq(self.pool_mq,json.dumps(item,ensure_ascii=False),self.mq_extract)
|
|
|
|
+ ackMsg(conn,message_id,subscription)
|
|
|
|
+ log("process %s docid:%d %s"%(str(_to_ack),data.get("doc_id"),str(all_done)))
|
|
except requests.ConnectionError as e1:
|
|
except requests.ConnectionError as e1:
|
|
item["extract_times"] -= 1
|
|
item["extract_times"] -= 1
|
|
if send_msg_toacmq(self.pool_mq,json.dumps(item,ensure_ascii=False),self.mq_extract):
|
|
if send_msg_toacmq(self.pool_mq,json.dumps(item,ensure_ascii=False),self.mq_extract):
|
|
@@ -1146,7 +1157,7 @@ class Dataflow_ActivteMQ_extract(Dataflow_extract):
|
|
except Exception as e:
|
|
except Exception as e:
|
|
traceback.print_exc()
|
|
traceback.print_exc()
|
|
sentMsgToDD("要素提取失败:docid:%d with result:%s"%(item.get(document_tmp_docid),str(e)))
|
|
sentMsgToDD("要素提取失败:docid:%d with result:%s"%(item.get(document_tmp_docid),str(e)))
|
|
- log("process %s docid: failed message_id:%s"%(data["doc_id"],message_id))
|
|
|
|
|
|
+ log("process %s docid: failed message_id:%s"%(data.get("doc_id"),message_id))
|
|
if extract_times>=10:
|
|
if extract_times>=10:
|
|
#process as succeed
|
|
#process as succeed
|
|
dtmp.setValue(document_tmp_dochtmlcon,"",False)
|
|
dtmp.setValue(document_tmp_dochtmlcon,"",False)
|
|
@@ -1379,7 +1390,7 @@ class Dataflow_init(Dataflow):
|
|
traceback.print_exc()
|
|
traceback.print_exc()
|
|
self.pool_oracle.decrease()
|
|
self.pool_oracle.decrease()
|
|
|
|
|
|
- def shengpi2mq(self):
|
|
|
|
|
|
+ def shenpi2mq(self):
|
|
|
|
|
|
conn_oracle = self.pool_oracle.getConnector()
|
|
conn_oracle = self.pool_oracle.getConnector()
|
|
|
|
|
|
@@ -1395,32 +1406,46 @@ class Dataflow_init(Dataflow):
|
|
if max_shenpi_id>self.base_shenpi_id:
|
|
if max_shenpi_id>self.base_shenpi_id:
|
|
max_shenpi_id -= self.base_shenpi_id
|
|
max_shenpi_id -= self.base_shenpi_id
|
|
self.max_shenpi_id = max_shenpi_id
|
|
self.max_shenpi_id = max_shenpi_id
|
|
|
|
+
|
|
|
|
+ if self.max_shenpi_id<60383953:
|
|
|
|
+ self.max_shenpi_id = 60383953
|
|
|
|
+
|
|
|
|
+
|
|
if self.max_shenpi_id is not None:
|
|
if self.max_shenpi_id is not None:
|
|
# select data in order
|
|
# select data in order
|
|
- list_data = T_SHEN_PI_XIANG_MU.select_rows(conn_oracle,self.max_shenpi_id,)
|
|
|
|
|
|
|
|
- # send data to mq one by one with max_shenpi_id updated
|
|
|
|
- for _data in list_data:
|
|
|
|
- _id = _data.getProperties().get(T_SHEN_PI_XIANG_MU_ID)
|
|
|
|
-
|
|
|
|
- ots_dict = _data.getProperties_ots()
|
|
|
|
- if ots_dict["docid"]<self.base_shenpi_id:
|
|
|
|
- ots_dict["docid"] += self.base_shenpi_id
|
|
|
|
-
|
|
|
|
- if ots_dict.get(T_SHEN_PI_XIANG_MU_PAGE_ATTACHMENTS,"") !='[]':
|
|
|
|
- if send_msg_toacmq(self.pool_mq,json.dumps(ots_dict,cls=MyEncoder),self.mq_attachment):
|
|
|
|
- self.max_shenpi_id = _id
|
|
|
|
- else:
|
|
|
|
- log("sent shenpi message to mq failed %s"%(_id))
|
|
|
|
- break
|
|
|
|
- else:
|
|
|
|
- if send_msg_toacmq(self.pool_mq,json.dumps(ots_dict,cls=MyEncoder),self.mq_extract):
|
|
|
|
- self.max_shenpi_id = _id
|
|
|
|
- else:
|
|
|
|
- log("sent shenpi message to mq failed %s"%(_id))
|
|
|
|
- break
|
|
|
|
|
|
+ origin_max_shenpi_id = T_SHEN_PI_XIANG_MU.get_max_id(conn_oracle)
|
|
|
|
+
|
|
|
|
+ if origin_max_shenpi_id is not None:
|
|
|
|
+ log("shenpi origin_max_shenpi_id:%d current_id:%d"%(origin_max_shenpi_id,self.max_shenpi_id))
|
|
|
|
+ for _id_i in range(self.max_shenpi_id+1,origin_max_shenpi_id+1):
|
|
|
|
+ list_data = T_SHEN_PI_XIANG_MU.select_rows(conn_oracle,_id_i)
|
|
|
|
+
|
|
|
|
+ # send data to mq one by one with max_shenpi_id updated
|
|
|
|
+ for _data in list_data:
|
|
|
|
+
|
|
|
|
+ _id = _data.getProperties().get(T_SHEN_PI_XIANG_MU_ID)
|
|
|
|
+
|
|
|
|
+ ots_dict = _data.getProperties_ots()
|
|
|
|
+ if ots_dict["docid"]<self.base_shenpi_id:
|
|
|
|
+ ots_dict["docid"] += self.base_shenpi_id
|
|
|
|
+
|
|
|
|
+ if ots_dict.get(T_SHEN_PI_XIANG_MU_PAGE_ATTACHMENTS,"") !='[]':
|
|
|
|
+ if send_msg_toacmq(self.pool_mq,json.dumps(ots_dict,cls=MyEncoder),self.mq_attachment):
|
|
|
|
+ self.max_shenpi_id = _id
|
|
|
|
+ else:
|
|
|
|
+ log("sent shenpi message to mq failed %s"%(_id))
|
|
|
|
+ break
|
|
|
|
+ else:
|
|
|
|
+ if send_msg_toacmq(self.pool_mq,json.dumps(ots_dict,cls=MyEncoder),self.mq_extract):
|
|
|
|
+ self.max_shenpi_id = _id
|
|
|
|
+ else:
|
|
|
|
+ log("sent shenpi message to mq failed %s"%(_id))
|
|
|
|
+ break
|
|
|
|
+ self.pool_oracle.putConnector(conn_oracle)
|
|
|
|
|
|
except Exception as e:
|
|
except Exception as e:
|
|
|
|
+ log("shenpi error")
|
|
traceback.print_exc()
|
|
traceback.print_exc()
|
|
self.pool_oracle.decrease()
|
|
self.pool_oracle.decrease()
|
|
|
|
|
|
@@ -1594,6 +1619,7 @@ class Dataflow_init(Dataflow):
|
|
from BaseDataMaintenance.model.oracle.ZhaoBiaoDaYiTemp import ZhaoBiaoDaYiTemp
|
|
from BaseDataMaintenance.model.oracle.ZhaoBiaoDaYiTemp import ZhaoBiaoDaYiTemp
|
|
from BaseDataMaintenance.model.oracle.ZhaoBiaoWenJianTemp import ZhaoBiaoWenJianTemp
|
|
from BaseDataMaintenance.model.oracle.ZhaoBiaoWenJianTemp import ZhaoBiaoWenJianTemp
|
|
schedule = BlockingScheduler()
|
|
schedule = BlockingScheduler()
|
|
|
|
+
|
|
schedule.add_job(self.temp2mq,"cron",args=(CaiGouYiXiangTemp({}),),second="*/10")
|
|
schedule.add_job(self.temp2mq,"cron",args=(CaiGouYiXiangTemp({}),),second="*/10")
|
|
schedule.add_job(self.temp2mq,"cron",args=(PaiMaiChuRangTemp({}),),second="*/10")
|
|
schedule.add_job(self.temp2mq,"cron",args=(PaiMaiChuRangTemp({}),),second="*/10")
|
|
schedule.add_job(self.temp2mq,"cron",args=(ZhaoBiaoGongGaoTemp({}),),second="*/10")
|
|
schedule.add_job(self.temp2mq,"cron",args=(ZhaoBiaoGongGaoTemp({}),),second="*/10")
|
|
@@ -1609,11 +1635,15 @@ class Dataflow_init(Dataflow):
|
|
schedule.add_job(self.ots2mq,"cron",second="*/10")
|
|
schedule.add_job(self.ots2mq,"cron",second="*/10")
|
|
schedule.add_job(self.otstmp2mq,"cron",second="*/10")
|
|
schedule.add_job(self.otstmp2mq,"cron",second="*/10")
|
|
schedule.add_job(self.monitor_listener,"cron",minute="*/1")
|
|
schedule.add_job(self.monitor_listener,"cron",minute="*/1")
|
|
|
|
+
|
|
|
|
+ schedule.add_job(self.shenpi2mq,"cron",minute="*/1")
|
|
schedule.start()
|
|
schedule.start()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
+
|
|
|
|
+
|
|
def transform_attachment():
|
|
def transform_attachment():
|
|
from BaseDataMaintenance.model.ots.attachment import attachment
|
|
from BaseDataMaintenance.model.ots.attachment import attachment
|
|
from BaseDataMaintenance.model.postgres.attachment import Attachment_postgres
|
|
from BaseDataMaintenance.model.postgres.attachment import Attachment_postgres
|