|
@@ -525,6 +525,7 @@ class Dataflow_ActivteMQ_extract(Dataflow_extract):
|
|
|
|
|
|
|
|
|
self.mq_extract = "/queue/dataflow_extract"
|
|
|
+ self.mq_extract_failed = "/queue/dataflow_extract_failed"
|
|
|
|
|
|
whole_weight = 0
|
|
|
for _url,weight in self.extract_interfaces:
|
|
@@ -587,6 +588,9 @@ class Dataflow_ActivteMQ_extract(Dataflow_extract):
|
|
|
dhtml = Document_html({"partitionkey":item.get("partitionkey"),
|
|
|
"docid":item.get("docid")})
|
|
|
|
|
|
+ extract_times = item.get("extract_times",0)+1
|
|
|
+ item["extract_times"] = extract_times
|
|
|
+
|
|
|
_dochtmlcon = item.get(document_tmp_dochtmlcon,"")
|
|
|
|
|
|
if len(_dochtmlcon)>200000:
|
|
@@ -633,19 +637,35 @@ class Dataflow_ActivteMQ_extract(Dataflow_extract):
|
|
|
try:
|
|
|
if all_done!=1:
|
|
|
sentMsgToDD("要素提取失败:docid:%d with result:%d"%(item.get(document_tmp_docid),all_done))
|
|
|
- send_succeed = send_msg_toacmq(self.pool_mq,frame.body,self.mq_extract)
|
|
|
+ if extract_times>2:
|
|
|
+ #transform to the extract_failed queue
|
|
|
+ if send_msg_toacmq(self.pool_mq,json.dumps(item,ensure_ascii=False),self.mq_extract_failed):
|
|
|
+ #process as succeed
|
|
|
+ dtmp.setValue(document_tmp_dochtmlcon,"",False)
|
|
|
+ dtmp.setValue(document_tmp_status,random.randint(*flow_extract_status_succeed_to),True)
|
|
|
+ dtmp.update_row(self.ots_client)
|
|
|
+ dhtml.update_row(self.ots_client)
|
|
|
+
|
|
|
+ #replace as {}
|
|
|
+ _extract.setValue(document_extract2_extract_json,"{}",True)
|
|
|
+ _extract.setValue(document_extract2_industry_json,"{}",True)
|
|
|
+ _extract.setValue(document_extract2_status,random.randint(1,50),True)
|
|
|
+ _extract.update_row(self.ots_client)
|
|
|
+ _to_ack = True
|
|
|
+ else:
|
|
|
|
|
|
+ send_succeed = send_msg_toacmq(self.pool_mq,json.dumps(item,ensure_ascii=False),self.mq_extract)
|
|
|
|
|
|
- #失败保存
|
|
|
- dtmp.setValue(document_tmp_dochtmlcon,"",False)
|
|
|
- dtmp.setValue(document_tmp_status,60,True)
|
|
|
- if not dtmp.exists_row(self.ots_client):
|
|
|
- dtmp.update_row(self.ots_client)
|
|
|
- dhtml.update_row(self.ots_client)
|
|
|
- if send_succeed:
|
|
|
- _to_ack = True
|
|
|
+ #失败保存
|
|
|
+ dtmp.setValue(document_tmp_dochtmlcon,"",False)
|
|
|
+ dtmp.setValue(document_tmp_status,60,True)
|
|
|
+ if not dtmp.exists_row(self.ots_client):
|
|
|
+ dtmp.update_row(self.ots_client)
|
|
|
+ dhtml.update_row(self.ots_client)
|
|
|
+ if send_succeed:
|
|
|
+ _to_ack = True
|
|
|
else:
|
|
|
-
|
|
|
+ #process succeed
|
|
|
dtmp.setValue(document_tmp_dochtmlcon,"",False)
|
|
|
dtmp.setValue(document_tmp_status,random.randint(*flow_extract_status_succeed_to),True)
|
|
|
dtmp.update_row(self.ots_client)
|
|
@@ -661,9 +681,26 @@ class Dataflow_ActivteMQ_extract(Dataflow_extract):
|
|
|
log("process %s docid:%d %s"%(str(_to_ack),data["doc_id"],str(all_done)))
|
|
|
except Exception as e:
|
|
|
traceback.print_exc()
|
|
|
+ sentMsgToDD("要素提取失败:docid:%d with result:%s"%(item.get(document_tmp_docid),str(e)))
|
|
|
log("process %s docid: failed message_id:%s"%(data["doc_id"],message_id))
|
|
|
- if send_msg_toacmq(self.pool_mq,frame.body,self.mq_extract):
|
|
|
- ackMsg(conn,message_id,subscription)
|
|
|
+ if extract_times>2:
|
|
|
+ #transform to the extract_failed queue
|
|
|
+ if send_msg_toacmq(self.pool_mq,json.dumps(item,ensure_ascii=False),self.mq_extract_failed):
|
|
|
+ #process as succeed
|
|
|
+ dtmp.setValue(document_tmp_dochtmlcon,"",False)
|
|
|
+ dtmp.setValue(document_tmp_status,random.randint(*flow_extract_status_succeed_to),True)
|
|
|
+ dtmp.update_row(self.ots_client)
|
|
|
+ dhtml.update_row(self.ots_client)
|
|
|
+ #replace as {}
|
|
|
+ _extract.setValue(document_extract2_extract_json,"{}",True)
|
|
|
+ _extract.setValue(document_extract2_industry_json,"{}",True)
|
|
|
+ _extract.setValue(document_extract2_status,random.randint(1,50),True)
|
|
|
+ _extract.update_row(self.ots_client)
|
|
|
+ ackMsg(conn,message_id,subscription)
|
|
|
+ else:
|
|
|
+ #transform to the extract queue
|
|
|
+ if send_msg_toacmq(self.pool_mq,json.dumps(item,ensure_ascii=False),self.mq_extract):
|
|
|
+ ackMsg(conn,message_id,subscription)
|
|
|
|
|
|
|
|
|
def start_flow_extract(self):
|