|
@@ -12,6 +12,8 @@ from BaseDataMaintenance.dataSource.pool import ConnectorPool
|
|
|
from BaseDataMaintenance.model.ots.document import Document
|
|
|
|
|
|
from BaseDataMaintenance.common.Utils import article_limit
|
|
|
+from BaseDataMaintenance.common.documentFingerprint import getFingerprint
|
|
|
+from BaseDataMaintenance.model.postgres.document_extract import *
|
|
|
|
|
|
class ActiveMQListener():
|
|
|
|
|
@@ -523,33 +525,45 @@ class Dataflow_ActivteMQ_extract(Dataflow_extract):
|
|
|
|
|
|
self.industy_url = "http://127.0.0.1:15000/industry_extract"
|
|
|
|
|
|
- self.extract_interfaces = [["http://127.0.0.1:15030/content_extract",15],
|
|
|
- ["http://192.168.0.115:15030/content_extract",9]
|
|
|
+ self.extract_interfaces = [["http://127.0.0.1:15030/content_extract",17],
|
|
|
+ ["http://192.168.0.115:15030/content_extract",3]
|
|
|
]
|
|
|
|
|
|
|
|
|
+
|
|
|
self.mq_extract = "/queue/dataflow_extract"
|
|
|
self.mq_extract_failed = "/queue/dataflow_extract_failed"
|
|
|
|
|
|
- whole_weight = 0
|
|
|
+ self.whole_weight = 0
|
|
|
for _url,weight in self.extract_interfaces:
|
|
|
- whole_weight+= weight
|
|
|
+ self.whole_weight+= weight
|
|
|
current_weight = 0
|
|
|
for _i in range(len(self.extract_interfaces)):
|
|
|
current_weight += self.extract_interfaces[_i][1]
|
|
|
- self.extract_interfaces[_i][1] = current_weight/whole_weight
|
|
|
+ self.extract_interfaces[_i][1] = current_weight/self.whole_weight
|
|
|
|
|
|
|
|
|
+ self.block_url = RLock()
|
|
|
+ self.url_count = 0
|
|
|
|
|
|
self.comsumer_count = 20
|
|
|
for _i in range(self.comsumer_count):
|
|
|
listener_extract = self.ExtractListener(getConnect_activateMQ(),self.comsumer_handle)
|
|
|
createComsumer(listener_extract,self.mq_extract)
|
|
|
+ self.pool_postgres = ConnectorPool(10,self.comsumer_count,getConnection_postgres)
|
|
|
self.conn_mq = getConnect_activateMQ()
|
|
|
self.pool_mq = ConnectorPool(10,30,getConnect_activateMQ)
|
|
|
|
|
|
+
|
|
|
def getExtract_url(self):
|
|
|
- _r = random.random()
|
|
|
+ _url_num = 0
|
|
|
+ with self.block_url:
|
|
|
+ self.url_count += 1
|
|
|
+ self.url_count %= self.whole_weight
|
|
|
+ _url_num = self.url_count
|
|
|
+
|
|
|
+ # _r = random.random()
|
|
|
+ _r = _url_num/self.whole_weight
|
|
|
for _i in range(len(self.extract_interfaces)):
|
|
|
if _r<=self.extract_interfaces[_i][1]:
|
|
|
return self.extract_interfaces[_i][0]
|
|
@@ -559,6 +573,7 @@ class Dataflow_ActivteMQ_extract(Dataflow_extract):
|
|
|
# _i = 0
|
|
|
# _url = self.extract_interfaces[_i]
|
|
|
_url = self.getExtract_url()
|
|
|
+ log("extract_url:%s"%(str(_url)))
|
|
|
resp = requests.post(_url,json=json,headers=headers)
|
|
|
return resp
|
|
|
|
|
@@ -604,9 +619,31 @@ class Dataflow_ActivteMQ_extract(Dataflow_extract):
|
|
|
self.comsumer()
|
|
|
|
|
|
def comsumer(self):
|
|
|
- mt = MultiThreadHandler(self.queue_extract,self.comsumer_handle,None,50,1,True)
|
|
|
+ mt = MultiThreadHandler(self.queue_extract,self.comsumer_handle,None,self.comsumer_count,1,True)
|
|
|
mt.run()
|
|
|
|
|
|
+
|
|
|
+ def getExtract_json_fromDB(self,_fingerprint):
|
|
|
+ conn = self.pool_postgres.getConnector()
|
|
|
+ try:
|
|
|
+ list_extract = Document_extract_postgres.select_rows(conn,Document_extract_postgres,"document_extract",[" fingerprint='%s'"%_fingerprint])
|
|
|
+ if len(list_extract)>0:
|
|
|
+ _extract = list_extract[0]
|
|
|
+ return _extract.getProperties().get(document_extract_extract_json)
|
|
|
+ except Exception as e:
|
|
|
+ traceback.print_exc()
|
|
|
+ finally:
|
|
|
+ self.pool_postgres.putConnector(conn)
|
|
|
+ return None
|
|
|
+
|
|
|
+ def putExtract_json_toDB(self,fingerprint,docid,extract_json):
|
|
|
+ _de = Document_extract_postgres({document_extract_fingerprint:fingerprint,
|
|
|
+ document_extract_docid:docid,
|
|
|
+ document_extract_extract_json:extract_json})
|
|
|
+ _de.insert_row(self.pool_postgres,1)
|
|
|
+
|
|
|
+
|
|
|
+
|
|
|
def comsumer_handle(self,_dict,result_queue):
|
|
|
try:
|
|
|
frame = _dict["frame"]
|
|
@@ -640,20 +677,32 @@ class Dataflow_ActivteMQ_extract(Dataflow_extract):
|
|
|
for k,v in item.items():
|
|
|
data[k] = v
|
|
|
data["timeout"] = 440
|
|
|
- data["doc_id"] = data.get(document_tmp_docid)
|
|
|
+ data["doc_id"] = data.get(document_tmp_docid,0)
|
|
|
data["content"] = data.get(document_tmp_dochtmlcon,"")
|
|
|
if document_tmp_dochtmlcon in data:
|
|
|
data.pop(document_tmp_dochtmlcon)
|
|
|
data["title"] = data.get(document_tmp_doctitle,"")
|
|
|
data["web_source_no"] = item.get(document_tmp_web_source_no,"")
|
|
|
data["original_docchannel"] = item.get(document_tmp_original_docchannel,"")
|
|
|
+
|
|
|
+ _fingerprint = getFingerprint(str(data["title"])+str(data["content"]))
|
|
|
+
|
|
|
if all_done>0:
|
|
|
- resp = self.request_extract_interface(json=data,headers=self.header)
|
|
|
- if (resp.status_code >=200 and resp.status_code<=213):
|
|
|
- _extract.setValue(document_extract2_extract_json,resp.content.decode("utf8"),True)
|
|
|
+ extract_json = self.getExtract_json_fromDB(_fingerprint)
|
|
|
+ # extract_json = None
|
|
|
+ _docid = int(data["doc_id"])
|
|
|
+ if extract_json is not None:
|
|
|
+ log("fingerprint %s exists docid:%s"%(_fingerprint,str(_docid)))
|
|
|
+ _extract.setValue(document_extract2_extract_json,extract_json,True)
|
|
|
else:
|
|
|
- log("%s--%s"%(str(resp.status_code),resp.content.decode("utf8")))
|
|
|
- all_done = -2
|
|
|
+ resp = self.request_extract_interface(json=data,headers=self.header)
|
|
|
+ if (resp.status_code >=200 and resp.status_code<=213):
|
|
|
+ extract_json = resp.content.decode("utf8")
|
|
|
+ _extract.setValue(document_extract2_extract_json,extract_json,True)
|
|
|
+ self.putExtract_json_toDB(_fingerprint,_docid,extract_json)
|
|
|
+ else:
|
|
|
+ log("%s--%s"%(str(resp.status_code),resp.content.decode("utf8")))
|
|
|
+ all_done = -2
|
|
|
# if all_done>0:
|
|
|
# resp = self.request_industry_interface(json=data,headers=self.header)
|
|
|
# if (resp.status_code >=200 and resp.status_code<=213):
|
|
@@ -723,6 +772,10 @@ class Dataflow_ActivteMQ_extract(Dataflow_extract):
|
|
|
if _to_ack:
|
|
|
ackMsg(conn,message_id,subscription)
|
|
|
log("process %s docid:%d %s"%(str(_to_ack),data["doc_id"],str(all_done)))
|
|
|
+ except requests.ConnectionError as e1:
|
|
|
+ item["extract_times"] -= 1
|
|
|
+ if send_msg_toacmq(self.pool_mq,json.dumps(item,ensure_ascii=False),self.mq_extract):
|
|
|
+ ackMsg(conn,message_id,subscription)
|
|
|
except Exception as e:
|
|
|
traceback.print_exc()
|
|
|
sentMsgToDD("要素提取失败:docid:%d with result:%s"%(item.get(document_tmp_docid),str(e)))
|