|
@@ -752,7 +752,9 @@ class Dataflow_ActivteMQ_extract(Dataflow_extract):
|
|
|
|
|
|
_dochtmlcon = item.get(document_tmp_dochtmlcon,"")
|
|
|
|
|
|
- if len(_dochtmlcon)>200000:
|
|
|
+ html_len = len(_dochtmlcon)
|
|
|
+ if html_len>200000:
|
|
|
+
|
|
|
try:
|
|
|
_soup = BeautifulSoup(_dochtmlcon,"lxml")
|
|
|
_soup = article_limit(_soup,200000)
|
|
@@ -761,6 +763,7 @@ class Dataflow_ActivteMQ_extract(Dataflow_extract):
|
|
|
traceback.print_exc()
|
|
|
ackMsg(conn,message_id,subscription)
|
|
|
return
|
|
|
+ log("docid %s len %d limit to %d"%(str(item.get("docid")),html_len,len(_dochtmlcon)))
|
|
|
|
|
|
|
|
|
dhtml.setValue(document_tmp_dochtmlcon,_dochtmlcon,True)
|