|
@@ -753,14 +753,14 @@ class Dataflow_ActivteMQ_extract(Dataflow_extract):
|
|
|
_dochtmlcon = item.get(document_tmp_dochtmlcon,"")
|
|
|
|
|
|
html_len = len(_dochtmlcon)
|
|
|
- if html_len>200000:
|
|
|
+ if html_len>50000:
|
|
|
if int(item.get("docid"))==329546490:
|
|
|
save(item,"329546490.pk")
|
|
|
log("docid %s dochtmlcon too long len %d "%(str(item.get("docid")),html_len))
|
|
|
try:
|
|
|
_dochtmlcon = re.sub("<html>|</html>|<body>|</body>", "", _dochtmlcon)
|
|
|
_soup = BeautifulSoup(_dochtmlcon,"lxml")
|
|
|
- _soup = article_limit(_soup,200000)
|
|
|
+ _soup = article_limit(_soup,50000)
|
|
|
_dochtmlcon = str(_soup)
|
|
|
except Exception as e:
|
|
|
traceback.print_exc()
|