|
@@ -756,6 +756,7 @@ class Dataflow_ActivteMQ_extract(Dataflow_extract):
|
|
|
if html_len>200000:
|
|
|
# if int(item.get("docid"))==238431011:
|
|
|
# save(item,"238431011.pk")
|
|
|
+ log("docid %s dochtmlcon too long len %d "%(str(item.get("docid")),html_len))
|
|
|
try:
|
|
|
_dochtmlcon = re.sub("<html>|</html>|<body>|</body>", "", _dochtmlcon)
|
|
|
_soup = BeautifulSoup(_dochtmlcon,"lxml")
|