|
@@ -754,9 +754,10 @@ class Dataflow_ActivteMQ_extract(Dataflow_extract):
|
|
|
|
|
|
html_len = len(_dochtmlcon)
|
|
html_len = len(_dochtmlcon)
|
|
if html_len>200000:
|
|
if html_len>200000:
|
|
- if int(item.get("docid"))==238431011:
|
|
|
|
- save(item,"238431011.pk")
|
|
|
|
|
|
+ # if int(item.get("docid"))==238431011:
|
|
|
|
+ # save(item,"238431011.pk")
|
|
try:
|
|
try:
|
|
|
|
+ _dochtmlcon = re.sub("<html>|</html>|<body>|</body>", "", _dochtmlcon)
|
|
_soup = BeautifulSoup(_dochtmlcon,"lxml")
|
|
_soup = BeautifulSoup(_dochtmlcon,"lxml")
|
|
_soup = article_limit(_soup,200000)
|
|
_soup = article_limit(_soup,200000)
|
|
_dochtmlcon = str(_soup)
|
|
_dochtmlcon = str(_soup)
|