1234567891011121314151617181920 |
- from BaseDataMaintenance.common.Utils import load,article_limit
- from bs4 import BeautifulSoup
- import re
- filename = "329546490.pk"
- item = load(filename)
- _dochtmlcon = item.get("dochtmlcon","")
- _dochtmlcon = re.sub("<html>|</html>|<body>|</body>", "", _dochtmlcon)
- _soup = BeautifulSoup(_dochtmlcon,"lxml")
- _soup = article_limit(_soup,200000)
- _dochtmlcon = str(_soup)
- print("done",len(_dochtmlcon))
|