test.py 406 B

1234567891011121314151617181920
  1. from BaseDataMaintenance.common.Utils import load,article_limit
  2. from bs4 import BeautifulSoup
  3. import re
  4. filename = "329546490.pk"
  5. item = load(filename)
  6. _dochtmlcon = item.get("dochtmlcon","")
  7. _dochtmlcon = re.sub("<html>|</html>|<body>|</body>", "", _dochtmlcon)
  8. _soup = BeautifulSoup(_dochtmlcon,"lxml")
  9. _soup = article_limit(_soup,200000)
  10. _dochtmlcon = str(_soup)
  11. print("done",len(_dochtmlcon))