2.py 483 B

12345678910111213141516171819202122
  1. import time
  2. import gunicorn
  3. import requests
  4. from bs4 import BeautifulSoup
  5. from BaseDataMaintenance.common.Utils import article_limit
  6. import codecs
  7. if __name__ == '__main__':
  8. text = codecs.open("C:\\Users\\\Administrator\\Desktop\\2.html","r",encoding="utf8").read()
  9. content = str(BeautifulSoup(text).find("div"))
  10. _soup = BeautifulSoup(content,"lxml")
  11. print(len(str(_soup)))
  12. _soup = article_limit(_soup,100)
  13. print(len(str(_soup)))
  14. print(str(_soup))