postProcess.py 1.0 KB

12345678910111213141516171819202122232425262728293031323334
  1. '''
  2. Created on 2019年3月13日
  3. @author: User
  4. '''
  5. from bs4 import BeautifulSoup
  6. import re
  7. def removeByRule(inner_html):
  8. '''
  9. @param:
  10. inner_html:需要剪枝的正文内容
  11. @return: 剪枝后的正文内容
  12. '''
  13. soup = BeautifulSoup(inner_html,"lxml")
  14. removeTags = ["script"]
  15. removePattern = re.compile("^.{0,25}((访问|浏览|阅读)(量|次数)|作者|来源).{0,10}$")
  16. hrefPattern = re.compile("pdf|doc|docx|xls|xlsx|zip|rar|com$")
  17. for child in soup.find_all(recursive=True):
  18. #判断tagName
  19. if child.name in removeTags:
  20. child.decompose()
  21. for child in soup.find_all(recursive=True):
  22. text_ = re.sub("[\r\n\s]","",child.get_text())
  23. if re.search(removePattern,text_) is not None:
  24. child.clear(decompose=True)
  25. if child.name=="a":
  26. if "href" in child.attrs:
  27. href = child.attrs["href"]
  28. if re.search(hrefPattern,href) is None:
  29. child.parent.decompose()
  30. return str(soup)