12345678910111213141516171819202122232425262728293031323334 |
- '''
- Created on 2019年3月13日
- @author: User
- '''
- from bs4 import BeautifulSoup
- import re
- def removeByRule(inner_html):
- '''
- @param:
- inner_html:需要剪枝的正文内容
- @return: 剪枝后的正文内容
- '''
- soup = BeautifulSoup(inner_html,"lxml")
- removeTags = ["script"]
- removePattern = re.compile("^.{0,25}((访问|浏览|阅读)(量|次数)|作者|来源).{0,10}$")
- hrefPattern = re.compile("pdf|doc|docx|xls|xlsx|zip|rar|com$")
- for child in soup.find_all(recursive=True):
- #判断tagName
- if child.name in removeTags:
- child.decompose()
- for child in soup.find_all(recursive=True):
- text_ = re.sub("[\r\n\s]","",child.get_text())
- if re.search(removePattern,text_) is not None:
- child.clear(decompose=True)
- if child.name=="a":
- if "href" in child.attrs:
- href = child.attrs["href"]
- if re.search(hrefPattern,href) is None:
- child.parent.decompose()
- return str(soup)
-
|