''' Created on 2019年3月13日 @author: User ''' from bs4 import BeautifulSoup import re def removeByRule(inner_html): ''' @param: inner_html:需要剪枝的正文内容 @return: 剪枝后的正文内容 ''' soup = BeautifulSoup(inner_html,"lxml") removeTags = ["script"] removePattern = re.compile("^.{0,25}((访问|浏览|阅读)(量|次数)|作者|来源).{0,10}$") hrefPattern = re.compile("pdf|doc|docx|xls|xlsx|zip|rar|com$") for child in soup.find_all(recursive=True): #判断tagName if child.name in removeTags: child.decompose() for child in soup.find_all(recursive=True): text_ = re.sub("[\r\n\s]","",child.get_text()) if re.search(removePattern,text_) is not None: child.clear(decompose=True) if child.name=="a": if "href" in child.attrs: href = child.attrs["href"] if re.search(hrefPattern,href) is None: child.parent.decompose() return str(soup)