luojiehua
/
ContentExtract


			
				
					
						
						
							12345678910111213141516171819202122232425262728293031323334
							'''
Created on 2019年3月13日

@author: User
'''

from bs4 import BeautifulSoup
import re

def removeByRule(inner_html):
    '''
    @param:
        inner_html:需要剪枝的正文内容
    @return: 剪枝后的正文内容
    '''
    soup = BeautifulSoup(inner_html,"lxml")
    removeTags = ["script"]
    removePattern = re.compile("^.{0,25}((访问|浏览|阅读)(量|次数)|作者|来源).{0,10}$")
    hrefPattern = re.compile("pdf|doc|docx|xls|xlsx|zip|rar|com$")
    for child in soup.find_all(recursive=True):
        #判断tagName
        if child.name in removeTags:
            child.decompose()
    for child in soup.find_all(recursive=True):
        text_ = re.sub("[\r\n\s]","",child.get_text())
        if re.search(removePattern,text_) is not None:
            child.clear(decompose=True)
        if child.name=="a":
            if "href" in child.attrs:
                href = child.attrs["href"]
                if re.search(hrefPattern,href) is None:
                    child.parent.decompose()
    return str(soup)