luojiehua
/
ContentExtract


			
							123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344
							
import sys
import os
sys.path.append("../")
import urllib.request
from bs4 import BeautifulSoup
import re
import time
import requests
import jieba
import numpy
from module.Utils import findAllIndex
from lxml import etree
from module.htmlDrawing import getBrowser


def analysis(url):
    '''
    @summary: 分析网页，做正文、标题、时间的抽取，只针对正文内容占整个网页文本的大部分的网页
    @param:
        url: 要提取的网页
    @return: type:dict 正文、标题、时间的字典
    '''
    
    def delStopTags(soup,stopTags):
        '''
        @summary: 从网页DOM树中删除所有的停用标签
        @param:
            soup: 网页DOM树
            stopTags: 停用标签
        @return: 网页DOM树
        '''
        for item in stopTags:
            for tag in soup.find_all(item):
                tag.decompose()
        return soup
        
    
    def recursiveStatistic(soup,stopTags,stopWords_pattern,punctuationWords_pattern,parent_code="ROOT"):
        '''
        @summary: 递归统计标签的字数，停用词数，标点符号数
        @param:
            soup: 网页的DOM树
            stopTags: 停用标签
            stopWords_pattern: 停用词正则
            punctuationWords_pattern: 标点符号正则
            parent_code: 父节点编码
        @return: 经过信息统计的DOM树
        '''
        
        i = 0
        for child in soup.find_all(True,recursive=False):
            if child.name is not None and child.name.strip().lower() not in stopTags:
                i += 1
                child.code = parent_code+("0"+str(i) if i<10 else str(i))
                child.words = re.sub("[\s\r\n]*","",child.get_text().strip()) if (child.get_text() is not None) else ""
                child.num_words = len(child.words)
                child.num_stopwords = len(re.findall(stopWords_pattern,child.words))
                child.num_punctuations = len(re.findall(punctuationWords_pattern,child.words))
                recursiveStatistic(child, stopTags, stopWords_pattern, punctuationWords_pattern, child.code)
        return soup
        
    def getContent_withWords(soup,all_words,last_percent,limit_percent=0.3):
        '''
        @summary: 从根节点往下，通过词数的变化找到正文所在节点
        @param:
            soup: 网页DOM树
            all_words:所有字数长度
            last_percent:父节点所占字数百分比
            limit_percent:百分比损失的限定值
        @return: 网页DOM树
        '''
        
        pass_limit = None
        pass_percent = last_percent
        for child in soup.find_all(True,recursive=False):
            
            if child.num_words is not None:
                percent = child.num_words/all_words
                print(child.name,last_percent,percent)
                if last_percent-percent<limit_percent:
                    pass_limit = child
                    pass_percent = percent
                    break
        if pass_limit is None:
            print(soup.words)
            return  soup
        else:
            return getContent_withWords(pass_limit,all_words,pass_percent)
        
    def getContent_withPunctuations(soup,all_punctuations,last_percent,limit_percent=0.2):
        '''
        @summary: 从根节点往下，用标点符号树的变化找到正文所在节点
        @param:
            soup: 网页DOM树
            all_punctuations: 所有标点符号数
            last_percent: 父节点标点符号数百分比
            limit_percent: 百分比损失限定值
        '''
        pass_limit = None
        pass_percent = last_percent
        for child in soup.find_all(True,recursive=False):
            
            if child.num_words is not None:
                percent = child.num_punctuations/all_punctuations
                #print(child.name,last_percent,percent)
                if last_percent-percent<limit_percent:
                    pass_limit = child
                    pass_percent = percent
                    break
        if pass_limit is None:
            #print(soup.words)
            return  soup
        else:
            return getContent_withPunctuations(pass_limit,all_punctuations,pass_percent)
        
    def getContent_withStopWords(soup,all_stopwords,last_percent,limit_percent=0.4):
        '''
        @summary: 从根节点开始查找，根据停用词数的变化确定正文所在节点
        @param:
            soup: 网页DOM树
            all_stopwords: 所有停用词数
            last_percent: 父节点停用词数所占百分比
            limit_percent: 百分比损失限定值
        '''
        pass_limit = None
        pass_percent = last_percent
        for child in soup.find_all(True,recursive=False):
            
            if child.num_words is not None:
                percent = child.num_stopwords/all_stopwords
                #print(child.name,last_percent,percent)
                if last_percent-percent<limit_percent:
                    pass_limit = child
                    pass_percent = percent
                    break
        if pass_limit is None:
            #print(soup.words)
            return  soup
        else:
            return getContent_withPunctuations(pass_limit,all_stopwords,pass_percent)
        
    def getChildsFromTheBeginOfContent(content,content_child,nums,getNums=[],list_childs_title=[],list_childs_time=[],title_len = (6,30),time_len = 40,time_pattern = re.compile("\d{2,4}[年/-]\d{1,2}[月/-]\d{1,2}[日\s]?")):
        '''
        @summary: 从正文开始处获取叶节点
        @param:
            content: 正文内容
            content_child: 当前节点
            nums: 要获取的叶节点个数
            child: 叶节点数组
        @return: list of 叶节点
        '''
        
        if len(content_child.find_all(True))==0:
            sum = 0
            appear = 0
            for item in jieba.cut(re.sub("[A-Za-z0-9]","",content_child.words)):
                if len(findAllIndex(item,content))>1:
                    appear += 1
                sum += 1
            if sum>=title_len[0] and sum<=title_len[1]:
                if appear/sum >0.7:
                    list_childs_title.append([content_child.words,sum,appear,content_child.code])
            
            if content_child.words is not None:
                if content_child.num_words<time_len:
                    matchs = re.findall(time_pattern,content_child.words)
                    if len(matchs)==1:
                        list_childs_time.append((matchs[0],content_child.code))
            getNums.append(1)
            if len(getNums)>=nums:
                return list_childs_title,list_childs_time
        for child in content_child.find_all(True,recursive=False):
            if len(getNums)>=nums:
                return list_childs_title,list_childs_time
            getChildsFromTheBeginOfContent(content,child,nums,getNums,list_childs_title,list_childs_time)
        
        
    def getTitleTimeList(soup,child_content,title_list = None,time_list = None,title_len = (6,30),time_len = 40,time_pattern = re.compile("\d{2,4}[年/-]\d{1,2}[月/-]\d{1,2}[日\s]?")):
        '''
        @summary: 根据正文所在节点来确定整个网页的title和时间
        @param:
            soup: 网页DOM树
            child_content: 正文所在节点
            title_list: 符合条件的title
            time_list: 符合条件的time
            title_len: 限制title所在句子的词数范围
            time_len: 限制时间所在句子的长度
            time_pattern: 时间正则
        @return: list of title,list of time
        '''
        if title_list is None:
            title_list = []
            time_list = []
        for child in soup.find_all(True,recursive=False):
            if child.words is not None and len(child.words)>0:
                text = re.sub("[A-Za-z0-9]","",child.words.strip())
                content = child_content.words.strip()
                sum = 0
                appear = 0
                for item in jieba.cut(text):
                    if str(content).find(item)>=0:
                        appear += 1
                    sum += 1
                if sum>=title_len[0] and sum<=title_len[1]:
                    if appear/sum >0.7:
                        title_list.append((child.words,sum,appear,child.code))
                        
                if child.words is not None:
                    if child.num_words<time_len:
                        matchs = re.findall(time_pattern,child.words)
                        if len(matchs)==1:
                            time_list.append((matchs[0],child.code))
            if child!=child_content:
                getTitleTimeList(child,child_content,title_list,time_list)
        return title_list,time_list
    
    header={
    "Accept": "text/html, application/xhtml+xml, image/jxr, */*",
    "Referer": "http://uia.hnist.cn/sso/login?service=http%3A%2F%2Fportal.hnist.\
                cn%2Fuser%2FsimpleSSOLogin",    
    "Accept-Language": "zh-Hans-CN,zh-Hans;q=0.8,en-US;q=0.5,en;q=0.3",
    "Content-Type": "application/x-www-form-urlencoded",
    "Accept-Encoding": "gzip, deflate",
    "Connection": "Keep-Alive",
    "User-Agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64) \
     AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.162 Safari/537.36",
    "Accept-Encoding": "gzip, deflate",
    "Origin": "http://uia.hnist.cn",
    "Upgrade-Insecure-Requests": "1",
    
    }  
    sess = requests.Session()
    sess.headers = header
    data=sess.get(url)
    data = data.text.encode(data.encoding)
    data = data.decode("gb2312")
    
    stopTags = ["script","meta","link","style"]
    #data = urllib.request.urlopen(url).read().decode("utf-8")
    soup = BeautifulSoup(data,"lxml")
    soup = delStopTags(soup, stopTags)
    stopWords = ["[A-Z]","[a-z]","[0-9]"]
    stopWords_pattern = re.compile("|".join(stopWords))
    punctuationWords = "[；，。：、]"
    punctuationWords_pattern = re.compile(punctuationWords)
    a = time.time()
    soup = recursiveStatistic(soup, stopTags, stopWords_pattern, punctuationWords_pattern)
    
    content_child = getContent_withWords(soup, soup.html.num_words, 1)
    #content_child = getContent_withPunctuations(soup,soup.html.num_punctuations,1)
    #content_child = getContent_withPunctuations(soup,soup.num_stopwords,1)
    
    list_childs_title,list_childs_time = getChildsFromTheBeginOfContent(content_child.words,content_child, 10)
    result = dict()
    title_list,time_list = getTitleTimeList(soup, content_child)
    for item in list_childs_title:
        title_list.append(item)
    for item in list_childs_time:
        time_list.append(item)
    title_list.sort(key=lambda x:x[2]/x[1],reverse=True)
    title_list_max = []
    
    #取出出现率最大的句子
    if len(title_list)>0:
        max_match = title_list[0][2]/title_list[0][1]
        for i in range(len(title_list)):
            if title_list[i][2]/title_list[i][1]==max_match:
                title_list_max.append(title_list[i])
            else:
                break
    route_match = 0
    if len(title_list_max)>0:
        title = title_list_max[0][0]
        #取出离正文最近的title
        for i in range(len(title_list_max)):
            match = 0
            for a,b in zip(title_list_max[i][3],content_child.code):
                if a==b:
                    match += 1
            if match > route_match:
                route_match = match
                title = title_list_max[i][0]
        result["title"] = title
    
    
    result["content"] = content_child.words
    #取出离正文最近的时间
    if len(time_list)>0:
        if len(time_list)==1:
            result["time"] = time_list[0][0]
        else:
            route_match = 0
            the_time = time_list[0][0]
            for i in range(len(time_list)):
                match = 0
                for a,b in zip(time_list[i][1],content_child.code):
                    if a == b:
                        match += 1
                if match>route_match:
                    route_match = match
                    the_time = time_list[i][0]
            result["time"] = the_time
    return result
    

if __name__=="__main__":
    
    url = "https://www.celap.org.cn/art/2019/6/4/art_563_43889.html"
    '''
    sess = requests.Session()
    data=sess.get(url)
    data = data.text.encode(data.encoding)
    data = data.decode("utf-8")
    '''
    browser = getBrowser()
    browser.get(url)
    data = browser.page_source
    htm=etree.HTML(data)
    htree=etree.ElementTree(htm)
    etree.xpath('//*[@id="zoom"]')
    #print(htm.iter())
    ###依次打印出每个元素的文本内容和xpath路径
    for t in htm.iter():
        print(t.getparent())
        print(etree.tostring(t,encoding="unicode"))
        print(htree.getpath(t),t.text)
    '''
    b = time.time()
    result = analysis(url)
    print(result)
    '''
    #soup = BeautifulSoup(data,"lxml")
    #print(soup.get_text())
    #print(soup.words)
    #print(soup.body.num_words)
    #print(soup.num_words,soup.num_punctuations,soup.num_stopwords)
    '''
    for child in soup.find_all(True):
        pass
        child.test1 = "1"
        print(child.name,child.words,child.num_words,len(child.find_all(True,recursive=False)),child.string,"---",child.parent.name)
        '''