luojiehua
/
ContentExtract


			
							123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115
							
import urllib.request
from bs4 import BeautifulSoup
import re
import time
import requests
import jieba
import numpy
from Utils import findAllIndex
import htmlDrawing as hd


def analysis(list_url):
    '''
    @summary: 分析网页，做正文、标题、时间的抽取，根据多个同源网站的剔除相同的内容，余下的就是正文
    @param:
        url: 要提取的网页
    @return: type:dict 正文、标题、时间的字典
    '''
    
    def delStopTags(list_soup,stopTags):
        '''
        @summary: 从网页DOM树中删除所有的停用标签
        @param:
            list_soup: 多个同源网页DOM树
            stopTags: 停用标签
        @return: 网页DOM树
        '''
        for soup in list_soup:
            for item in stopTags:
                for tag in soup.find_all(item):
                    tag.decompose()
        return list_soup
    
    def getPath_code_Text(soup,result,code=""):
        '''
        @summary: 从网页DOM树中拿到路径、标签引用、文本
        @param:
            soup: 网页DOM
        @return: {路径,[[标签引用,文本]]}
        '''
        for child in soup.find_all(True,recursive=False):
            path = code+child.name
            if path in result.keys():
                result[path].append([child,re.sub("[\s\r\n]*","",child.get_text().strip())])
            else:
                result[path] = [[child,re.sub("[\s\r\n]*","",child.get_text().strip())]]
            getPath_code_Text(child, result, path)
        return result
    
    def getTheSameTagsOfSameText(path,text,list_PathCodeText):
        '''
        @summary: 从多个网页的path-code-text中获取路径相同文本相同的tag
        '''
        list_child = []
        if text=="":
            return None
        for dict_pct in list_PathCodeText:
            if path in dict_pct.keys():
                list_TagText = dict_pct[path]
                for TagText in list_TagText:
                    if text==TagText[1] and text!="":
                        list_child.append(TagText[0])
                        break
        if len(list_child)==len(list_PathCodeText):
            return list_child
        return None
        
    def removeTheSameTags(list_PathCodeText):
        '''
        @summary: 剔除路径和文本都一样的标签节点
        @param:
            list_PathCodeText: type:list,多个网页经过getPath_Code_Text方法得到的结果
        '''
        if len(list_PathCodeText)>1:
            dict_1 = list_PathCodeText[0]
            for path in dict_1.keys():
                list_TagText = dict_1[path]
                #print("--",list_TagText)
                for TagText in list_TagText:
                    Tag = TagText[0]
                    Text = TagText[1]
                    sameTags = getTheSameTagsOfSameText(path, Text, list_PathCodeText[1:])
                    if sameTags is not None:
                        #print(path)
                        Tag.decompose()
                        for tag in sameTags:
                            tag.decompose()
        
                    
    list_soup = []
    for url in list_url:
        soup = hd.getSource(url)
        list_soup.append(soup)
    stopTags = ["script","meta","link","style","head"]
    list_soup = delStopTags(list_soup, stopTags)
    list_PathCodeText = []
    for soup in list_soup:
        list_PathCodeText.append(getPath_code_Text(soup,dict()))
    #print(list_PathCodeText[0])
    removeTheSameTags(list_PathCodeText)
    for soup in list_soup:
        print(soup.get_text())
    
    
if __name__=="__main__":
    
    url = ["http://gtj.taiyuan.gov.cn/doc/2018/08/30/661759.shtml",
           "http://gtj.taiyuan.gov.cn/doc/2018/07/09/590197.shtml"]
    b = time.time()
    result = analysis(url)
    print(time.time()-b)