import urllib.request from bs4 import BeautifulSoup import re import time import requests import jieba import numpy from Utils import findAllIndex import htmlDrawing as hd def analysis(list_url): ''' @summary: 分析网页,做正文、标题、时间的抽取,根据多个同源网站的剔除相同的内容,余下的就是正文 @param: url: 要提取的网页 @return: type:dict 正文、标题、时间的字典 ''' def delStopTags(list_soup,stopTags): ''' @summary: 从网页DOM树中删除所有的停用标签 @param: list_soup: 多个同源网页DOM树 stopTags: 停用标签 @return: 网页DOM树 ''' for soup in list_soup: for item in stopTags: for tag in soup.find_all(item): tag.decompose() return list_soup def getPath_code_Text(soup,result,code=""): ''' @summary: 从网页DOM树中拿到路径、标签引用、文本 @param: soup: 网页DOM @return: {路径,[[标签引用,文本]]} ''' for child in soup.find_all(True,recursive=False): path = code+child.name if path in result.keys(): result[path].append([child,re.sub("[\s\r\n]*","",child.get_text().strip())]) else: result[path] = [[child,re.sub("[\s\r\n]*","",child.get_text().strip())]] getPath_code_Text(child, result, path) return result def getTheSameTagsOfSameText(path,text,list_PathCodeText): ''' @summary: 从多个网页的path-code-text中获取路径相同文本相同的tag ''' list_child = [] if text=="": return None for dict_pct in list_PathCodeText: if path in dict_pct.keys(): list_TagText = dict_pct[path] for TagText in list_TagText: if text==TagText[1] and text!="": list_child.append(TagText[0]) break if len(list_child)==len(list_PathCodeText): return list_child return None def removeTheSameTags(list_PathCodeText): ''' @summary: 剔除路径和文本都一样的标签节点 @param: list_PathCodeText: type:list,多个网页经过getPath_Code_Text方法得到的结果 ''' if len(list_PathCodeText)>1: dict_1 = list_PathCodeText[0] for path in dict_1.keys(): list_TagText = dict_1[path] #print("--",list_TagText) for TagText in list_TagText: Tag = TagText[0] Text = TagText[1] sameTags = getTheSameTagsOfSameText(path, Text, list_PathCodeText[1:]) if sameTags is not None: #print(path) Tag.decompose() for tag in sameTags: tag.decompose() list_soup = [] for url in list_url: soup = hd.getSource(url) list_soup.append(soup) stopTags = ["script","meta","link","style","head"] list_soup = delStopTags(list_soup, stopTags) list_PathCodeText = [] for soup in list_soup: list_PathCodeText.append(getPath_code_Text(soup,dict())) #print(list_PathCodeText[0]) removeTheSameTags(list_PathCodeText) for soup in list_soup: print(soup.get_text()) if __name__=="__main__": url = ["http://gtj.taiyuan.gov.cn/doc/2018/08/30/661759.shtml", "http://gtj.taiyuan.gov.cn/doc/2018/07/09/590197.shtml"] b = time.time() result = analysis(url) print(time.time()-b)