123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115 |
- import urllib.request
- from bs4 import BeautifulSoup
- import re
- import time
- import requests
- import jieba
- import numpy
- from Utils import findAllIndex
- import htmlDrawing as hd
- def analysis(list_url):
- '''
- @summary: 分析网页,做正文、标题、时间的抽取,根据多个同源网站的剔除相同的内容,余下的就是正文
- @param:
- url: 要提取的网页
- @return: type:dict 正文、标题、时间的字典
- '''
-
- def delStopTags(list_soup,stopTags):
- '''
- @summary: 从网页DOM树中删除所有的停用标签
- @param:
- list_soup: 多个同源网页DOM树
- stopTags: 停用标签
- @return: 网页DOM树
- '''
- for soup in list_soup:
- for item in stopTags:
- for tag in soup.find_all(item):
- tag.decompose()
- return list_soup
-
- def getPath_code_Text(soup,result,code=""):
- '''
- @summary: 从网页DOM树中拿到路径、标签引用、文本
- @param:
- soup: 网页DOM
- @return: {路径,[[标签引用,文本]]}
- '''
- for child in soup.find_all(True,recursive=False):
- path = code+child.name
- if path in result.keys():
- result[path].append([child,re.sub("[\s\r\n]*","",child.get_text().strip())])
- else:
- result[path] = [[child,re.sub("[\s\r\n]*","",child.get_text().strip())]]
- getPath_code_Text(child, result, path)
- return result
-
- def getTheSameTagsOfSameText(path,text,list_PathCodeText):
- '''
- @summary: 从多个网页的path-code-text中获取路径相同文本相同的tag
- '''
- list_child = []
- if text=="":
- return None
- for dict_pct in list_PathCodeText:
- if path in dict_pct.keys():
- list_TagText = dict_pct[path]
- for TagText in list_TagText:
- if text==TagText[1] and text!="":
- list_child.append(TagText[0])
- break
- if len(list_child)==len(list_PathCodeText):
- return list_child
- return None
-
- def removeTheSameTags(list_PathCodeText):
- '''
- @summary: 剔除路径和文本都一样的标签节点
- @param:
- list_PathCodeText: type:list,多个网页经过getPath_Code_Text方法得到的结果
- '''
- if len(list_PathCodeText)>1:
- dict_1 = list_PathCodeText[0]
- for path in dict_1.keys():
- list_TagText = dict_1[path]
- #print("--",list_TagText)
- for TagText in list_TagText:
- Tag = TagText[0]
- Text = TagText[1]
- sameTags = getTheSameTagsOfSameText(path, Text, list_PathCodeText[1:])
- if sameTags is not None:
- #print(path)
- Tag.decompose()
- for tag in sameTags:
- tag.decompose()
-
-
-
-
- list_soup = []
- for url in list_url:
- soup = hd.getSource(url)
- list_soup.append(soup)
- stopTags = ["script","meta","link","style","head"]
- list_soup = delStopTags(list_soup, stopTags)
- list_PathCodeText = []
- for soup in list_soup:
- list_PathCodeText.append(getPath_code_Text(soup,dict()))
- #print(list_PathCodeText[0])
- removeTheSameTags(list_PathCodeText)
- for soup in list_soup:
- print(soup.get_text())
-
-
- if __name__=="__main__":
-
- url = ["http://gtj.taiyuan.gov.cn/doc/2018/08/30/661759.shtml",
- "http://gtj.taiyuan.gov.cn/doc/2018/07/09/590197.shtml"]
- b = time.time()
- result = analysis(url)
- print(time.time()-b)
|