import sys import os sys.path.append("../") import urllib.request from bs4 import BeautifulSoup import re import time import requests import jieba import numpy from module.Utils import findAllIndex from lxml import etree from module.htmlDrawing import getBrowser def analysis(url): ''' @summary: 分析网页,做正文、标题、时间的抽取,只针对正文内容占整个网页文本的大部分的网页 @param: url: 要提取的网页 @return: type:dict 正文、标题、时间的字典 ''' def delStopTags(soup,stopTags): ''' @summary: 从网页DOM树中删除所有的停用标签 @param: soup: 网页DOM树 stopTags: 停用标签 @return: 网页DOM树 ''' for item in stopTags: for tag in soup.find_all(item): tag.decompose() return soup def recursiveStatistic(soup,stopTags,stopWords_pattern,punctuationWords_pattern,parent_code="ROOT"): ''' @summary: 递归统计标签的字数,停用词数,标点符号数 @param: soup: 网页的DOM树 stopTags: 停用标签 stopWords_pattern: 停用词正则 punctuationWords_pattern: 标点符号正则 parent_code: 父节点编码 @return: 经过信息统计的DOM树 ''' i = 0 for child in soup.find_all(True,recursive=False): if child.name is not None and child.name.strip().lower() not in stopTags: i += 1 child.code = parent_code+("0"+str(i) if i<10 else str(i)) child.words = re.sub("[\s\r\n]*","",child.get_text().strip()) if (child.get_text() is not None) else "" child.num_words = len(child.words) child.num_stopwords = len(re.findall(stopWords_pattern,child.words)) child.num_punctuations = len(re.findall(punctuationWords_pattern,child.words)) recursiveStatistic(child, stopTags, stopWords_pattern, punctuationWords_pattern, child.code) return soup def getContent_withWords(soup,all_words,last_percent,limit_percent=0.3): ''' @summary: 从根节点往下,通过词数的变化找到正文所在节点 @param: soup: 网页DOM树 all_words:所有字数长度 last_percent:父节点所占字数百分比 limit_percent:百分比损失的限定值 @return: 网页DOM树 ''' pass_limit = None pass_percent = last_percent for child in soup.find_all(True,recursive=False): if child.num_words is not None: percent = child.num_words/all_words print(child.name,last_percent,percent) if last_percent-percent1: appear += 1 sum += 1 if sum>=title_len[0] and sum<=title_len[1]: if appear/sum >0.7: list_childs_title.append([content_child.words,sum,appear,content_child.code]) if content_child.words is not None: if content_child.num_words=nums: return list_childs_title,list_childs_time for child in content_child.find_all(True,recursive=False): if len(getNums)>=nums: return list_childs_title,list_childs_time getChildsFromTheBeginOfContent(content,child,nums,getNums,list_childs_title,list_childs_time) def getTitleTimeList(soup,child_content,title_list = None,time_list = None,title_len = (6,30),time_len = 40,time_pattern = re.compile("\d{2,4}[年/-]\d{1,2}[月/-]\d{1,2}[日\s]?")): ''' @summary: 根据正文所在节点来确定整个网页的title和时间 @param: soup: 网页DOM树 child_content: 正文所在节点 title_list: 符合条件的title time_list: 符合条件的time title_len: 限制title所在句子的词数范围 time_len: 限制时间所在句子的长度 time_pattern: 时间正则 @return: list of title,list of time ''' if title_list is None: title_list = [] time_list = [] for child in soup.find_all(True,recursive=False): if child.words is not None and len(child.words)>0: text = re.sub("[A-Za-z0-9]","",child.words.strip()) content = child_content.words.strip() sum = 0 appear = 0 for item in jieba.cut(text): if str(content).find(item)>=0: appear += 1 sum += 1 if sum>=title_len[0] and sum<=title_len[1]: if appear/sum >0.7: title_list.append((child.words,sum,appear,child.code)) if child.words is not None: if child.num_words0: max_match = title_list[0][2]/title_list[0][1] for i in range(len(title_list)): if title_list[i][2]/title_list[i][1]==max_match: title_list_max.append(title_list[i]) else: break route_match = 0 if len(title_list_max)>0: title = title_list_max[0][0] #取出离正文最近的title for i in range(len(title_list_max)): match = 0 for a,b in zip(title_list_max[i][3],content_child.code): if a==b: match += 1 if match > route_match: route_match = match title = title_list_max[i][0] result["title"] = title result["content"] = content_child.words #取出离正文最近的时间 if len(time_list)>0: if len(time_list)==1: result["time"] = time_list[0][0] else: route_match = 0 the_time = time_list[0][0] for i in range(len(time_list)): match = 0 for a,b in zip(time_list[i][1],content_child.code): if a == b: match += 1 if match>route_match: route_match = match the_time = time_list[i][0] result["time"] = the_time return result if __name__=="__main__": url = "https://www.celap.org.cn/art/2019/6/4/art_563_43889.html" ''' sess = requests.Session() data=sess.get(url) data = data.text.encode(data.encoding) data = data.decode("utf-8") ''' browser = getBrowser() browser.get(url) data = browser.page_source htm=etree.HTML(data) htree=etree.ElementTree(htm) etree.xpath('//*[@id="zoom"]') #print(htm.iter()) ###依次打印出每个元素的文本内容和xpath路径 for t in htm.iter(): print(t.getparent()) print(etree.tostring(t,encoding="unicode")) print(htree.getpath(t),t.text) ''' b = time.time() result = analysis(url) print(result) ''' #soup = BeautifulSoup(data,"lxml") #print(soup.get_text()) #print(soup.words) #print(soup.body.num_words) #print(soup.num_words,soup.num_punctuations,soup.num_stopwords) ''' for child in soup.find_all(True): pass child.test1 = "1" print(child.name,child.words,child.num_words,len(child.find_all(True,recursive=False)),child.string,"---",child.parent.name) '''