123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344 |
- import sys
- import os
- sys.path.append("../")
- import urllib.request
- from bs4 import BeautifulSoup
- import re
- import time
- import requests
- import jieba
- import numpy
- from module.Utils import findAllIndex
- from lxml import etree
- from module.htmlDrawing import getBrowser
- def analysis(url):
- '''
- @summary: 分析网页,做正文、标题、时间的抽取,只针对正文内容占整个网页文本的大部分的网页
- @param:
- url: 要提取的网页
- @return: type:dict 正文、标题、时间的字典
- '''
-
- def delStopTags(soup,stopTags):
- '''
- @summary: 从网页DOM树中删除所有的停用标签
- @param:
- soup: 网页DOM树
- stopTags: 停用标签
- @return: 网页DOM树
- '''
- for item in stopTags:
- for tag in soup.find_all(item):
- tag.decompose()
- return soup
-
-
- def recursiveStatistic(soup,stopTags,stopWords_pattern,punctuationWords_pattern,parent_code="ROOT"):
- '''
- @summary: 递归统计标签的字数,停用词数,标点符号数
- @param:
- soup: 网页的DOM树
- stopTags: 停用标签
- stopWords_pattern: 停用词正则
- punctuationWords_pattern: 标点符号正则
- parent_code: 父节点编码
- @return: 经过信息统计的DOM树
- '''
-
- i = 0
- for child in soup.find_all(True,recursive=False):
- if child.name is not None and child.name.strip().lower() not in stopTags:
- i += 1
- child.code = parent_code+("0"+str(i) if i<10 else str(i))
- child.words = re.sub("[\s\r\n]*","",child.get_text().strip()) if (child.get_text() is not None) else ""
- child.num_words = len(child.words)
- child.num_stopwords = len(re.findall(stopWords_pattern,child.words))
- child.num_punctuations = len(re.findall(punctuationWords_pattern,child.words))
- recursiveStatistic(child, stopTags, stopWords_pattern, punctuationWords_pattern, child.code)
- return soup
-
- def getContent_withWords(soup,all_words,last_percent,limit_percent=0.3):
- '''
- @summary: 从根节点往下,通过词数的变化找到正文所在节点
- @param:
- soup: 网页DOM树
- all_words:所有字数长度
- last_percent:父节点所占字数百分比
- limit_percent:百分比损失的限定值
- @return: 网页DOM树
- '''
-
- pass_limit = None
- pass_percent = last_percent
- for child in soup.find_all(True,recursive=False):
-
- if child.num_words is not None:
- percent = child.num_words/all_words
- print(child.name,last_percent,percent)
- if last_percent-percent<limit_percent:
- pass_limit = child
- pass_percent = percent
- break
- if pass_limit is None:
- print(soup.words)
- return soup
- else:
- return getContent_withWords(pass_limit,all_words,pass_percent)
-
- def getContent_withPunctuations(soup,all_punctuations,last_percent,limit_percent=0.2):
- '''
- @summary: 从根节点往下,用标点符号树的变化找到正文所在节点
- @param:
- soup: 网页DOM树
- all_punctuations: 所有标点符号数
- last_percent: 父节点标点符号数百分比
- limit_percent: 百分比损失限定值
- '''
- pass_limit = None
- pass_percent = last_percent
- for child in soup.find_all(True,recursive=False):
-
- if child.num_words is not None:
- percent = child.num_punctuations/all_punctuations
- #print(child.name,last_percent,percent)
- if last_percent-percent<limit_percent:
- pass_limit = child
- pass_percent = percent
- break
- if pass_limit is None:
- #print(soup.words)
- return soup
- else:
- return getContent_withPunctuations(pass_limit,all_punctuations,pass_percent)
-
- def getContent_withStopWords(soup,all_stopwords,last_percent,limit_percent=0.4):
- '''
- @summary: 从根节点开始查找,根据停用词数的变化确定正文所在节点
- @param:
- soup: 网页DOM树
- all_stopwords: 所有停用词数
- last_percent: 父节点停用词数所占百分比
- limit_percent: 百分比损失限定值
- '''
- pass_limit = None
- pass_percent = last_percent
- for child in soup.find_all(True,recursive=False):
-
- if child.num_words is not None:
- percent = child.num_stopwords/all_stopwords
- #print(child.name,last_percent,percent)
- if last_percent-percent<limit_percent:
- pass_limit = child
- pass_percent = percent
- break
- if pass_limit is None:
- #print(soup.words)
- return soup
- else:
- return getContent_withPunctuations(pass_limit,all_stopwords,pass_percent)
-
- def getChildsFromTheBeginOfContent(content,content_child,nums,getNums=[],list_childs_title=[],list_childs_time=[],title_len = (6,30),time_len = 40,time_pattern = re.compile("\d{2,4}[年/-]\d{1,2}[月/-]\d{1,2}[日\s]?")):
- '''
- @summary: 从正文开始处获取叶节点
- @param:
- content: 正文内容
- content_child: 当前节点
- nums: 要获取的叶节点个数
- child: 叶节点数组
- @return: list of 叶节点
- '''
-
- if len(content_child.find_all(True))==0:
- sum = 0
- appear = 0
- for item in jieba.cut(re.sub("[A-Za-z0-9]","",content_child.words)):
- if len(findAllIndex(item,content))>1:
- appear += 1
- sum += 1
- if sum>=title_len[0] and sum<=title_len[1]:
- if appear/sum >0.7:
- list_childs_title.append([content_child.words,sum,appear,content_child.code])
-
- if content_child.words is not None:
- if content_child.num_words<time_len:
- matchs = re.findall(time_pattern,content_child.words)
- if len(matchs)==1:
- list_childs_time.append((matchs[0],content_child.code))
- getNums.append(1)
- if len(getNums)>=nums:
- return list_childs_title,list_childs_time
- for child in content_child.find_all(True,recursive=False):
- if len(getNums)>=nums:
- return list_childs_title,list_childs_time
- getChildsFromTheBeginOfContent(content,child,nums,getNums,list_childs_title,list_childs_time)
-
-
- def getTitleTimeList(soup,child_content,title_list = None,time_list = None,title_len = (6,30),time_len = 40,time_pattern = re.compile("\d{2,4}[年/-]\d{1,2}[月/-]\d{1,2}[日\s]?")):
- '''
- @summary: 根据正文所在节点来确定整个网页的title和时间
- @param:
- soup: 网页DOM树
- child_content: 正文所在节点
- title_list: 符合条件的title
- time_list: 符合条件的time
- title_len: 限制title所在句子的词数范围
- time_len: 限制时间所在句子的长度
- time_pattern: 时间正则
- @return: list of title,list of time
- '''
- if title_list is None:
- title_list = []
- time_list = []
- for child in soup.find_all(True,recursive=False):
- if child.words is not None and len(child.words)>0:
- text = re.sub("[A-Za-z0-9]","",child.words.strip())
- content = child_content.words.strip()
- sum = 0
- appear = 0
- for item in jieba.cut(text):
- if str(content).find(item)>=0:
- appear += 1
- sum += 1
- if sum>=title_len[0] and sum<=title_len[1]:
- if appear/sum >0.7:
- title_list.append((child.words,sum,appear,child.code))
-
- if child.words is not None:
- if child.num_words<time_len:
- matchs = re.findall(time_pattern,child.words)
- if len(matchs)==1:
- time_list.append((matchs[0],child.code))
- if child!=child_content:
- getTitleTimeList(child,child_content,title_list,time_list)
- return title_list,time_list
-
- header={
- "Accept": "text/html, application/xhtml+xml, image/jxr, */*",
- "Referer": "http://uia.hnist.cn/sso/login?service=http%3A%2F%2Fportal.hnist.\
- cn%2Fuser%2FsimpleSSOLogin",
- "Accept-Language": "zh-Hans-CN,zh-Hans;q=0.8,en-US;q=0.5,en;q=0.3",
- "Content-Type": "application/x-www-form-urlencoded",
- "Accept-Encoding": "gzip, deflate",
- "Connection": "Keep-Alive",
- "User-Agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64) \
- AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.162 Safari/537.36",
- "Accept-Encoding": "gzip, deflate",
- "Origin": "http://uia.hnist.cn",
- "Upgrade-Insecure-Requests": "1",
-
- }
- sess = requests.Session()
- sess.headers = header
- data=sess.get(url)
- data = data.text.encode(data.encoding)
- data = data.decode("gb2312")
-
- stopTags = ["script","meta","link","style"]
- #data = urllib.request.urlopen(url).read().decode("utf-8")
- soup = BeautifulSoup(data,"lxml")
- soup = delStopTags(soup, stopTags)
- stopWords = ["[A-Z]","[a-z]","[0-9]"]
- stopWords_pattern = re.compile("|".join(stopWords))
- punctuationWords = "[;,。:、]"
- punctuationWords_pattern = re.compile(punctuationWords)
- a = time.time()
- soup = recursiveStatistic(soup, stopTags, stopWords_pattern, punctuationWords_pattern)
-
- content_child = getContent_withWords(soup, soup.html.num_words, 1)
- #content_child = getContent_withPunctuations(soup,soup.html.num_punctuations,1)
- #content_child = getContent_withPunctuations(soup,soup.num_stopwords,1)
-
- list_childs_title,list_childs_time = getChildsFromTheBeginOfContent(content_child.words,content_child, 10)
- result = dict()
- title_list,time_list = getTitleTimeList(soup, content_child)
- for item in list_childs_title:
- title_list.append(item)
- for item in list_childs_time:
- time_list.append(item)
- title_list.sort(key=lambda x:x[2]/x[1],reverse=True)
- title_list_max = []
-
- #取出出现率最大的句子
- if len(title_list)>0:
- max_match = title_list[0][2]/title_list[0][1]
- for i in range(len(title_list)):
- if title_list[i][2]/title_list[i][1]==max_match:
- title_list_max.append(title_list[i])
- else:
- break
- route_match = 0
- if len(title_list_max)>0:
- title = title_list_max[0][0]
- #取出离正文最近的title
- for i in range(len(title_list_max)):
- match = 0
- for a,b in zip(title_list_max[i][3],content_child.code):
- if a==b:
- match += 1
- if match > route_match:
- route_match = match
- title = title_list_max[i][0]
- result["title"] = title
-
-
- result["content"] = content_child.words
- #取出离正文最近的时间
- if len(time_list)>0:
- if len(time_list)==1:
- result["time"] = time_list[0][0]
- else:
- route_match = 0
- the_time = time_list[0][0]
- for i in range(len(time_list)):
- match = 0
- for a,b in zip(time_list[i][1],content_child.code):
- if a == b:
- match += 1
- if match>route_match:
- route_match = match
- the_time = time_list[i][0]
- result["time"] = the_time
- return result
-
- if __name__=="__main__":
-
- url = "https://www.celap.org.cn/art/2019/6/4/art_563_43889.html"
- '''
- sess = requests.Session()
- data=sess.get(url)
- data = data.text.encode(data.encoding)
- data = data.decode("utf-8")
- '''
- browser = getBrowser()
- browser.get(url)
- data = browser.page_source
- htm=etree.HTML(data)
- htree=etree.ElementTree(htm)
- etree.xpath('//*[@id="zoom"]')
- #print(htm.iter())
- ###依次打印出每个元素的文本内容和xpath路径
- for t in htm.iter():
- print(t.getparent())
- print(etree.tostring(t,encoding="unicode"))
- print(htree.getpath(t),t.text)
- '''
- b = time.time()
- result = analysis(url)
- print(result)
- '''
- #soup = BeautifulSoup(data,"lxml")
- #print(soup.get_text())
- #print(soup.words)
- #print(soup.body.num_words)
- #print(soup.num_words,soup.num_punctuations,soup.num_stopwords)
- '''
- for child in soup.find_all(True):
- pass
- child.test1 = "1"
- print(child.name,child.words,child.num_words,len(child.find_all(True,recursive=False)),child.string,"---",child.parent.name)
- '''
|