htmlAnalysisWithBS.py 13 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344
  1. import sys
  2. import os
  3. sys.path.append("../")
  4. import urllib.request
  5. from bs4 import BeautifulSoup
  6. import re
  7. import time
  8. import requests
  9. import jieba
  10. import numpy
  11. from module.Utils import findAllIndex
  12. from lxml import etree
  13. from module.htmlDrawing import getBrowser
  14. def analysis(url):
  15. '''
  16. @summary: 分析网页,做正文、标题、时间的抽取,只针对正文内容占整个网页文本的大部分的网页
  17. @param:
  18. url: 要提取的网页
  19. @return: type:dict 正文、标题、时间的字典
  20. '''
  21. def delStopTags(soup,stopTags):
  22. '''
  23. @summary: 从网页DOM树中删除所有的停用标签
  24. @param:
  25. soup: 网页DOM树
  26. stopTags: 停用标签
  27. @return: 网页DOM树
  28. '''
  29. for item in stopTags:
  30. for tag in soup.find_all(item):
  31. tag.decompose()
  32. return soup
  33. def recursiveStatistic(soup,stopTags,stopWords_pattern,punctuationWords_pattern,parent_code="ROOT"):
  34. '''
  35. @summary: 递归统计标签的字数,停用词数,标点符号数
  36. @param:
  37. soup: 网页的DOM树
  38. stopTags: 停用标签
  39. stopWords_pattern: 停用词正则
  40. punctuationWords_pattern: 标点符号正则
  41. parent_code: 父节点编码
  42. @return: 经过信息统计的DOM树
  43. '''
  44. i = 0
  45. for child in soup.find_all(True,recursive=False):
  46. if child.name is not None and child.name.strip().lower() not in stopTags:
  47. i += 1
  48. child.code = parent_code+("0"+str(i) if i<10 else str(i))
  49. child.words = re.sub("[\s\r\n]*","",child.get_text().strip()) if (child.get_text() is not None) else ""
  50. child.num_words = len(child.words)
  51. child.num_stopwords = len(re.findall(stopWords_pattern,child.words))
  52. child.num_punctuations = len(re.findall(punctuationWords_pattern,child.words))
  53. recursiveStatistic(child, stopTags, stopWords_pattern, punctuationWords_pattern, child.code)
  54. return soup
  55. def getContent_withWords(soup,all_words,last_percent,limit_percent=0.3):
  56. '''
  57. @summary: 从根节点往下,通过词数的变化找到正文所在节点
  58. @param:
  59. soup: 网页DOM树
  60. all_words:所有字数长度
  61. last_percent:父节点所占字数百分比
  62. limit_percent:百分比损失的限定值
  63. @return: 网页DOM树
  64. '''
  65. pass_limit = None
  66. pass_percent = last_percent
  67. for child in soup.find_all(True,recursive=False):
  68. if child.num_words is not None:
  69. percent = child.num_words/all_words
  70. print(child.name,last_percent,percent)
  71. if last_percent-percent<limit_percent:
  72. pass_limit = child
  73. pass_percent = percent
  74. break
  75. if pass_limit is None:
  76. print(soup.words)
  77. return soup
  78. else:
  79. return getContent_withWords(pass_limit,all_words,pass_percent)
  80. def getContent_withPunctuations(soup,all_punctuations,last_percent,limit_percent=0.2):
  81. '''
  82. @summary: 从根节点往下,用标点符号树的变化找到正文所在节点
  83. @param:
  84. soup: 网页DOM树
  85. all_punctuations: 所有标点符号数
  86. last_percent: 父节点标点符号数百分比
  87. limit_percent: 百分比损失限定值
  88. '''
  89. pass_limit = None
  90. pass_percent = last_percent
  91. for child in soup.find_all(True,recursive=False):
  92. if child.num_words is not None:
  93. percent = child.num_punctuations/all_punctuations
  94. #print(child.name,last_percent,percent)
  95. if last_percent-percent<limit_percent:
  96. pass_limit = child
  97. pass_percent = percent
  98. break
  99. if pass_limit is None:
  100. #print(soup.words)
  101. return soup
  102. else:
  103. return getContent_withPunctuations(pass_limit,all_punctuations,pass_percent)
  104. def getContent_withStopWords(soup,all_stopwords,last_percent,limit_percent=0.4):
  105. '''
  106. @summary: 从根节点开始查找,根据停用词数的变化确定正文所在节点
  107. @param:
  108. soup: 网页DOM树
  109. all_stopwords: 所有停用词数
  110. last_percent: 父节点停用词数所占百分比
  111. limit_percent: 百分比损失限定值
  112. '''
  113. pass_limit = None
  114. pass_percent = last_percent
  115. for child in soup.find_all(True,recursive=False):
  116. if child.num_words is not None:
  117. percent = child.num_stopwords/all_stopwords
  118. #print(child.name,last_percent,percent)
  119. if last_percent-percent<limit_percent:
  120. pass_limit = child
  121. pass_percent = percent
  122. break
  123. if pass_limit is None:
  124. #print(soup.words)
  125. return soup
  126. else:
  127. return getContent_withPunctuations(pass_limit,all_stopwords,pass_percent)
  128. def getChildsFromTheBeginOfContent(content,content_child,nums,getNums=[],list_childs_title=[],list_childs_time=[],title_len = (6,30),time_len = 40,time_pattern = re.compile("\d{2,4}[年/-]\d{1,2}[月/-]\d{1,2}[日\s]?")):
  129. '''
  130. @summary: 从正文开始处获取叶节点
  131. @param:
  132. content: 正文内容
  133. content_child: 当前节点
  134. nums: 要获取的叶节点个数
  135. child: 叶节点数组
  136. @return: list of 叶节点
  137. '''
  138. if len(content_child.find_all(True))==0:
  139. sum = 0
  140. appear = 0
  141. for item in jieba.cut(re.sub("[A-Za-z0-9]","",content_child.words)):
  142. if len(findAllIndex(item,content))>1:
  143. appear += 1
  144. sum += 1
  145. if sum>=title_len[0] and sum<=title_len[1]:
  146. if appear/sum >0.7:
  147. list_childs_title.append([content_child.words,sum,appear,content_child.code])
  148. if content_child.words is not None:
  149. if content_child.num_words<time_len:
  150. matchs = re.findall(time_pattern,content_child.words)
  151. if len(matchs)==1:
  152. list_childs_time.append((matchs[0],content_child.code))
  153. getNums.append(1)
  154. if len(getNums)>=nums:
  155. return list_childs_title,list_childs_time
  156. for child in content_child.find_all(True,recursive=False):
  157. if len(getNums)>=nums:
  158. return list_childs_title,list_childs_time
  159. getChildsFromTheBeginOfContent(content,child,nums,getNums,list_childs_title,list_childs_time)
  160. def getTitleTimeList(soup,child_content,title_list = None,time_list = None,title_len = (6,30),time_len = 40,time_pattern = re.compile("\d{2,4}[年/-]\d{1,2}[月/-]\d{1,2}[日\s]?")):
  161. '''
  162. @summary: 根据正文所在节点来确定整个网页的title和时间
  163. @param:
  164. soup: 网页DOM树
  165. child_content: 正文所在节点
  166. title_list: 符合条件的title
  167. time_list: 符合条件的time
  168. title_len: 限制title所在句子的词数范围
  169. time_len: 限制时间所在句子的长度
  170. time_pattern: 时间正则
  171. @return: list of title,list of time
  172. '''
  173. if title_list is None:
  174. title_list = []
  175. time_list = []
  176. for child in soup.find_all(True,recursive=False):
  177. if child.words is not None and len(child.words)>0:
  178. text = re.sub("[A-Za-z0-9]","",child.words.strip())
  179. content = child_content.words.strip()
  180. sum = 0
  181. appear = 0
  182. for item in jieba.cut(text):
  183. if str(content).find(item)>=0:
  184. appear += 1
  185. sum += 1
  186. if sum>=title_len[0] and sum<=title_len[1]:
  187. if appear/sum >0.7:
  188. title_list.append((child.words,sum,appear,child.code))
  189. if child.words is not None:
  190. if child.num_words<time_len:
  191. matchs = re.findall(time_pattern,child.words)
  192. if len(matchs)==1:
  193. time_list.append((matchs[0],child.code))
  194. if child!=child_content:
  195. getTitleTimeList(child,child_content,title_list,time_list)
  196. return title_list,time_list
  197. header={
  198. "Accept": "text/html, application/xhtml+xml, image/jxr, */*",
  199. "Referer": "http://uia.hnist.cn/sso/login?service=http%3A%2F%2Fportal.hnist.\
  200. cn%2Fuser%2FsimpleSSOLogin",
  201. "Accept-Language": "zh-Hans-CN,zh-Hans;q=0.8,en-US;q=0.5,en;q=0.3",
  202. "Content-Type": "application/x-www-form-urlencoded",
  203. "Accept-Encoding": "gzip, deflate",
  204. "Connection": "Keep-Alive",
  205. "User-Agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64) \
  206. AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.162 Safari/537.36",
  207. "Accept-Encoding": "gzip, deflate",
  208. "Origin": "http://uia.hnist.cn",
  209. "Upgrade-Insecure-Requests": "1",
  210. }
  211. sess = requests.Session()
  212. sess.headers = header
  213. data=sess.get(url)
  214. data = data.text.encode(data.encoding)
  215. data = data.decode("gb2312")
  216. stopTags = ["script","meta","link","style"]
  217. #data = urllib.request.urlopen(url).read().decode("utf-8")
  218. soup = BeautifulSoup(data,"lxml")
  219. soup = delStopTags(soup, stopTags)
  220. stopWords = ["[A-Z]","[a-z]","[0-9]"]
  221. stopWords_pattern = re.compile("|".join(stopWords))
  222. punctuationWords = "[;,。:、]"
  223. punctuationWords_pattern = re.compile(punctuationWords)
  224. a = time.time()
  225. soup = recursiveStatistic(soup, stopTags, stopWords_pattern, punctuationWords_pattern)
  226. content_child = getContent_withWords(soup, soup.html.num_words, 1)
  227. #content_child = getContent_withPunctuations(soup,soup.html.num_punctuations,1)
  228. #content_child = getContent_withPunctuations(soup,soup.num_stopwords,1)
  229. list_childs_title,list_childs_time = getChildsFromTheBeginOfContent(content_child.words,content_child, 10)
  230. result = dict()
  231. title_list,time_list = getTitleTimeList(soup, content_child)
  232. for item in list_childs_title:
  233. title_list.append(item)
  234. for item in list_childs_time:
  235. time_list.append(item)
  236. title_list.sort(key=lambda x:x[2]/x[1],reverse=True)
  237. title_list_max = []
  238. #取出出现率最大的句子
  239. if len(title_list)>0:
  240. max_match = title_list[0][2]/title_list[0][1]
  241. for i in range(len(title_list)):
  242. if title_list[i][2]/title_list[i][1]==max_match:
  243. title_list_max.append(title_list[i])
  244. else:
  245. break
  246. route_match = 0
  247. if len(title_list_max)>0:
  248. title = title_list_max[0][0]
  249. #取出离正文最近的title
  250. for i in range(len(title_list_max)):
  251. match = 0
  252. for a,b in zip(title_list_max[i][3],content_child.code):
  253. if a==b:
  254. match += 1
  255. if match > route_match:
  256. route_match = match
  257. title = title_list_max[i][0]
  258. result["title"] = title
  259. result["content"] = content_child.words
  260. #取出离正文最近的时间
  261. if len(time_list)>0:
  262. if len(time_list)==1:
  263. result["time"] = time_list[0][0]
  264. else:
  265. route_match = 0
  266. the_time = time_list[0][0]
  267. for i in range(len(time_list)):
  268. match = 0
  269. for a,b in zip(time_list[i][1],content_child.code):
  270. if a == b:
  271. match += 1
  272. if match>route_match:
  273. route_match = match
  274. the_time = time_list[i][0]
  275. result["time"] = the_time
  276. return result
  277. if __name__=="__main__":
  278. url = "https://www.celap.org.cn/art/2019/6/4/art_563_43889.html"
  279. '''
  280. sess = requests.Session()
  281. data=sess.get(url)
  282. data = data.text.encode(data.encoding)
  283. data = data.decode("utf-8")
  284. '''
  285. browser = getBrowser()
  286. browser.get(url)
  287. data = browser.page_source
  288. htm=etree.HTML(data)
  289. htree=etree.ElementTree(htm)
  290. etree.xpath('//*[@id="zoom"]')
  291. #print(htm.iter())
  292. ###依次打印出每个元素的文本内容和xpath路径
  293. for t in htm.iter():
  294. print(t.getparent())
  295. print(etree.tostring(t,encoding="unicode"))
  296. print(htree.getpath(t),t.text)
  297. '''
  298. b = time.time()
  299. result = analysis(url)
  300. print(result)
  301. '''
  302. #soup = BeautifulSoup(data,"lxml")
  303. #print(soup.get_text())
  304. #print(soup.words)
  305. #print(soup.body.num_words)
  306. #print(soup.num_words,soup.num_punctuations,soup.num_stopwords)
  307. '''
  308. for child in soup.find_all(True):
  309. pass
  310. child.test1 = "1"
  311. print(child.name,child.words,child.num_words,len(child.find_all(True,recursive=False)),child.string,"---",child.parent.name)
  312. '''