htmlAnalysisByCompare.py 3.8 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115
  1. import urllib.request
  2. from bs4 import BeautifulSoup
  3. import re
  4. import time
  5. import requests
  6. import jieba
  7. import numpy
  8. from Utils import findAllIndex
  9. import htmlDrawing as hd
  10. def analysis(list_url):
  11. '''
  12. @summary: 分析网页,做正文、标题、时间的抽取,根据多个同源网站的剔除相同的内容,余下的就是正文
  13. @param:
  14. url: 要提取的网页
  15. @return: type:dict 正文、标题、时间的字典
  16. '''
  17. def delStopTags(list_soup,stopTags):
  18. '''
  19. @summary: 从网页DOM树中删除所有的停用标签
  20. @param:
  21. list_soup: 多个同源网页DOM树
  22. stopTags: 停用标签
  23. @return: 网页DOM树
  24. '''
  25. for soup in list_soup:
  26. for item in stopTags:
  27. for tag in soup.find_all(item):
  28. tag.decompose()
  29. return list_soup
  30. def getPath_code_Text(soup,result,code=""):
  31. '''
  32. @summary: 从网页DOM树中拿到路径、标签引用、文本
  33. @param:
  34. soup: 网页DOM
  35. @return: {路径,[[标签引用,文本]]}
  36. '''
  37. for child in soup.find_all(True,recursive=False):
  38. path = code+child.name
  39. if path in result.keys():
  40. result[path].append([child,re.sub("[\s\r\n]*","",child.get_text().strip())])
  41. else:
  42. result[path] = [[child,re.sub("[\s\r\n]*","",child.get_text().strip())]]
  43. getPath_code_Text(child, result, path)
  44. return result
  45. def getTheSameTagsOfSameText(path,text,list_PathCodeText):
  46. '''
  47. @summary: 从多个网页的path-code-text中获取路径相同文本相同的tag
  48. '''
  49. list_child = []
  50. if text=="":
  51. return None
  52. for dict_pct in list_PathCodeText:
  53. if path in dict_pct.keys():
  54. list_TagText = dict_pct[path]
  55. for TagText in list_TagText:
  56. if text==TagText[1] and text!="":
  57. list_child.append(TagText[0])
  58. break
  59. if len(list_child)==len(list_PathCodeText):
  60. return list_child
  61. return None
  62. def removeTheSameTags(list_PathCodeText):
  63. '''
  64. @summary: 剔除路径和文本都一样的标签节点
  65. @param:
  66. list_PathCodeText: type:list,多个网页经过getPath_Code_Text方法得到的结果
  67. '''
  68. if len(list_PathCodeText)>1:
  69. dict_1 = list_PathCodeText[0]
  70. for path in dict_1.keys():
  71. list_TagText = dict_1[path]
  72. #print("--",list_TagText)
  73. for TagText in list_TagText:
  74. Tag = TagText[0]
  75. Text = TagText[1]
  76. sameTags = getTheSameTagsOfSameText(path, Text, list_PathCodeText[1:])
  77. if sameTags is not None:
  78. #print(path)
  79. Tag.decompose()
  80. for tag in sameTags:
  81. tag.decompose()
  82. list_soup = []
  83. for url in list_url:
  84. soup = hd.getSource(url)
  85. list_soup.append(soup)
  86. stopTags = ["script","meta","link","style","head"]
  87. list_soup = delStopTags(list_soup, stopTags)
  88. list_PathCodeText = []
  89. for soup in list_soup:
  90. list_PathCodeText.append(getPath_code_Text(soup,dict()))
  91. #print(list_PathCodeText[0])
  92. removeTheSameTags(list_PathCodeText)
  93. for soup in list_soup:
  94. print(soup.get_text())
  95. if __name__=="__main__":
  96. url = ["http://gtj.taiyuan.gov.cn/doc/2018/08/30/661759.shtml",
  97. "http://gtj.taiyuan.gov.cn/doc/2018/07/09/590197.shtml"]
  98. b = time.time()
  99. result = analysis(url)
  100. print(time.time()-b)