K-means.py 14 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398
  1. import urllib.request
  2. from bs4 import BeautifulSoup
  3. import re
  4. import time
  5. import requests
  6. import jieba
  7. import numpy as np
  8. import Utils
  9. import htmlDrawing as hd
  10. from Utils import findAllIndex
  11. import gzip
  12. import io
  13. def analysis(url):
  14. '''
  15. @summary: 分析网页,做正文、标题、时间的抽取,只针对正文内容占整个网页文本的大部分的网页
  16. @param:
  17. url: 要提取的网页
  18. @return: type:dict 正文、标题、时间的字典
  19. '''
  20. def delStopTags(soup,stopTags):
  21. '''
  22. @summary: 从网页DOM树中删除所有的停用标签
  23. @param:
  24. soup: 网页DOM树
  25. stopTags: 停用标签
  26. @return: 网页DOM树
  27. '''
  28. for item in stopTags:
  29. for tag in soup.find_all(item):
  30. tag.decompose()
  31. return soup
  32. def recursiveStatistic(soup):
  33. '''
  34. @summary: 递归统计标签的字数,停用词数,标点符号数
  35. @param:
  36. soup: 网页的DOM树
  37. stopTags: 停用标签
  38. stopWords_pattern: 停用词正则
  39. punctuationWords_pattern: 标点符号正则
  40. parent_code: 父节点编码
  41. @return: 经过信息统计的DOM树
  42. '''
  43. def getParent(child):
  44. if len(child.parent.find_all(recursive=False))>1:
  45. return child.parent
  46. else:
  47. return getParent(child.parent)
  48. childs = soup.find_all(recursive=False)
  49. if len(childs)==0:
  50. if (soup.name in ["a"] and "href" in soup.attrs) or soup.name in ["input"] or "onclick" in soup.attrs or soup.parent.name in ["a"] or soup.parent.parent.name in ["a"]:
  51. soup.Ncontent_leaf_count = 1
  52. else:
  53. soup.Ncontent_leaf_count = 0
  54. soup.leaf_count = 1
  55. soup.leaf_is = True
  56. text = soup.get_text()
  57. soup.words_set = set(jieba.cut(text))
  58. soup.tag_set = set([soup])
  59. #print(soup.name,soup.parent.name,soup.parent.get_text())
  60. return soup.leaf_count,soup.Ncontent_leaf_count,soup.words_set,soup.tag_set
  61. else:
  62. leaf_count = 0
  63. Ncontent_leaf_count = 0
  64. words_set = set()
  65. tag_set = set()
  66. for child in childs:
  67. result = recursiveStatistic(child)
  68. leaf_count += result[0]
  69. Ncontent_leaf_count += result[1]
  70. words_set = words_set | set(jieba.cut(child.get_text()))
  71. tag_set = tag_set | result[3]
  72. soup.leaf_count = leaf_count
  73. soup.Ncontent_leaf_count = Ncontent_leaf_count
  74. soup.leaf_is = False
  75. soup.words_set = words_set
  76. soup.tag_set = tag_set
  77. return leaf_count,Ncontent_leaf_count,words_set,tag_set
  78. def getInputOfKmeans(soup):
  79. def getPatent(child):
  80. if child.parent.leaf_count>1:
  81. return child.parent
  82. else:
  83. return getPatent(child.parent)
  84. prob_content = 0.5
  85. prob_Ncontent = 1
  86. node_list = []
  87. feature_list = []
  88. for child in soup.find_all(recursive=True):
  89. if child.leaf_is:
  90. parent = getPatent(child)
  91. if child.Ncontent_leaf_count>0:
  92. feature = prob_Ncontent*parent.Ncontent_leaf_count/parent.leaf_count
  93. else:
  94. feature = prob_content*parent.Ncontent_leaf_count/parent.leaf_count
  95. node_list.append(child)
  96. feature_list.append(feature)
  97. contextFeature_list = []
  98. for i in range(len(node_list)):
  99. last_1 = i - 1
  100. next_1 = (i+1)%len(node_list)
  101. contextFeature_list.append([feature_list[last_1],feature_list[i],feature_list[next_1]])
  102. return node_list,feature_list
  103. def kmeans(Node_list,feature_list):
  104. def getText(child,words_len):
  105. if child.parent.leaf_count>1:
  106. return child.parent.words_set
  107. #return set(jieba.cut(child.parent.get_text()))
  108. else:
  109. return getText(child.parent,words_len)
  110. def getDistance(feature_list,init_hears,nearst_sum):
  111. distance = np.repeat(np.array(feature_list),2,axis=0)
  112. distance = np.reshape(distance,(-1,len(init_hears),len(init_hears[0])))
  113. means = np.array(init_hears)
  114. the_distance = np.zeros_like(distance)
  115. for i in range(len(the_distance)):
  116. the_distance[i] = distance[i]-means
  117. return np.sum(np.abs(the_distance),axis=(2))/nearst_sum
  118. '''
  119. distance = np.array(feature_list).repeat(2)
  120. distance = np.reshape(distance,(-1,2))
  121. means = np.array(init_hears)
  122. return np.abs(distance-means)/nearst_sum
  123. '''
  124. init_hears = [[0.01],[0.2]]
  125. feature_list = np.array(feature_list)
  126. last_nearst = np.zeros((len(feature_list,)))
  127. last_nearst_sum = np.array([1,1])
  128. while(True):
  129. distance = getDistance(feature_list,init_hears,last_nearst_sum)
  130. current_nearst = np.argmin(distance,axis=1)
  131. if (last_nearst==current_nearst).all():
  132. break
  133. for i in range(len(init_hears)):
  134. median = np.median(feature_list[current_nearst==i],axis=0)
  135. if not np.isnan(median):
  136. init_hears[i] = [median]
  137. last_nearst_sum[i] = np.sum(feature_list[current_nearst==i])
  138. last_nearst = current_nearst
  139. content_words_set = set()
  140. expectation_dict = dict()
  141. print("nearst",current_nearst)
  142. #给全文所有词分配并计算合计期望
  143. content_tag_set = set()
  144. for node,nearst in zip(node_list,current_nearst):
  145. if nearst==0:
  146. #print(node.parent.get_text())
  147. content_tag_set.add(node)
  148. node.nearst = nearst
  149. for word in getText(node,len(node.words_set)):
  150. #for word in node.words_set:
  151. if word in expectation_dict.keys():
  152. expectation_dict[word] += 1
  153. else:
  154. expectation_dict[word] = 1
  155. else:
  156. node.nearst = nearst
  157. for word in getText(node,len(node.words_set)):
  158. #for word in node.words_set:
  159. if word in expectation_dict.keys():
  160. expectation_dict[word] += -1
  161. else:
  162. expectation_dict[word] = -1
  163. for key in expectation_dict.keys():
  164. if expectation_dict[key]>0:
  165. content_words_set.add(key)
  166. #print(content_words_set)
  167. return content_words_set,content_tag_set
  168. def getMaxIoU(soup,content_words_set,content_tag_set):
  169. maxIoU = 0
  170. node_maxIoU = None
  171. prob_tag = 0.7
  172. for child in soup.find_all(recursive=True):
  173. IoU_1 = len(content_words_set & child.words_set)/(len(content_words_set | child.words_set)+0.0001)
  174. IoU_2 = len(content_tag_set & child.tag_set)/(len(content_tag_set | child.tag_set)+0.001)
  175. #print(IoU_1,IoU_2)
  176. IoU = IoU_1*(1-prob_tag)+IoU_2*prob_tag
  177. if IoU>=maxIoU:
  178. maxIoU = IoU
  179. node_maxIoU = child
  180. '''
  181. if IoU>0.4:
  182. print(IoU)
  183. print(child.get_text())
  184. '''
  185. #print(maxIoU)
  186. return node_maxIoU
  187. def removeNcontentTag(node):
  188. def getPercentOfNcontent(soup):
  189. leaf_count = 0
  190. NContent_leaf_count = 0
  191. for child in soup.find_all(recursive=True):
  192. if child.leaf_is:
  193. if child.nearst==1:
  194. NContent_leaf_count += 1
  195. leaf_count += 1
  196. if leaf_count>0:
  197. return NContent_leaf_count/leaf_count,leaf_count
  198. else:
  199. return 0,leaf_count
  200. for child in node.find_all(recursive=False):
  201. if child.leaf_count>1:
  202. percent,leaf_count = getPercentOfNcontent(child)
  203. if leaf_count>2 and percent>0.7:
  204. #print(child.get_text(),leaf_count,percent)
  205. child.decompose()
  206. return node
  207. def removeTag_byRule(soup,keyword_pattern = "访问量|打印|浏览次数|上一篇|下一篇"):
  208. words_len = 8
  209. for child in soup.find_all(recursive=True):
  210. if child.leaf_is:
  211. parent_text = child.parent.get_text()
  212. child_text = child.get_text()
  213. if re.search(keyword_pattern,parent_text) is not None:
  214. if re.search(keyword_pattern,child_text) is None:
  215. child.parent.decompose()
  216. else:
  217. if len(parent_text)-len(child_text)>words_len:
  218. child.decompose()
  219. else:
  220. child.parent.decompose()
  221. soup = hd.getSource(url)
  222. #print(soup)
  223. stopTags = ["script","meta","link","style"]
  224. delStopTags(soup, stopTags)
  225. #print(soup.get_text())
  226. stopWords = ["[A-Z]","[a-z]","[0-9]"]
  227. stopWords_pattern = re.compile("|".join(stopWords))
  228. punctuationWords = "[;,。:、]"
  229. punctuationWords_pattern = re.compile(punctuationWords)
  230. a = time.time()
  231. recursiveStatistic(soup)
  232. result = dict()
  233. '''
  234. for child in soup.find_all(recursive=True):
  235. print(child.name,child.leaf_is,child.Ncontent_leaf_count,child.leaf_count)
  236. node_list,feature_list = getInputOfKmeans_context(soup)
  237. node = getMaxIoU(soup, kmeans_context(node_list,feature_list))
  238. '''
  239. node_list,feature_list = getInputOfKmeans(soup)
  240. word_set,tag_set = kmeans(node_list,feature_list)
  241. node = getMaxIoU(soup,word_set,tag_set)
  242. node = removeNcontentTag(node)
  243. #node = removeTag_byRule(node)
  244. if node:
  245. return node.get_text()
  246. else:
  247. return ""
  248. '''
  249. content_child = getContent_withWords(soup, soup.html.num_words, 1)
  250. #content_child = getContent_withPunctuations(soup,soup.html.num_punctuations,1)
  251. #content_child = getContent_withPunctuations(soup,soup.num_stopwords,1)
  252. list_childs_title,list_childs_time = getChildsFromTheBeginOfContent(content_child.words,content_child, 10)
  253. title_list,time_list = getTitleTimeList(soup, content_child)
  254. for item in list_childs_title:
  255. title_list.append(item)
  256. for item in list_childs_time:
  257. time_list.append(item)
  258. title_list.sort(key=lambda x:x[2]/x[1],reverse=True)
  259. title_list_max = []
  260. #取出出现率最大的句子
  261. if len(title_list)>0:
  262. max_match = title_list[0][2]/title_list[0][1]
  263. for i in range(len(title_list)):
  264. if title_list[i][2]/title_list[i][1]==max_match:
  265. title_list_max.append(title_list[i])
  266. else:
  267. break
  268. route_match = 0
  269. if len(title_list_max)>0:
  270. title = title_list_max[0][0]
  271. #取出离正文最近的title
  272. for i in range(len(title_list_max)):
  273. match = 0
  274. for a,b in zip(title_list_max[i][3],content_child.code):
  275. if a==b:
  276. match += 1
  277. if match > route_match:
  278. route_match = match
  279. title = title_list_max[i][0]
  280. result["title"] = title
  281. result["content"] = content_child.words
  282. #取出离正文最近的时间
  283. if len(time_list)>0:
  284. if len(time_list)==1:
  285. result["time"] = time_list[0][0]
  286. else:
  287. route_match = 0
  288. the_time = time_list[0][0]
  289. for i in range(len(time_list)):
  290. match = 0
  291. for a,b in zip(time_list[i][1],content_child.code):
  292. if a == b:
  293. match += 1
  294. if match>route_match:
  295. route_match = match
  296. the_time = time_list[i][0]
  297. result["time"] = the_time
  298. '''
  299. import psycopg2
  300. conn = psycopg2.connect(dbname="htmlExtract",user="postgres",password="postgres",host="192.168.2.101")
  301. cursor = conn.cursor()
  302. def getAccAndRecall(label_content,predict_content,whole_same=True):
  303. label_content = re.sub("\r|\n|\s","",label_content)
  304. predict_content = re.sub("\r|\n|\s","",predict_content)
  305. #print(label_content)
  306. #print(predict_content)
  307. if whole_same:
  308. if label_content==predict_content:
  309. return 1,1
  310. else:
  311. return 0,0
  312. else:
  313. content_set1 = set(jieba.cut(label_content))
  314. content_set2 = set(jieba.cut(predict_content))
  315. inter_counts = len(content_set1 & content_set2)
  316. label_counts = len(content_set1)
  317. predict_counts = len(content_set2)
  318. print("diff",(content_set1|content_set2)-(content_set1&content_set2))
  319. return inter_counts/(predict_counts+0.001),inter_counts/(label_counts+0.001)
  320. def getLabelData():
  321. sql = " select url,content from label_html where content is not NULL and content!='' limit 300"
  322. cursor.execute(sql)
  323. rows = cursor.fetchall()
  324. return rows
  325. def getLabelData_withUrl(url):
  326. sql = " select url,content from label_html where url='"+url+"' "
  327. cursor.execute(sql)
  328. rows = cursor.fetchall()
  329. return rows
  330. def test(rows):
  331. all_acc = 0
  332. all_recall = 0
  333. counts = 0
  334. notgood = []
  335. for row in rows:
  336. url = row[0]
  337. print("url:",url)
  338. content = row[1]
  339. content_predict = analysis(url)
  340. acc,recall = getAccAndRecall(content, content_predict)
  341. if acc<0.9:
  342. notgood.append(url)
  343. counts += 1
  344. all_acc += acc
  345. all_recall += recall
  346. print("acc:%f,recall:%f"%(all_acc/counts,all_recall/counts))
  347. for url in notgood:
  348. print(url)
  349. if __name__=="__main__":
  350. url = "https://blog.csdn.net/studysinklc/article/details/78017330"
  351. result = analysis(url)
  352. print(result)
  353. #test(getLabelData_withUrl(url))
  354. browser.close()
  355. '''
  356. a = time.time()
  357. test(getLabelData())
  358. print("takes",time.time()-a)
  359. '''