123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398 |
- import urllib.request
- from bs4 import BeautifulSoup
- import re
- import time
- import requests
- import jieba
- import numpy as np
- import Utils
- import htmlDrawing as hd
- from Utils import findAllIndex
- import gzip
- import io
- def analysis(url):
- '''
- @summary: 分析网页,做正文、标题、时间的抽取,只针对正文内容占整个网页文本的大部分的网页
- @param:
- url: 要提取的网页
- @return: type:dict 正文、标题、时间的字典
- '''
-
- def delStopTags(soup,stopTags):
- '''
- @summary: 从网页DOM树中删除所有的停用标签
- @param:
- soup: 网页DOM树
- stopTags: 停用标签
- @return: 网页DOM树
- '''
- for item in stopTags:
- for tag in soup.find_all(item):
- tag.decompose()
- return soup
-
-
- def recursiveStatistic(soup):
- '''
- @summary: 递归统计标签的字数,停用词数,标点符号数
- @param:
- soup: 网页的DOM树
- stopTags: 停用标签
- stopWords_pattern: 停用词正则
- punctuationWords_pattern: 标点符号正则
- parent_code: 父节点编码
- @return: 经过信息统计的DOM树
- '''
- def getParent(child):
- if len(child.parent.find_all(recursive=False))>1:
- return child.parent
- else:
- return getParent(child.parent)
- childs = soup.find_all(recursive=False)
- if len(childs)==0:
- if (soup.name in ["a"] and "href" in soup.attrs) or soup.name in ["input"] or "onclick" in soup.attrs or soup.parent.name in ["a"] or soup.parent.parent.name in ["a"]:
- soup.Ncontent_leaf_count = 1
- else:
- soup.Ncontent_leaf_count = 0
- soup.leaf_count = 1
- soup.leaf_is = True
- text = soup.get_text()
- soup.words_set = set(jieba.cut(text))
- soup.tag_set = set([soup])
- #print(soup.name,soup.parent.name,soup.parent.get_text())
- return soup.leaf_count,soup.Ncontent_leaf_count,soup.words_set,soup.tag_set
- else:
- leaf_count = 0
- Ncontent_leaf_count = 0
- words_set = set()
- tag_set = set()
- for child in childs:
- result = recursiveStatistic(child)
- leaf_count += result[0]
- Ncontent_leaf_count += result[1]
- words_set = words_set | set(jieba.cut(child.get_text()))
- tag_set = tag_set | result[3]
- soup.leaf_count = leaf_count
- soup.Ncontent_leaf_count = Ncontent_leaf_count
- soup.leaf_is = False
- soup.words_set = words_set
- soup.tag_set = tag_set
- return leaf_count,Ncontent_leaf_count,words_set,tag_set
-
- def getInputOfKmeans(soup):
-
- def getPatent(child):
- if child.parent.leaf_count>1:
- return child.parent
- else:
- return getPatent(child.parent)
-
- prob_content = 0.5
- prob_Ncontent = 1
- node_list = []
- feature_list = []
-
- for child in soup.find_all(recursive=True):
- if child.leaf_is:
- parent = getPatent(child)
- if child.Ncontent_leaf_count>0:
- feature = prob_Ncontent*parent.Ncontent_leaf_count/parent.leaf_count
- else:
- feature = prob_content*parent.Ncontent_leaf_count/parent.leaf_count
- node_list.append(child)
- feature_list.append(feature)
-
- contextFeature_list = []
- for i in range(len(node_list)):
- last_1 = i - 1
- next_1 = (i+1)%len(node_list)
- contextFeature_list.append([feature_list[last_1],feature_list[i],feature_list[next_1]])
- return node_list,feature_list
- def kmeans(Node_list,feature_list):
-
- def getText(child,words_len):
- if child.parent.leaf_count>1:
- return child.parent.words_set
- #return set(jieba.cut(child.parent.get_text()))
- else:
- return getText(child.parent,words_len)
-
- def getDistance(feature_list,init_hears,nearst_sum):
-
- distance = np.repeat(np.array(feature_list),2,axis=0)
- distance = np.reshape(distance,(-1,len(init_hears),len(init_hears[0])))
- means = np.array(init_hears)
- the_distance = np.zeros_like(distance)
- for i in range(len(the_distance)):
- the_distance[i] = distance[i]-means
- return np.sum(np.abs(the_distance),axis=(2))/nearst_sum
-
- '''
- distance = np.array(feature_list).repeat(2)
- distance = np.reshape(distance,(-1,2))
- means = np.array(init_hears)
- return np.abs(distance-means)/nearst_sum
- '''
- init_hears = [[0.01],[0.2]]
- feature_list = np.array(feature_list)
- last_nearst = np.zeros((len(feature_list,)))
- last_nearst_sum = np.array([1,1])
- while(True):
- distance = getDistance(feature_list,init_hears,last_nearst_sum)
- current_nearst = np.argmin(distance,axis=1)
-
- if (last_nearst==current_nearst).all():
- break
- for i in range(len(init_hears)):
- median = np.median(feature_list[current_nearst==i],axis=0)
- if not np.isnan(median):
- init_hears[i] = [median]
- last_nearst_sum[i] = np.sum(feature_list[current_nearst==i])
- last_nearst = current_nearst
- content_words_set = set()
- expectation_dict = dict()
-
- print("nearst",current_nearst)
- #给全文所有词分配并计算合计期望
- content_tag_set = set()
- for node,nearst in zip(node_list,current_nearst):
- if nearst==0:
- #print(node.parent.get_text())
- content_tag_set.add(node)
- node.nearst = nearst
- for word in getText(node,len(node.words_set)):
- #for word in node.words_set:
- if word in expectation_dict.keys():
- expectation_dict[word] += 1
- else:
- expectation_dict[word] = 1
- else:
- node.nearst = nearst
- for word in getText(node,len(node.words_set)):
- #for word in node.words_set:
- if word in expectation_dict.keys():
- expectation_dict[word] += -1
- else:
- expectation_dict[word] = -1
- for key in expectation_dict.keys():
- if expectation_dict[key]>0:
- content_words_set.add(key)
- #print(content_words_set)
- return content_words_set,content_tag_set
-
-
- def getMaxIoU(soup,content_words_set,content_tag_set):
- maxIoU = 0
- node_maxIoU = None
- prob_tag = 0.7
- for child in soup.find_all(recursive=True):
- IoU_1 = len(content_words_set & child.words_set)/(len(content_words_set | child.words_set)+0.0001)
- IoU_2 = len(content_tag_set & child.tag_set)/(len(content_tag_set | child.tag_set)+0.001)
- #print(IoU_1,IoU_2)
- IoU = IoU_1*(1-prob_tag)+IoU_2*prob_tag
- if IoU>=maxIoU:
- maxIoU = IoU
- node_maxIoU = child
- '''
- if IoU>0.4:
- print(IoU)
- print(child.get_text())
- '''
- #print(maxIoU)
- return node_maxIoU
-
- def removeNcontentTag(node):
- def getPercentOfNcontent(soup):
- leaf_count = 0
- NContent_leaf_count = 0
- for child in soup.find_all(recursive=True):
- if child.leaf_is:
- if child.nearst==1:
- NContent_leaf_count += 1
- leaf_count += 1
- if leaf_count>0:
- return NContent_leaf_count/leaf_count,leaf_count
- else:
- return 0,leaf_count
- for child in node.find_all(recursive=False):
- if child.leaf_count>1:
- percent,leaf_count = getPercentOfNcontent(child)
- if leaf_count>2 and percent>0.7:
- #print(child.get_text(),leaf_count,percent)
- child.decompose()
- return node
-
- def removeTag_byRule(soup,keyword_pattern = "访问量|打印|浏览次数|上一篇|下一篇"):
- words_len = 8
- for child in soup.find_all(recursive=True):
- if child.leaf_is:
- parent_text = child.parent.get_text()
- child_text = child.get_text()
- if re.search(keyword_pattern,parent_text) is not None:
- if re.search(keyword_pattern,child_text) is None:
- child.parent.decompose()
- else:
- if len(parent_text)-len(child_text)>words_len:
- child.decompose()
- else:
- child.parent.decompose()
-
-
- soup = hd.getSource(url)
- #print(soup)
- stopTags = ["script","meta","link","style"]
- delStopTags(soup, stopTags)
- #print(soup.get_text())
-
- stopWords = ["[A-Z]","[a-z]","[0-9]"]
- stopWords_pattern = re.compile("|".join(stopWords))
- punctuationWords = "[;,。:、]"
- punctuationWords_pattern = re.compile(punctuationWords)
- a = time.time()
- recursiveStatistic(soup)
- result = dict()
- '''
- for child in soup.find_all(recursive=True):
- print(child.name,child.leaf_is,child.Ncontent_leaf_count,child.leaf_count)
-
- node_list,feature_list = getInputOfKmeans_context(soup)
- node = getMaxIoU(soup, kmeans_context(node_list,feature_list))
- '''
- node_list,feature_list = getInputOfKmeans(soup)
- word_set,tag_set = kmeans(node_list,feature_list)
- node = getMaxIoU(soup,word_set,tag_set)
- node = removeNcontentTag(node)
- #node = removeTag_byRule(node)
- if node:
- return node.get_text()
- else:
- return ""
- '''
- content_child = getContent_withWords(soup, soup.html.num_words, 1)
- #content_child = getContent_withPunctuations(soup,soup.html.num_punctuations,1)
- #content_child = getContent_withPunctuations(soup,soup.num_stopwords,1)
-
- list_childs_title,list_childs_time = getChildsFromTheBeginOfContent(content_child.words,content_child, 10)
-
- title_list,time_list = getTitleTimeList(soup, content_child)
- for item in list_childs_title:
- title_list.append(item)
- for item in list_childs_time:
- time_list.append(item)
- title_list.sort(key=lambda x:x[2]/x[1],reverse=True)
- title_list_max = []
-
- #取出出现率最大的句子
- if len(title_list)>0:
- max_match = title_list[0][2]/title_list[0][1]
- for i in range(len(title_list)):
- if title_list[i][2]/title_list[i][1]==max_match:
- title_list_max.append(title_list[i])
- else:
- break
- route_match = 0
- if len(title_list_max)>0:
- title = title_list_max[0][0]
- #取出离正文最近的title
- for i in range(len(title_list_max)):
- match = 0
- for a,b in zip(title_list_max[i][3],content_child.code):
- if a==b:
- match += 1
- if match > route_match:
- route_match = match
- title = title_list_max[i][0]
- result["title"] = title
-
-
- result["content"] = content_child.words
- #取出离正文最近的时间
- if len(time_list)>0:
- if len(time_list)==1:
- result["time"] = time_list[0][0]
- else:
- route_match = 0
- the_time = time_list[0][0]
- for i in range(len(time_list)):
- match = 0
- for a,b in zip(time_list[i][1],content_child.code):
- if a == b:
- match += 1
- if match>route_match:
- route_match = match
- the_time = time_list[i][0]
- result["time"] = the_time
- '''
- import psycopg2
- conn = psycopg2.connect(dbname="htmlExtract",user="postgres",password="postgres",host="192.168.2.101")
- cursor = conn.cursor()
- def getAccAndRecall(label_content,predict_content,whole_same=True):
- label_content = re.sub("\r|\n|\s","",label_content)
- predict_content = re.sub("\r|\n|\s","",predict_content)
- #print(label_content)
- #print(predict_content)
- if whole_same:
- if label_content==predict_content:
- return 1,1
- else:
- return 0,0
- else:
- content_set1 = set(jieba.cut(label_content))
- content_set2 = set(jieba.cut(predict_content))
- inter_counts = len(content_set1 & content_set2)
- label_counts = len(content_set1)
- predict_counts = len(content_set2)
- print("diff",(content_set1|content_set2)-(content_set1&content_set2))
- return inter_counts/(predict_counts+0.001),inter_counts/(label_counts+0.001)
-
- def getLabelData():
- sql = " select url,content from label_html where content is not NULL and content!='' limit 300"
- cursor.execute(sql)
- rows = cursor.fetchall()
- return rows
- def getLabelData_withUrl(url):
- sql = " select url,content from label_html where url='"+url+"' "
- cursor.execute(sql)
- rows = cursor.fetchall()
- return rows
- def test(rows):
- all_acc = 0
- all_recall = 0
- counts = 0
- notgood = []
- for row in rows:
- url = row[0]
- print("url:",url)
- content = row[1]
- content_predict = analysis(url)
- acc,recall = getAccAndRecall(content, content_predict)
- if acc<0.9:
- notgood.append(url)
- counts += 1
- all_acc += acc
- all_recall += recall
- print("acc:%f,recall:%f"%(all_acc/counts,all_recall/counts))
- for url in notgood:
- print(url)
- if __name__=="__main__":
-
- url = "https://blog.csdn.net/studysinklc/article/details/78017330"
- result = analysis(url)
- print(result)
- #test(getLabelData_withUrl(url))
- browser.close()
- '''
- a = time.time()
- test(getLabelData())
- print("takes",time.time()-a)
- '''
|