import urllib.request from bs4 import BeautifulSoup import re import time import requests import jieba import numpy as np import Utils import htmlDrawing as hd from Utils import findAllIndex import gzip import io def analysis(url): ''' @summary: 分析网页,做正文、标题、时间的抽取,只针对正文内容占整个网页文本的大部分的网页 @param: url: 要提取的网页 @return: type:dict 正文、标题、时间的字典 ''' def delStopTags(soup,stopTags): ''' @summary: 从网页DOM树中删除所有的停用标签 @param: soup: 网页DOM树 stopTags: 停用标签 @return: 网页DOM树 ''' for item in stopTags: for tag in soup.find_all(item): tag.decompose() return soup def recursiveStatistic(soup): ''' @summary: 递归统计标签的字数,停用词数,标点符号数 @param: soup: 网页的DOM树 stopTags: 停用标签 stopWords_pattern: 停用词正则 punctuationWords_pattern: 标点符号正则 parent_code: 父节点编码 @return: 经过信息统计的DOM树 ''' def getParent(child): if len(child.parent.find_all(recursive=False))>1: return child.parent else: return getParent(child.parent) childs = soup.find_all(recursive=False) if len(childs)==0: if (soup.name in ["a"] and "href" in soup.attrs) or soup.name in ["input"] or "onclick" in soup.attrs or soup.parent.name in ["a"] or soup.parent.parent.name in ["a"]: soup.Ncontent_leaf_count = 1 else: soup.Ncontent_leaf_count = 0 soup.leaf_count = 1 soup.leaf_is = True text = soup.get_text() soup.words_set = set(jieba.cut(text)) soup.tag_set = set([soup]) #print(soup.name,soup.parent.name,soup.parent.get_text()) return soup.leaf_count,soup.Ncontent_leaf_count,soup.words_set,soup.tag_set else: leaf_count = 0 Ncontent_leaf_count = 0 words_set = set() tag_set = set() for child in childs: result = recursiveStatistic(child) leaf_count += result[0] Ncontent_leaf_count += result[1] words_set = words_set | set(jieba.cut(child.get_text())) tag_set = tag_set | result[3] soup.leaf_count = leaf_count soup.Ncontent_leaf_count = Ncontent_leaf_count soup.leaf_is = False soup.words_set = words_set soup.tag_set = tag_set return leaf_count,Ncontent_leaf_count,words_set,tag_set def getInputOfKmeans(soup): def getPatent(child): if child.parent.leaf_count>1: return child.parent else: return getPatent(child.parent) prob_content = 0.5 prob_Ncontent = 1 node_list = [] feature_list = [] for child in soup.find_all(recursive=True): if child.leaf_is: parent = getPatent(child) if child.Ncontent_leaf_count>0: feature = prob_Ncontent*parent.Ncontent_leaf_count/parent.leaf_count else: feature = prob_content*parent.Ncontent_leaf_count/parent.leaf_count node_list.append(child) feature_list.append(feature) contextFeature_list = [] for i in range(len(node_list)): last_1 = i - 1 next_1 = (i+1)%len(node_list) contextFeature_list.append([feature_list[last_1],feature_list[i],feature_list[next_1]]) return node_list,feature_list def kmeans(Node_list,feature_list): def getText(child,words_len): if child.parent.leaf_count>1: return child.parent.words_set #return set(jieba.cut(child.parent.get_text())) else: return getText(child.parent,words_len) def getDistance(feature_list,init_hears,nearst_sum): distance = np.repeat(np.array(feature_list),2,axis=0) distance = np.reshape(distance,(-1,len(init_hears),len(init_hears[0]))) means = np.array(init_hears) the_distance = np.zeros_like(distance) for i in range(len(the_distance)): the_distance[i] = distance[i]-means return np.sum(np.abs(the_distance),axis=(2))/nearst_sum ''' distance = np.array(feature_list).repeat(2) distance = np.reshape(distance,(-1,2)) means = np.array(init_hears) return np.abs(distance-means)/nearst_sum ''' init_hears = [[0.01],[0.2]] feature_list = np.array(feature_list) last_nearst = np.zeros((len(feature_list,))) last_nearst_sum = np.array([1,1]) while(True): distance = getDistance(feature_list,init_hears,last_nearst_sum) current_nearst = np.argmin(distance,axis=1) if (last_nearst==current_nearst).all(): break for i in range(len(init_hears)): median = np.median(feature_list[current_nearst==i],axis=0) if not np.isnan(median): init_hears[i] = [median] last_nearst_sum[i] = np.sum(feature_list[current_nearst==i]) last_nearst = current_nearst content_words_set = set() expectation_dict = dict() print("nearst",current_nearst) #给全文所有词分配并计算合计期望 content_tag_set = set() for node,nearst in zip(node_list,current_nearst): if nearst==0: #print(node.parent.get_text()) content_tag_set.add(node) node.nearst = nearst for word in getText(node,len(node.words_set)): #for word in node.words_set: if word in expectation_dict.keys(): expectation_dict[word] += 1 else: expectation_dict[word] = 1 else: node.nearst = nearst for word in getText(node,len(node.words_set)): #for word in node.words_set: if word in expectation_dict.keys(): expectation_dict[word] += -1 else: expectation_dict[word] = -1 for key in expectation_dict.keys(): if expectation_dict[key]>0: content_words_set.add(key) #print(content_words_set) return content_words_set,content_tag_set def getMaxIoU(soup,content_words_set,content_tag_set): maxIoU = 0 node_maxIoU = None prob_tag = 0.7 for child in soup.find_all(recursive=True): IoU_1 = len(content_words_set & child.words_set)/(len(content_words_set | child.words_set)+0.0001) IoU_2 = len(content_tag_set & child.tag_set)/(len(content_tag_set | child.tag_set)+0.001) #print(IoU_1,IoU_2) IoU = IoU_1*(1-prob_tag)+IoU_2*prob_tag if IoU>=maxIoU: maxIoU = IoU node_maxIoU = child ''' if IoU>0.4: print(IoU) print(child.get_text()) ''' #print(maxIoU) return node_maxIoU def removeNcontentTag(node): def getPercentOfNcontent(soup): leaf_count = 0 NContent_leaf_count = 0 for child in soup.find_all(recursive=True): if child.leaf_is: if child.nearst==1: NContent_leaf_count += 1 leaf_count += 1 if leaf_count>0: return NContent_leaf_count/leaf_count,leaf_count else: return 0,leaf_count for child in node.find_all(recursive=False): if child.leaf_count>1: percent,leaf_count = getPercentOfNcontent(child) if leaf_count>2 and percent>0.7: #print(child.get_text(),leaf_count,percent) child.decompose() return node def removeTag_byRule(soup,keyword_pattern = "访问量|打印|浏览次数|上一篇|下一篇"): words_len = 8 for child in soup.find_all(recursive=True): if child.leaf_is: parent_text = child.parent.get_text() child_text = child.get_text() if re.search(keyword_pattern,parent_text) is not None: if re.search(keyword_pattern,child_text) is None: child.parent.decompose() else: if len(parent_text)-len(child_text)>words_len: child.decompose() else: child.parent.decompose() soup = hd.getSource(url) #print(soup) stopTags = ["script","meta","link","style"] delStopTags(soup, stopTags) #print(soup.get_text()) stopWords = ["[A-Z]","[a-z]","[0-9]"] stopWords_pattern = re.compile("|".join(stopWords)) punctuationWords = "[;,。:、]" punctuationWords_pattern = re.compile(punctuationWords) a = time.time() recursiveStatistic(soup) result = dict() ''' for child in soup.find_all(recursive=True): print(child.name,child.leaf_is,child.Ncontent_leaf_count,child.leaf_count) node_list,feature_list = getInputOfKmeans_context(soup) node = getMaxIoU(soup, kmeans_context(node_list,feature_list)) ''' node_list,feature_list = getInputOfKmeans(soup) word_set,tag_set = kmeans(node_list,feature_list) node = getMaxIoU(soup,word_set,tag_set) node = removeNcontentTag(node) #node = removeTag_byRule(node) if node: return node.get_text() else: return "" ''' content_child = getContent_withWords(soup, soup.html.num_words, 1) #content_child = getContent_withPunctuations(soup,soup.html.num_punctuations,1) #content_child = getContent_withPunctuations(soup,soup.num_stopwords,1) list_childs_title,list_childs_time = getChildsFromTheBeginOfContent(content_child.words,content_child, 10) title_list,time_list = getTitleTimeList(soup, content_child) for item in list_childs_title: title_list.append(item) for item in list_childs_time: time_list.append(item) title_list.sort(key=lambda x:x[2]/x[1],reverse=True) title_list_max = [] #取出出现率最大的句子 if len(title_list)>0: max_match = title_list[0][2]/title_list[0][1] for i in range(len(title_list)): if title_list[i][2]/title_list[i][1]==max_match: title_list_max.append(title_list[i]) else: break route_match = 0 if len(title_list_max)>0: title = title_list_max[0][0] #取出离正文最近的title for i in range(len(title_list_max)): match = 0 for a,b in zip(title_list_max[i][3],content_child.code): if a==b: match += 1 if match > route_match: route_match = match title = title_list_max[i][0] result["title"] = title result["content"] = content_child.words #取出离正文最近的时间 if len(time_list)>0: if len(time_list)==1: result["time"] = time_list[0][0] else: route_match = 0 the_time = time_list[0][0] for i in range(len(time_list)): match = 0 for a,b in zip(time_list[i][1],content_child.code): if a == b: match += 1 if match>route_match: route_match = match the_time = time_list[i][0] result["time"] = the_time ''' import psycopg2 conn = psycopg2.connect(dbname="htmlExtract",user="postgres",password="postgres",host="192.168.2.101") cursor = conn.cursor() def getAccAndRecall(label_content,predict_content,whole_same=True): label_content = re.sub("\r|\n|\s","",label_content) predict_content = re.sub("\r|\n|\s","",predict_content) #print(label_content) #print(predict_content) if whole_same: if label_content==predict_content: return 1,1 else: return 0,0 else: content_set1 = set(jieba.cut(label_content)) content_set2 = set(jieba.cut(predict_content)) inter_counts = len(content_set1 & content_set2) label_counts = len(content_set1) predict_counts = len(content_set2) print("diff",(content_set1|content_set2)-(content_set1&content_set2)) return inter_counts/(predict_counts+0.001),inter_counts/(label_counts+0.001) def getLabelData(): sql = " select url,content from label_html where content is not NULL and content!='' limit 300" cursor.execute(sql) rows = cursor.fetchall() return rows def getLabelData_withUrl(url): sql = " select url,content from label_html where url='"+url+"' " cursor.execute(sql) rows = cursor.fetchall() return rows def test(rows): all_acc = 0 all_recall = 0 counts = 0 notgood = [] for row in rows: url = row[0] print("url:",url) content = row[1] content_predict = analysis(url) acc,recall = getAccAndRecall(content, content_predict) if acc<0.9: notgood.append(url) counts += 1 all_acc += acc all_recall += recall print("acc:%f,recall:%f"%(all_acc/counts,all_recall/counts)) for url in notgood: print(url) if __name__=="__main__": url = "https://blog.csdn.net/studysinklc/article/details/78017330" result = analysis(url) print(result) #test(getLabelData_withUrl(url)) browser.close() ''' a = time.time() test(getLabelData()) print("takes",time.time()-a) '''