luojiehua
/
ContentExtract


			
							123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398
							
import urllib.request
from bs4 import BeautifulSoup
import re
import time
import requests
import jieba
import numpy as np
import Utils
import htmlDrawing as hd
from Utils import findAllIndex
import gzip
import io


def analysis(url):
    '''
    @summary: 分析网页，做正文、标题、时间的抽取，只针对正文内容占整个网页文本的大部分的网页
    @param:
        url: 要提取的网页
    @return: type:dict 正文、标题、时间的字典
    '''
    
    def delStopTags(soup,stopTags):
        '''
        @summary: 从网页DOM树中删除所有的停用标签
        @param:
            soup: 网页DOM树
            stopTags: 停用标签
        @return: 网页DOM树
        '''
        for item in stopTags:
            for tag in soup.find_all(item):
                tag.decompose()
        return soup
        
    
    def recursiveStatistic(soup):
        '''
        @summary: 递归统计标签的字数，停用词数，标点符号数
        @param:
            soup: 网页的DOM树
            stopTags: 停用标签
            stopWords_pattern: 停用词正则
            punctuationWords_pattern: 标点符号正则
            parent_code: 父节点编码
        @return: 经过信息统计的DOM树
        '''
        def getParent(child):
            if len(child.parent.find_all(recursive=False))>1:
                return child.parent
            else:
                return getParent(child.parent)
        childs = soup.find_all(recursive=False)
        if len(childs)==0:
            if (soup.name in ["a"] and "href" in soup.attrs) or soup.name in ["input"] or "onclick" in soup.attrs or soup.parent.name in ["a"] or soup.parent.parent.name in ["a"]:
                soup.Ncontent_leaf_count = 1
            else:
                soup.Ncontent_leaf_count = 0
            soup.leaf_count = 1
            soup.leaf_is = True
            text = soup.get_text()
            soup.words_set = set(jieba.cut(text))
            soup.tag_set = set([soup])
            #print(soup.name,soup.parent.name,soup.parent.get_text())
            return soup.leaf_count,soup.Ncontent_leaf_count,soup.words_set,soup.tag_set
        else:
            leaf_count = 0
            Ncontent_leaf_count = 0
            words_set = set()
            tag_set = set()
            for child in childs:
                result = recursiveStatistic(child)
                leaf_count += result[0]
                Ncontent_leaf_count += result[1]
                words_set = words_set | set(jieba.cut(child.get_text()))
                tag_set = tag_set | result[3]
            soup.leaf_count = leaf_count
            soup.Ncontent_leaf_count = Ncontent_leaf_count
            soup.leaf_is = False
            soup.words_set = words_set
            soup.tag_set = tag_set
            return leaf_count,Ncontent_leaf_count,words_set,tag_set
    
    def getInputOfKmeans(soup):
        
        def getPatent(child):
            if child.parent.leaf_count>1:
                return child.parent
            else:
                return getPatent(child.parent)
        
        prob_content = 0.5
        prob_Ncontent = 1
        node_list = []
        feature_list = []
        
        for child in soup.find_all(recursive=True):
            if child.leaf_is:
                parent = getPatent(child)
                if child.Ncontent_leaf_count>0:
                    feature = prob_Ncontent*parent.Ncontent_leaf_count/parent.leaf_count
                else:
                    feature = prob_content*parent.Ncontent_leaf_count/parent.leaf_count
                node_list.append(child)
                feature_list.append(feature)
                
        contextFeature_list = []
        for i in range(len(node_list)):
            last_1 = i - 1
            next_1 = (i+1)%len(node_list)
            contextFeature_list.append([feature_list[last_1],feature_list[i],feature_list[next_1]])
        return node_list,feature_list

    def kmeans(Node_list,feature_list):
        
        def getText(child,words_len):
            if child.parent.leaf_count>1:
                return child.parent.words_set
                #return set(jieba.cut(child.parent.get_text()))
            else:
                return getText(child.parent,words_len)
        
        def getDistance(feature_list,init_hears,nearst_sum):
            
            distance = np.repeat(np.array(feature_list),2,axis=0)
            distance = np.reshape(distance,(-1,len(init_hears),len(init_hears[0])))
            means = np.array(init_hears)
            the_distance = np.zeros_like(distance)
            for i in range(len(the_distance)):
                the_distance[i] = distance[i]-means
            return np.sum(np.abs(the_distance),axis=(2))/nearst_sum
        
            '''
            distance = np.array(feature_list).repeat(2)
            distance = np.reshape(distance,(-1,2))
            means = np.array(init_hears)
            return np.abs(distance-means)/nearst_sum
            '''
        init_hears = [[0.01],[0.2]]
        feature_list = np.array(feature_list)
        last_nearst = np.zeros((len(feature_list,)))
        last_nearst_sum = np.array([1,1])
        while(True):
            distance = getDistance(feature_list,init_hears,last_nearst_sum)
            current_nearst = np.argmin(distance,axis=1)
            
            if (last_nearst==current_nearst).all():
                break
            for i in range(len(init_hears)):
                median = np.median(feature_list[current_nearst==i],axis=0)
                if not np.isnan(median):
                    init_hears[i] = [median]
                last_nearst_sum[i] = np.sum(feature_list[current_nearst==i])
            last_nearst = current_nearst
        content_words_set = set()
        expectation_dict = dict()
        
        print("nearst",current_nearst)
        #给全文所有词分配并计算合计期望
        content_tag_set = set()
        for node,nearst in zip(node_list,current_nearst):
            if nearst==0:
                #print(node.parent.get_text())
                content_tag_set.add(node)
                node.nearst = nearst
                for word in getText(node,len(node.words_set)):
                #for word in node.words_set:
                    if word in expectation_dict.keys():
                        expectation_dict[word] += 1
                    else:
                        expectation_dict[word] = 1
            else:
                node.nearst = nearst
                for word in getText(node,len(node.words_set)):
                #for word in node.words_set:
                    if word in expectation_dict.keys():
                        expectation_dict[word] += -1
                    else:
                        expectation_dict[word] = -1
        for key in expectation_dict.keys():
            if expectation_dict[key]>0:
                content_words_set.add(key)
        #print(content_words_set)
        return content_words_set,content_tag_set
    
    
    def getMaxIoU(soup,content_words_set,content_tag_set):
        maxIoU = 0
        node_maxIoU = None
        prob_tag = 0.7
        for child in soup.find_all(recursive=True):
            IoU_1 = len(content_words_set & child.words_set)/(len(content_words_set | child.words_set)+0.0001)
            IoU_2 = len(content_tag_set & child.tag_set)/(len(content_tag_set | child.tag_set)+0.001)
            #print(IoU_1,IoU_2)
            IoU = IoU_1*(1-prob_tag)+IoU_2*prob_tag
            if IoU>=maxIoU:
                maxIoU = IoU
                node_maxIoU = child
                '''
            if IoU>0.4:
                print(IoU)
                print(child.get_text())
                '''
        #print(maxIoU)
        return node_maxIoU
    
    def removeNcontentTag(node):
        def getPercentOfNcontent(soup):
            leaf_count = 0
            NContent_leaf_count = 0
            for child in soup.find_all(recursive=True):
                if child.leaf_is:
                    if child.nearst==1:
                        NContent_leaf_count += 1
                    leaf_count += 1
            if leaf_count>0:
                return NContent_leaf_count/leaf_count,leaf_count
            else:
                return 0,leaf_count
        for child in node.find_all(recursive=False):
            if child.leaf_count>1:
                percent,leaf_count = getPercentOfNcontent(child)
                if leaf_count>2 and percent>0.7:
                    #print(child.get_text(),leaf_count,percent)
                    child.decompose()
        return node
    
    def removeTag_byRule(soup,keyword_pattern = "访问量|打印|浏览次数|上一篇|下一篇"):
        words_len = 8
        for child in soup.find_all(recursive=True):
            if child.leaf_is:
                parent_text = child.parent.get_text()
                child_text = child.get_text()
                if re.search(keyword_pattern,parent_text) is not None:
                    if re.search(keyword_pattern,child_text) is None:
                        child.parent.decompose()
                    else:
                        if len(parent_text)-len(child_text)>words_len:
                            child.decompose()
                        else:
                            child.parent.decompose()
    
        
    soup = hd.getSource(url)
    #print(soup)
    stopTags = ["script","meta","link","style"]
    delStopTags(soup, stopTags)
    #print(soup.get_text())
    
    stopWords = ["[A-Z]","[a-z]","[0-9]"]
    stopWords_pattern = re.compile("|".join(stopWords))
    punctuationWords = "[；，。：、]"
    punctuationWords_pattern = re.compile(punctuationWords)
    a = time.time()
    recursiveStatistic(soup)
    result = dict()
    '''
    for child in soup.find_all(recursive=True):
        print(child.name,child.leaf_is,child.Ncontent_leaf_count,child.leaf_count)
    
    node_list,feature_list = getInputOfKmeans_context(soup)
    node = getMaxIoU(soup, kmeans_context(node_list,feature_list))
    '''
    node_list,feature_list = getInputOfKmeans(soup)
    word_set,tag_set = kmeans(node_list,feature_list)
    node = getMaxIoU(soup,word_set,tag_set)
    node = removeNcontentTag(node)
    #node = removeTag_byRule(node)
    if node:
        return node.get_text()
    else:
        return ""
    '''
    content_child = getContent_withWords(soup, soup.html.num_words, 1)
    #content_child = getContent_withPunctuations(soup,soup.html.num_punctuations,1)
    #content_child = getContent_withPunctuations(soup,soup.num_stopwords,1)
    
    list_childs_title,list_childs_time = getChildsFromTheBeginOfContent(content_child.words,content_child, 10)
    
    title_list,time_list = getTitleTimeList(soup, content_child)
    for item in list_childs_title:
        title_list.append(item)
    for item in list_childs_time:
        time_list.append(item)
    title_list.sort(key=lambda x:x[2]/x[1],reverse=True)
    title_list_max = []
    
    #取出出现率最大的句子
    if len(title_list)>0:
        max_match = title_list[0][2]/title_list[0][1]
        for i in range(len(title_list)):
            if title_list[i][2]/title_list[i][1]==max_match:
                title_list_max.append(title_list[i])
            else:
                break
    route_match = 0
    if len(title_list_max)>0:
        title = title_list_max[0][0]
        #取出离正文最近的title
        for i in range(len(title_list_max)):
            match = 0
            for a,b in zip(title_list_max[i][3],content_child.code):
                if a==b:
                    match += 1
            if match > route_match:
                route_match = match
                title = title_list_max[i][0]
        result["title"] = title
    
    
    result["content"] = content_child.words
    #取出离正文最近的时间
    if len(time_list)>0:
        if len(time_list)==1:
            result["time"] = time_list[0][0]
        else:
            route_match = 0
            the_time = time_list[0][0]
            for i in range(len(time_list)):
                match = 0
                for a,b in zip(time_list[i][1],content_child.code):
                    if a == b:
                        match += 1
                if match>route_match:
                    route_match = match
                    the_time = time_list[i][0]
            result["time"] = the_time
    '''

import psycopg2
conn = psycopg2.connect(dbname="htmlExtract",user="postgres",password="postgres",host="192.168.2.101")
cursor = conn.cursor()

def getAccAndRecall(label_content,predict_content,whole_same=True):
    label_content = re.sub("\r|\n|\s","",label_content)
    predict_content = re.sub("\r|\n|\s","",predict_content)
    #print(label_content)
    #print(predict_content)
    if whole_same:
        if label_content==predict_content:
            return 1,1
        else:
            return 0,0
    else:
        content_set1 = set(jieba.cut(label_content))
        content_set2 = set(jieba.cut(predict_content))
        inter_counts = len(content_set1 & content_set2)
        label_counts = len(content_set1)
        predict_counts = len(content_set2)
        print("diff",(content_set1|content_set2)-(content_set1&content_set2))
        return inter_counts/(predict_counts+0.001),inter_counts/(label_counts+0.001)
    

def getLabelData():
    sql = " select url,content from label_html where content is not NULL and content!='' limit 300"
    cursor.execute(sql)
    rows = cursor.fetchall()
    return rows

def getLabelData_withUrl(url):
    sql = " select url,content from label_html where url='"+url+"' "
    cursor.execute(sql)
    rows = cursor.fetchall()
    return rows

def test(rows):
    all_acc = 0
    all_recall = 0
    counts = 0
    notgood = []
    for row in rows:
        url = row[0]
        print("url:",url)
        content = row[1]
        content_predict = analysis(url)
        acc,recall = getAccAndRecall(content, content_predict)
        if acc<0.9:
            notgood.append(url)
        counts += 1
        all_acc += acc
        all_recall += recall
    print("acc:%f,recall:%f"%(all_acc/counts,all_recall/counts))
    for url in notgood:
        print(url)

if __name__=="__main__":
    
    url = "https://blog.csdn.net/studysinklc/article/details/78017330"
    result = analysis(url)
    print(result)
    #test(getLabelData_withUrl(url))
    browser.close()
    '''
    a = time.time()
    test(getLabelData())
    print("takes",time.time()-a)
    '''