luojiehua
/
ContentExtract


			
							123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385
							'''
Created on 2019年8月8日

@author: User
'''
import re
import time
from keras.preprocessing.sequence import pad_sequences

scripts_title = '''
        
function statisticIframe(nodes){
    var counts_communicateTags = 0;
    for(var i=0;i<nodes.length;i++){
        child = nodes[i]
        if (child.tagName!=null){
            if (child.tagName.toLowerCase() in {a:"",input:"",select:""} || child.onclick!=null){
                counts_communicateTags += 1;
            }
            if(child.tagName.toLowerCase()=="iframe"){
                if(child.contentWindow.document!=null){
                    counts_communicateTags += statisticIframe(child.contentWindow.document.all);
                }
            }
        }
    }
    return counts_communicateTags;
}
function statistic(node,deepth){
    if(node.childNodes==null){
        node.counts_communicateTags = 0;
        return node.counts_communicateTags;
    }
    node.counts_communicateTags = 0;
    for(var i=0;i<node.childNodes.length;i++){
        child = node.childNodes[i];
        //删除标签
        /*
        if (child.tagName!=null){
            if (child.tagName.toLowerCase() in {head:"",script:"",meta:"",link:"",style:""} || child.nodeType==8 ){
                node.removeChild(child);
                continue;
            }
        }
        */
        if (child.tagName!=null){
            if (child.tagName.toLowerCase() in {a:"",input:"",select:""} || child.onclick!=null){
                node.counts_communicateTags += 1;
            }
        }
        /*if(child.tagName!=null && child.tagName.toLowerCase()=="iframe" && child.contentWindow.document!=null){
            node.counts_communicateTags += statisticIframe(child.contentWindow.document.all);
        }else{
            node.counts_communicateTags += statistic(child,deepth+1);
        }*/
        node.counts_communicateTags += statistic(child,deepth+1);                
    }
    var innertext = node.innerText;
    if(innertext){
        var text = innertext.replace(/\s/g,'');
        //var text = innertext;
        node.counts_text = text.length;
        var punc = text.match(/；|，|。|：|、/g);
        var lines = innertext.match(/.{10}\\n/g);
        if(lines){
            node.counts_lines = lines.length;
        }else{
            node.counts_lines = 0;
        }
        if(punc){
            node['counts_punctuations']= punc.length;
        }else{
            node.counts_punctuations = 0;
        }
        
    }else{
        node.counts_lines = 0;
        node.counts_text = 0;
        node.counts_punctuations=0;
    }
    node.deepth = deepth;
    return node.counts_communicateTags;
}
function recursive_candidate_title(node,list_candidate_title,maxWidth,maxHeight){
    if(node==document){
        var _flag = true;
        var list_node_true = new Array();
        for(var i=0;i<node.childNodes.length;i++){
            child = node.childNodes[i];
            if(child.offsetWidth>maxWidth){
                maxWidth = child.offsetWidth;
            }
            if(child.offsetHeight>maxHeight){
                maxHeight = child.offsetHeight;
            }
        }
        for(var i=0;i<node.childNodes.length;i++){
            var child = node.childNodes[i];
            var _result = recursive_candidate_title(child,list_candidate_title,maxWidth,maxHeight);
            if(_result!=null){
                if(!_result[1]){
                    _flag = false;
                }else{
                    list_node_true.push(child);
                }
            }

        }
        
        if(_flag){
            
        }else{
            for(var i=0;i<list_node_true.length;i++){
                list_candidate_title.push([node,node.innerHTML]);
            }
            
        }
    }else{
        if(node.nodeType!=1){
            return null;
        }
        if(node.innerText==null || node.innerText==""){
            return null;
        }
        var _node_fontSize = window.getComputedStyle(node).fontSize;
        if(node.childNodes==null){
            return [_node_fontSize,true];
        }else{
            for(var i=0;i<node.childNodes.length;i++){
                child = node.childNodes[i];
                if(child.offsetWidth>maxWidth){
                    maxWidth = child.offsetWidth;
                }
                if(child.offsetHeight>maxHeight){
                    maxHeight = child.offsetHeight;
                }
            }
        
            var _flag = true;
            var list_node_true = new Array();
            for(var i=0;i<node.childNodes.length;i++){
                var child = node.childNodes[i];
                var _result = recursive_candidate_title(child,list_candidate_title,maxWidth,maxHeight);
                if(_result!=null){
                    if(!_result[1]){
                        _flag = false;
                    }else{
                        list_node_true.push(child);
                    }
                    if(_node_fontSize!=_result[0]){
                        _flag = false;
                    }
                }

            }
            
            if(_flag){
                return [_node_fontSize,true];
            }else{
                for(var i=0;i<list_node_true.length;i++){
                    var child_true = list_node_true[i]
                    if(child_true.offsetWidth>100 && getOffsetTop(child_true)>0){
                        var _fontWeight = window.getComputedStyle(child_true).fontWeight
                        var _weight = 400;
                        if(_fontWeight=="normal"){
                            _weight = 400;
                        }else if(_fontWeight=="bold"){
                            _weight = 700;
                        }else if(_fontWeight=="lighter"){
                            _weight = 200;
                        }else if(_fontWeight=="bolder"){
                            _weight = 600;
                        }else{
                            _weight = parseInt(_fontWeight)
                        }
                        var _fontSize = parseInt(window.getComputedStyle(child_true).fontSize.match(/\d+/)[0])    
                            
                        list_candidate_title.push([[maxWidth,maxHeight,getOffsetLeft(child_true),getOffsetTop(child_true),child_true.offsetWidth,child_true.offsetHeight,_fontSize,_weight,child_true.counts_text,child_true.counts_lines,child_true.counts_punctuations,child_true.counts_communicateTags],child_true.innerHTML,getListXpath(child_true,new Array())]);
                    }
                    
                }
                return [_node_fontSize,false];
            }
        }
    }
}


var list_candidate_title = new Array();
statistic(document,1);
recursive_candidate_title(document,list_candidate_title,0,0);
return list_candidate_title;
'''

import module.htmlDrawing as hd
import numpy as np
import math
from module.Utils import *

def dealWithScriptOut(data,sort_index=3):
    list_input = []
    list_inner = []
    list_xpath = []
    list_top = []
    for index in range(len(data)):
        #clean nan
        for i in range(len(data[index][0])):
            if data[index][0][i] is None or math.isnan(data[index][0][i]):
                data[index][0][i] = -1
    data.sort(key=lambda x:x[0][sort_index])
    for item in data:
        list_input.append(item[0])
        list_inner.append(item[1])
        list_xpath.append(item[2])
        list_top.append(item[0][3])
    #print(len(data))
    if len(list_input)>0:
        the_max = np.max(list_input,axis=0)
        the_max = np.array([x if x>0 else 1 for x in the_max])
        the_max = np.array(list(the_max)[0:2]*3+[16,400,20,20,20,20])
        input_x = np.array(list_input/the_max)
        return input_x,list_inner,list_xpath,list_top
    else:
        return None

def getInput_byJS(browser,url):
    try:
        # browser = hd.getdriver()
        # debug("get driver")
        # hd.loadPage(browser, url)
    
        # data = browser.execute_script(scripts_common+scripts_title)
        data = get_js_rs(browser, scripts_common+scripts_title)
        deal_data = dealWithScriptOut(data)
        if deal_data is None:
            return False,""
        else:
            input_x,list_inner,list_xpath,list_height = deal_data
        return True,[[np.expand_dims(input_x,0)],list_inner,list_xpath,list_height]
    except Exception as e:
        error(str(e))
        err_msg = ""
        if re.search("frame",str(e)) is not None:
            err_msg = "#iframe#"
        return None,err_msg
    # finally:
        # hd.adddriver(browser)
        # debug("release driver")

def encodeInput_byJS(url,targethtml):
    def label(innerhtml,target_source):
        target_source =re.sub("[\r\n\s]","",str(target_source))
        pattern = ">(.*)<"
        target_source = re.findall(re.compile(pattern), target_source)[0]
        innerhtml = re.sub("[\r\n\s]","",str(innerhtml))
        #print(target_source[0:40])
        #print(element_source[0:40])
        #if target_source[0:10]==element_source[0:10] and target_source[-10:]==element_source[-10]:
        if target_source==innerhtml:
            return 1
        return 0
    try:
        browser = hd.getdriver()
        debug("get driver")
        start = time.time()
        hd.loadPage(browser, url)
        print("get",time.time()-start)
        browser.maximize_window()
        start = time.time()
        
        # data = browser.execute_script(scripts_common+scripts_title)
        data = get_js_rs(browser, scripts_common+scripts_title)
        input_x,list_inner,_,_ = dealWithScriptOut(data)
        list_label = []
        for item in list_inner:
            list_label.append(label(item, targethtml))
        if len(list_label)>0 and np.sum(list_label)==1:
            return input_x,np.array(list_label)
        else:
            return None
        print("cost",time.time()-start)
    except Exception as e:
        print(e)
    finally:
        hd.adddriver(browser)
        debug("release driver")
    return None

def dumpLinkTitle():
    def trytosave(d):
        try:
            save(d,"1.pk")
            return 1
        except Exception as e:
            return 0
    import cx_Oracle as cx_Oracle
    conn=cx_Oracle.connect('bxkc/bxkc@192.168.2.54:1521/orcl')    #连接数据库
    cursor=conn.cursor()
    sql = " select page_link,page_title from DETAIL_CONTENT_HTML where page_link is not null and page_type=1 and page_title like '<%' and page_title not like '<a%' "
    cursor.execute(sql)
    data = []
    while(True):
        try:
            rows = cursor.fetchmany(10)
            if not rows:
                break
            for row in rows:
                if trytosave(row)==1:
                    data.append(row)
        except Exception as e:
            print(e)
    save(data,"Link_Title.pk")
    
def getAllData():
    all_data = load("Link_Title.pk")
    data = []
    temp_file ="temp_data.pk"
    count = 0
    label = 0
    data_len = len(all_data)
    for row in all_data:
        count += 1
        print(str(label)+"/"+str(count)+"/"+str(data_len),row[0])
        #encode = encodeInput(row[0], row[1])
        
        if count%100==0:
            save(data,temp_file)
        encode = encodeInput_byJS(row[0], row[1])
        if encode:
            label += 1
            x,y = encode
            data.append([x,y,row[0]])
        else:
            print("None")
    save(data,"data_done.pk")
    return data

def filter():
    list_length = []
    data = load("temp_data.pk")
    print(data[0])
    data.sort(key = lambda x:x[2])
    new_data = []
    for item in data:
        list_length.append(len(item[0]))
        if len(item[0])<100:
            new_data.append(item)
    print(max(list_length))
    print(len(data))
    print(len(new_data))
    save(new_data,"source_12input.pk")
    
def paddinig(all_data,pad=True):
    max_len = np.max([len(data[1]) for data in all_data])
    print("max_len",max_len)
    #max_len = 200
    list_x = []
    list_y = []
    list_url = []
    for data in all_data:
        input_x = data[0]
        label_y = data[1]
        url = data[2]
        if pad:
            input_x = np.transpose(pad_sequences(np.transpose(input_x,(1,0)), max_len,padding="post", truncating="post", value=0,dtype="float32"),(1,0))
            list_x.append(input_x)
            label_y = pad_sequences([label_y],max_len,padding="post", truncating="post", value=-1)[0]
            #list_y.append(label_y)
            list_y.append([(np.arange(2)==i).astype(np.integer) for i in label_y])
        else:
            #input_x = np.array(input_x)
            list_x.append([input_x])
            list_y.append([(np.arange(2)==i).astype(np.integer) for i in label_y])
        list_url.append(url)
    return [np.array(list_x),np.array(list_y),list_url]


if __name__=="__main__":
    #data = getInput_byJS("http://www.tonghua.gov.cn/cjj/zbtb/201908/t20190802_360119.html")
    #dumpLinkTitle()
    #getAllData()
    #filter()
    data = paddinig(load("source_12input.pk"))
    save(data,"source_12input_padding.pk")