luojiehua
/
ContentExtract


			
							123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700
							'''
Created on 2019年8月12日

@author: User
'''

import module.htmlDrawing as hd
import math
import numpy as np
from module.Utils import *
from keras.preprocessing.sequence import pad_sequences
import re

script_content = '''


function label(node,set_url){
    var node_flag = check(node,set_url);
    var child_flag = false;
    if(node.childNodes!=null){
        for(var i=0;i<node.childNodes.length;i++){
            var child = node.childNodes[i];
            if(check(child,set_url)){
                child_flag = true;
            }
        }
    }
    if(node_flag && !child_flag){
        return 1;
    }else{
        return 0;
    }
}

function statisticIframe(nodes){
    var counts_communicateTags = 0;
    for(var i=0;i<nodes.length;i++){
        child = nodes[i]
        if (child.tagName!=null){
            if (child.tagName.toLowerCase() in {a:"",input:"",select:""} || child.onclick!=null){
                counts_communicateTags += 1;
            }
            if(child.tagName.toLowerCase()=="iframe"){
                if(child.contentWindow.document!=null){
                    counts_communicateTags += statisticIframe(child.contentWindow.document.all);
                }
            }
        }
    }
    return counts_communicateTags;
}

function statistic(node,deepth){
    if(node.childNodes==null){
        node.counts_communicateTags = 0;
        node.counts_tag = 0;
        node.entropy_width = 0;
        node.entropy_height = 0;
        return node.counts_communicateTags;
    }
    node.counts_communicateTags = 0;
    node.counts_tag = 0;
    var set_tag = new Set();
    var list_width = []
    var list_height = []
    for(var i=0;i<node.childNodes.length;i++){
        child = node.childNodes[i];
        //删除标签
        /*
        if (child.tagName!=null){
            if (child.tagName.toLowerCase() in {head:"",script:"",meta:"",link:"",style:""} || child.nodeType==8 ){
                node.removeChild(child);
                continue;
            }
        }
        */
        if(child.offsetWidth){
            list_width.push(child.offsetWidth);
        }
        if(child.offsetHeight){
            list_height.push(child.offsetHeight);
        }
        node.counts_tag += 1;
        if (child.tagName!=null){
            set_tag.add(child.tagName.toLowerCase())
            if (child.tagName.toLowerCase() in {a:"",input:"",select:""} || child.onclick!=null){
                node.counts_communicateTags += 1;
            }
        }        
        /*if(child.tagName!=null && child.tagName.toLowerCase()=="iframe" && child.contentWindow.document!=null){
            node.counts_communicateTags += statisticIframe(child.contentWindow.document.all);
        }else{
            node.counts_communicateTags += statistic(child,deepth+1);
        }*/
        node.counts_communicateTags += statistic(child,deepth+1);    
    }
    node.counts_tagType = set_tag.size();
    var sum_width = 0;
    var sum_height = 0;
    var avg_width = 0;
    var avg_height = 0;
    var entropy_width = 0;
    var entropy_height = 0;
    if(list_width.length>0){
        for(var i=0;i<list_width.length;i++){
            sum_width += list_width[i];
        }
        for(var i=0;i<list_height.length;i++){
            sum_height += list_height[i];
        }
        
        avg_width = sum_width/list_width.length;
        avg_height = sum_height/list_height.length;
        
        for(var i=0;i<list_width.length;i++){
            entropy_width += Math.pow(list_width[i]-avg_width,2);
        }
        for(var i=0;i<list_height.length;i++){
            entropy_height += Math.pow(list_height[i]-avg_height,2);
        }
        
        entropy_width /= list_width.length;
        entropy_height /= list_height.length;
    }
    
    entropy_width = entropy_width>1000?1000:entropy_width;
    entropy_height = entropy_height>1000?1000:entropy_height;
    node.entropy_width = entropy_width;
    node.entropy_height = entropy_height;
    
    var innertext = node.innerText;
    if(innertext){
        var pattern_time = /([^\d]?(\d{4}|\d{2})\s*[-\/:：年.]\s*\d{1,2}\s*[-\/:：月.]\s*\d{1,2}[^\d]?)|([^\d]?\d{2,4}\s*[-\/月年]\s*\d{1,2}[^\d]?)/
        var text = innertext.replace(/\s/g,'');
        //var text = innertext;
        node.counts_text = text.length;
        var punc = text.match(/；|，|。|：|、/g);
        var lines = innertext.match(/.{10}\\n/g);
        var times = innertext.match(pattern_time);
        if(lines){
            node.counts_lines = lines.length;
        }else{
            node.counts_lines = 0;
        }
        if(punc){
            node['counts_punctuations']= punc.length;
        }else{
            node.counts_punctuations = 0;
        }
        if(times){
            node.counts_times = times.length;
        }else{
            node.counts_times = 0;
        }
        
    }else{
        node.counts_lines = 0;
        node.counts_text = 0;
        node.counts_punctuations=0;
        node.counts_times = 0;
    }
    node.deepth = deepth;
    return node.counts_communicateTags;
}
function search(str_url){
    statistic(document,1);
    var objs = document.all;
    var set_url = new Set();
    list_url = str_url.split("比地");
    for(var i=0;i<list_url.length;i++){
        if(list_url[i]!=""){
            set_url.add(list_url[i]);
        }
    }
    var data = new Array();
    for(var i=0;i<objs.length;i++){
        obj = objs[i];
        if (obj.offsetWidth>100 && obj.offsetHeight>100 && obj.parentNode.tagName!=null && obj.childNodes.length>0){
            _item = new Array();
            _item.push(getOffsetLeft(obj),getOffsetTop(obj),obj.offsetWidth,obj.offsetHeight,obj.deepth,obj.counts_text,obj.counts_times,obj.counts_tagType,obj.counts_tag,obj.entropy_width,obj.entropy_height)
            data.push([_item,label(obj,set_url),obj.innerHTML,getListXpath(obj,new Array())])
        }
    }
    return(data);
}
return search(arguments[0])
'''

script_get_A_Date = '''
function is_similar(source,target){
    var diff_index = -1;
    var source_split = source.split(/(\d+)/)
    var target_split = target.split(/(\d+)/)
    if(source_split.length==target_split.length){
        var diff_count = 0;
        for(var i=0;i<source_split.length;i++){
            if(source_split[i]!=target_split[i]){
                if(diff_index==-1){
                    if(source_split[i].search(/^\d+$/)>=0 && target_split[i].search(/^\d+$/)>=0){
                        diff_index = i;
                    }else{
                        //不同的部分一定要是数字
                        return -1;
                    }
                }
                diff_count += 1;
            }
        }
        if(diff_count==1){
            return diff_index;
        }else{
            return -1;
        }
    }else{
        return -1;
    }
}
function getNode_listContent(xpath){
    /*
    var objs = document.all;
    for(var i=0;i<objs.length;i++){
        var obj = objs[i];
        if(obj!=null && getXpath(obj,[])==xpath){
            return obj;
        }
    }
    return null;
    */
    var objs = findElements_byXpath(xpath);
    if(objs.length>0){
        return objs[0];
    }
    return null;
}


function statistic_time(node,_array){
    var pattern_time = /([^\d]?(\d{4}|\d{2})\s*[-\/:：年.]\s*\d{1,2}\s*[-\/:：月.]\s*\d{1,2}[^\d]?)|([^\d]?\d{2,4}\s*[-\/月年]\s*\d{1,2}[^\d]?)/
    var _find_flag = false;
    if (node.childNodes==null){

    }else{
        for(var i=0;i<node.childNodes.length;i++){
            var childNode = node.childNodes[i];
            var _innerText = childNode.innerText;
            if(childNode!=null && childNode.tagName!=null && childNode.tagName.toLowerCase()=="script"){
                continue;
            }
            if (_innerText!=null && _innerText.search(pattern_time)>=0){
                statistic_time(childNode,_array);
                _find_flag = true;
            }
        }
    }

    if (!_find_flag){
        _array.push(getXpath(node,["tr","li"],true));
    }
    return _array;
}

function padding_href(href){
    var baseUrl = window.location.href;
    var baseUrl_split = baseUrl.split("/");
    var join_flag = true;
    var href_padded = "";
    var level_nums = 1; 
    var filename = "";
    if(href==null){
        join_flag = false;
    }else if(href.indexOf("javascript")>-1){
        join_flag = false;
    }else if(href.indexOf("http")>-1){
        join_flag = false;
        href_padded = href;
    }else if(href.indexOf("./")==0){
        filename = href.substring(2);
    }else if(href.indexOf("../")==0){
        level_nums ++;
        _substr = href.substring(3)
        while(true){
            if(_substr.indexOf("../")==0){
                level_nums ++;
                _substr = _substr.substring(3);
            }else{
                filename = _substr;
                break;
            }
        }
    }else if(href.indexOf("./")==0){
        level_nums = baseUrl_split.length-3;
        filename = href.substring(1);
    }else if(href.indexOf("?")==0){
        _href = baseUrl.split("?")[0]+href;
        return _href;
    }else{
        filename = href;
    }
    if(join_flag){
        for(var i=0;i<baseUrl_split.length-level_nums;i++){
            href_padded += baseUrl_split[i]+"/";
        }
        href_padded += filename;
    }
    return href_padded;
}

function statistic_A(node){
    var list_a = node.getElementsByTagName("a");
    var clustered_turnPage = clustering_turnPage();
    var array_xpath_turnPage = new Set();
    for(var i=0;i<clustered_turnPage.length;i++){
        array_xpath_turnPage.add(padding_href(clustered_turnPage[i][0].href));
    }
    var set_aXpath = new Set();
    var set_href = new Set();
    for(var i=0;i<list_a.length;i++){
        _href = padding_href(list_a[i].href);
        var is_turnPage = false; 
        _xpath = getXpath(list_a[i],["tr","li"],true);
        if(array_xpath_turnPage.contains(_href)){
            is_turnPage = true;
        }
        if(!is_turnPage){
            set_aXpath.add(_xpath);
            if(_href!=""){
                set_href.add(_href);
            }
            
        }

    }
    return [set_aXpath.dataStore,set_href.dataStore];
}

function similar_all(_xpath,array_xpath){
    var similar_index = -1;
    for(var h=0;h<array_xpath.length;h++){
        diff_index = is_similar(_xpath,array_xpath[h]);
        if( similar_index>-1 && similar_index!=diff_index){
            return -1;
        }
        similar_index = diff_index;
        if(diff_index<=-1){
            return -1;
        }
    }
    return similar_index;
}

function clustering_xpath(array_xpath){
    var array_class = new Array();
    for(var i=0;i<array_xpath.length;i++){
        for(var j=0;j<array_class.length;j++){
            //与此类中所有xpath都要一样
            var diff_index = similar_all(array_xpath[i],array_class[j][1])
            if(diff_index>-1){
                if(array_class[j][0].indexOf(diff_index)==-1){
                   array_class[j][0].push(diff_index);
                }
                if(array_class[j][1].indexOf(array_xpath[i])<0){
                    array_class[j][1].push(array_xpath[i]);
                }
                
            }
        }
        array_class.push([[],[array_xpath[i]]]);
    }
    var _max_length = 0;
    var _max_index = -1;
    for(var i=0;i<array_class.length;i++){
        if(array_class[i][1].length>_max_length){
            _max_length = array_class[i][1].length;
            _max_index = i;
        }
    }
    return array_class[_max_index];
}


function search(content_xpath){
    try{
        content_node = getNode_listContent(content_xpath) //获取列表页标签节点
        if(content_node!=null){
            var array_a_href = statistic_A(content_node);
            var array_a = array_a_href[0];
            var array_href = new Array();
            var array_date = new Array();
            statistic_time(content_node,array_date);
            var _clustered_a = clustering_xpath(array_a);
            var _clustered_date = clustering_xpath(array_date);
            for(var i=0;i<array_a.length;i++){
                if(_clustered_a[1].indexOf(array_a_href[0][i])>=0){
                    array_href.push(array_a_href[1][i]);
                }
            }
            return [_clustered_a,_clustered_date,array_href]
        }
        return null;
    }
    catch(e){
        return null
    }

}
return search(arguments[0]);
'''


def dealWithScriptOut(data):    
    list_input = []
    list_label = []
    list_inner = []
    list_xpath = []
    for index in range(len(data)):
        #clean nan
        for i in range(len(data[index][0])):
            if data[index][0][i] is None or math.isnan(data[index][0][i]):
                data[index][0][i] = -1
    #order by deepth
    data.sort(key=lambda x:x[0][2]*x[0][3],reverse=True)
    for item in data:
        list_input.append(item[0])
        list_label.append(item[1])
        list_inner.append(item[2])
        list_xpath.append(item[3])
    #print(len(data))
    if len(list_input)>0:
        the_max = np.max(list_input,axis=0)
        the_max = np.array([x if x>0 else 1 for x in the_max])
        the_max = np.array((list(the_max)[2:4]*2+list(the_max)[4:7]+[10,20]+list(the_max)[9:]))
        input_x = np.array(list_input/the_max)
        return input_x,list_label,list_inner,list_xpath
    else:
        return None
    
def encodeInput_byJS(url,str_href):
    try:
        browser = hd.getdriver()
        debug("get driver")
        hd.loadPage(browser, url)
        # data = browser.execute_script(scripts_common+script_content,str_href)
        data = get_js_rs(browser, scripts_common+script_content,str_href)
        deal_data = dealWithScriptOut(data)
        
        if deal_data is None:
            return None
        input_x,list_label,list_inner,list_xpath = deal_data
        if np.sum(list_label)==1:
            return input_x,np.array(list_label)
        else:
            return None
    except Exception as e:
        log(str(e))
    finally:
        hd.adddriver(browser)
        debug("release driver")
    return None
        

def getInput_byJS(browser,url,str_href):
    try:
        # hd.loadPage(browser,url)
        # data = browser.execute_script(scripts_common+script_content,str_href)
        data = get_js_rs(browser, scripts_common+script_content,str_href)

        deal_data = dealWithScriptOut(data)
        if deal_data is None:
            return None
        else:
            input_x,_,list_inner,list_xpath = deal_data
            return [np.expand_dims(input_x,0)],list_inner,list_xpath
    except Exception as e:
        error(str(e))
    return None
        
def getRule_A_Date(browser, url,content_xpath):
    def appendXpath(list_xpath,_xpath):
        if len(list_xpath)==0:
            list_xpath.append(_xpath)
        else:
            list_xpath.append(list_xpath[-1].split("/")[-1]+"/"+_xpath)
    
    dict_Rule_A_Date = {"listpage_A":None,
                        "listpage_Date":None,
                        "flag":True,
                        "hasDrew":False}
    # try:
        # browser = hd.getdriver()
        # debug("get driver")
        # hd.loadPage(browser,url)
        
    list_a = None
    for _content_xpath in [content_xpath,"/html"]:
        # data = browser.execute_script(scripts_common+script_get_A_Date,_content_xpath)
        data = get_js_rs(browser, scripts_common+script_get_A_Date,_content_xpath)
        if data is None:
            log("A_Date not found with xpath:"+_content_xpath)
            continue
        if _content_xpath==content_xpath or len(data[0][1])==len(data[1][1]):
            list_a = data[0]
            list_date = data[1]
            list_hrefs = data[2]
        if list_a is not None and len(list_a[1])==len(list_date[1]):
            log('list_a is not None and len(list_a[1])==len(list_date[1])')
            break
        else:
            log("different length of A and Date:with xpath:"+_content_xpath)

    if list_a is None:
        log("A_Date not found with all xpath")
        return None;
    log("xpath of a:\t"+str(list_a[1][0])+"-"+str(list_a[0]))
    log("xpath of date:\t"+str(list_date[1][0])+"-"+str(list_date[0]))

    log("length of A and Date:"+str(len(list_a[1]))+"-"+str(len(list_date[1])))
    if len(list_a[1])!=len(list_date[1]):
        dict_Rule_A_Date["flag"] = False
        add_err_msg(dict_Rule_A_Date, "#列表页链接和标题数量不一致#")
        return dict_Rule_A_Date,list_hrefs
    else:
        list_diffindex = list_a[0]
        _xpath = list_a[1][0]
        listpage_a = []
        begin = 0
        list_diffindex.sort(key=lambda x:x)
        _jump_flag = False

        dict_Rule_A_Date["hasDrew"] = dict_Rule_A_Date["hasDrew"] or hd.hasDrew(url, [{"rule":_xpath,"type":"xpath"}])
        _xpath_split = re.split("(\d+)",_xpath)
        for i in range(len(list_diffindex)):
            _index = list_diffindex[i]
            if not (_xpath_split[_index-1][-1]=="[" and _xpath_split[_index+1][0]=="]"):
                add_err_msg(dict_Rule_A_Date, "#列表页链接xpath无法分割#")
                dict_Rule_A_Date["flag"] = False
                return dict_Rule_A_Date,list_hrefs
            else:
                if i==0:
                    appendXpath(listpage_a,re.search("(.*)\[","".join(_xpath_split[:_index])).group(1))
                    begin = _index+1
                elif i<len(list_diffindex):
                    appendXpath(listpage_a,re.search("/(.*)\[","".join(_xpath_split[begin:_index])).group(1))
                    begin = _index+1
                else:
                    appendXpath(listpage_a,re.search("/(.*)","".join(_xpath_split[begin:])).group(1))


                if i==len(list_diffindex)-1:
                    _group = re.search("/(.*)","".join(_xpath_split[begin:]))
                    if _group is not None:
                        appendXpath(listpage_a,_group.group(1))

        for i in range(len(listpage_a)):
            if len(listpage_a[i].split("/"))>6:
                # listpage_a[i] = browser.execute_script(scripts_replaceXpath,listpage_a[i])
                listpage_a[i] = get_js_rs(browser, scripts_replaceXpath,listpage_a[i])
        dict_Rule_A_Date["listpage_A"] = listpage_a
        list_diffindex = list_date[0]
        _xpath = list_date[1][0]
        listpage_date = []
        begin = 0
        list_diffindex.sort(key=lambda x:x)
        _jump_flag = False

        dict_Rule_A_Date["hasDrew"] = dict_Rule_A_Date["hasDrew"] or hd.hasDrew(url, [{"rule":_xpath,"type":"xpath"}])
        _xpath_split = re.split("(\d+)",_xpath)
        for i in range(len(list_diffindex)):
            _index = list_diffindex[i]
            if not (_xpath_split[_index-1][-1]=="[" and _xpath_split[_index+1][0]=="]"):
                add_err_msg(dict_Rule_A_Date, "#列表页链接xpath无法分割#")
                dict_Rule_A_Date["flag"] = False
                return dict_Rule_A_Date,list_hrefs
            else:
                if i==0:
                    appendXpath(listpage_date,re.search("(.*)\[","".join(_xpath_split[:_index])).group(1))
                    begin = _index+1
                elif i<len(list_diffindex):
                    appendXpath(listpage_date,re.search("/(.*)\[","".join(_xpath_split[begin:_index])).group(1))
                    begin = _index+1
                else:
                    appendXpath(listpage_date,re.search("/(.*)","".join(_xpath_split[begin:])).group(1))

                if i==len(list_diffindex)-1:
                    _group = re.search("/(.*)","".join(_xpath_split[begin:]))
                    if _group is not None:
                        appendXpath(listpage_date,_group.group(1))

        for i in range(len(listpage_date)):
            if len(listpage_date[i].split("/"))>6:
                # listpage_date[i] = browser.execute_script(scripts_replaceXpath,listpage_date[i])
                listpage_date[i] = get_js_rs(browser, scripts_replaceXpath,listpage_date[i])

        dict_Rule_A_Date["listpage_Date"] = listpage_date

    return dict_Rule_A_Date,list_hrefs
                
                
    # except Exception as e:
    #     error(str(e))
    # finally:
    #     # hd.adddriver(browser)
    #     # debug("release driver")
    #     log('getRule_A_Date done')
    return None
        
def dumpLinkContent():
    def trytosave(d):
        try:
            save(d,"1.pk")
            return 1
        except Exception as e:
            return 0
    import cx_Oracle as cx_Oracle
    conn=cx_Oracle.connect('bxkc/bxkc@192.168.2.54:1521/orcl')    #连接数据库
    cursor=conn.cursor()
    sql = " select page_link,page_content from BXKC.DETAIL_CONTENT_HTML where page_type=0 "
    cursor.execute(sql)
    data = []
    while(True):
        try:
            rows = cursor.fetchmany(10)
            if not rows:
                break
            for row in rows:
                if trytosave(row)==1:
                    data.append(row)
        except Exception as e:
            print(e)
    save(data,"Link_Content.pk")
    
def getAllData():
    all_data = load("Link_Content.pk")
    data = []
    temp_file ="temp_data.pk"
    count = 0
    label = 0
    data_len = len(all_data)
    for row in all_data:
        count += 1
        print(str(label)+"/"+str(count)+"/"+str(data_len),row[0])
        #encode = encodeInput(row[0], row[1])
        
        if count%100==0:
            save(data,temp_file)
        encode = encodeInput_byJS(row[0], row[1])
        if encode:
            label += 1
            x,y = encode
            data.append([x,y,row[0]])
        else:
            print("None")
    save(data,"data_done.pk")
    return data

def filter():
    list_length = []
    data = load("temp_data.pk")
    print(data[0])
    data.sort(key = lambda x:x[2])
    new_data = []
    for item in data:
        list_length.append(len(item[0]))
        if len(item[0])<100:
            new_data.append(item)
    print(max(list_length))
    print(len(data))
    print(len(new_data))
    save(new_data,"source_11input.pk")
    
def padding(all_data,pad=True):
    max_len = np.max([len(data[1]) for data in all_data])
    print("max_len",max_len)
    #max_len = 200
    list_x = []
    list_y = []
    list_url = []
    for data in all_data:
        input_x = data[0]
        label_y = data[1]
        url = data[2]
        if pad:
            input_x = np.transpose(pad_sequences(np.transpose(input_x,(1,0)), max_len,padding="post", truncating="post", value=0,dtype="float32"),(1,0))
            list_x.append(input_x)
            label_y = pad_sequences([label_y],max_len,padding="post", truncating="post", value=-1)[0]
            #list_y.append(label_y)
            list_y.append([(np.arange(2)==i).astype(np.integer) for i in label_y])
        else:
            #input_x = np.array(input_x)
            list_x.append([input_x])
            list_y.append([(np.arange(2)==i).astype(np.integer) for i in label_y])
        list_url.append(url)
    return [np.array(list_x),np.array(list_y),list_url]
    
if __name__=="__main__":
    #dumpLinkContent()
    #getAllData()
    #filter()
    #data = padding(load("source_11input.pk"))
    #save(data,"source_11input_padding.pk")
    getRule_A_Date(url="http://www.dp.gov.cn/dpxw/zwgz/gsgg.htm",content_xpath='//*[@class="yaowen_list"]')