luojiehua
/
ContentExtract


			
							123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605
							
import re
import module.htmlDrawing as hd
import numpy as np
from keras.preprocessing.sequence import pad_sequences
import time
from bs4 import BeautifulSoup
from module.Utils import *
import math
import json
from _collections import OrderedDict
import os

scripts = '''


function statisticIframe(nodes){
    var counts_communicateTags = 0;
    for(var i=0;i<nodes.length;i++){
        child = nodes[i]
        if (child.tagName!=null){
            if (child.tagName.toLowerCase() in {a:"",input:"",select:""} || child.onclick!=null){
                counts_communicateTags += 1;
            }
            if(child.tagName.toLowerCase()=="iframe"){
                if(child.contentWindow.document!=null){
                    counts_communicateTags += statisticIframe(child.contentWindow.document.all);
                }
            }
        }
    }
    return counts_communicateTags;
}
function statistic(node,deepth){
    if(node.childNodes==null){
        node.counts_communicateTags = 0;
        return node.counts_communicateTags;
    }
    node.counts_communicateTags = 0;
    for(var i=0;i<node.childNodes.length;i++){
        child = node.childNodes[i];
        //删除标签
        /*
        if (child.tagName!=null){
            if (child.tagName.toLowerCase() in {head:"",script:"",meta:"",link:"",style:""} || child.nodeType==8 ){
                node.removeChild(child);
                continue;
            }
        }
        */
        if (child.tagName!=null){
            if (child.tagName.toLowerCase() in {a:"",input:"",select:""} || child.onclick!=null){
                node.counts_communicateTags += 1;
            }
        }
        /*if(child.tagName!=null && child.tagName.toLowerCase()=="iframe" && child.contentWindow.document!=null){
            node.counts_communicateTags += statisticIframe(child.contentWindow.document.all);
        }else{
            node.counts_communicateTags += statistic(child,deepth+1);
        }*/
        node.counts_communicateTags += statistic(child,deepth+1);    
    }
    var innertext = node.innerText;
    if(innertext){
        var text = innertext.replace(/\s/g,'');
        //var text = innertext;
        node.counts_text = text.length;
        var punc = text.match(/；|，|。|：|、/g);
        var lines = innertext.match(/.{10}\\n/g);
        if(lines){
            node.counts_lines = lines.length;
        }else{
            node.counts_lines = 0;
        }
        if(punc){
            node['counts_punctuations']= punc.length;
        }else{
            node.counts_punctuations = 0;
        }
        
    }else{
        node.counts_lines = 0;
        node.counts_text = 0;
        node.counts_punctuations=0;
    }
    node.deepth = deepth;
    return node.counts_communicateTags;
}
function label(node,targethtml){
    var innerhtml = node.innerHTML;
    if(innerhtml){
        innerhtml = innerhtml.replace(/\s/g,'');
        sub_innerhtml = innerhtml.substring(0,40);
        if (sub_innerhtml==targethtml.substring(0,40)){
            return 1;
        }else{
            return 0;
        }
    }else{
        return 0;
    }
}

function getListFontSize(node,_array){
    if(node!=null && node.nodeType==1){
        _fontSize = parseInt(window.getComputedStyle(node).fontSize.match(/\d+/)[0]);
        if(_fontSize!=null){
            _array.push(_fontSize);
        }
        if(node.childNodes!=null){
            for(var i=0;i<node.childNodes.length;i++){
                var child = node.childNodes[i];
                getListFontSize(child,_array);
            }
        }
    }
}

function stastic_time(node,_array){
    var pattern_time = /\d{4}[\-\/:：年.]\d{1,2}[\-\/:：月.]\d{1,2}/g
    var _find_flag = false;
    if (node.childNodes==null){

    }else{
        for(var i=0;i<node.childNodes.length;i++){
            var childNode = node.childNodes[i];
            var _innerText = childNode.innerText;
            if (_innerText!=null && _innerText.search(pattern_time)>=0){
                stastic_time(childNode,_array);
                _find_flag = true;
            }
        }
    }

    if (!_find_flag && node!=document){
        _array_fontSize = new Array();
        getListFontSize(node,_array_fontSize);
        _array.push([getOffsetLeft(node),getOffsetTop(node),getListXpath(node,new Array()),Math.min(_array_fontSize)]);
    }
    return _array;
}
function search(){
    statistic(document,1);
    var objs = document.all;
    var data = new Array();
    for(var i=0;i<objs.length;i++){
        obj = objs[i];
        if (obj.offsetWidth>100 && obj.offsetHeight>100 && obj.parentNode.tagName!=null && obj.childNodes.length>0){
            maxArea = 0;
            child_maxArea = null;
            secondmaxArea = 0;
            child_secondmaxArea = null;
            for(var j =0;j<obj.childNodes.length;j++){
                if(obj.childNodes[j].offsetWidth!=null && obj.childNodes[j].offsetHeight!=null){
                    if( obj.childNodes[j].offsetWidth*obj.childNodes[j].offsetHeight>maxArea){
                        maxArea = obj.childNodes[j].offsetWidth*obj.childNodes[j].offsetHeight;
                        child_maxArea = obj.childNodes[j];
                    }
                    if(obj.childNodes[j].offsetWidth*obj.childNodes[j].offsetHeight>secondmaxArea && obj.childNodes[j].offsetWidth*obj.childNodes[j].offsetHeight<maxArea){
                        secondmaxArea = obj.childNodes[j].offsetWidth*obj.childNodes[j].offsetHeight;
                        child_secondmaxArea = obj.childNodes[j];
                    }
                }
            }
            
            _item = new Array();
            _item.push(getOffsetLeft(obj),getOffsetTop(obj),obj.offsetWidth,obj.offsetHeight,obj.deepth,obj.counts_communicateTags,obj.counts_lines,obj.counts_text,obj.counts_punctuations,
                    getOffsetLeft(obj.parentNode),getOffsetTop(obj.parentNode),obj.parentNode.offsetWidth,obj.parentNode.offsetHeight,obj.parentNode.deepth,obj.parentNode.counts_communicateTags,obj.parentNode.counts_lines,obj.parentNode.counts_text,obj.parentNode.counts_punctuations)
            if(child_maxArea!=null){
                _item.push(getOffsetLeft(child_maxArea),getOffsetTop(child_maxArea),child_maxArea.offsetWidth,child_maxArea.offsetHeight,child_maxArea.deepth,child_maxArea.counts_communicateTags,child_maxArea.counts_lines,child_maxArea.counts_text,child_maxArea.counts_punctuations)
            }else{
                _item.push(-1,-1,-1,-1,-1,-1,-1,-1,-1)
            }
            
            if(child_secondmaxArea!=null){
                _item.push(getOffsetLeft(child_secondmaxArea),getOffsetTop(child_secondmaxArea),child_secondmaxArea.offsetWidth,child_secondmaxArea.offsetHeight,child_secondmaxArea.deepth,child_secondmaxArea.counts_communicateTags,child_secondmaxArea.counts_lines,child_secondmaxArea.counts_text,child_secondmaxArea.counts_punctuations)
            }else{
                _item.push(-1,-1,-1,-1,-1,-1,-1,-1,-1)
            }
            data.push([_item,obj.innerHTML,getListXpath(obj,new Array(),true)])
            
        }
    }
    
    var data_time = stastic_time(document,new Array());
    
    return([data,data_time]);
}
return (search());
'''

def statisticCommunicateTags(element):
    count = 0
    soup = BeautifulSoup(element.get_attribute("innerHTML"),"lxml")
    childs = soup.find_all(recursive=True)
    for child in childs:
        if child.name in ["a","input","select"] or "onclick" in child.attrs is not None:
            #print(child.text)
            count += 1
    return count
    
def statisticPunctuationAndWords(element,punctuationWords_pattern = re.compile("[；，。：、]")):
    text = element.text
    text = re.sub("\r|\n|\s","",text)
    words_len = len(text)
    punctuation_len = len(re.findall(punctuationWords_pattern,text))
    return punctuation_len,words_len
    
def encodeInput(url,target_source):
    def _method(args):
        try:
            url = args["url"]
            target_source = args["target_source"]
            browser = args["browser"]
            start_time = time.time()
            browser.get(url)
            print("get",time.time()-start_time)
            start_time = time.time()
            #browser.refresh()
            #time.sleep(1)
            browser.maximize_window()
            #elements = browser.find_elements_by_xpath("//*")
            ''''''
            findTags = ["div","table","tbody","tr","td","form","li","span"]
            MIN_WIDTH = 400
            MIN_HEIGHT = 400
            list_input = []
            input_x = []
            label_y = []
            for tag in findTags:
                for element in browser.find_elements_by_tag_name(tag):
                    rect = element.rect
                    x = rect["x"]
                    y = rect["y"]
                    width = rect["width"]
                    height = rect["height"]
                    if width>=MIN_WIDTH and height>=MIN_HEIGHT:
                        list_input.append(element)
                    
            print("search",time.time()-start_time) 
            start_time = time.time()       
            for element in list_input:
                communicateTags = statisticCommunicateTags(element)
                punctuation,words = statisticPunctuationAndWords(element)
                input_x.append([element.rect["x"],element.rect["y"],element.rect["width"],element.rect["height"],communicateTags,punctuation,words])
                label_y.append(labelElement(element, target_source))
                
            print("encode",time.time()-start_time)
            the_max = np.max(input_x,axis=0)
            the_max = np.array(list(the_max)[2:4]+list(the_max)[2:])
            input_x = np.array(input_x/the_max)
            if len(label_y)>0 and np.max(label_y)==1:
                return input_x,np.array(label_y)
            else:
                return None
        except Exception as e:
            print(e)
            return None
    args = {"url":url,"target_source":target_source}
    hd.executeMethod(_method, args)
    
def getInput(url):
    def _method(args):
        try:
            url = args["url"]
            browser = args["browser"]
            start_time = time.time()
            browser.get(url)
            print("get",time.time()-start_time)
            start_time = time.time()
            #browser.refresh()
            #time.sleep(1)
            browser.maximize_window()
            #elements = browser.find_elements_by_xpath("//*")
            ''''''
            findTags = ["div","table","tbody","tr","td","form","li","span"]
            MIN_WIDTH = 400
            MIN_HEIGHT = 400
            list_input = []
            input_x = []
            label_y = []
            for tag in findTags:
            
                for element in browser.find_elements_by_tag_name(tag):
                    rect = element.rect
                    x = rect["x"]
                    y = rect["y"]
                    width = rect["width"]
                    height = rect["height"]
                    if width>=MIN_WIDTH and height>=MIN_HEIGHT:
                        list_input.append(element)
                    
            print("search",time.time()-start_time) 
            start_time = time.time()       
            for element in list_input:
                communicateTags = statisticCommunicateTags(element)
                punctuation,words = statisticPunctuationAndWords(element)
                input_x.append([element.rect["x"],element.rect["y"],element.rect["width"],element.rect["height"],communicateTags,punctuation,words])
                
            print("encode",time.time()-start_time)
            the_max = np.max(input_x,axis=0)
            the_max = np.array(list(the_max)[2:4]+list(the_max)[2:])
            input_x = np.array(input_x/the_max)
            return [np.expand_dims(input_x,0)]
        except Exception as e:
            print(e)
            return None
    args = {"url":url}
    hd.executeMethod(_method, args)
    
def encodeInput_byJS(url,targethtml):
    def label(innerhtml,target_source):
        target_source =re.sub("[\r\n\s]","",str(target_source))
        pattern = ">(.*)<"
        target_source = re.findall(re.compile(pattern), target_source)[0]
        innerhtml = re.sub("[\r\n\s]","",str(innerhtml))
        #print(target_source[0:40])
        #print(element_source[0:40])
        #if target_source[0:10]==element_source[0:10] and target_source[-10:]==element_source[-10]:
        if target_source[0:60]==innerhtml[0:60]:
            return 1
        return 0
    
    def _method(args):
        try:
            url = args["url"]
            targethtml = args["targethtml"]
            browser = args["browser"]
            start = time.time()
            browser.get(url)
            _log = CLog()
            _log.write("get"+str(time.time()-start))
            browser.maximize_window()
            start = time.time()
            
            # data = browser.execute_script(scripts_common+scripts)
            data = get_js_rs(browser, scripts_common+scripts)
            input_x,list_inner = dealWithScriptOut(data)
            list_label = []
            for item in list_inner:
                list_label.append(label(item, targethtml))
            if len(list_label)>0 and np.max(list_label)==1:
                return input_x,np.array(list_label)
            else:
                return None
            print("cost",time.time()-start)
        except Exception as e:
            print(e)
        finally:
            pass
        return None
    args = {"url":url,"targethtml":targethtml}
    hd.executeMethod(_method, args)
    
def getInput_byJS(browser, url):
    def label(innerhtml,target_source):
        target_source =re.sub("[\r\n\s]","",str(target_source))
        pattern = ">(.*)<"
        target_source = re.findall(re.compile(pattern), target_source)[0]
        innerhtml = re.sub("[\r\n\s]","",str(innerhtml))
        #print(target_source[0:40])
        #print(element_source[0:40])
        #if target_source[0:10]==element_source[0:10] and target_source[-10:]==element_source[-10]:
        if target_source[0:60]==innerhtml[0:60]:
            return 1
        return 0
    try:
        # browser = hd.getdriver()
        # debug("get driver")
        # hd.loadPage(browser, url)
        # browser.maximize_window()
        
        # data,data_time = browser.execute_script(scripts_common+scripts)
        data,data_time = get_js_rs(browser, scripts_common+scripts)
        log('获取正文、时间脚本执行完毕')
        input_x,list_inner,list_xpath = dealWithScriptOut(data)
        if input_x is not None:
            #return [np.expand_dims(np.transpose(pad_sequences(np.transpose(input_x,(1,0)), 155,padding="post", truncating="post", value=0,dtype="float32"),(1,0)),0)],list_inner
            return True,[[np.expand_dims(input_x,0)],list_inner,list_xpath,data_time]
        else:
            return False,""
    except Exception as e:
        error(str(e))
        err_msg = ""
        if re.search("frame",str(e)) is not None:
            err_msg = "#iframe#"
        return None,err_msg
    # finally:
    #     hd.adddriver(browser)
    #     debug("release driver")


def dealWithScriptOut(data,key_index=4):    
    list_input = []
    list_inner = []
    list_xpath = []
    for index in range(len(data)):
        #clean nan
        for i in range(len(data[index][0])):
            if data[index][0][i] is None or math.isnan(data[index][0][i]):
                data[index][0][i] = -1
    #order by deepth
    data.sort(key=lambda x:x[0][2]*x[0][3],reverse=True)
    for item in data:
        list_input.append(item[0])
        list_inner.append(item[1])
        list_xpath.append(item[2])
    #print(len(data))
    if len(list_input)>0:
        the_max = np.max(list_input,axis=0)
        the_max = np.array([x if x>0 else 1 for x in the_max])
        the_max = np.array((list(the_max)[2:4]+list(the_max)[2:9])*4)
        input_x = np.array(list_input/the_max)
        return input_x,list_inner,list_xpath
    else:
        return None,None,None
    

def getResponseHeaders(browser):
    har = json.loads(browser.get_log('har')[0]['message'])
    print(har['log']['entries'])
    return OrderedDict(sorted([(header["name"], header["value"]) for header in har['log']['entries'][0]['General']], key = lambda x: x[0]))

def getHttpStatus(browser):
    for responseReceived in browser.get_log('performance'):
        try:
            response = json.loads(responseReceived[u'message'])[u'message'][u'params'][u'response']
            if response[u'url'] == browser.current_url:
                return (response[u'status'], response[u'statusText'])
        except:
            pass
    return None

def getHttpResponseHeader(browser):
    for responseReceived in browser.get_log('performance'):
        try:
            response = json.loads(responseReceived[u'message'])[u'message'][u'params'][u'response']
            print
            if response[u'url'] == browser.current_url:
                print(response)
                
        except:
            pass
    return response[u'headers']
    return None

        
def labelElement(element,target_source):
    target_source =re.sub("[\r\n\s]","",str(target_source))
    pattern = ">(.*)<"
    target_source = re.findall(re.compile(pattern), target_source)[0]
    element_source = element.get_attribute("innerHTML")
    element_source = re.sub("[\r\n\s]","",str(element_source))
    #print(target_source[0:40])
    #print(element_source[0:40])
    #if target_source[0:10]==element_source[0:10] and target_source[-10:]==element_source[-10]:
    if target_source[0:60]==element_source[0:60]:
        return 1
    return 0


def padding(all_data,pad=True):
    max_len = np.max([len(data[1]) for data in all_data])
    print("max_len",max_len)
    #max_len = 200
    list_x = []
    list_y = []
    list_url = []
    for data in all_data:
        input_x = data[0]
        label_y = data[1]
        url = data[2]
        if pad:
            input_x = np.transpose(pad_sequences(np.transpose(input_x,(1,0)), max_len,padding="post", truncating="post", value=0,dtype="float32"),(1,0))
            list_x.append(input_x)
            label_y = pad_sequences([label_y],max_len,padding="post", truncating="post", value=-1)[0]
            #list_y.append(label_y)
            list_y.append([(np.arange(2)==i).astype(np.integer) for i in label_y])
        else:
            #input_x = np.array(input_x)
            list_x.append([input_x])
            list_y.append([(np.arange(2)==i).astype(np.integer) for i in label_y])
        list_url.append(url)
    return [np.array(list_x),np.array(list_y),list_url]


def getAllData():
    all_data = load("Link_Content.pk")
    data = []
    temp_file ="temp_data.pk"
    count = 0
    label = 0
    data_len = len(all_data)
    for row in all_data:
        count += 1
        print(str(label)+"/"+str(count)+"/"+str(data_len),row[0])
        #encode = encodeInput(row[0], row[1])
        
        if count%100==0:
            save(data,temp_file)
        encode = encodeInput_byJS(row[0], row[1])
        if encode:
            label += 1
            x,y = encode
            data.append([x,y,row[0]])
        else:
            print("None")
    data = padding(data)
    return data

def augmentation(data,times=100):
    aug_data = []
    for item in data:
        x,y = item[0],item[1]
        new_item = []
        for i_x,i_y in zip(list(x),list(y)):
            new_item.append([i_x,i_y])
        aug_data.append(item)
        for _ in range(times):
            new_x = []
            new_y = []
            np.random.shuffle(new_item)
            for new_i in new_item:
                new_x.append(new_i[0])
                new_y.append(new_i[1])
            aug_data.append([new_x,new_y])
    return aug_data
            
def dumpLinkContent():
    def trytosave(d):
        try:
            save(d,"1.pk")
            return 1
        except Exception as e:
            return 0
    import cx_Oracle as cx_Oracle
    conn=cx_Oracle.connect('bxkc/bxkc@192.168.2.54:1521/orcl')    #连接数据库
    cursor=conn.cursor()
    sql = " select page_link,page_content from detail_content "
    cursor.execute(sql)
    rows = cursor.fetchall()
    data = []
    for row in rows:
        if trytosave(row)==1:
            data.append(row)
    save(data,"Link_Content.pk")
    
def relabel(file_data="sourceData_36Input_28849_sort.pk"):
    '''
    @summary: 调整标注数据，解决上卷问题
    '''
    data = load(file_data)
    count = 0
    set_1 = set()
    set_2 = set()
    for page in data:
        _feature = page[0]
        _label = page[1]
        _url = page[2]
        _label_index = np.argmax(_label)
        _label_left = _feature[_label_index][0]
        _label_top = _feature[_label_index][1]
        _label_width = _feature[_label_index][2]
        _label_height = _feature[_label_index][3]
        _label_deepth = _feature[_label_index][4]
        _label_text = _feature[_label_index][7]
        _index = 0
        _re_deepth = 0
        _re_index = -1
        for _box in _feature:
            _left = _box[0]
            _top = _box[1]
            _width = _box[2]
            _height = _box[3]
            _deepth = _box[4]
            _text = _box[7]
            if _deepth>_label_deepth:
                if _left>=_label_left and _top>=_label_top and (_left+_width)<=(_label_left+_label_width) and (_top+_height)<=(_label_top+_label_height) and (_width*_height/(_label_width*_label_height)>0.7 or (_width*_height/(_label_width*_label_height)>0.5 and _text/_label_text>0.9)):
                    set_1.add(_url)
                    if _deepth>_re_deepth:
                        _re_deepth = _deepth
                        _re_index = _index
            _index += 1
        if _re_index>-1:
            _label[_label_index] = 0
            _label[_re_index] = 1
            print(_url)
            print(_label_index,_re_index)
    data.sort(key=lambda x:x[2])
    print(len(list(set_1)))
    save(data,"sourceData_36Input_"+str(len(data[1]))+"_relabel.pk")
    data = padding(data)
    save(data,"data_"+str(len(data[1]))+"_relabel.pk")
    return data
    
if __name__=="__main__":
    #dumpLinkContent()
    '''
    relabel()
    '''
    data = getInput_byJS("http://hailing.taizhou.gov.cn/art/2019/5/23/art_50810_2498758.html")
    for item in data[3]:
        print(item)