luojiehua
/
ContentExtract


			
							123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050105110521053105410551056105710581059106010611062106310641065106610671068106910701071107210731074107510761077107810791080108110821083108410851086108710881089109010911092
							# -*- coding: utf-8 -*-
'''
Created on 2018年12月20日

@author: User
'''

import numpy as np
import re
import gensim
from keras import backend as K
import ctypes
import inspect
w2vfile = "../wiki_128_word_embedding_new.vector"
model_w2v = None

from decimal import Decimal
import logging
#logging.basicConfig(level = logging.INFO,format = '%(asctime)s - %(name)s - %(levelname)s - %(message)s')

import pickle
import tensorflow as tf
from keras import losses
import threading

__author__ = 'baniu.yao'

class MyThread(threading.Thread):
    def __init__(self, func, args=()):
        super(MyThread, self).__init__()
        self.func = func
        self.args = args

    def run(self):
        self.result = self.func(*self.args)

    def get_result(self):
        try:
            return self.result
        except Exception as e:
            print('执行js抛出异常：', e)
            return None

def get_js_rs(browser, script, *arg, timeout=20):
    '''
    浏览器执行脚本，返回结果，超时中断
    :param browser:浏览器对象
    :param script: 脚本
    :param arg:参数
    :param timeout:超时时间
    :return:
    '''
    def execute_js():
        data = browser.execute_script(script, *arg)
        return data
    t = MyThread(func=execute_js, args=())
    t.setDaemon(True)
    t.start()
    t.join(timeout)
    if t.isAlive():
        print('执行js超时')
        stop_thread(t)
        return None
    data = t.get_result()
    return data

import time
def thread_run(func, *arg, timeout=30):
    t = MyThread(func=func, args=(*arg,))
    t.setDaemon(True)
    t.start()
    t.join(timeout)
    if t.isAlive():
        print('thread_run time out')
    result = t.get_result()
    return result

def xpath2css(xpath):
    '''
    把xpath路径转为css路径
    :param xpath:
    :return:
    '''
    xpath = xpath.replace('//', '').replace('@', '').replace('/', '>')
    for it in re.finditer('\[(\d)\]', xpath):
        xpath = xpath.replace(it.group(0), ':nth-child(%s)'%it.group(1))
    if xpath[0] == '>':
        xpath = xpath[1:]
    return xpath

def get_class_from_frame(fr):
    args, _, _, value_dict = inspect.getargvalues(fr)
    if len(args) and args[0] == 'self':
        instance = value_dict.get('self', None)
        if instance:
            return getattr(instance, '__class__', None)
    return None


class CLog(object):
    def __init__(self, log_file_path='./test.log'):
        logging.basicConfig(level=logging.INFO, filemode='a',format='%(asctime)s %(message)s')
        self.logger = logging.getLogger("single_server")
        ''''''
        console = logging.FileHandler(log_file_path,encoding="UTF8")
        formatter = logging.Formatter('%(asctime)s %(levelname)s %(message)s [%(chain)s] %(thread)s %(threadName)s')
        console.setFormatter(formatter)
        self.logger.setLevel(logging.DEBUG)
        self.logger.addHandler(console)
        

    def get_file_name_in_full_path(self, file_path):
        return file_path.split('/')[-1]

    def get_meta_data(self):
        frames = inspect.stack()
        chain_list = []
        for i in range(0, len(frames)-1):
            _, file_path, _, func_name, _, _ = frames[i]
            file_name = self.get_file_name_in_full_path(file_path)
            try:
                args = re.findall('\((.*)\)', frames[i+1][-2][0])[0]
            except Exception as e:
                args = ""
            current_chain = '%s(%s)' % (func_name, args)
            chain_list.append(current_chain)
        chain_list.reverse()
        return ' --> '.join(chain_list[:-2])

    def info(self, message):
        chain = self.get_meta_data()
        self.logger.info(message,extra={"chain":chain})
        
    def error(self, message):
        chain = self.get_meta_data()
        self.logger.error(message,extra={"chain":chain})
        
    def debug(self, message):
        chain = self.get_meta_data()
        self.logger.debug(message,extra={"chain":chain})
        

def add_err_msg(_dict,msg):
    _key = "err_msg"
    if _key in _dict:
        if re.search(msg,_dict[_key]) is None:
            _dict[_key] = _dict[_key]+msg
    else:
        _dict[_key] = msg

def _async_raise(tid, exctype):
    """raises the exception, performs cleanup if needed"""
    tid = ctypes.c_long(tid)
    if not inspect.isclass(exctype):
        exctype = type(exctype)
    res = ctypes.pythonapi.PyThreadState_SetAsyncExc(tid, ctypes.py_object(exctype))
    if res == 0:
        raise ValueError("invalid thread id")
    elif res != 1:
        ctypes.pythonapi.PyThreadState_SetAsyncExc(tid, None)
        raise SystemError("PyThreadState_SetAsyncExc failed")

def stop_thread(thread):
    _async_raise(thread.ident, SystemExit)

_log = CLog()

def log(msg):
    '''
    @summary:打印信息
    '''
    
    _log.info(msg)

def error(msg):
    _log.error(msg)
    
def debug(msg):
    _log.debug(msg)

def save(object_to_save, path):
    '''
    保存对象
    @Arugs:
        object_to_save: 需要保存的对象

    @Return:
        保存的路径
    '''
    with open(path, 'wb') as f:
        pickle.dump(object_to_save, f)

def load(path):
    '''
    读取对象
    @Arugs:
        path: 读取的路径

    @Return:
        读取的对象
    '''
    with open(path, 'rb') as f:
        object1 = pickle.load(f)
        return object1


def find_index(list_tofind,text):
    '''
    @summary: 查找所有词汇在字符串中第一次出现的位置
    @param:
        list_tofind:待查找词汇
        text:字符串
    @return: list,每个词汇第一次出现的位置
    
    '''
    result = []
    for item in list_tofind:
        index = text.find(item)
        if index>=0:
            result.append(index)
        else:
            result.append(-1)
    return result


def combine(list1,list2):
    '''
    @summary:将两个list中的字符串两两拼接
    @param:
        list1:字符串list
        list2:字符串list
    @return:拼接结果list
    '''
    result = []
    for item1 in list1:
        for item2 in list2:
            result.append(str(item1)+str(item2))
    return result


def getDigitsDic(unit):
    '''
    @summary:拿到中文对应的数字
    '''
    DigitsDic = {"零":0, "壹":1, "贰":2, "叁":3, "肆":4, "伍":5, "陆":6, "柒":7, "捌":8, "玖":9,
                 "〇":0, "一":1, "二":2, "三":3, "四":4, "五":5, "六":6, "七":7, "八":8, "九":9}
    return DigitsDic.get(unit)

def getMultipleFactor(unit):
    '''
    @summary:拿到单位对应的值
    '''
    MultipleFactor = {"兆":Decimal(1000000000000),"亿":Decimal(100000000),"万":Decimal(10000),"仟":Decimal(1000),"千":Decimal(1000),"佰":Decimal(100),"百":Decimal(100),"拾":Decimal(10),"十":Decimal(10),"元":Decimal(1),"角":round(Decimal(0.1),1),"分":round(Decimal(0.01),2)}
    return MultipleFactor.get(unit)

def getUnifyMoney(money):
    '''
    @summary:将中文金额字符串转换为数字金额
    @param:
        money:中文金额字符串
    @return: decimal,数据金额
    '''
    
    
    #去掉逗号
    money = re.sub("[，,]","",money)
    result = Decimal(0)
    chnDigits = ["零", "壹", "贰", "叁", "肆", "伍", "陆", "柒", "捌", "玖"]
    chnFactorUnits = ["兆", "亿", "万", "仟", "佰", "拾","元","角","分"]
    
    LowMoneypattern = re.compile("^(\d+,?)+(\.\d+)?$")
    BigMoneypattern = re.compile("^[%s]$"%("".join(chnDigits)))
    if re.search(LowMoneypattern,money) is not None:
        return Decimal(money)
    elif re.search(BigMoneypattern,money) is not None:
        return getDigitsDic(money)
    
    for factorUnit in chnFactorUnits:
        if re.search(re.compile(".*%s.*"%(factorUnit)),money) is not None:
            subMoneys = re.split(re.compile("%s(?!.*%s.*)"%(factorUnit,factorUnit)),money)
            if re.search(re.compile("^(\d+(,)?)+(\.\d+)?$"),subMoneys[0]) is not None:
                result += Decimal(subMoneys[0])*(getMultipleFactor(factorUnit))
            elif len(subMoneys[0])==1:
                if re.search(re.compile("^[%s]$"%("".join(chnDigits))),subMoneys[0]) is not None:
                    result += Decimal(getDigitsDic(subMoneys[0]))*(getMultipleFactor(factorUnit))
            else:
                result += Decimal(getUnifyMoney(subMoneys[0]))*(getMultipleFactor(factorUnit))
            
            if len(subMoneys)>1:
                if re.search(re.compile("^(\d+(,)?)+(\.\d+)?[百千万亿]?\s?(元)?$"),subMoneys[1]) is not None:
                    result += Decimal(subMoneys[1])
                elif len(subMoneys[1])==1:
                    if re.search(re.compile("^[%s]$"%("".join(chnDigits))),subMoneys[1]) is not None:
                        result += Decimal(getDigitsDic(subMoneys[1]))
                else:
                    result += Decimal(getUnifyMoney(subMoneys[1]))
            break
    return result

def mergeDict(list_dict):
    new_dict = dict()
    _flag = True
    hasDrew = False
    err_msg = ""
    for _dict in list_dict:
        if _dict is None:
            _flag = False
            continue
        for key in _dict.keys():
            if key=="flag":
                if not _dict[key]:
                    _flag = _dict[key]
            else:
                if key=="err_msg":
                    err_msg += _dict[key]
                new_dict[key] = _dict[key]
            if key=="hasDrew":
                hasDrew = hasDrew or _dict[key]
    new_dict["flag"] = _flag
    new_dict["hasDrew"] = hasDrew
    new_dict["err_msg"] = err_msg
    count_rules = 0
    for _key in new_dict.keys():
        if _key not in ["flag","success","count_rules"] and new_dict[_key]!="" and new_dict[_key] is not None:
            count_rules += 1
    new_dict["count_rules"] = count_rules
    return new_dict

def getCommonXpath(list_xpaths,on_value=0.6):
    CommonXpath = None
    if len(list_xpaths)>0:
        MAX_LEN = max([len(x) for x in list_xpaths])
        for i in range(MAX_LEN):
            _xpath = None
            _same_count = 0
            for j in range(len(list_xpaths)):
                if i<len(list_xpaths[j]):
                    if _xpath is None:
                        _xpath = list_xpaths[j][i]
                    if list_xpaths[j][i]==_xpath:
                        _same_count += 1
            if _same_count/len(list_xpaths)>=on_value:
                CommonXpath = _xpath
    return CommonXpath


def getModel_w2v():
    '''
    @summary:加载词向量
    '''
    global model_w2v
    if model_w2v is None:
        model_w2v = gensim.models.KeyedVectors.load_word2vec_format(w2vfile,binary=True)
    return model_w2v


def findAllIndex(substr,wholestr):
    '''
    @summary: 找到字符串的子串的所有begin_index
    @param:
        substr:子字符串
        wholestr:子串所在完整字符串
    @return: list,字符串的子串的所有begin_index
    '''
    copystr = wholestr
    result = []
    indexappend = 0
    while(True):
        index = copystr.find(substr)
        if index<0:
            break
        else:
            result.append(indexappend+index)
            indexappend += index+len(substr)
            copystr = copystr[index+len(substr):]
    return result
    
  
def spanWindow(tokens,begin_index,end_index,size):
    '''
    @summary:取得某个实体的上下文词汇
    @param:
        tokens:句子分词list
        begin_index:实体的开始index
        end_index:实体的结束index
        size:左右两边各取多少个词
    @return: list,实体的上下文词汇
    '''  
    length_tokens = len(tokens)
    if begin_index>size:
        begin = begin_index-size
    else:
        begin = 0
    if end_index+size<length_tokens:
        end = end_index+size+1
    else:
        end = length_tokens
    result = []
    result.append(tokens[begin:begin_index])
    #result.append(tokens[begin_index:end_index+1])
    result.append(tokens[end_index+1:end])
    return result

def embedding(datas,shape):
    '''
    @summary:查找词汇对应的词向量
    @param:
        datas:词汇的list
        shape:结果的shape
    @return: array,返回对应shape的词嵌入
    '''
    model_w2v = getModel_w2v()
    embed = np.zeros(shape)
    length = shape[1]
    out_index = 0
    #print(datas)
    for data in datas:
        index = 0
        for item in data:
            item_not_space = re.sub("\s*","",item)
            if index>=length:
                break
            if item_not_space in model_w2v.vocab:
                embed[out_index][index] = model_w2v[item_not_space]
                index += 1
            else:
                #embed[out_index][index] = model_w2v['unk']
                index += 1
        out_index += 1
    return embed

def partMoney(entity_text,input2_shape = [7]):
    '''
    @summary:对金额分段
    @param:
        entity_text:数值金额
        input2_shape:分类数
    @return: array,分段之后的独热编码
    '''
    money = float(entity_text)
    parts = np.zeros(input2_shape)
    if money<100:
        parts[0] = 1
    elif money<1000:
        parts[1] = 1
    elif money<10000:
        parts[2] = 1
    elif money<100000:
        parts[3] = 1
    elif money<1000000:
        parts[4] = 1
    elif money<10000000:
        parts[5] = 1
    else:
        parts[6] = 1
    return parts

def recall(y_true, y_pred):
    '''
    计算召回率
    @Argus:
        y_true: 正确的标签
        y_pred: 模型预测的标签

    @Return
        召回率
    '''
    c1 = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
    c3 = K.sum(K.round(K.clip(y_true, 0, 1)))
    if c3 == 0:
        return 0
    recall = c1 / c3
    return recall


def f1_score(y_true, y_pred):
    '''
    计算F1

    @Argus:
        y_true: 正确的标签
        y_pred: 模型预测的标签

    @Return
        F1值
    '''

    c1 = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
    c2 = K.sum(K.round(K.clip(y_pred, 0, 1)))
    c3 = K.sum(K.round(K.clip(y_true, 0, 1)))
    precision = c1 / c2
    if c3 == 0:
        recall = 0
    else:
        recall = c1 / c3
    f1_score = 2 * (precision * recall) / (precision + recall)
    return f1_score


def precision(y_true, y_pred):
    '''
    计算精确率

    @Argus:
        y_true: 正确的标签
        y_pred: 模型预测的标签

    @Return
        精确率
    '''
    c1 = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
    c2 = K.sum(K.round(K.clip(y_pred, 0, 1)))
    precision = c1 / c2
    return precision

def acc(y_true, y_pred):
    '''
    #计算正确率
    '''
    c1 = tf.reduce_mean(tf.cast(tf.equal(tf.matmul(tf.cast(tf.argmax(y_true,1),tf.float64),tf.constant([[0],[1]],dtype=tf.float64)),tf.matmul(tf.cast(tf.argmax(y_pred,1),tf.float64),tf.constant([[0],[1]],dtype=tf.float64))),tf.float32))
    return c1

def my_loss(y_true, y_pred):
    return -tf.reduce_mean(y_true*tf.log(y_pred))
    #return losses.categorical_crossentropy(y_true, y_pred)+(1-tf.reduce_mean(tf.cast(tf.equal(tf.matmul(tf.cast(tf.argmax(y_true,1),tf.float64),tf.constant([[0],[1]],dtype=tf.float64)),tf.matmul(tf.cast(tf.argmax(y_pred,1),tf.float64),tf.constant([[0],[1]],dtype=tf.float64))),tf.float32)))

def print_metrics(history):
    '''
    制作每次迭代的各metrics变化图片

    @Arugs:
        history: 模型训练迭代的历史记录
    '''
    import matplotlib.pyplot as plt

    # loss图
    loss = history.history['loss']
    val_loss = history.history['val_loss']
    epochs = range(1, len(loss) + 1)
    plt.subplot(2, 2, 1)
    plt.plot(epochs, loss, 'bo', label='Training loss')
    plt.plot(epochs, val_loss, 'b', label='Validation loss')
    plt.title('Training and validation loss')
    plt.xlabel('Epochs')
    plt.ylabel('Loss')
    plt.legend()

    # f1图
    f1 = history.history['f1_score']
    val_f1 = history.history['val_f1_score']
    plt.subplot(2, 2, 2)
    plt.plot(epochs, f1, 'bo', label='Training f1')
    plt.plot(epochs, val_f1, 'b', label='Validation f1')
    plt.title('Training and validation f1')
    plt.xlabel('Epochs')
    plt.ylabel('F1')
    plt.legend()

    # precision图
    prec = history.history['precision']
    val_prec = history.history['val_precision']
    plt.subplot(2, 2, 3)
    plt.plot(epochs, prec, 'bo', label='Training precision')
    plt.plot(epochs, val_prec, 'b', label='Validation pecision')
    plt.title('Training and validation precision')
    plt.xlabel('Epochs')
    plt.ylabel('Precision')
    plt.legend()

    # recall图
    recall = history.history['recall']
    val_recall = history.history['val_recall']
    plt.subplot(2, 2, 4)
    plt.plot(epochs, recall, 'bo', label='Training recall')
    plt.plot(epochs, val_recall, 'b', label='Validation recall')
    plt.title('Training and validation recall')
    plt.xlabel('Epochs')
    plt.ylabel('Recall')
    plt.legend()

    plt.show()

scripts_common = '''
document.getElementsByClassName = function (Name,e,tag) {
            var ele = [],
                allEle,
                length,
                i = 0;
 
            if (typeof tag === "undefined" ){
                tag = "*"
            }
 
            if (typeof e === "undefined"){
                e = document;
            }
 
            allEle = e.getElementsByTagName(tag);
 
            for (length = allEle.length;i < length;i = i + 1){
                if (allEle[i].className === Name) {
                    ele.push(allEle[i]);
                }
            }
 
            return ele;
        }

document.countElementById = function (id,e,tag) {
            var ele = [],
                allEle,
                length,
                i = 0;
 
            if (typeof tag === "undefined" ){
                tag = "*"
            }
 
            if (typeof e === "undefined"){
                e = document;
            }
 
            allEle = e.getElementsByTagName(tag);
 
            for (length = allEle.length;i < length;i = i + 1){
                if (allEle[i].id === id) {
                    ele.push(allEle[i]);
                }
            }
 
            return ele;
        }

/*js集合set类的实现*/
function Set() {
    this.dataStore = [];
    this.add = add;//新增元素
    this.remove = remove;//删除元素
    this.size = size;//集合的元素个数
    this.union = union;//求并集
    this.contains = contains;//判断一个集合中是否包含某个元素
    this.intersect = intersect;//交集
    this.subset = subset;//判断一个集合是否是另一个的子集
    this.difference = difference;//求补集
    this.show = show;//将集合元素显示出来
}

function add(data) {
    if (this.dataStore.indexOf(data) < 0) {
        this.dataStore.push(data);
        return true;
    }
    else {
        return false;
    }
}

function remove(data) {
    var pos = this.dataStore.indexOf(data);
    if (pos > -1) {
        this.dataStore.splice(pos,1);
        return true;
    }
    else {
        return false;
    }
}

function size() {
    return this.dataStore.length;
}

function show() {
    return "[" + this.dataStore + "]";
}

function contains(data) {
    if (this.dataStore.indexOf(data) > -1) {
        return true;
    }
    else {
        return false;
    }
}

function union(set) {
    var tempSet = new Set();
    for (var i = 0; i < this.dataStore.length; ++i) {
        tempSet.add(this.dataStore[i]);
    }
    for (var i = 0; i < set.dataStore.length; ++i) {
        if (!tempSet.contains(set.dataStore[i])) {
            tempSet.dataStore.push(set.dataStore[i]);
        }
    }
    return tempSet;
}

function intersect(set) {
    var tempSet = new Set();
    for (var i = 0; i < this.dataStore.length; ++i) {
        if (set.contains(this.dataStore[i])) {
            tempSet.add(this.dataStore[i]);
        }
    }
    return tempSet;
}

function subset(set) {
    if (this.size() > set.size()) {
        return false;
    }
    else {
        for(var member in this.dataStore) {
            if (!set.contains(member)) {
                return false;
            }
        }
    }
    return true;
}

function difference(set) {
    var tempSet = new Set();
    for (var i = 0; i < this.dataStore.length; ++i) {
        if (!set.contains(this.dataStore[i])) {
            tempSet.add(this.dataStore[i]);
        }
    }
    return tempSet;
}

function check(node,set_url){
    if(node.nodeType!=1){
        return false;
    }
    var label_flag = false;
    var list_a = node.getElementsByTagName("a");
    
    if(list_a.length==set_url.size()){
        return true;
    }else{
        return false;
    }
    
}

function getRemoveList(node,recurse,list_remove){
    var pattern = /(上一?篇|下一?篇|作者|点击数|发布时间|发布日期|更新日期|更新时间|字体|字号|来源|阅读次?数|浏览次?数|点击次?数|本站编辑|编辑人|关键字|上一条|下一条)|(打印|关闭窗口|回到顶部|现在的位置|首页|分享)/
    
    if(node.childNodes==null || node.childNodes.length<=0){
        return;
    }
    for(var i=0;i<node.childNodes.length;i++){
        _child = node.childNodes[i];
        if(_child.nodeType==3){
            _match = _child.textContent.toString().match(pattern);
            if(_match!=null){
                if(_match[1]!=null){
                    if(node.textContent.toString().trim().length-_match[1].length<3){
                      _soup = node.parentNode.tagName.toLowerCase()+":contains("+_match[0]+")";
                    }else{
                        _soup = node.tagName.toLowerCase()+":contains("+_match[0]+")";
                    }
                }else{
                    _soup = node.tagName.toLowerCase()+":contains("+_match[0]+")";
                }
                list_remove.push(_soup);
            }
        }
        if(_child.nodeType==1 && recurse){
            getRemoveList(_child,recurse,list_remove)
        }
    }
}

function getListXpath(el,list_xpath,getRemove){
    if (el==document || el==document.body){
        return list_xpath;
    }
    if(getRemove){
        _array = new Array();
        getRemoveList(el,true,_array);
        list_xpath.push([getXpath(el),_array])
    }else{
        list_xpath.push(getXpath(el))
    }
    
    return getListXpath(el.parentNode,list_xpath,getRemove);
}
function getXpath(el,b,notfirst){
    if (el.id !="" && document.countElementById(el.id).length==1){
        var _jump_flag = false;
        if(b!=null){
            for(var i=0;i<b.length;i++){
                if(el.tagName.toLowerCase()==b[i]){
                    _jump_flag = true;
                }
            }
        }
        if(notfirst){
            _jump_flag = true;
        }
        if(!_jump_flag){
            //return '//*[@id=\"'+el.id+'\"]';
            return '//'+el.tagName.toLowerCase()+'[@id=\"'+el.id+'\"]';
        }
        
    }
    
    if (el.getAttribute("class")!=null && document.getElementsByClassName(el.getAttribute("class")).length==1){
        if(!notfirst){
            //return '//*[@class=\"'+el.getAttribute("class")+'\"]';
            return '//'+el.tagName.toLowerCase()+'[@class=\"'+el.getAttribute("class")+'\"]';
        }
        
    }
    
    if (el==document.body){
        return '/html/'+el.tagName.toLowerCase();
    }
    
    
    var ix = 1;
    siblings = el.parentNode.childNodes;
    
    for (var i=0,l=siblings.length;i<l;i++){
        var sibling = siblings[i];
        if (sibling==el){
            if(ix>1 || (ix==1 && i+1<siblings.length && siblings[i+1].tagName==el.tagName)){
                return getXpath(el.parentNode,b)+'/'+el.tagName.toLowerCase()+'['+(ix)+']';
            }else{
                return getXpath(el.parentNode,b)+'/'+el.tagName.toLowerCase();
            }
            //return getXpath(el.parentNode,b)+'/'+el.tagName.toLowerCase()+'['+(ix)+']';
        }else if (sibling.tagName==el.tagName){
            ix++;
        }
    }

}


function getJsoup(node){
    var _nodeName = node.tagName.toLowerCase();
    var _nodeText = node.innerText;
    if(_nodeText==null || _nodeText==""){
        return null;
    }
    var counts = 0;
    var list_node = document.getElementsByTagName(_nodeName);
    for(var i=0;i<list_node.length;i++){
        var _node = list_node[i];
        if(_node.innerText!=null && _node.innerText.indexOf(_nodeText)>=0){
            counts += 1;
        }
    }
    if(counts!=1){
        return null;
    }
    var jsoup = _nodeName+':contains('+_nodeText.trim()+')';
    return jsoup;
}
function getOffsetLeft(el){
     return el.offsetParent
      ? el.offsetLeft + getOffsetLeft(el.offsetParent)
      : el.offsetLeft;
}
function getOffsetTop(el){
     return el.offsetParent
      ? el.offsetTop + getOffsetTop(el.offsetParent)
      : el.offsetTop;
} 

function search_pageBt(node,type,list_hitTag,pattern_page){
    var find_flag = false;
    if(node!=null && node.nodeName.toLowerCase()=="a"){
        list_hitTag.push([node,type,getOffsetLeft(node),getOffsetTop(node)])
    }else{
        if(node.childNodes==null){
    
        }else{
            for(var i=0;i<node.childNodes.length;i++){
                child = node.childNodes[i];
                if(child!=null && child.tagName !=null && (child.tagName.toLowerCase()=="script" || child.tagName.toLowerCase()=="select")){
                    continue;
                }
                child_innerText = child.innerText;
                if(child_innerText!=null){
                    _match = child_innerText.match(pattern_page)
                    if(_match!=null){
                       var _type = _match[1]?"nextPage":(_match[2]?"lastPage":(_match[3]?"firstPage":(_match[4]?"tailPage":"other")))
                        search_pageBt(child,_type,list_hitTag,pattern_page);
                        find_flag = true;
                        
                    }
                }
            }
        }
        if(!find_flag){
            list_hitTag.push([node,type,getOffsetLeft(node),getOffsetTop(node)]);
        }
    }
    
}

//对命中的标签进行聚类
function clustering(list_hitTag){
    var list_cluster = new Array();
    for(var i=0;i<list_hitTag.length;i++){
        var _find_flag = false;
        for(var j=0;j<list_cluster.length;j++){
            if(Math.abs(list_cluster[j][1]-list_hitTag[i][3])<20){
                list_cluster[j][2].push([list_hitTag[i][0],list_hitTag[i][1]]);
                if(list_hitTag[i][0].tagName.toLowerCase()=="a" || list_hitTag[i][0].onclick!=null){
                    list_cluster[j][3] += 1;
                }
                _find_flag = true;
            }
        }
        if(!_find_flag){
            var _click_num = 0;
            if(list_hitTag[i][0].tagName.toLowerCase()=="a" || list_hitTag[i][0].onclick!=null){
                _click_num = 1;
            }
            list_cluster.push([list_hitTag[i][2],list_hitTag[i][3],[[list_hitTag[i][0],list_hitTag[i][1]]],_click_num]);
        }
    }
    var _list_max_cluster = new Array();
    var _max = 0;
    for(var k=0;k<list_cluster.length;k++){
        _prob = list_cluster[k][2].length*0.5+list_cluster[k][3]*0;
        if(_prob>_max){
            _max = _prob;
            _list_max_cluster = list_cluster[k][2];
        }
    }
    return _list_max_cluster;
}

function clustering_turnPage(){
    //var pattern_page = /((?<nextPage>下一?页|>>|>)|(?<lastPage>上一?页|<<|<)|(?<firstPage>首页|第一页)|(?<tailPage>尾页)|(?<other>\.{1,2}|共\d[条页]|\d+\/\d+))/ //phantomjs不支持命名分组
    var pattern_page = /^\s*[^最]?\s*([下后]\s*一?\s*页?|[下后]\s*一?\s*页\s*»|[下后]\s*一?\s*页\s*>|[下后]\s*一?\s*页\s*>>|»|>>|>|[Nn]ext)\s*.?\s*$|^\s*.?([前上]\s*一?\s*页?|«\s*[前上]\s*一?\s*页|«|<<|<|[Pp]revious).?\s*$|^\s*.?(首\s*页?|第\s*一\s*页|first|\|<).?\s*$|^\s*.?([尾末]\s*一?\s*页?|tail|>\|).?s\s*$|(^\s*\.{1,2}\s*$|^.{,10}共\s*\d+\s*[条页].{,10}$|^.{,10}\d+\/\d+.{,3}$|^\s*\.{0,2}\d+\s*$|^\s*[gG][oO]\s*$|^.{0,2}跳?转到?)/
    var pattern_nextPage = /[Nn]ext/
    var list_hitTag = new Array();
    
    //search_pageBt(document,"other",list_hitTag,pattern_page)
    for(var i=0;i<document.all.length;i++){
        var node = document.all[i];
        if(!((getOffsetLeft(node)>0 && getOffsetTop(node)>0))){
            continue;
        }
        if(node.tagName.toLowerCase()=="script"){
            continue;
        }
        var _value = node.getAttribute("value");
        if(_value==null){
            _value = "";
        }
        var _title = node.getAttribute("title");
        if(_title==null){
            _title = "";
        }
        var _text = "";
        if(node!=null && node.innerText!=null){
            _text = node.innerText;
        }
        
        
        if (_text!=null && _text!="" && node.tagName.toLowerCase()!="option"){
            _match = _text.match(pattern_page)
            if(_match!=null){
                var _type = _match[1]?"nextPage":(_match[2]?"lastPage":(_match[3]?"firstPage":(_match[4]?"tailPage":"other")))
                list_hitTag.push([node,_type,getOffsetLeft(node),getOffsetTop(node)]);
            }
        }else if (_value!=null && _value!="" && node.tagName.toLowerCase()!="option"){
            _match = _value.match(pattern_page)
            if(_match!=null){
                var _type = _match[1]?"nextPage":(_match[2]?"lastPage":(_match[3]?"firstPage":(_match[4]?"tailPage":"other")))
                list_hitTag.push([node,_type,getOffsetLeft(node),getOffsetTop(node)]);
            }
        }else if (_title!=null && _title!=""){
            _match = _title.match(pattern_page)
            if(_match!=null){
                var _type = _match[1]?"nextPage":(_match[2]?"lastPage":(_match[3]?"firstPage":(_match[4]?"tailPage":"other")))
                list_hitTag.push([node,_type,getOffsetLeft(node),getOffsetTop(node)]);
            }
        }else if(node!=null && node.getAttribute("class")!=null && node.getAttribute("class").match(pattern_nextPage)!=null){
            list_hitTag.push([node,"nextPage",getOffsetLeft(node),getOffsetTop(node)]);
        }
        
    }
    
    var _find = false;
    for(var i=0;i<list_hitTag.length;i++){
        if(list_hitTag[i][0].innerText==">"){
            _find = true;
        }
    }
    if(_find){
        for(var i=0;i<list_hitTag.length;i++){
            if(list_hitTag[i][0].innerText==">>"){
                list_hitTag[i][1] = "tailPage"
            }
        }  
    }
    
    list_cluster = clustering(list_hitTag);
    return list_cluster;
}


function findElements_byXpath(STR_XPATH) {
    var xresult = document.evaluate(STR_XPATH, document, null, XPathResult.ANY_TYPE, null);
    var xnodes = [];
    var xres;
    while (xres = xresult.iterateNext()) {
        xnodes.push(xres);
    }
 
    return xnodes;
}
'''
scripts_replaceXpath = '''
function findElements_byXpath(STR_XPATH) {
    var xresult = document.evaluate(STR_XPATH, document, null, XPathResult.ANY_TYPE, null);
    var xnodes = [];
    var xres;
    while (xres = xresult.iterateNext()) {
        xnodes.push(xres);
    }
 
    return xnodes;
}

function replaceXpath(_xpath){
    var list_path = _xpath.split("/");
    var _replaced_xpath = "";
    var aim_att = ["height","width","align","valign","border","bgcolor","style"]
    for(var i=list_path.length-1;i>=0;i--){
        var _path = list_path[i];
        if(_path.indexOf("]")>=0){
            if(_path.indexOf("@")>=0){
                _replaced_xpath = "//"+_path;
                return _xpath;
            }else if(_path=="html"){
                return _xpath;
            }else{
                _temp_xpath = list_path.slice(0,i+1).join("/")
                _temp_nodes = findElements_byXpath(_temp_xpath)
                if(_temp_nodes.length==1){
                    var _count = 0;
                    var gen_xpath = "";
                    for(var j=0;j<_temp_nodes[0].attributes.length;j++){
                        var _att = _temp_nodes[0].attributes[j];
                        _head = _att.name
                        if(aim_att.indexOf(_head)>=0){
                            _count += 1;
                            if(gen_xpath==""){
                                gen_xpath = "//"+_temp_nodes[0].tagName.toLowerCase()+"[@"+_att.name+'=\"'+_att.value+'\"]';
                            }else{
                                gen_xpath = gen_xpath+"[@"+_att.name+'=\"'+_att.value+'\"]';
                            }
                        }
                    }
                    if(_count>=2){
                        var _find_nodes = findElements_byXpath(gen_xpath);
                        if(_find_nodes.length==1){
                            return gen_xpath+_replaced_xpath
                        }else{
                            _replaced_xpath = "/"+_path + _replaced_xpath;
                        }
                    }else{
                        _replaced_xpath = "/"+_path + _replaced_xpath;
                    }
                }else{
                    _replaced_xpath = "/"+_path + _replaced_xpath;
                }
            }
        }else{
            if(_path!=""){
                _replaced_xpath = "/"+_path + _replaced_xpath;
            }
        }
    }
    return _replaced_xpath;
}
return replaceXpath(arguments[0]);
'''