123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050105110521053105410551056105710581059106010611062106310641065106610671068106910701071107210731074107510761077107810791080108110821083108410851086108710881089109010911092 |
- # -*- coding: utf-8 -*-
- '''
- Created on 2018年12月20日
- @author: User
- '''
- import numpy as np
- import re
- import gensim
- from keras import backend as K
- import ctypes
- import inspect
- w2vfile = "../wiki_128_word_embedding_new.vector"
- model_w2v = None
- from decimal import Decimal
- import logging
- #logging.basicConfig(level = logging.INFO,format = '%(asctime)s - %(name)s - %(levelname)s - %(message)s')
- import pickle
- import tensorflow as tf
- from keras import losses
- import threading
- __author__ = 'baniu.yao'
- class MyThread(threading.Thread):
- def __init__(self, func, args=()):
- super(MyThread, self).__init__()
- self.func = func
- self.args = args
- def run(self):
- self.result = self.func(*self.args)
- def get_result(self):
- try:
- return self.result
- except Exception as e:
- print('执行js抛出异常:', e)
- return None
- def get_js_rs(browser, script, *arg, timeout=20):
- '''
- 浏览器执行脚本,返回结果,超时中断
- :param browser:浏览器对象
- :param script: 脚本
- :param arg:参数
- :param timeout:超时时间
- :return:
- '''
- def execute_js():
- data = browser.execute_script(script, *arg)
- return data
- t = MyThread(func=execute_js, args=())
- t.setDaemon(True)
- t.start()
- t.join(timeout)
- if t.isAlive():
- print('执行js超时')
- stop_thread(t)
- return None
- data = t.get_result()
- return data
- import time
- def thread_run(func, *arg, timeout=30):
- t = MyThread(func=func, args=(*arg,))
- t.setDaemon(True)
- t.start()
- t.join(timeout)
- if t.isAlive():
- print('thread_run time out')
- result = t.get_result()
- return result
- def xpath2css(xpath):
- '''
- 把xpath路径转为css路径
- :param xpath:
- :return:
- '''
- xpath = xpath.replace('//', '').replace('@', '').replace('/', '>')
- for it in re.finditer('\[(\d)\]', xpath):
- xpath = xpath.replace(it.group(0), ':nth-child(%s)'%it.group(1))
- if xpath[0] == '>':
- xpath = xpath[1:]
- return xpath
- def get_class_from_frame(fr):
- args, _, _, value_dict = inspect.getargvalues(fr)
- if len(args) and args[0] == 'self':
- instance = value_dict.get('self', None)
- if instance:
- return getattr(instance, '__class__', None)
- return None
- class CLog(object):
- def __init__(self, log_file_path='./test.log'):
- logging.basicConfig(level=logging.INFO, filemode='a',format='%(asctime)s %(message)s')
- self.logger = logging.getLogger("single_server")
- ''''''
- console = logging.FileHandler(log_file_path,encoding="UTF8")
- formatter = logging.Formatter('%(asctime)s %(levelname)s %(message)s [%(chain)s] %(thread)s %(threadName)s')
- console.setFormatter(formatter)
- self.logger.setLevel(logging.DEBUG)
- self.logger.addHandler(console)
-
- def get_file_name_in_full_path(self, file_path):
- return file_path.split('/')[-1]
- def get_meta_data(self):
- frames = inspect.stack()
- chain_list = []
- for i in range(0, len(frames)-1):
- _, file_path, _, func_name, _, _ = frames[i]
- file_name = self.get_file_name_in_full_path(file_path)
- try:
- args = re.findall('\((.*)\)', frames[i+1][-2][0])[0]
- except Exception as e:
- args = ""
- current_chain = '%s(%s)' % (func_name, args)
- chain_list.append(current_chain)
- chain_list.reverse()
- return ' --> '.join(chain_list[:-2])
- def info(self, message):
- chain = self.get_meta_data()
- self.logger.info(message,extra={"chain":chain})
-
- def error(self, message):
- chain = self.get_meta_data()
- self.logger.error(message,extra={"chain":chain})
-
- def debug(self, message):
- chain = self.get_meta_data()
- self.logger.debug(message,extra={"chain":chain})
-
- def add_err_msg(_dict,msg):
- _key = "err_msg"
- if _key in _dict:
- if re.search(msg,_dict[_key]) is None:
- _dict[_key] = _dict[_key]+msg
- else:
- _dict[_key] = msg
- def _async_raise(tid, exctype):
- """raises the exception, performs cleanup if needed"""
- tid = ctypes.c_long(tid)
- if not inspect.isclass(exctype):
- exctype = type(exctype)
- res = ctypes.pythonapi.PyThreadState_SetAsyncExc(tid, ctypes.py_object(exctype))
- if res == 0:
- raise ValueError("invalid thread id")
- elif res != 1:
- ctypes.pythonapi.PyThreadState_SetAsyncExc(tid, None)
- raise SystemError("PyThreadState_SetAsyncExc failed")
- def stop_thread(thread):
- _async_raise(thread.ident, SystemExit)
- _log = CLog()
- def log(msg):
- '''
- @summary:打印信息
- '''
-
- _log.info(msg)
- def error(msg):
- _log.error(msg)
-
- def debug(msg):
- _log.debug(msg)
- def save(object_to_save, path):
- '''
- 保存对象
- @Arugs:
- object_to_save: 需要保存的对象
- @Return:
- 保存的路径
- '''
- with open(path, 'wb') as f:
- pickle.dump(object_to_save, f)
- def load(path):
- '''
- 读取对象
- @Arugs:
- path: 读取的路径
- @Return:
- 读取的对象
- '''
- with open(path, 'rb') as f:
- object1 = pickle.load(f)
- return object1
- def find_index(list_tofind,text):
- '''
- @summary: 查找所有词汇在字符串中第一次出现的位置
- @param:
- list_tofind:待查找词汇
- text:字符串
- @return: list,每个词汇第一次出现的位置
-
- '''
- result = []
- for item in list_tofind:
- index = text.find(item)
- if index>=0:
- result.append(index)
- else:
- result.append(-1)
- return result
- def combine(list1,list2):
- '''
- @summary:将两个list中的字符串两两拼接
- @param:
- list1:字符串list
- list2:字符串list
- @return:拼接结果list
- '''
- result = []
- for item1 in list1:
- for item2 in list2:
- result.append(str(item1)+str(item2))
- return result
- def getDigitsDic(unit):
- '''
- @summary:拿到中文对应的数字
- '''
- DigitsDic = {"零":0, "壹":1, "贰":2, "叁":3, "肆":4, "伍":5, "陆":6, "柒":7, "捌":8, "玖":9,
- "〇":0, "一":1, "二":2, "三":3, "四":4, "五":5, "六":6, "七":7, "八":8, "九":9}
- return DigitsDic.get(unit)
- def getMultipleFactor(unit):
- '''
- @summary:拿到单位对应的值
- '''
- MultipleFactor = {"兆":Decimal(1000000000000),"亿":Decimal(100000000),"万":Decimal(10000),"仟":Decimal(1000),"千":Decimal(1000),"佰":Decimal(100),"百":Decimal(100),"拾":Decimal(10),"十":Decimal(10),"元":Decimal(1),"角":round(Decimal(0.1),1),"分":round(Decimal(0.01),2)}
- return MultipleFactor.get(unit)
- def getUnifyMoney(money):
- '''
- @summary:将中文金额字符串转换为数字金额
- @param:
- money:中文金额字符串
- @return: decimal,数据金额
- '''
-
-
-
- #去掉逗号
- money = re.sub("[,,]","",money)
- result = Decimal(0)
- chnDigits = ["零", "壹", "贰", "叁", "肆", "伍", "陆", "柒", "捌", "玖"]
- chnFactorUnits = ["兆", "亿", "万", "仟", "佰", "拾","元","角","分"]
-
- LowMoneypattern = re.compile("^(\d+,?)+(\.\d+)?$")
- BigMoneypattern = re.compile("^[%s]$"%("".join(chnDigits)))
- if re.search(LowMoneypattern,money) is not None:
- return Decimal(money)
- elif re.search(BigMoneypattern,money) is not None:
- return getDigitsDic(money)
-
- for factorUnit in chnFactorUnits:
- if re.search(re.compile(".*%s.*"%(factorUnit)),money) is not None:
- subMoneys = re.split(re.compile("%s(?!.*%s.*)"%(factorUnit,factorUnit)),money)
- if re.search(re.compile("^(\d+(,)?)+(\.\d+)?$"),subMoneys[0]) is not None:
- result += Decimal(subMoneys[0])*(getMultipleFactor(factorUnit))
- elif len(subMoneys[0])==1:
- if re.search(re.compile("^[%s]$"%("".join(chnDigits))),subMoneys[0]) is not None:
- result += Decimal(getDigitsDic(subMoneys[0]))*(getMultipleFactor(factorUnit))
- else:
- result += Decimal(getUnifyMoney(subMoneys[0]))*(getMultipleFactor(factorUnit))
-
- if len(subMoneys)>1:
- if re.search(re.compile("^(\d+(,)?)+(\.\d+)?[百千万亿]?\s?(元)?$"),subMoneys[1]) is not None:
- result += Decimal(subMoneys[1])
- elif len(subMoneys[1])==1:
- if re.search(re.compile("^[%s]$"%("".join(chnDigits))),subMoneys[1]) is not None:
- result += Decimal(getDigitsDic(subMoneys[1]))
- else:
- result += Decimal(getUnifyMoney(subMoneys[1]))
- break
- return result
- def mergeDict(list_dict):
- new_dict = dict()
- _flag = True
- hasDrew = False
- err_msg = ""
- for _dict in list_dict:
- if _dict is None:
- _flag = False
- continue
- for key in _dict.keys():
- if key=="flag":
- if not _dict[key]:
- _flag = _dict[key]
- else:
- if key=="err_msg":
- err_msg += _dict[key]
- new_dict[key] = _dict[key]
- if key=="hasDrew":
- hasDrew = hasDrew or _dict[key]
- new_dict["flag"] = _flag
- new_dict["hasDrew"] = hasDrew
- new_dict["err_msg"] = err_msg
- count_rules = 0
- for _key in new_dict.keys():
- if _key not in ["flag","success","count_rules"] and new_dict[_key]!="" and new_dict[_key] is not None:
- count_rules += 1
- new_dict["count_rules"] = count_rules
- return new_dict
- def getCommonXpath(list_xpaths,on_value=0.6):
- CommonXpath = None
- if len(list_xpaths)>0:
- MAX_LEN = max([len(x) for x in list_xpaths])
- for i in range(MAX_LEN):
- _xpath = None
- _same_count = 0
- for j in range(len(list_xpaths)):
- if i<len(list_xpaths[j]):
- if _xpath is None:
- _xpath = list_xpaths[j][i]
- if list_xpaths[j][i]==_xpath:
- _same_count += 1
- if _same_count/len(list_xpaths)>=on_value:
- CommonXpath = _xpath
- return CommonXpath
- def getModel_w2v():
- '''
- @summary:加载词向量
- '''
- global model_w2v
- if model_w2v is None:
- model_w2v = gensim.models.KeyedVectors.load_word2vec_format(w2vfile,binary=True)
- return model_w2v
- def findAllIndex(substr,wholestr):
- '''
- @summary: 找到字符串的子串的所有begin_index
- @param:
- substr:子字符串
- wholestr:子串所在完整字符串
- @return: list,字符串的子串的所有begin_index
- '''
- copystr = wholestr
- result = []
- indexappend = 0
- while(True):
- index = copystr.find(substr)
- if index<0:
- break
- else:
- result.append(indexappend+index)
- indexappend += index+len(substr)
- copystr = copystr[index+len(substr):]
- return result
-
-
- def spanWindow(tokens,begin_index,end_index,size):
- '''
- @summary:取得某个实体的上下文词汇
- @param:
- tokens:句子分词list
- begin_index:实体的开始index
- end_index:实体的结束index
- size:左右两边各取多少个词
- @return: list,实体的上下文词汇
- '''
- length_tokens = len(tokens)
- if begin_index>size:
- begin = begin_index-size
- else:
- begin = 0
- if end_index+size<length_tokens:
- end = end_index+size+1
- else:
- end = length_tokens
- result = []
- result.append(tokens[begin:begin_index])
- #result.append(tokens[begin_index:end_index+1])
- result.append(tokens[end_index+1:end])
- return result
- def embedding(datas,shape):
- '''
- @summary:查找词汇对应的词向量
- @param:
- datas:词汇的list
- shape:结果的shape
- @return: array,返回对应shape的词嵌入
- '''
- model_w2v = getModel_w2v()
- embed = np.zeros(shape)
- length = shape[1]
- out_index = 0
- #print(datas)
- for data in datas:
- index = 0
- for item in data:
- item_not_space = re.sub("\s*","",item)
- if index>=length:
- break
- if item_not_space in model_w2v.vocab:
- embed[out_index][index] = model_w2v[item_not_space]
- index += 1
- else:
- #embed[out_index][index] = model_w2v['unk']
- index += 1
- out_index += 1
- return embed
- def partMoney(entity_text,input2_shape = [7]):
- '''
- @summary:对金额分段
- @param:
- entity_text:数值金额
- input2_shape:分类数
- @return: array,分段之后的独热编码
- '''
- money = float(entity_text)
- parts = np.zeros(input2_shape)
- if money<100:
- parts[0] = 1
- elif money<1000:
- parts[1] = 1
- elif money<10000:
- parts[2] = 1
- elif money<100000:
- parts[3] = 1
- elif money<1000000:
- parts[4] = 1
- elif money<10000000:
- parts[5] = 1
- else:
- parts[6] = 1
- return parts
- def recall(y_true, y_pred):
- '''
- 计算召回率
- @Argus:
- y_true: 正确的标签
- y_pred: 模型预测的标签
- @Return
- 召回率
- '''
- c1 = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
- c3 = K.sum(K.round(K.clip(y_true, 0, 1)))
- if c3 == 0:
- return 0
- recall = c1 / c3
- return recall
- def f1_score(y_true, y_pred):
- '''
- 计算F1
- @Argus:
- y_true: 正确的标签
- y_pred: 模型预测的标签
- @Return
- F1值
- '''
- c1 = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
- c2 = K.sum(K.round(K.clip(y_pred, 0, 1)))
- c3 = K.sum(K.round(K.clip(y_true, 0, 1)))
- precision = c1 / c2
- if c3 == 0:
- recall = 0
- else:
- recall = c1 / c3
- f1_score = 2 * (precision * recall) / (precision + recall)
- return f1_score
- def precision(y_true, y_pred):
- '''
- 计算精确率
- @Argus:
- y_true: 正确的标签
- y_pred: 模型预测的标签
- @Return
- 精确率
- '''
- c1 = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
- c2 = K.sum(K.round(K.clip(y_pred, 0, 1)))
- precision = c1 / c2
- return precision
- def acc(y_true, y_pred):
- '''
- #计算正确率
- '''
- c1 = tf.reduce_mean(tf.cast(tf.equal(tf.matmul(tf.cast(tf.argmax(y_true,1),tf.float64),tf.constant([[0],[1]],dtype=tf.float64)),tf.matmul(tf.cast(tf.argmax(y_pred,1),tf.float64),tf.constant([[0],[1]],dtype=tf.float64))),tf.float32))
- return c1
- def my_loss(y_true, y_pred):
- return -tf.reduce_mean(y_true*tf.log(y_pred))
- #return losses.categorical_crossentropy(y_true, y_pred)+(1-tf.reduce_mean(tf.cast(tf.equal(tf.matmul(tf.cast(tf.argmax(y_true,1),tf.float64),tf.constant([[0],[1]],dtype=tf.float64)),tf.matmul(tf.cast(tf.argmax(y_pred,1),tf.float64),tf.constant([[0],[1]],dtype=tf.float64))),tf.float32)))
- def print_metrics(history):
- '''
- 制作每次迭代的各metrics变化图片
- @Arugs:
- history: 模型训练迭代的历史记录
- '''
- import matplotlib.pyplot as plt
- # loss图
- loss = history.history['loss']
- val_loss = history.history['val_loss']
- epochs = range(1, len(loss) + 1)
- plt.subplot(2, 2, 1)
- plt.plot(epochs, loss, 'bo', label='Training loss')
- plt.plot(epochs, val_loss, 'b', label='Validation loss')
- plt.title('Training and validation loss')
- plt.xlabel('Epochs')
- plt.ylabel('Loss')
- plt.legend()
- # f1图
- f1 = history.history['f1_score']
- val_f1 = history.history['val_f1_score']
- plt.subplot(2, 2, 2)
- plt.plot(epochs, f1, 'bo', label='Training f1')
- plt.plot(epochs, val_f1, 'b', label='Validation f1')
- plt.title('Training and validation f1')
- plt.xlabel('Epochs')
- plt.ylabel('F1')
- plt.legend()
- # precision图
- prec = history.history['precision']
- val_prec = history.history['val_precision']
- plt.subplot(2, 2, 3)
- plt.plot(epochs, prec, 'bo', label='Training precision')
- plt.plot(epochs, val_prec, 'b', label='Validation pecision')
- plt.title('Training and validation precision')
- plt.xlabel('Epochs')
- plt.ylabel('Precision')
- plt.legend()
- # recall图
- recall = history.history['recall']
- val_recall = history.history['val_recall']
- plt.subplot(2, 2, 4)
- plt.plot(epochs, recall, 'bo', label='Training recall')
- plt.plot(epochs, val_recall, 'b', label='Validation recall')
- plt.title('Training and validation recall')
- plt.xlabel('Epochs')
- plt.ylabel('Recall')
- plt.legend()
- plt.show()
- scripts_common = '''
- document.getElementsByClassName = function (Name,e,tag) {
- var ele = [],
- allEle,
- length,
- i = 0;
-
- if (typeof tag === "undefined" ){
- tag = "*"
- }
-
- if (typeof e === "undefined"){
- e = document;
- }
-
- allEle = e.getElementsByTagName(tag);
-
- for (length = allEle.length;i < length;i = i + 1){
- if (allEle[i].className === Name) {
- ele.push(allEle[i]);
- }
- }
-
- return ele;
- }
- document.countElementById = function (id,e,tag) {
- var ele = [],
- allEle,
- length,
- i = 0;
-
- if (typeof tag === "undefined" ){
- tag = "*"
- }
-
- if (typeof e === "undefined"){
- e = document;
- }
-
- allEle = e.getElementsByTagName(tag);
-
- for (length = allEle.length;i < length;i = i + 1){
- if (allEle[i].id === id) {
- ele.push(allEle[i]);
- }
- }
-
- return ele;
- }
- /*js集合set类的实现*/
- function Set() {
- this.dataStore = [];
- this.add = add;//新增元素
- this.remove = remove;//删除元素
- this.size = size;//集合的元素个数
- this.union = union;//求并集
- this.contains = contains;//判断一个集合中是否包含某个元素
- this.intersect = intersect;//交集
- this.subset = subset;//判断一个集合是否是另一个的子集
- this.difference = difference;//求补集
- this.show = show;//将集合元素显示出来
- }
- function add(data) {
- if (this.dataStore.indexOf(data) < 0) {
- this.dataStore.push(data);
- return true;
- }
- else {
- return false;
- }
- }
- function remove(data) {
- var pos = this.dataStore.indexOf(data);
- if (pos > -1) {
- this.dataStore.splice(pos,1);
- return true;
- }
- else {
- return false;
- }
- }
- function size() {
- return this.dataStore.length;
- }
- function show() {
- return "[" + this.dataStore + "]";
- }
- function contains(data) {
- if (this.dataStore.indexOf(data) > -1) {
- return true;
- }
- else {
- return false;
- }
- }
- function union(set) {
- var tempSet = new Set();
- for (var i = 0; i < this.dataStore.length; ++i) {
- tempSet.add(this.dataStore[i]);
- }
- for (var i = 0; i < set.dataStore.length; ++i) {
- if (!tempSet.contains(set.dataStore[i])) {
- tempSet.dataStore.push(set.dataStore[i]);
- }
- }
- return tempSet;
- }
- function intersect(set) {
- var tempSet = new Set();
- for (var i = 0; i < this.dataStore.length; ++i) {
- if (set.contains(this.dataStore[i])) {
- tempSet.add(this.dataStore[i]);
- }
- }
- return tempSet;
- }
- function subset(set) {
- if (this.size() > set.size()) {
- return false;
- }
- else {
- for(var member in this.dataStore) {
- if (!set.contains(member)) {
- return false;
- }
- }
- }
- return true;
- }
- function difference(set) {
- var tempSet = new Set();
- for (var i = 0; i < this.dataStore.length; ++i) {
- if (!set.contains(this.dataStore[i])) {
- tempSet.add(this.dataStore[i]);
- }
- }
- return tempSet;
- }
- function check(node,set_url){
- if(node.nodeType!=1){
- return false;
- }
- var label_flag = false;
- var list_a = node.getElementsByTagName("a");
-
- if(list_a.length==set_url.size()){
- return true;
- }else{
- return false;
- }
-
- }
- function getRemoveList(node,recurse,list_remove){
- var pattern = /(上一?篇|下一?篇|作者|点击数|发布时间|发布日期|更新日期|更新时间|字体|字号|来源|阅读次?数|浏览次?数|点击次?数|本站编辑|编辑人|关键字|上一条|下一条)|(打印|关闭窗口|回到顶部|现在的位置|首页|分享)/
-
- if(node.childNodes==null || node.childNodes.length<=0){
- return;
- }
- for(var i=0;i<node.childNodes.length;i++){
- _child = node.childNodes[i];
- if(_child.nodeType==3){
- _match = _child.textContent.toString().match(pattern);
- if(_match!=null){
- if(_match[1]!=null){
- if(node.textContent.toString().trim().length-_match[1].length<3){
- _soup = node.parentNode.tagName.toLowerCase()+":contains("+_match[0]+")";
- }else{
- _soup = node.tagName.toLowerCase()+":contains("+_match[0]+")";
- }
- }else{
- _soup = node.tagName.toLowerCase()+":contains("+_match[0]+")";
- }
- list_remove.push(_soup);
- }
- }
- if(_child.nodeType==1 && recurse){
- getRemoveList(_child,recurse,list_remove)
- }
- }
- }
- function getListXpath(el,list_xpath,getRemove){
- if (el==document || el==document.body){
- return list_xpath;
- }
- if(getRemove){
- _array = new Array();
- getRemoveList(el,true,_array);
- list_xpath.push([getXpath(el),_array])
- }else{
- list_xpath.push(getXpath(el))
- }
-
- return getListXpath(el.parentNode,list_xpath,getRemove);
- }
- function getXpath(el,b,notfirst){
- if (el.id !="" && document.countElementById(el.id).length==1){
- var _jump_flag = false;
- if(b!=null){
- for(var i=0;i<b.length;i++){
- if(el.tagName.toLowerCase()==b[i]){
- _jump_flag = true;
- }
- }
- }
- if(notfirst){
- _jump_flag = true;
- }
- if(!_jump_flag){
- //return '//*[@id=\"'+el.id+'\"]';
- return '//'+el.tagName.toLowerCase()+'[@id=\"'+el.id+'\"]';
- }
-
- }
-
- if (el.getAttribute("class")!=null && document.getElementsByClassName(el.getAttribute("class")).length==1){
- if(!notfirst){
- //return '//*[@class=\"'+el.getAttribute("class")+'\"]';
- return '//'+el.tagName.toLowerCase()+'[@class=\"'+el.getAttribute("class")+'\"]';
- }
-
- }
-
- if (el==document.body){
- return '/html/'+el.tagName.toLowerCase();
- }
-
-
- var ix = 1;
- siblings = el.parentNode.childNodes;
-
- for (var i=0,l=siblings.length;i<l;i++){
- var sibling = siblings[i];
- if (sibling==el){
- if(ix>1 || (ix==1 && i+1<siblings.length && siblings[i+1].tagName==el.tagName)){
- return getXpath(el.parentNode,b)+'/'+el.tagName.toLowerCase()+'['+(ix)+']';
- }else{
- return getXpath(el.parentNode,b)+'/'+el.tagName.toLowerCase();
- }
- //return getXpath(el.parentNode,b)+'/'+el.tagName.toLowerCase()+'['+(ix)+']';
- }else if (sibling.tagName==el.tagName){
- ix++;
- }
- }
- }
- function getJsoup(node){
- var _nodeName = node.tagName.toLowerCase();
- var _nodeText = node.innerText;
- if(_nodeText==null || _nodeText==""){
- return null;
- }
- var counts = 0;
- var list_node = document.getElementsByTagName(_nodeName);
- for(var i=0;i<list_node.length;i++){
- var _node = list_node[i];
- if(_node.innerText!=null && _node.innerText.indexOf(_nodeText)>=0){
- counts += 1;
- }
- }
- if(counts!=1){
- return null;
- }
- var jsoup = _nodeName+':contains('+_nodeText.trim()+')';
- return jsoup;
- }
- function getOffsetLeft(el){
- return el.offsetParent
- ? el.offsetLeft + getOffsetLeft(el.offsetParent)
- : el.offsetLeft;
- }
- function getOffsetTop(el){
- return el.offsetParent
- ? el.offsetTop + getOffsetTop(el.offsetParent)
- : el.offsetTop;
- }
- function search_pageBt(node,type,list_hitTag,pattern_page){
- var find_flag = false;
- if(node!=null && node.nodeName.toLowerCase()=="a"){
- list_hitTag.push([node,type,getOffsetLeft(node),getOffsetTop(node)])
- }else{
- if(node.childNodes==null){
-
- }else{
- for(var i=0;i<node.childNodes.length;i++){
- child = node.childNodes[i];
- if(child!=null && child.tagName !=null && (child.tagName.toLowerCase()=="script" || child.tagName.toLowerCase()=="select")){
- continue;
- }
- child_innerText = child.innerText;
- if(child_innerText!=null){
- _match = child_innerText.match(pattern_page)
- if(_match!=null){
- var _type = _match[1]?"nextPage":(_match[2]?"lastPage":(_match[3]?"firstPage":(_match[4]?"tailPage":"other")))
- search_pageBt(child,_type,list_hitTag,pattern_page);
- find_flag = true;
-
- }
- }
- }
- }
- if(!find_flag){
- list_hitTag.push([node,type,getOffsetLeft(node),getOffsetTop(node)]);
- }
- }
-
- }
- //对命中的标签进行聚类
- function clustering(list_hitTag){
- var list_cluster = new Array();
- for(var i=0;i<list_hitTag.length;i++){
- var _find_flag = false;
- for(var j=0;j<list_cluster.length;j++){
- if(Math.abs(list_cluster[j][1]-list_hitTag[i][3])<20){
- list_cluster[j][2].push([list_hitTag[i][0],list_hitTag[i][1]]);
- if(list_hitTag[i][0].tagName.toLowerCase()=="a" || list_hitTag[i][0].onclick!=null){
- list_cluster[j][3] += 1;
- }
- _find_flag = true;
- }
- }
- if(!_find_flag){
- var _click_num = 0;
- if(list_hitTag[i][0].tagName.toLowerCase()=="a" || list_hitTag[i][0].onclick!=null){
- _click_num = 1;
- }
- list_cluster.push([list_hitTag[i][2],list_hitTag[i][3],[[list_hitTag[i][0],list_hitTag[i][1]]],_click_num]);
- }
- }
- var _list_max_cluster = new Array();
- var _max = 0;
- for(var k=0;k<list_cluster.length;k++){
- _prob = list_cluster[k][2].length*0.5+list_cluster[k][3]*0;
- if(_prob>_max){
- _max = _prob;
- _list_max_cluster = list_cluster[k][2];
- }
- }
- return _list_max_cluster;
- }
- function clustering_turnPage(){
- //var pattern_page = /((?<nextPage>下一?页|>>|>)|(?<lastPage>上一?页|<<|<)|(?<firstPage>首页|第一页)|(?<tailPage>尾页)|(?<other>\.{1,2}|共\d[条页]|\d+\/\d+))/ //phantomjs不支持命名分组
- var pattern_page = /^\s*[^最]?\s*([下后]\s*一?\s*页?|[下后]\s*一?\s*页\s*»|[下后]\s*一?\s*页\s*>|[下后]\s*一?\s*页\s*>>|»|>>|>|[Nn]ext)\s*.?\s*$|^\s*.?([前上]\s*一?\s*页?|«\s*[前上]\s*一?\s*页|«|<<|<|[Pp]revious).?\s*$|^\s*.?(首\s*页?|第\s*一\s*页|first|\|<).?\s*$|^\s*.?([尾末]\s*一?\s*页?|tail|>\|).?s\s*$|(^\s*\.{1,2}\s*$|^.{,10}共\s*\d+\s*[条页].{,10}$|^.{,10}\d+\/\d+.{,3}$|^\s*\.{0,2}\d+\s*$|^\s*[gG][oO]\s*$|^.{0,2}跳?转到?)/
- var pattern_nextPage = /[Nn]ext/
- var list_hitTag = new Array();
-
- //search_pageBt(document,"other",list_hitTag,pattern_page)
- for(var i=0;i<document.all.length;i++){
- var node = document.all[i];
- if(!((getOffsetLeft(node)>0 && getOffsetTop(node)>0))){
- continue;
- }
- if(node.tagName.toLowerCase()=="script"){
- continue;
- }
- var _value = node.getAttribute("value");
- if(_value==null){
- _value = "";
- }
- var _title = node.getAttribute("title");
- if(_title==null){
- _title = "";
- }
- var _text = "";
- if(node!=null && node.innerText!=null){
- _text = node.innerText;
- }
-
-
- if (_text!=null && _text!="" && node.tagName.toLowerCase()!="option"){
- _match = _text.match(pattern_page)
- if(_match!=null){
- var _type = _match[1]?"nextPage":(_match[2]?"lastPage":(_match[3]?"firstPage":(_match[4]?"tailPage":"other")))
- list_hitTag.push([node,_type,getOffsetLeft(node),getOffsetTop(node)]);
- }
- }else if (_value!=null && _value!="" && node.tagName.toLowerCase()!="option"){
- _match = _value.match(pattern_page)
- if(_match!=null){
- var _type = _match[1]?"nextPage":(_match[2]?"lastPage":(_match[3]?"firstPage":(_match[4]?"tailPage":"other")))
- list_hitTag.push([node,_type,getOffsetLeft(node),getOffsetTop(node)]);
- }
- }else if (_title!=null && _title!=""){
- _match = _title.match(pattern_page)
- if(_match!=null){
- var _type = _match[1]?"nextPage":(_match[2]?"lastPage":(_match[3]?"firstPage":(_match[4]?"tailPage":"other")))
- list_hitTag.push([node,_type,getOffsetLeft(node),getOffsetTop(node)]);
- }
- }else if(node!=null && node.getAttribute("class")!=null && node.getAttribute("class").match(pattern_nextPage)!=null){
- list_hitTag.push([node,"nextPage",getOffsetLeft(node),getOffsetTop(node)]);
- }
-
- }
-
- var _find = false;
- for(var i=0;i<list_hitTag.length;i++){
- if(list_hitTag[i][0].innerText==">"){
- _find = true;
- }
- }
- if(_find){
- for(var i=0;i<list_hitTag.length;i++){
- if(list_hitTag[i][0].innerText==">>"){
- list_hitTag[i][1] = "tailPage"
- }
- }
- }
-
- list_cluster = clustering(list_hitTag);
- return list_cluster;
- }
- function findElements_byXpath(STR_XPATH) {
- var xresult = document.evaluate(STR_XPATH, document, null, XPathResult.ANY_TYPE, null);
- var xnodes = [];
- var xres;
- while (xres = xresult.iterateNext()) {
- xnodes.push(xres);
- }
-
- return xnodes;
- }
- '''
- scripts_replaceXpath = '''
- function findElements_byXpath(STR_XPATH) {
- var xresult = document.evaluate(STR_XPATH, document, null, XPathResult.ANY_TYPE, null);
- var xnodes = [];
- var xres;
- while (xres = xresult.iterateNext()) {
- xnodes.push(xres);
- }
-
- return xnodes;
- }
- function replaceXpath(_xpath){
- var list_path = _xpath.split("/");
- var _replaced_xpath = "";
- var aim_att = ["height","width","align","valign","border","bgcolor","style"]
- for(var i=list_path.length-1;i>=0;i--){
- var _path = list_path[i];
- if(_path.indexOf("]")>=0){
- if(_path.indexOf("@")>=0){
- _replaced_xpath = "//"+_path;
- return _xpath;
- }else if(_path=="html"){
- return _xpath;
- }else{
- _temp_xpath = list_path.slice(0,i+1).join("/")
- _temp_nodes = findElements_byXpath(_temp_xpath)
- if(_temp_nodes.length==1){
- var _count = 0;
- var gen_xpath = "";
- for(var j=0;j<_temp_nodes[0].attributes.length;j++){
- var _att = _temp_nodes[0].attributes[j];
- _head = _att.name
- if(aim_att.indexOf(_head)>=0){
- _count += 1;
- if(gen_xpath==""){
- gen_xpath = "//"+_temp_nodes[0].tagName.toLowerCase()+"[@"+_att.name+'=\"'+_att.value+'\"]';
- }else{
- gen_xpath = gen_xpath+"[@"+_att.name+'=\"'+_att.value+'\"]';
- }
- }
- }
- if(_count>=2){
- var _find_nodes = findElements_byXpath(gen_xpath);
- if(_find_nodes.length==1){
- return gen_xpath+_replaced_xpath
- }else{
- _replaced_xpath = "/"+_path + _replaced_xpath;
- }
- }else{
- _replaced_xpath = "/"+_path + _replaced_xpath;
- }
- }else{
- _replaced_xpath = "/"+_path + _replaced_xpath;
- }
- }
- }else{
- if(_path!=""){
- _replaced_xpath = "/"+_path + _replaced_xpath;
- }
- }
- }
- return _replaced_xpath;
- }
- return replaceXpath(arguments[0]);
- '''
|