Utils.py 33 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050105110521053105410551056105710581059106010611062106310641065106610671068106910701071107210731074107510761077107810791080108110821083108410851086108710881089109010911092
  1. # -*- coding: utf-8 -*-
  2. '''
  3. Created on 2018年12月20日
  4. @author: User
  5. '''
  6. import numpy as np
  7. import re
  8. import gensim
  9. from keras import backend as K
  10. import ctypes
  11. import inspect
  12. w2vfile = "../wiki_128_word_embedding_new.vector"
  13. model_w2v = None
  14. from decimal import Decimal
  15. import logging
  16. #logging.basicConfig(level = logging.INFO,format = '%(asctime)s - %(name)s - %(levelname)s - %(message)s')
  17. import pickle
  18. import tensorflow as tf
  19. from keras import losses
  20. import threading
  21. __author__ = 'baniu.yao'
  22. class MyThread(threading.Thread):
  23. def __init__(self, func, args=()):
  24. super(MyThread, self).__init__()
  25. self.func = func
  26. self.args = args
  27. def run(self):
  28. self.result = self.func(*self.args)
  29. def get_result(self):
  30. try:
  31. return self.result
  32. except Exception as e:
  33. print('执行js抛出异常:', e)
  34. return None
  35. def get_js_rs(browser, script, *arg, timeout=20):
  36. '''
  37. 浏览器执行脚本,返回结果,超时中断
  38. :param browser:浏览器对象
  39. :param script: 脚本
  40. :param arg:参数
  41. :param timeout:超时时间
  42. :return:
  43. '''
  44. def execute_js():
  45. data = browser.execute_script(script, *arg)
  46. return data
  47. t = MyThread(func=execute_js, args=())
  48. t.setDaemon(True)
  49. t.start()
  50. t.join(timeout)
  51. if t.isAlive():
  52. print('执行js超时')
  53. stop_thread(t)
  54. return None
  55. data = t.get_result()
  56. return data
  57. import time
  58. def thread_run(func, *arg, timeout=30):
  59. t = MyThread(func=func, args=(*arg,))
  60. t.setDaemon(True)
  61. t.start()
  62. t.join(timeout)
  63. if t.isAlive():
  64. print('thread_run time out')
  65. result = t.get_result()
  66. return result
  67. def xpath2css(xpath):
  68. '''
  69. 把xpath路径转为css路径
  70. :param xpath:
  71. :return:
  72. '''
  73. xpath = xpath.replace('//', '').replace('@', '').replace('/', '>')
  74. for it in re.finditer('\[(\d)\]', xpath):
  75. xpath = xpath.replace(it.group(0), ':nth-child(%s)'%it.group(1))
  76. if xpath[0] == '>':
  77. xpath = xpath[1:]
  78. return xpath
  79. def get_class_from_frame(fr):
  80. args, _, _, value_dict = inspect.getargvalues(fr)
  81. if len(args) and args[0] == 'self':
  82. instance = value_dict.get('self', None)
  83. if instance:
  84. return getattr(instance, '__class__', None)
  85. return None
  86. class CLog(object):
  87. def __init__(self, log_file_path='./test.log'):
  88. logging.basicConfig(level=logging.INFO, filemode='a',format='%(asctime)s %(message)s')
  89. self.logger = logging.getLogger("single_server")
  90. ''''''
  91. console = logging.FileHandler(log_file_path,encoding="UTF8")
  92. formatter = logging.Formatter('%(asctime)s %(levelname)s %(message)s [%(chain)s] %(thread)s %(threadName)s')
  93. console.setFormatter(formatter)
  94. self.logger.setLevel(logging.DEBUG)
  95. self.logger.addHandler(console)
  96. def get_file_name_in_full_path(self, file_path):
  97. return file_path.split('/')[-1]
  98. def get_meta_data(self):
  99. frames = inspect.stack()
  100. chain_list = []
  101. for i in range(0, len(frames)-1):
  102. _, file_path, _, func_name, _, _ = frames[i]
  103. file_name = self.get_file_name_in_full_path(file_path)
  104. try:
  105. args = re.findall('\((.*)\)', frames[i+1][-2][0])[0]
  106. except Exception as e:
  107. args = ""
  108. current_chain = '%s(%s)' % (func_name, args)
  109. chain_list.append(current_chain)
  110. chain_list.reverse()
  111. return ' --> '.join(chain_list[:-2])
  112. def info(self, message):
  113. chain = self.get_meta_data()
  114. self.logger.info(message,extra={"chain":chain})
  115. def error(self, message):
  116. chain = self.get_meta_data()
  117. self.logger.error(message,extra={"chain":chain})
  118. def debug(self, message):
  119. chain = self.get_meta_data()
  120. self.logger.debug(message,extra={"chain":chain})
  121. def add_err_msg(_dict,msg):
  122. _key = "err_msg"
  123. if _key in _dict:
  124. if re.search(msg,_dict[_key]) is None:
  125. _dict[_key] = _dict[_key]+msg
  126. else:
  127. _dict[_key] = msg
  128. def _async_raise(tid, exctype):
  129. """raises the exception, performs cleanup if needed"""
  130. tid = ctypes.c_long(tid)
  131. if not inspect.isclass(exctype):
  132. exctype = type(exctype)
  133. res = ctypes.pythonapi.PyThreadState_SetAsyncExc(tid, ctypes.py_object(exctype))
  134. if res == 0:
  135. raise ValueError("invalid thread id")
  136. elif res != 1:
  137. ctypes.pythonapi.PyThreadState_SetAsyncExc(tid, None)
  138. raise SystemError("PyThreadState_SetAsyncExc failed")
  139. def stop_thread(thread):
  140. _async_raise(thread.ident, SystemExit)
  141. _log = CLog()
  142. def log(msg):
  143. '''
  144. @summary:打印信息
  145. '''
  146. _log.info(msg)
  147. def error(msg):
  148. _log.error(msg)
  149. def debug(msg):
  150. _log.debug(msg)
  151. def save(object_to_save, path):
  152. '''
  153. 保存对象
  154. @Arugs:
  155. object_to_save: 需要保存的对象
  156. @Return:
  157. 保存的路径
  158. '''
  159. with open(path, 'wb') as f:
  160. pickle.dump(object_to_save, f)
  161. def load(path):
  162. '''
  163. 读取对象
  164. @Arugs:
  165. path: 读取的路径
  166. @Return:
  167. 读取的对象
  168. '''
  169. with open(path, 'rb') as f:
  170. object1 = pickle.load(f)
  171. return object1
  172. def find_index(list_tofind,text):
  173. '''
  174. @summary: 查找所有词汇在字符串中第一次出现的位置
  175. @param:
  176. list_tofind:待查找词汇
  177. text:字符串
  178. @return: list,每个词汇第一次出现的位置
  179. '''
  180. result = []
  181. for item in list_tofind:
  182. index = text.find(item)
  183. if index>=0:
  184. result.append(index)
  185. else:
  186. result.append(-1)
  187. return result
  188. def combine(list1,list2):
  189. '''
  190. @summary:将两个list中的字符串两两拼接
  191. @param:
  192. list1:字符串list
  193. list2:字符串list
  194. @return:拼接结果list
  195. '''
  196. result = []
  197. for item1 in list1:
  198. for item2 in list2:
  199. result.append(str(item1)+str(item2))
  200. return result
  201. def getDigitsDic(unit):
  202. '''
  203. @summary:拿到中文对应的数字
  204. '''
  205. DigitsDic = {"零":0, "壹":1, "贰":2, "叁":3, "肆":4, "伍":5, "陆":6, "柒":7, "捌":8, "玖":9,
  206. "〇":0, "一":1, "二":2, "三":3, "四":4, "五":5, "六":6, "七":7, "八":8, "九":9}
  207. return DigitsDic.get(unit)
  208. def getMultipleFactor(unit):
  209. '''
  210. @summary:拿到单位对应的值
  211. '''
  212. MultipleFactor = {"兆":Decimal(1000000000000),"亿":Decimal(100000000),"万":Decimal(10000),"仟":Decimal(1000),"千":Decimal(1000),"佰":Decimal(100),"百":Decimal(100),"拾":Decimal(10),"十":Decimal(10),"元":Decimal(1),"角":round(Decimal(0.1),1),"分":round(Decimal(0.01),2)}
  213. return MultipleFactor.get(unit)
  214. def getUnifyMoney(money):
  215. '''
  216. @summary:将中文金额字符串转换为数字金额
  217. @param:
  218. money:中文金额字符串
  219. @return: decimal,数据金额
  220. '''
  221. #去掉逗号
  222. money = re.sub("[,,]","",money)
  223. result = Decimal(0)
  224. chnDigits = ["零", "壹", "贰", "叁", "肆", "伍", "陆", "柒", "捌", "玖"]
  225. chnFactorUnits = ["兆", "亿", "万", "仟", "佰", "拾","元","角","分"]
  226. LowMoneypattern = re.compile("^(\d+,?)+(\.\d+)?$")
  227. BigMoneypattern = re.compile("^[%s]$"%("".join(chnDigits)))
  228. if re.search(LowMoneypattern,money) is not None:
  229. return Decimal(money)
  230. elif re.search(BigMoneypattern,money) is not None:
  231. return getDigitsDic(money)
  232. for factorUnit in chnFactorUnits:
  233. if re.search(re.compile(".*%s.*"%(factorUnit)),money) is not None:
  234. subMoneys = re.split(re.compile("%s(?!.*%s.*)"%(factorUnit,factorUnit)),money)
  235. if re.search(re.compile("^(\d+(,)?)+(\.\d+)?$"),subMoneys[0]) is not None:
  236. result += Decimal(subMoneys[0])*(getMultipleFactor(factorUnit))
  237. elif len(subMoneys[0])==1:
  238. if re.search(re.compile("^[%s]$"%("".join(chnDigits))),subMoneys[0]) is not None:
  239. result += Decimal(getDigitsDic(subMoneys[0]))*(getMultipleFactor(factorUnit))
  240. else:
  241. result += Decimal(getUnifyMoney(subMoneys[0]))*(getMultipleFactor(factorUnit))
  242. if len(subMoneys)>1:
  243. if re.search(re.compile("^(\d+(,)?)+(\.\d+)?[百千万亿]?\s?(元)?$"),subMoneys[1]) is not None:
  244. result += Decimal(subMoneys[1])
  245. elif len(subMoneys[1])==1:
  246. if re.search(re.compile("^[%s]$"%("".join(chnDigits))),subMoneys[1]) is not None:
  247. result += Decimal(getDigitsDic(subMoneys[1]))
  248. else:
  249. result += Decimal(getUnifyMoney(subMoneys[1]))
  250. break
  251. return result
  252. def mergeDict(list_dict):
  253. new_dict = dict()
  254. _flag = True
  255. hasDrew = False
  256. err_msg = ""
  257. for _dict in list_dict:
  258. if _dict is None:
  259. _flag = False
  260. continue
  261. for key in _dict.keys():
  262. if key=="flag":
  263. if not _dict[key]:
  264. _flag = _dict[key]
  265. else:
  266. if key=="err_msg":
  267. err_msg += _dict[key]
  268. new_dict[key] = _dict[key]
  269. if key=="hasDrew":
  270. hasDrew = hasDrew or _dict[key]
  271. new_dict["flag"] = _flag
  272. new_dict["hasDrew"] = hasDrew
  273. new_dict["err_msg"] = err_msg
  274. count_rules = 0
  275. for _key in new_dict.keys():
  276. if _key not in ["flag","success","count_rules"] and new_dict[_key]!="" and new_dict[_key] is not None:
  277. count_rules += 1
  278. new_dict["count_rules"] = count_rules
  279. return new_dict
  280. def getCommonXpath(list_xpaths,on_value=0.6):
  281. CommonXpath = None
  282. if len(list_xpaths)>0:
  283. MAX_LEN = max([len(x) for x in list_xpaths])
  284. for i in range(MAX_LEN):
  285. _xpath = None
  286. _same_count = 0
  287. for j in range(len(list_xpaths)):
  288. if i<len(list_xpaths[j]):
  289. if _xpath is None:
  290. _xpath = list_xpaths[j][i]
  291. if list_xpaths[j][i]==_xpath:
  292. _same_count += 1
  293. if _same_count/len(list_xpaths)>=on_value:
  294. CommonXpath = _xpath
  295. return CommonXpath
  296. def getModel_w2v():
  297. '''
  298. @summary:加载词向量
  299. '''
  300. global model_w2v
  301. if model_w2v is None:
  302. model_w2v = gensim.models.KeyedVectors.load_word2vec_format(w2vfile,binary=True)
  303. return model_w2v
  304. def findAllIndex(substr,wholestr):
  305. '''
  306. @summary: 找到字符串的子串的所有begin_index
  307. @param:
  308. substr:子字符串
  309. wholestr:子串所在完整字符串
  310. @return: list,字符串的子串的所有begin_index
  311. '''
  312. copystr = wholestr
  313. result = []
  314. indexappend = 0
  315. while(True):
  316. index = copystr.find(substr)
  317. if index<0:
  318. break
  319. else:
  320. result.append(indexappend+index)
  321. indexappend += index+len(substr)
  322. copystr = copystr[index+len(substr):]
  323. return result
  324. def spanWindow(tokens,begin_index,end_index,size):
  325. '''
  326. @summary:取得某个实体的上下文词汇
  327. @param:
  328. tokens:句子分词list
  329. begin_index:实体的开始index
  330. end_index:实体的结束index
  331. size:左右两边各取多少个词
  332. @return: list,实体的上下文词汇
  333. '''
  334. length_tokens = len(tokens)
  335. if begin_index>size:
  336. begin = begin_index-size
  337. else:
  338. begin = 0
  339. if end_index+size<length_tokens:
  340. end = end_index+size+1
  341. else:
  342. end = length_tokens
  343. result = []
  344. result.append(tokens[begin:begin_index])
  345. #result.append(tokens[begin_index:end_index+1])
  346. result.append(tokens[end_index+1:end])
  347. return result
  348. def embedding(datas,shape):
  349. '''
  350. @summary:查找词汇对应的词向量
  351. @param:
  352. datas:词汇的list
  353. shape:结果的shape
  354. @return: array,返回对应shape的词嵌入
  355. '''
  356. model_w2v = getModel_w2v()
  357. embed = np.zeros(shape)
  358. length = shape[1]
  359. out_index = 0
  360. #print(datas)
  361. for data in datas:
  362. index = 0
  363. for item in data:
  364. item_not_space = re.sub("\s*","",item)
  365. if index>=length:
  366. break
  367. if item_not_space in model_w2v.vocab:
  368. embed[out_index][index] = model_w2v[item_not_space]
  369. index += 1
  370. else:
  371. #embed[out_index][index] = model_w2v['unk']
  372. index += 1
  373. out_index += 1
  374. return embed
  375. def partMoney(entity_text,input2_shape = [7]):
  376. '''
  377. @summary:对金额分段
  378. @param:
  379. entity_text:数值金额
  380. input2_shape:分类数
  381. @return: array,分段之后的独热编码
  382. '''
  383. money = float(entity_text)
  384. parts = np.zeros(input2_shape)
  385. if money<100:
  386. parts[0] = 1
  387. elif money<1000:
  388. parts[1] = 1
  389. elif money<10000:
  390. parts[2] = 1
  391. elif money<100000:
  392. parts[3] = 1
  393. elif money<1000000:
  394. parts[4] = 1
  395. elif money<10000000:
  396. parts[5] = 1
  397. else:
  398. parts[6] = 1
  399. return parts
  400. def recall(y_true, y_pred):
  401. '''
  402. 计算召回率
  403. @Argus:
  404. y_true: 正确的标签
  405. y_pred: 模型预测的标签
  406. @Return
  407. 召回率
  408. '''
  409. c1 = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
  410. c3 = K.sum(K.round(K.clip(y_true, 0, 1)))
  411. if c3 == 0:
  412. return 0
  413. recall = c1 / c3
  414. return recall
  415. def f1_score(y_true, y_pred):
  416. '''
  417. 计算F1
  418. @Argus:
  419. y_true: 正确的标签
  420. y_pred: 模型预测的标签
  421. @Return
  422. F1值
  423. '''
  424. c1 = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
  425. c2 = K.sum(K.round(K.clip(y_pred, 0, 1)))
  426. c3 = K.sum(K.round(K.clip(y_true, 0, 1)))
  427. precision = c1 / c2
  428. if c3 == 0:
  429. recall = 0
  430. else:
  431. recall = c1 / c3
  432. f1_score = 2 * (precision * recall) / (precision + recall)
  433. return f1_score
  434. def precision(y_true, y_pred):
  435. '''
  436. 计算精确率
  437. @Argus:
  438. y_true: 正确的标签
  439. y_pred: 模型预测的标签
  440. @Return
  441. 精确率
  442. '''
  443. c1 = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
  444. c2 = K.sum(K.round(K.clip(y_pred, 0, 1)))
  445. precision = c1 / c2
  446. return precision
  447. def acc(y_true, y_pred):
  448. '''
  449. #计算正确率
  450. '''
  451. c1 = tf.reduce_mean(tf.cast(tf.equal(tf.matmul(tf.cast(tf.argmax(y_true,1),tf.float64),tf.constant([[0],[1]],dtype=tf.float64)),tf.matmul(tf.cast(tf.argmax(y_pred,1),tf.float64),tf.constant([[0],[1]],dtype=tf.float64))),tf.float32))
  452. return c1
  453. def my_loss(y_true, y_pred):
  454. return -tf.reduce_mean(y_true*tf.log(y_pred))
  455. #return losses.categorical_crossentropy(y_true, y_pred)+(1-tf.reduce_mean(tf.cast(tf.equal(tf.matmul(tf.cast(tf.argmax(y_true,1),tf.float64),tf.constant([[0],[1]],dtype=tf.float64)),tf.matmul(tf.cast(tf.argmax(y_pred,1),tf.float64),tf.constant([[0],[1]],dtype=tf.float64))),tf.float32)))
  456. def print_metrics(history):
  457. '''
  458. 制作每次迭代的各metrics变化图片
  459. @Arugs:
  460. history: 模型训练迭代的历史记录
  461. '''
  462. import matplotlib.pyplot as plt
  463. # loss图
  464. loss = history.history['loss']
  465. val_loss = history.history['val_loss']
  466. epochs = range(1, len(loss) + 1)
  467. plt.subplot(2, 2, 1)
  468. plt.plot(epochs, loss, 'bo', label='Training loss')
  469. plt.plot(epochs, val_loss, 'b', label='Validation loss')
  470. plt.title('Training and validation loss')
  471. plt.xlabel('Epochs')
  472. plt.ylabel('Loss')
  473. plt.legend()
  474. # f1图
  475. f1 = history.history['f1_score']
  476. val_f1 = history.history['val_f1_score']
  477. plt.subplot(2, 2, 2)
  478. plt.plot(epochs, f1, 'bo', label='Training f1')
  479. plt.plot(epochs, val_f1, 'b', label='Validation f1')
  480. plt.title('Training and validation f1')
  481. plt.xlabel('Epochs')
  482. plt.ylabel('F1')
  483. plt.legend()
  484. # precision图
  485. prec = history.history['precision']
  486. val_prec = history.history['val_precision']
  487. plt.subplot(2, 2, 3)
  488. plt.plot(epochs, prec, 'bo', label='Training precision')
  489. plt.plot(epochs, val_prec, 'b', label='Validation pecision')
  490. plt.title('Training and validation precision')
  491. plt.xlabel('Epochs')
  492. plt.ylabel('Precision')
  493. plt.legend()
  494. # recall图
  495. recall = history.history['recall']
  496. val_recall = history.history['val_recall']
  497. plt.subplot(2, 2, 4)
  498. plt.plot(epochs, recall, 'bo', label='Training recall')
  499. plt.plot(epochs, val_recall, 'b', label='Validation recall')
  500. plt.title('Training and validation recall')
  501. plt.xlabel('Epochs')
  502. plt.ylabel('Recall')
  503. plt.legend()
  504. plt.show()
  505. scripts_common = '''
  506. document.getElementsByClassName = function (Name,e,tag) {
  507. var ele = [],
  508. allEle,
  509. length,
  510. i = 0;
  511. if (typeof tag === "undefined" ){
  512. tag = "*"
  513. }
  514. if (typeof e === "undefined"){
  515. e = document;
  516. }
  517. allEle = e.getElementsByTagName(tag);
  518. for (length = allEle.length;i < length;i = i + 1){
  519. if (allEle[i].className === Name) {
  520. ele.push(allEle[i]);
  521. }
  522. }
  523. return ele;
  524. }
  525. document.countElementById = function (id,e,tag) {
  526. var ele = [],
  527. allEle,
  528. length,
  529. i = 0;
  530. if (typeof tag === "undefined" ){
  531. tag = "*"
  532. }
  533. if (typeof e === "undefined"){
  534. e = document;
  535. }
  536. allEle = e.getElementsByTagName(tag);
  537. for (length = allEle.length;i < length;i = i + 1){
  538. if (allEle[i].id === id) {
  539. ele.push(allEle[i]);
  540. }
  541. }
  542. return ele;
  543. }
  544. /*js集合set类的实现*/
  545. function Set() {
  546. this.dataStore = [];
  547. this.add = add;//新增元素
  548. this.remove = remove;//删除元素
  549. this.size = size;//集合的元素个数
  550. this.union = union;//求并集
  551. this.contains = contains;//判断一个集合中是否包含某个元素
  552. this.intersect = intersect;//交集
  553. this.subset = subset;//判断一个集合是否是另一个的子集
  554. this.difference = difference;//求补集
  555. this.show = show;//将集合元素显示出来
  556. }
  557. function add(data) {
  558. if (this.dataStore.indexOf(data) < 0) {
  559. this.dataStore.push(data);
  560. return true;
  561. }
  562. else {
  563. return false;
  564. }
  565. }
  566. function remove(data) {
  567. var pos = this.dataStore.indexOf(data);
  568. if (pos > -1) {
  569. this.dataStore.splice(pos,1);
  570. return true;
  571. }
  572. else {
  573. return false;
  574. }
  575. }
  576. function size() {
  577. return this.dataStore.length;
  578. }
  579. function show() {
  580. return "[" + this.dataStore + "]";
  581. }
  582. function contains(data) {
  583. if (this.dataStore.indexOf(data) > -1) {
  584. return true;
  585. }
  586. else {
  587. return false;
  588. }
  589. }
  590. function union(set) {
  591. var tempSet = new Set();
  592. for (var i = 0; i < this.dataStore.length; ++i) {
  593. tempSet.add(this.dataStore[i]);
  594. }
  595. for (var i = 0; i < set.dataStore.length; ++i) {
  596. if (!tempSet.contains(set.dataStore[i])) {
  597. tempSet.dataStore.push(set.dataStore[i]);
  598. }
  599. }
  600. return tempSet;
  601. }
  602. function intersect(set) {
  603. var tempSet = new Set();
  604. for (var i = 0; i < this.dataStore.length; ++i) {
  605. if (set.contains(this.dataStore[i])) {
  606. tempSet.add(this.dataStore[i]);
  607. }
  608. }
  609. return tempSet;
  610. }
  611. function subset(set) {
  612. if (this.size() > set.size()) {
  613. return false;
  614. }
  615. else {
  616. for(var member in this.dataStore) {
  617. if (!set.contains(member)) {
  618. return false;
  619. }
  620. }
  621. }
  622. return true;
  623. }
  624. function difference(set) {
  625. var tempSet = new Set();
  626. for (var i = 0; i < this.dataStore.length; ++i) {
  627. if (!set.contains(this.dataStore[i])) {
  628. tempSet.add(this.dataStore[i]);
  629. }
  630. }
  631. return tempSet;
  632. }
  633. function check(node,set_url){
  634. if(node.nodeType!=1){
  635. return false;
  636. }
  637. var label_flag = false;
  638. var list_a = node.getElementsByTagName("a");
  639. if(list_a.length==set_url.size()){
  640. return true;
  641. }else{
  642. return false;
  643. }
  644. }
  645. function getRemoveList(node,recurse,list_remove){
  646. var pattern = /(上一?篇|下一?篇|作者|点击数|发布时间|发布日期|更新日期|更新时间|字体|字号|来源|阅读次?数|浏览次?数|点击次?数|本站编辑|编辑人|关键字|上一条|下一条)|(打印|关闭窗口|回到顶部|现在的位置|首页|分享)/
  647. if(node.childNodes==null || node.childNodes.length<=0){
  648. return;
  649. }
  650. for(var i=0;i<node.childNodes.length;i++){
  651. _child = node.childNodes[i];
  652. if(_child.nodeType==3){
  653. _match = _child.textContent.toString().match(pattern);
  654. if(_match!=null){
  655. if(_match[1]!=null){
  656. if(node.textContent.toString().trim().length-_match[1].length<3){
  657. _soup = node.parentNode.tagName.toLowerCase()+":contains("+_match[0]+")";
  658. }else{
  659. _soup = node.tagName.toLowerCase()+":contains("+_match[0]+")";
  660. }
  661. }else{
  662. _soup = node.tagName.toLowerCase()+":contains("+_match[0]+")";
  663. }
  664. list_remove.push(_soup);
  665. }
  666. }
  667. if(_child.nodeType==1 && recurse){
  668. getRemoveList(_child,recurse,list_remove)
  669. }
  670. }
  671. }
  672. function getListXpath(el,list_xpath,getRemove){
  673. if (el==document || el==document.body){
  674. return list_xpath;
  675. }
  676. if(getRemove){
  677. _array = new Array();
  678. getRemoveList(el,true,_array);
  679. list_xpath.push([getXpath(el),_array])
  680. }else{
  681. list_xpath.push(getXpath(el))
  682. }
  683. return getListXpath(el.parentNode,list_xpath,getRemove);
  684. }
  685. function getXpath(el,b,notfirst){
  686. if (el.id !="" && document.countElementById(el.id).length==1){
  687. var _jump_flag = false;
  688. if(b!=null){
  689. for(var i=0;i<b.length;i++){
  690. if(el.tagName.toLowerCase()==b[i]){
  691. _jump_flag = true;
  692. }
  693. }
  694. }
  695. if(notfirst){
  696. _jump_flag = true;
  697. }
  698. if(!_jump_flag){
  699. //return '//*[@id=\"'+el.id+'\"]';
  700. return '//'+el.tagName.toLowerCase()+'[@id=\"'+el.id+'\"]';
  701. }
  702. }
  703. if (el.getAttribute("class")!=null && document.getElementsByClassName(el.getAttribute("class")).length==1){
  704. if(!notfirst){
  705. //return '//*[@class=\"'+el.getAttribute("class")+'\"]';
  706. return '//'+el.tagName.toLowerCase()+'[@class=\"'+el.getAttribute("class")+'\"]';
  707. }
  708. }
  709. if (el==document.body){
  710. return '/html/'+el.tagName.toLowerCase();
  711. }
  712. var ix = 1;
  713. siblings = el.parentNode.childNodes;
  714. for (var i=0,l=siblings.length;i<l;i++){
  715. var sibling = siblings[i];
  716. if (sibling==el){
  717. if(ix>1 || (ix==1 && i+1<siblings.length && siblings[i+1].tagName==el.tagName)){
  718. return getXpath(el.parentNode,b)+'/'+el.tagName.toLowerCase()+'['+(ix)+']';
  719. }else{
  720. return getXpath(el.parentNode,b)+'/'+el.tagName.toLowerCase();
  721. }
  722. //return getXpath(el.parentNode,b)+'/'+el.tagName.toLowerCase()+'['+(ix)+']';
  723. }else if (sibling.tagName==el.tagName){
  724. ix++;
  725. }
  726. }
  727. }
  728. function getJsoup(node){
  729. var _nodeName = node.tagName.toLowerCase();
  730. var _nodeText = node.innerText;
  731. if(_nodeText==null || _nodeText==""){
  732. return null;
  733. }
  734. var counts = 0;
  735. var list_node = document.getElementsByTagName(_nodeName);
  736. for(var i=0;i<list_node.length;i++){
  737. var _node = list_node[i];
  738. if(_node.innerText!=null && _node.innerText.indexOf(_nodeText)>=0){
  739. counts += 1;
  740. }
  741. }
  742. if(counts!=1){
  743. return null;
  744. }
  745. var jsoup = _nodeName+':contains('+_nodeText.trim()+')';
  746. return jsoup;
  747. }
  748. function getOffsetLeft(el){
  749. return el.offsetParent
  750. ? el.offsetLeft + getOffsetLeft(el.offsetParent)
  751. : el.offsetLeft;
  752. }
  753. function getOffsetTop(el){
  754. return el.offsetParent
  755. ? el.offsetTop + getOffsetTop(el.offsetParent)
  756. : el.offsetTop;
  757. }
  758. function search_pageBt(node,type,list_hitTag,pattern_page){
  759. var find_flag = false;
  760. if(node!=null && node.nodeName.toLowerCase()=="a"){
  761. list_hitTag.push([node,type,getOffsetLeft(node),getOffsetTop(node)])
  762. }else{
  763. if(node.childNodes==null){
  764. }else{
  765. for(var i=0;i<node.childNodes.length;i++){
  766. child = node.childNodes[i];
  767. if(child!=null && child.tagName !=null && (child.tagName.toLowerCase()=="script" || child.tagName.toLowerCase()=="select")){
  768. continue;
  769. }
  770. child_innerText = child.innerText;
  771. if(child_innerText!=null){
  772. _match = child_innerText.match(pattern_page)
  773. if(_match!=null){
  774. var _type = _match[1]?"nextPage":(_match[2]?"lastPage":(_match[3]?"firstPage":(_match[4]?"tailPage":"other")))
  775. search_pageBt(child,_type,list_hitTag,pattern_page);
  776. find_flag = true;
  777. }
  778. }
  779. }
  780. }
  781. if(!find_flag){
  782. list_hitTag.push([node,type,getOffsetLeft(node),getOffsetTop(node)]);
  783. }
  784. }
  785. }
  786. //对命中的标签进行聚类
  787. function clustering(list_hitTag){
  788. var list_cluster = new Array();
  789. for(var i=0;i<list_hitTag.length;i++){
  790. var _find_flag = false;
  791. for(var j=0;j<list_cluster.length;j++){
  792. if(Math.abs(list_cluster[j][1]-list_hitTag[i][3])<20){
  793. list_cluster[j][2].push([list_hitTag[i][0],list_hitTag[i][1]]);
  794. if(list_hitTag[i][0].tagName.toLowerCase()=="a" || list_hitTag[i][0].onclick!=null){
  795. list_cluster[j][3] += 1;
  796. }
  797. _find_flag = true;
  798. }
  799. }
  800. if(!_find_flag){
  801. var _click_num = 0;
  802. if(list_hitTag[i][0].tagName.toLowerCase()=="a" || list_hitTag[i][0].onclick!=null){
  803. _click_num = 1;
  804. }
  805. list_cluster.push([list_hitTag[i][2],list_hitTag[i][3],[[list_hitTag[i][0],list_hitTag[i][1]]],_click_num]);
  806. }
  807. }
  808. var _list_max_cluster = new Array();
  809. var _max = 0;
  810. for(var k=0;k<list_cluster.length;k++){
  811. _prob = list_cluster[k][2].length*0.5+list_cluster[k][3]*0;
  812. if(_prob>_max){
  813. _max = _prob;
  814. _list_max_cluster = list_cluster[k][2];
  815. }
  816. }
  817. return _list_max_cluster;
  818. }
  819. function clustering_turnPage(){
  820. //var pattern_page = /((?<nextPage>下一?页|>>|>)|(?<lastPage>上一?页|<<|<)|(?<firstPage>首页|第一页)|(?<tailPage>尾页)|(?<other>\.{1,2}|共\d[条页]|\d+\/\d+))/ //phantomjs不支持命名分组
  821. var pattern_page = /^\s*[^最]?\s*([下后]\s*一?\s*页?|[下后]\s*一?\s*页\s*»|[下后]\s*一?\s*页\s*>|[下后]\s*一?\s*页\s*>>|»|>>|>|[Nn]ext)\s*.?\s*$|^\s*.?([前上]\s*一?\s*页?|«\s*[前上]\s*一?\s*页|«|<<|<|[Pp]revious).?\s*$|^\s*.?(首\s*页?|第\s*一\s*页|first|\|<).?\s*$|^\s*.?([尾末]\s*一?\s*页?|tail|>\|).?s\s*$|(^\s*\.{1,2}\s*$|^.{,10}共\s*\d+\s*[条页].{,10}$|^.{,10}\d+\/\d+.{,3}$|^\s*\.{0,2}\d+\s*$|^\s*[gG][oO]\s*$|^.{0,2}跳?转到?)/
  822. var pattern_nextPage = /[Nn]ext/
  823. var list_hitTag = new Array();
  824. //search_pageBt(document,"other",list_hitTag,pattern_page)
  825. for(var i=0;i<document.all.length;i++){
  826. var node = document.all[i];
  827. if(!((getOffsetLeft(node)>0 && getOffsetTop(node)>0))){
  828. continue;
  829. }
  830. if(node.tagName.toLowerCase()=="script"){
  831. continue;
  832. }
  833. var _value = node.getAttribute("value");
  834. if(_value==null){
  835. _value = "";
  836. }
  837. var _title = node.getAttribute("title");
  838. if(_title==null){
  839. _title = "";
  840. }
  841. var _text = "";
  842. if(node!=null && node.innerText!=null){
  843. _text = node.innerText;
  844. }
  845. if (_text!=null && _text!="" && node.tagName.toLowerCase()!="option"){
  846. _match = _text.match(pattern_page)
  847. if(_match!=null){
  848. var _type = _match[1]?"nextPage":(_match[2]?"lastPage":(_match[3]?"firstPage":(_match[4]?"tailPage":"other")))
  849. list_hitTag.push([node,_type,getOffsetLeft(node),getOffsetTop(node)]);
  850. }
  851. }else if (_value!=null && _value!="" && node.tagName.toLowerCase()!="option"){
  852. _match = _value.match(pattern_page)
  853. if(_match!=null){
  854. var _type = _match[1]?"nextPage":(_match[2]?"lastPage":(_match[3]?"firstPage":(_match[4]?"tailPage":"other")))
  855. list_hitTag.push([node,_type,getOffsetLeft(node),getOffsetTop(node)]);
  856. }
  857. }else if (_title!=null && _title!=""){
  858. _match = _title.match(pattern_page)
  859. if(_match!=null){
  860. var _type = _match[1]?"nextPage":(_match[2]?"lastPage":(_match[3]?"firstPage":(_match[4]?"tailPage":"other")))
  861. list_hitTag.push([node,_type,getOffsetLeft(node),getOffsetTop(node)]);
  862. }
  863. }else if(node!=null && node.getAttribute("class")!=null && node.getAttribute("class").match(pattern_nextPage)!=null){
  864. list_hitTag.push([node,"nextPage",getOffsetLeft(node),getOffsetTop(node)]);
  865. }
  866. }
  867. var _find = false;
  868. for(var i=0;i<list_hitTag.length;i++){
  869. if(list_hitTag[i][0].innerText==">"){
  870. _find = true;
  871. }
  872. }
  873. if(_find){
  874. for(var i=0;i<list_hitTag.length;i++){
  875. if(list_hitTag[i][0].innerText==">>"){
  876. list_hitTag[i][1] = "tailPage"
  877. }
  878. }
  879. }
  880. list_cluster = clustering(list_hitTag);
  881. return list_cluster;
  882. }
  883. function findElements_byXpath(STR_XPATH) {
  884. var xresult = document.evaluate(STR_XPATH, document, null, XPathResult.ANY_TYPE, null);
  885. var xnodes = [];
  886. var xres;
  887. while (xres = xresult.iterateNext()) {
  888. xnodes.push(xres);
  889. }
  890. return xnodes;
  891. }
  892. '''
  893. scripts_replaceXpath = '''
  894. function findElements_byXpath(STR_XPATH) {
  895. var xresult = document.evaluate(STR_XPATH, document, null, XPathResult.ANY_TYPE, null);
  896. var xnodes = [];
  897. var xres;
  898. while (xres = xresult.iterateNext()) {
  899. xnodes.push(xres);
  900. }
  901. return xnodes;
  902. }
  903. function replaceXpath(_xpath){
  904. var list_path = _xpath.split("/");
  905. var _replaced_xpath = "";
  906. var aim_att = ["height","width","align","valign","border","bgcolor","style"]
  907. for(var i=list_path.length-1;i>=0;i--){
  908. var _path = list_path[i];
  909. if(_path.indexOf("]")>=0){
  910. if(_path.indexOf("@")>=0){
  911. _replaced_xpath = "//"+_path;
  912. return _xpath;
  913. }else if(_path=="html"){
  914. return _xpath;
  915. }else{
  916. _temp_xpath = list_path.slice(0,i+1).join("/")
  917. _temp_nodes = findElements_byXpath(_temp_xpath)
  918. if(_temp_nodes.length==1){
  919. var _count = 0;
  920. var gen_xpath = "";
  921. for(var j=0;j<_temp_nodes[0].attributes.length;j++){
  922. var _att = _temp_nodes[0].attributes[j];
  923. _head = _att.name
  924. if(aim_att.indexOf(_head)>=0){
  925. _count += 1;
  926. if(gen_xpath==""){
  927. gen_xpath = "//"+_temp_nodes[0].tagName.toLowerCase()+"[@"+_att.name+'=\"'+_att.value+'\"]';
  928. }else{
  929. gen_xpath = gen_xpath+"[@"+_att.name+'=\"'+_att.value+'\"]';
  930. }
  931. }
  932. }
  933. if(_count>=2){
  934. var _find_nodes = findElements_byXpath(gen_xpath);
  935. if(_find_nodes.length==1){
  936. return gen_xpath+_replaced_xpath
  937. }else{
  938. _replaced_xpath = "/"+_path + _replaced_xpath;
  939. }
  940. }else{
  941. _replaced_xpath = "/"+_path + _replaced_xpath;
  942. }
  943. }else{
  944. _replaced_xpath = "/"+_path + _replaced_xpath;
  945. }
  946. }
  947. }else{
  948. if(_path!=""){
  949. _replaced_xpath = "/"+_path + _replaced_xpath;
  950. }
  951. }
  952. }
  953. return _replaced_xpath;
  954. }
  955. return replaceXpath(arguments[0]);
  956. '''