Utils.py 32 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989910010110210310410510610710810911011111211311411511611711811912012112212312412512612712812913013113213313413513613713813914014114214314414514614714814915015115215315415515615715815916016116216316416516616716816917017117217317417517617717817918018118218318418518618718818919019119219319419519619719819920020120220320420520620720820921021121221321421521621721821922022122222322422522622722822923023123223323423523623723823924024124224324424524624724824925025125225325425525625725825926026126226326426526626726826927027127227327427527627727827928028128228328428528628728828929029129229329429529629729829930030130230330430530630730830931031131231331431531631731831932032132232332432532632732832933033133233333433533633733833934034134234334434534634734834935035135235335435535635735835936036136236336436536636736836937037137237337437537637737837938038138238338438538638738838939039139239339439539639739839940040140240340440540640740840941041141241341441541641741841942042142242342442542642742842943043143243343443543643743843944044144244344444544644744844945045145245345445545645745845946046146246346446546646746846947047147247347447547647747847948048148248348448548648748848949049149249349449549649749849950050150250350450550650750850951051151251351451551651751851952052152252352452552652752852953053153253353453553653753853954054154254354454554654754854955055155255355455555655755855956056156256356456556656756856957057157257357457557657757857958058158258358458558658758858959059159259359459559659759859960060160260360460560660760860961061161261361461561661761861962062162262362462562662762862963063163263363463563663763863964064164264364464564664764864965065165265365465565665765865966066166266366466566666766866967067167267367467567667767867968068168268368468568668768868969069169269369469569669769869970070170270370470570670770870971071171271371471571671771871972072172272372472572672772872973073173273373473573673773873974074174274374474574674774874975075175275375475575675775875976076176276376476576676776876977077177277377477577677777877978078178278378478578678778878979079179279379479579679779879980080180280380480580680780880981081181281381481581681781881982082182282382482582682782882983083183283383483583683783883984084184284384484584684784884985085185285385485585685785885986086186286386486586686786886987087187287387487587687787887988088188288388488588688788888989089189289389489589689789889990090190290390490590690790890991091191291391491591691791891992092192292392492592692792892993093193293393493593693793893994094194294394494594694794894995095195295395495595695795895996096196296396496596696796896997097197297397497597697797897998098198298398498598698798898999099199299399499599699799899910001001100210031004100510061007100810091010101110121013101410151016101710181019102010211022102310241025102610271028102910301031103210331034103510361037103810391040104110421043104410451046104710481049105010511052105310541055105610571058105910601061106210631064106510661067106810691070107110721073107410751076107710781079108010811082108310841085
  1. # -*- coding: utf-8 -*-
  2. '''
  3. Created on 2018年12月20日
  4. @author: User
  5. '''
  6. import numpy as np
  7. import re
  8. import gensim
  9. from keras import backend as K
  10. import ctypes
  11. import inspect
  12. w2vfile = "../wiki_128_word_embedding_new.vector"
  13. model_w2v = None
  14. from decimal import Decimal
  15. import logging
  16. #logging.basicConfig(level = logging.INFO,format = '%(asctime)s - %(name)s - %(levelname)s - %(message)s')
  17. import pickle
  18. import tensorflow as tf
  19. from keras import losses
  20. import threading
  21. __author__ = 'baniu.yao'
  22. class MyThread(threading.Thread):
  23. def __init__(self, func, args=()):
  24. super(MyThread, self).__init__()
  25. self.func = func
  26. self.args = args
  27. def run(self):
  28. self.result = self.func(*self.args)
  29. def get_result(self):
  30. try:
  31. return self.result
  32. except Exception as e:
  33. print('执行js抛出异常:', e)
  34. return None
  35. def get_js_rs(browser, script, *arg, timeout=20):
  36. '''
  37. 浏览器执行脚本,返回结果,超时中断
  38. :param browser:浏览器对象
  39. :param script: 脚本
  40. :param arg:参数
  41. :param timeout:超时时间
  42. :return:
  43. '''
  44. def execute_js():
  45. data = browser.execute_script(script, *arg)
  46. return data
  47. t = MyThread(func=execute_js, args=())
  48. t.setDaemon(True)
  49. t.start()
  50. t.join(timeout)
  51. if t.isAlive():
  52. print('执行js超时')
  53. stop_thread(t)
  54. return None
  55. data = t.get_result()
  56. return data
  57. import time
  58. def thread_run(func, *arg, timeout=30):
  59. t = MyThread(func=func, args=(*arg,))
  60. t.setDaemon(True)
  61. t.start()
  62. t.join(timeout)
  63. if t.isAlive():
  64. print('thread_run time out')
  65. result = t.get_result()
  66. return result
  67. def xpath2css(xpath):
  68. '''
  69. 把xpath路径转为css路径
  70. :param xpath:
  71. :return:
  72. '''
  73. xpath = xpath.replace('//', '').replace('@', '').replace('/', '>')
  74. for it in re.finditer('\[(\d)\]', xpath):
  75. xpath = xpath.replace(it.group(0), ':nth-child(%s)'%it.group(1))
  76. return xpath
  77. def get_class_from_frame(fr):
  78. args, _, _, value_dict = inspect.getargvalues(fr)
  79. if len(args) and args[0] == 'self':
  80. instance = value_dict.get('self', None)
  81. if instance:
  82. return getattr(instance, '__class__', None)
  83. return None
  84. class CLog(object):
  85. def __init__(self, log_file_path='./test.log'):
  86. logging.basicConfig(level=logging.INFO, filemode='a',format='%(asctime)s %(message)s')
  87. self.logger = logging.getLogger("single_server")
  88. ''''''
  89. console = logging.FileHandler(log_file_path,encoding="UTF8")
  90. formatter = logging.Formatter('%(asctime)s %(levelname)s %(message)s [%(chain)s] %(thread)s %(threadName)s')
  91. console.setFormatter(formatter)
  92. self.logger.setLevel(logging.DEBUG)
  93. self.logger.addHandler(console)
  94. def get_file_name_in_full_path(self, file_path):
  95. return file_path.split('/')[-1]
  96. def get_meta_data(self):
  97. frames = inspect.stack()
  98. chain_list = []
  99. for i in range(0, len(frames)-1):
  100. _, file_path, _, func_name, _, _ = frames[i]
  101. file_name = self.get_file_name_in_full_path(file_path)
  102. try:
  103. args = re.findall('\((.*)\)', frames[i+1][-2][0])[0]
  104. except Exception as e:
  105. args = ""
  106. current_chain = '%s(%s)' % (func_name, args)
  107. chain_list.append(current_chain)
  108. chain_list.reverse()
  109. return ' --> '.join(chain_list[:-2])
  110. def info(self, message):
  111. chain = self.get_meta_data()
  112. self.logger.info(message,extra={"chain":chain})
  113. def error(self, message):
  114. chain = self.get_meta_data()
  115. self.logger.error(message,extra={"chain":chain})
  116. def debug(self, message):
  117. chain = self.get_meta_data()
  118. self.logger.debug(message,extra={"chain":chain})
  119. def add_err_msg(_dict,msg):
  120. _key = "err_msg"
  121. if _key in _dict:
  122. if re.search(msg,_dict[_key]) is None:
  123. _dict[_key] = _dict[_key]+msg
  124. else:
  125. _dict[_key] = msg
  126. def _async_raise(tid, exctype):
  127. """raises the exception, performs cleanup if needed"""
  128. tid = ctypes.c_long(tid)
  129. if not inspect.isclass(exctype):
  130. exctype = type(exctype)
  131. res = ctypes.pythonapi.PyThreadState_SetAsyncExc(tid, ctypes.py_object(exctype))
  132. if res == 0:
  133. raise ValueError("invalid thread id")
  134. elif res != 1:
  135. ctypes.pythonapi.PyThreadState_SetAsyncExc(tid, None)
  136. raise SystemError("PyThreadState_SetAsyncExc failed")
  137. def stop_thread(thread):
  138. _async_raise(thread.ident, SystemExit)
  139. _log = CLog()
  140. def log(msg):
  141. '''
  142. @summary:打印信息
  143. '''
  144. _log.info(msg)
  145. def error(msg):
  146. _log.error(msg)
  147. def debug(msg):
  148. _log.debug(msg)
  149. def save(object_to_save, path):
  150. '''
  151. 保存对象
  152. @Arugs:
  153. object_to_save: 需要保存的对象
  154. @Return:
  155. 保存的路径
  156. '''
  157. with open(path, 'wb') as f:
  158. pickle.dump(object_to_save, f)
  159. def load(path):
  160. '''
  161. 读取对象
  162. @Arugs:
  163. path: 读取的路径
  164. @Return:
  165. 读取的对象
  166. '''
  167. with open(path, 'rb') as f:
  168. object1 = pickle.load(f)
  169. return object1
  170. def find_index(list_tofind,text):
  171. '''
  172. @summary: 查找所有词汇在字符串中第一次出现的位置
  173. @param:
  174. list_tofind:待查找词汇
  175. text:字符串
  176. @return: list,每个词汇第一次出现的位置
  177. '''
  178. result = []
  179. for item in list_tofind:
  180. index = text.find(item)
  181. if index>=0:
  182. result.append(index)
  183. else:
  184. result.append(-1)
  185. return result
  186. def combine(list1,list2):
  187. '''
  188. @summary:将两个list中的字符串两两拼接
  189. @param:
  190. list1:字符串list
  191. list2:字符串list
  192. @return:拼接结果list
  193. '''
  194. result = []
  195. for item1 in list1:
  196. for item2 in list2:
  197. result.append(str(item1)+str(item2))
  198. return result
  199. def getDigitsDic(unit):
  200. '''
  201. @summary:拿到中文对应的数字
  202. '''
  203. DigitsDic = {"零":0, "壹":1, "贰":2, "叁":3, "肆":4, "伍":5, "陆":6, "柒":7, "捌":8, "玖":9,
  204. "〇":0, "一":1, "二":2, "三":3, "四":4, "五":5, "六":6, "七":7, "八":8, "九":9}
  205. return DigitsDic.get(unit)
  206. def getMultipleFactor(unit):
  207. '''
  208. @summary:拿到单位对应的值
  209. '''
  210. MultipleFactor = {"兆":Decimal(1000000000000),"亿":Decimal(100000000),"万":Decimal(10000),"仟":Decimal(1000),"千":Decimal(1000),"佰":Decimal(100),"百":Decimal(100),"拾":Decimal(10),"十":Decimal(10),"元":Decimal(1),"角":round(Decimal(0.1),1),"分":round(Decimal(0.01),2)}
  211. return MultipleFactor.get(unit)
  212. def getUnifyMoney(money):
  213. '''
  214. @summary:将中文金额字符串转换为数字金额
  215. @param:
  216. money:中文金额字符串
  217. @return: decimal,数据金额
  218. '''
  219. #去掉逗号
  220. money = re.sub("[,,]","",money)
  221. result = Decimal(0)
  222. chnDigits = ["零", "壹", "贰", "叁", "肆", "伍", "陆", "柒", "捌", "玖"]
  223. chnFactorUnits = ["兆", "亿", "万", "仟", "佰", "拾","元","角","分"]
  224. LowMoneypattern = re.compile("^(\d+,?)+(\.\d+)?$")
  225. BigMoneypattern = re.compile("^[%s]$"%("".join(chnDigits)))
  226. if re.search(LowMoneypattern,money) is not None:
  227. return Decimal(money)
  228. elif re.search(BigMoneypattern,money) is not None:
  229. return getDigitsDic(money)
  230. for factorUnit in chnFactorUnits:
  231. if re.search(re.compile(".*%s.*"%(factorUnit)),money) is not None:
  232. subMoneys = re.split(re.compile("%s(?!.*%s.*)"%(factorUnit,factorUnit)),money)
  233. if re.search(re.compile("^(\d+(,)?)+(\.\d+)?$"),subMoneys[0]) is not None:
  234. result += Decimal(subMoneys[0])*(getMultipleFactor(factorUnit))
  235. elif len(subMoneys[0])==1:
  236. if re.search(re.compile("^[%s]$"%("".join(chnDigits))),subMoneys[0]) is not None:
  237. result += Decimal(getDigitsDic(subMoneys[0]))*(getMultipleFactor(factorUnit))
  238. else:
  239. result += Decimal(getUnifyMoney(subMoneys[0]))*(getMultipleFactor(factorUnit))
  240. if len(subMoneys)>1:
  241. if re.search(re.compile("^(\d+(,)?)+(\.\d+)?[百千万亿]?\s?(元)?$"),subMoneys[1]) is not None:
  242. result += Decimal(subMoneys[1])
  243. elif len(subMoneys[1])==1:
  244. if re.search(re.compile("^[%s]$"%("".join(chnDigits))),subMoneys[1]) is not None:
  245. result += Decimal(getDigitsDic(subMoneys[1]))
  246. else:
  247. result += Decimal(getUnifyMoney(subMoneys[1]))
  248. break
  249. return result
  250. def mergeDict(list_dict):
  251. new_dict = dict()
  252. _flag = True
  253. hasDrew = False
  254. err_msg = ""
  255. for _dict in list_dict:
  256. if _dict is None:
  257. _flag = False
  258. continue
  259. for key in _dict.keys():
  260. if key=="flag":
  261. if not _dict[key]:
  262. _flag = _dict[key]
  263. else:
  264. if key=="err_msg":
  265. err_msg += _dict[key]
  266. new_dict[key] = _dict[key]
  267. if key=="hasDrew":
  268. hasDrew = hasDrew or _dict[key]
  269. new_dict["flag"] = _flag
  270. new_dict["hasDrew"] = hasDrew
  271. new_dict["err_msg"] = err_msg
  272. count_rules = 0
  273. for _key in new_dict.keys():
  274. if _key not in ["flag","success","count_rules"] and new_dict[_key]!="" and new_dict[_key] is not None:
  275. count_rules += 1
  276. new_dict["count_rules"] = count_rules
  277. return new_dict
  278. def getCommonXpath(list_xpaths,on_value=0.6):
  279. CommonXpath = None
  280. if len(list_xpaths)>0:
  281. MAX_LEN = max([len(x) for x in list_xpaths])
  282. for i in range(MAX_LEN):
  283. _xpath = None
  284. _same_count = 0
  285. for j in range(len(list_xpaths)):
  286. if i<len(list_xpaths[j]):
  287. if _xpath is None:
  288. _xpath = list_xpaths[j][i]
  289. if list_xpaths[j][i]==_xpath:
  290. _same_count += 1
  291. if _same_count/len(list_xpaths)>=on_value:
  292. CommonXpath = _xpath
  293. return CommonXpath
  294. def getModel_w2v():
  295. '''
  296. @summary:加载词向量
  297. '''
  298. global model_w2v
  299. if model_w2v is None:
  300. model_w2v = gensim.models.KeyedVectors.load_word2vec_format(w2vfile,binary=True)
  301. return model_w2v
  302. def findAllIndex(substr,wholestr):
  303. '''
  304. @summary: 找到字符串的子串的所有begin_index
  305. @param:
  306. substr:子字符串
  307. wholestr:子串所在完整字符串
  308. @return: list,字符串的子串的所有begin_index
  309. '''
  310. copystr = wholestr
  311. result = []
  312. indexappend = 0
  313. while(True):
  314. index = copystr.find(substr)
  315. if index<0:
  316. break
  317. else:
  318. result.append(indexappend+index)
  319. indexappend += index+len(substr)
  320. copystr = copystr[index+len(substr):]
  321. return result
  322. def spanWindow(tokens,begin_index,end_index,size):
  323. '''
  324. @summary:取得某个实体的上下文词汇
  325. @param:
  326. tokens:句子分词list
  327. begin_index:实体的开始index
  328. end_index:实体的结束index
  329. size:左右两边各取多少个词
  330. @return: list,实体的上下文词汇
  331. '''
  332. length_tokens = len(tokens)
  333. if begin_index>size:
  334. begin = begin_index-size
  335. else:
  336. begin = 0
  337. if end_index+size<length_tokens:
  338. end = end_index+size+1
  339. else:
  340. end = length_tokens
  341. result = []
  342. result.append(tokens[begin:begin_index])
  343. #result.append(tokens[begin_index:end_index+1])
  344. result.append(tokens[end_index+1:end])
  345. return result
  346. def embedding(datas,shape):
  347. '''
  348. @summary:查找词汇对应的词向量
  349. @param:
  350. datas:词汇的list
  351. shape:结果的shape
  352. @return: array,返回对应shape的词嵌入
  353. '''
  354. model_w2v = getModel_w2v()
  355. embed = np.zeros(shape)
  356. length = shape[1]
  357. out_index = 0
  358. #print(datas)
  359. for data in datas:
  360. index = 0
  361. for item in data:
  362. item_not_space = re.sub("\s*","",item)
  363. if index>=length:
  364. break
  365. if item_not_space in model_w2v.vocab:
  366. embed[out_index][index] = model_w2v[item_not_space]
  367. index += 1
  368. else:
  369. #embed[out_index][index] = model_w2v['unk']
  370. index += 1
  371. out_index += 1
  372. return embed
  373. def partMoney(entity_text,input2_shape = [7]):
  374. '''
  375. @summary:对金额分段
  376. @param:
  377. entity_text:数值金额
  378. input2_shape:分类数
  379. @return: array,分段之后的独热编码
  380. '''
  381. money = float(entity_text)
  382. parts = np.zeros(input2_shape)
  383. if money<100:
  384. parts[0] = 1
  385. elif money<1000:
  386. parts[1] = 1
  387. elif money<10000:
  388. parts[2] = 1
  389. elif money<100000:
  390. parts[3] = 1
  391. elif money<1000000:
  392. parts[4] = 1
  393. elif money<10000000:
  394. parts[5] = 1
  395. else:
  396. parts[6] = 1
  397. return parts
  398. def recall(y_true, y_pred):
  399. '''
  400. 计算召回率
  401. @Argus:
  402. y_true: 正确的标签
  403. y_pred: 模型预测的标签
  404. @Return
  405. 召回率
  406. '''
  407. c1 = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
  408. c3 = K.sum(K.round(K.clip(y_true, 0, 1)))
  409. if c3 == 0:
  410. return 0
  411. recall = c1 / c3
  412. return recall
  413. def f1_score(y_true, y_pred):
  414. '''
  415. 计算F1
  416. @Argus:
  417. y_true: 正确的标签
  418. y_pred: 模型预测的标签
  419. @Return
  420. F1值
  421. '''
  422. c1 = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
  423. c2 = K.sum(K.round(K.clip(y_pred, 0, 1)))
  424. c3 = K.sum(K.round(K.clip(y_true, 0, 1)))
  425. precision = c1 / c2
  426. if c3 == 0:
  427. recall = 0
  428. else:
  429. recall = c1 / c3
  430. f1_score = 2 * (precision * recall) / (precision + recall)
  431. return f1_score
  432. def precision(y_true, y_pred):
  433. '''
  434. 计算精确率
  435. @Argus:
  436. y_true: 正确的标签
  437. y_pred: 模型预测的标签
  438. @Return
  439. 精确率
  440. '''
  441. c1 = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
  442. c2 = K.sum(K.round(K.clip(y_pred, 0, 1)))
  443. precision = c1 / c2
  444. return precision
  445. def acc(y_true, y_pred):
  446. '''
  447. #计算正确率
  448. '''
  449. c1 = tf.reduce_mean(tf.cast(tf.equal(tf.matmul(tf.cast(tf.argmax(y_true,1),tf.float64),tf.constant([[0],[1]],dtype=tf.float64)),tf.matmul(tf.cast(tf.argmax(y_pred,1),tf.float64),tf.constant([[0],[1]],dtype=tf.float64))),tf.float32))
  450. return c1
  451. def my_loss(y_true, y_pred):
  452. return -tf.reduce_mean(y_true*tf.log(y_pred))
  453. #return losses.categorical_crossentropy(y_true, y_pred)+(1-tf.reduce_mean(tf.cast(tf.equal(tf.matmul(tf.cast(tf.argmax(y_true,1),tf.float64),tf.constant([[0],[1]],dtype=tf.float64)),tf.matmul(tf.cast(tf.argmax(y_pred,1),tf.float64),tf.constant([[0],[1]],dtype=tf.float64))),tf.float32)))
  454. def print_metrics(history):
  455. '''
  456. 制作每次迭代的各metrics变化图片
  457. @Arugs:
  458. history: 模型训练迭代的历史记录
  459. '''
  460. import matplotlib.pyplot as plt
  461. # loss图
  462. loss = history.history['loss']
  463. val_loss = history.history['val_loss']
  464. epochs = range(1, len(loss) + 1)
  465. plt.subplot(2, 2, 1)
  466. plt.plot(epochs, loss, 'bo', label='Training loss')
  467. plt.plot(epochs, val_loss, 'b', label='Validation loss')
  468. plt.title('Training and validation loss')
  469. plt.xlabel('Epochs')
  470. plt.ylabel('Loss')
  471. plt.legend()
  472. # f1图
  473. f1 = history.history['f1_score']
  474. val_f1 = history.history['val_f1_score']
  475. plt.subplot(2, 2, 2)
  476. plt.plot(epochs, f1, 'bo', label='Training f1')
  477. plt.plot(epochs, val_f1, 'b', label='Validation f1')
  478. plt.title('Training and validation f1')
  479. plt.xlabel('Epochs')
  480. plt.ylabel('F1')
  481. plt.legend()
  482. # precision图
  483. prec = history.history['precision']
  484. val_prec = history.history['val_precision']
  485. plt.subplot(2, 2, 3)
  486. plt.plot(epochs, prec, 'bo', label='Training precision')
  487. plt.plot(epochs, val_prec, 'b', label='Validation pecision')
  488. plt.title('Training and validation precision')
  489. plt.xlabel('Epochs')
  490. plt.ylabel('Precision')
  491. plt.legend()
  492. # recall图
  493. recall = history.history['recall']
  494. val_recall = history.history['val_recall']
  495. plt.subplot(2, 2, 4)
  496. plt.plot(epochs, recall, 'bo', label='Training recall')
  497. plt.plot(epochs, val_recall, 'b', label='Validation recall')
  498. plt.title('Training and validation recall')
  499. plt.xlabel('Epochs')
  500. plt.ylabel('Recall')
  501. plt.legend()
  502. plt.show()
  503. scripts_common = '''
  504. document.getElementsByClassName = function (Name,e,tag) {
  505. var ele = [],
  506. allEle,
  507. length,
  508. i = 0;
  509. if (typeof tag === "undefined" ){
  510. tag = "*"
  511. }
  512. if (typeof e === "undefined"){
  513. e = document;
  514. }
  515. allEle = e.getElementsByTagName(tag);
  516. for (length = allEle.length;i < length;i = i + 1){
  517. if (allEle[i].className === Name) {
  518. ele.push(allEle[i]);
  519. }
  520. }
  521. return ele;
  522. }
  523. document.countElementById = function (id,e,tag) {
  524. var ele = [],
  525. allEle,
  526. length,
  527. i = 0;
  528. if (typeof tag === "undefined" ){
  529. tag = "*"
  530. }
  531. if (typeof e === "undefined"){
  532. e = document;
  533. }
  534. allEle = e.getElementsByTagName(tag);
  535. for (length = allEle.length;i < length;i = i + 1){
  536. if (allEle[i].id === id) {
  537. ele.push(allEle[i]);
  538. }
  539. }
  540. return ele;
  541. }
  542. /*js集合set类的实现*/
  543. function Set() {
  544. this.dataStore = [];
  545. this.add = add;//新增元素
  546. this.remove = remove;//删除元素
  547. this.size = size;//集合的元素个数
  548. this.union = union;//求并集
  549. this.contains = contains;//判断一个集合中是否包含某个元素
  550. this.intersect = intersect;//交集
  551. this.subset = subset;//判断一个集合是否是另一个的子集
  552. this.difference = difference;//求补集
  553. this.show = show;//将集合元素显示出来
  554. }
  555. function add(data) {
  556. if (this.dataStore.indexOf(data) < 0) {
  557. this.dataStore.push(data);
  558. return true;
  559. }
  560. else {
  561. return false;
  562. }
  563. }
  564. function remove(data) {
  565. var pos = this.dataStore.indexOf(data);
  566. if (pos > -1) {
  567. this.dataStore.splice(pos,1);
  568. return true;
  569. }
  570. else {
  571. return false;
  572. }
  573. }
  574. function size() {
  575. return this.dataStore.length;
  576. }
  577. function show() {
  578. return "[" + this.dataStore + "]";
  579. }
  580. function contains(data) {
  581. if (this.dataStore.indexOf(data) > -1) {
  582. return true;
  583. }
  584. else {
  585. return false;
  586. }
  587. }
  588. function union(set) {
  589. var tempSet = new Set();
  590. for (var i = 0; i < this.dataStore.length; ++i) {
  591. tempSet.add(this.dataStore[i]);
  592. }
  593. for (var i = 0; i < set.dataStore.length; ++i) {
  594. if (!tempSet.contains(set.dataStore[i])) {
  595. tempSet.dataStore.push(set.dataStore[i]);
  596. }
  597. }
  598. return tempSet;
  599. }
  600. function intersect(set) {
  601. var tempSet = new Set();
  602. for (var i = 0; i < this.dataStore.length; ++i) {
  603. if (set.contains(this.dataStore[i])) {
  604. tempSet.add(this.dataStore[i]);
  605. }
  606. }
  607. return tempSet;
  608. }
  609. function subset(set) {
  610. if (this.size() > set.size()) {
  611. return false;
  612. }
  613. else {
  614. for(var member in this.dataStore) {
  615. if (!set.contains(member)) {
  616. return false;
  617. }
  618. }
  619. }
  620. return true;
  621. }
  622. function difference(set) {
  623. var tempSet = new Set();
  624. for (var i = 0; i < this.dataStore.length; ++i) {
  625. if (!set.contains(this.dataStore[i])) {
  626. tempSet.add(this.dataStore[i]);
  627. }
  628. }
  629. return tempSet;
  630. }
  631. function check(node,set_url){
  632. if(node.nodeType!=1){
  633. return false;
  634. }
  635. var label_flag = false;
  636. var list_a = node.getElementsByTagName("a");
  637. if(list_a.length==set_url.size()){
  638. return true;
  639. }else{
  640. return false;
  641. }
  642. }
  643. function getRemoveList(node,recurse,list_remove){
  644. var pattern = /(上一?篇|下一?篇|作者|点击数|发布时间|发布日期|更新日期|更新时间|字体|字号|来源|阅读次?数|浏览次?数|点击次?数|本站编辑|编辑人|关键字|上一条|下一条)|(打印|关闭窗口|回到顶部|现在的位置|首页|分享)/
  645. if(node.childNodes==null || node.childNodes.length<=0){
  646. return;
  647. }
  648. for(var i=0;i<node.childNodes.length;i++){
  649. _child = node.childNodes[i];
  650. if(_child.nodeType==3){
  651. _match = _child.textContent.toString().match(pattern);
  652. if(_match!=null){
  653. if(_match[1]!=null){
  654. if(node.textContent.toString().trim().length-_match[1].length<3){
  655. _soup = node.parentNode.tagName.toLowerCase()+":contains("+_match[0]+")";
  656. }else{
  657. _soup = node.tagName.toLowerCase()+":contains("+_match[0]+")";
  658. }
  659. }else{
  660. _soup = node.tagName.toLowerCase()+":contains("+_match[0]+")";
  661. }
  662. list_remove.push(_soup);
  663. }
  664. }
  665. if(_child.nodeType==1 && recurse){
  666. getRemoveList(_child,recurse,list_remove)
  667. }
  668. }
  669. }
  670. function getListXpath(el,list_xpath,getRemove){
  671. if (el==document || el==document.body){
  672. return list_xpath;
  673. }
  674. if(getRemove){
  675. _array = new Array();
  676. getRemoveList(el,true,_array);
  677. list_xpath.push([getXpath(el),_array])
  678. }else{
  679. list_xpath.push(getXpath(el))
  680. }
  681. return getListXpath(el.parentNode,list_xpath,getRemove);
  682. }
  683. function getXpath(el,b,notfirst){
  684. if (el.id !="" && document.countElementById(el.id).length==1){
  685. var _jump_flag = false;
  686. if(b!=null){
  687. for(var i=0;i<b.length;i++){
  688. if(el.tagName.toLowerCase()==b[i]){
  689. _jump_flag = true;
  690. }
  691. }
  692. }
  693. if(notfirst){
  694. _jump_flag = true;
  695. }
  696. if(!_jump_flag){
  697. //return '//*[@id=\"'+el.id+'\"]';
  698. return '//'+el.tagName.toLowerCase()+'[@id=\"'+el.id+'\"]';
  699. }
  700. }
  701. if (el.getAttribute("class")!=null && document.getElementsByClassName(el.getAttribute("class")).length==1){
  702. if(!notfirst){
  703. //return '//*[@class=\"'+el.getAttribute("class")+'\"]';
  704. return '//'+el.tagName.toLowerCase()+'[@class=\"'+el.getAttribute("class")+'\"]';
  705. }
  706. }
  707. if (el==document.body){
  708. return '/html/'+el.tagName.toLowerCase();
  709. }
  710. var ix = 1;
  711. siblings = el.parentNode.childNodes;
  712. for (var i=0,l=siblings.length;i<l;i++){
  713. var sibling = siblings[i];
  714. if (sibling==el){
  715. return getXpath(el.parentNode,b)+'/'+el.tagName.toLowerCase()+'['+(ix)+']';
  716. }else if (sibling.tagName==el.tagName){
  717. ix++;
  718. }
  719. }
  720. }
  721. function getJsoup(node){
  722. var _nodeName = node.tagName.toLowerCase();
  723. var _nodeText = node.innerText;
  724. if(_nodeText==null || _nodeText==""){
  725. return null;
  726. }
  727. var counts = 0;
  728. var list_node = document.getElementsByTagName(_nodeName);
  729. for(var i=0;i<list_node.length;i++){
  730. var _node = list_node[i];
  731. if(_node.innerText!=null && _node.innerText.indexOf(_nodeText)>=0){
  732. counts += 1;
  733. }
  734. }
  735. if(counts!=1){
  736. return null;
  737. }
  738. var jsoup = _nodeName+':contains('+_nodeText.trim()+')';
  739. return jsoup;
  740. }
  741. function getOffsetLeft(el){
  742. return el.offsetParent
  743. ? el.offsetLeft + getOffsetLeft(el.offsetParent)
  744. : el.offsetLeft;
  745. }
  746. function getOffsetTop(el){
  747. return el.offsetParent
  748. ? el.offsetTop + getOffsetTop(el.offsetParent)
  749. : el.offsetTop;
  750. }
  751. function search_pageBt(node,type,list_hitTag,pattern_page){
  752. var find_flag = false;
  753. if(node!=null && node.nodeName.toLowerCase()=="a"){
  754. list_hitTag.push([node,type,getOffsetLeft(node),getOffsetTop(node)])
  755. }else{
  756. if(node.childNodes==null){
  757. }else{
  758. for(var i=0;i<node.childNodes.length;i++){
  759. child = node.childNodes[i];
  760. if(child!=null && child.tagName !=null && (child.tagName.toLowerCase()=="script" || child.tagName.toLowerCase()=="select")){
  761. continue;
  762. }
  763. child_innerText = child.innerText;
  764. if(child_innerText!=null){
  765. _match = child_innerText.match(pattern_page)
  766. if(_match!=null){
  767. var _type = _match[1]?"nextPage":(_match[2]?"lastPage":(_match[3]?"firstPage":(_match[4]?"tailPage":"other")))
  768. search_pageBt(child,_type,list_hitTag,pattern_page);
  769. find_flag = true;
  770. }
  771. }
  772. }
  773. }
  774. if(!find_flag){
  775. list_hitTag.push([node,type,getOffsetLeft(node),getOffsetTop(node)]);
  776. }
  777. }
  778. }
  779. //对命中的标签进行聚类
  780. function clustering(list_hitTag){
  781. var list_cluster = new Array();
  782. for(var i=0;i<list_hitTag.length;i++){
  783. var _find_flag = false;
  784. for(var j=0;j<list_cluster.length;j++){
  785. if(Math.abs(list_cluster[j][1]-list_hitTag[i][3])<20){
  786. list_cluster[j][2].push([list_hitTag[i][0],list_hitTag[i][1]]);
  787. if(list_hitTag[i][0].tagName.toLowerCase()=="a" || list_hitTag[i][0].onclick!=null){
  788. list_cluster[j][3] += 1;
  789. }
  790. _find_flag = true;
  791. }
  792. }
  793. if(!_find_flag){
  794. var _click_num = 0;
  795. if(list_hitTag[i][0].tagName.toLowerCase()=="a" || list_hitTag[i][0].onclick!=null){
  796. _click_num = 1;
  797. }
  798. list_cluster.push([list_hitTag[i][2],list_hitTag[i][3],[[list_hitTag[i][0],list_hitTag[i][1]]],_click_num]);
  799. }
  800. }
  801. var _list_max_cluster = new Array();
  802. var _max = 0;
  803. for(var k=0;k<list_cluster.length;k++){
  804. _prob = list_cluster[k][2].length*0.5+list_cluster[k][3]*0;
  805. if(_prob>_max){
  806. _max = _prob;
  807. _list_max_cluster = list_cluster[k][2];
  808. }
  809. }
  810. return _list_max_cluster;
  811. }
  812. function clustering_turnPage(){
  813. //var pattern_page = /((?<nextPage>下一?页|>>|>)|(?<lastPage>上一?页|<<|<)|(?<firstPage>首页|第一页)|(?<tailPage>尾页)|(?<other>\.{1,2}|共\d[条页]|\d+\/\d+))/ //phantomjs不支持命名分组
  814. var pattern_page = /^\s*[^最]?\s*([下后]\s*一?\s*页?|[下后]\s*一?\s*页\s*»|[下后]\s*一?\s*页\s*>|[下后]\s*一?\s*页\s*>>|»|>>|>|[Nn]ext)\s*.?\s*$|^\s*.?([前上]\s*一?\s*页?|«\s*[前上]\s*一?\s*页|«|<<|<|[Pp]revious).?\s*$|^\s*.?(首\s*页?|第\s*一\s*页|first|\|<).?\s*$|^\s*.?([尾末]\s*一?\s*页?|tail|>\|).?s\s*$|(^\s*\.{1,2}\s*$|^.{,10}共\s*\d+\s*[条页].{,10}$|^.{,10}\d+\/\d+.{,3}$|^\s*\.{0,2}\d+\s*$|^\s*[gG][oO]\s*$|^.{0,2}跳?转到?)/
  815. var pattern_nextPage = /[Nn]ext/
  816. var list_hitTag = new Array();
  817. //search_pageBt(document,"other",list_hitTag,pattern_page)
  818. for(var i=0;i<document.all.length;i++){
  819. var node = document.all[i];
  820. if(!((getOffsetLeft(node)>0 && getOffsetTop(node)>0))){
  821. continue;
  822. }
  823. if(node.tagName.toLowerCase()=="script"){
  824. continue;
  825. }
  826. var _value = node.getAttribute("value");
  827. if(_value==null){
  828. _value = "";
  829. }
  830. var _title = node.getAttribute("title");
  831. if(_title==null){
  832. _title = "";
  833. }
  834. var _text = "";
  835. if(node!=null && node.innerText!=null){
  836. _text = node.innerText;
  837. }
  838. if (_text!=null && _text!="" && node.tagName.toLowerCase()!="option"){
  839. _match = _text.match(pattern_page)
  840. if(_match!=null){
  841. var _type = _match[1]?"nextPage":(_match[2]?"lastPage":(_match[3]?"firstPage":(_match[4]?"tailPage":"other")))
  842. list_hitTag.push([node,_type,getOffsetLeft(node),getOffsetTop(node)]);
  843. }
  844. }else if (_value!=null && _value!="" && node.tagName.toLowerCase()!="option"){
  845. _match = _value.match(pattern_page)
  846. if(_match!=null){
  847. var _type = _match[1]?"nextPage":(_match[2]?"lastPage":(_match[3]?"firstPage":(_match[4]?"tailPage":"other")))
  848. list_hitTag.push([node,_type,getOffsetLeft(node),getOffsetTop(node)]);
  849. }
  850. }else if (_title!=null && _title!=""){
  851. _match = _title.match(pattern_page)
  852. if(_match!=null){
  853. var _type = _match[1]?"nextPage":(_match[2]?"lastPage":(_match[3]?"firstPage":(_match[4]?"tailPage":"other")))
  854. list_hitTag.push([node,_type,getOffsetLeft(node),getOffsetTop(node)]);
  855. }
  856. }else if(node!=null && node.getAttribute("class")!=null && node.getAttribute("class").match(pattern_nextPage)!=null){
  857. list_hitTag.push([node,"nextPage",getOffsetLeft(node),getOffsetTop(node)]);
  858. }
  859. }
  860. var _find = false;
  861. for(var i=0;i<list_hitTag.length;i++){
  862. if(list_hitTag[i][0].innerText==">"){
  863. _find = true;
  864. }
  865. }
  866. if(_find){
  867. for(var i=0;i<list_hitTag.length;i++){
  868. if(list_hitTag[i][0].innerText==">>"){
  869. list_hitTag[i][1] = "tailPage"
  870. }
  871. }
  872. }
  873. list_cluster = clustering(list_hitTag);
  874. return list_cluster;
  875. }
  876. function findElements_byXpath(STR_XPATH) {
  877. var xresult = document.evaluate(STR_XPATH, document, null, XPathResult.ANY_TYPE, null);
  878. var xnodes = [];
  879. var xres;
  880. while (xres = xresult.iterateNext()) {
  881. xnodes.push(xres);
  882. }
  883. return xnodes;
  884. }
  885. '''
  886. scripts_replaceXpath = '''
  887. function findElements_byXpath(STR_XPATH) {
  888. var xresult = document.evaluate(STR_XPATH, document, null, XPathResult.ANY_TYPE, null);
  889. var xnodes = [];
  890. var xres;
  891. while (xres = xresult.iterateNext()) {
  892. xnodes.push(xres);
  893. }
  894. return xnodes;
  895. }
  896. function replaceXpath(_xpath){
  897. var list_path = _xpath.split("/");
  898. var _replaced_xpath = "";
  899. var aim_att = ["height","width","align","valign","border","bgcolor","style"]
  900. for(var i=list_path.length-1;i>=0;i--){
  901. var _path = list_path[i];
  902. if(_path.indexOf("]")>=0){
  903. if(_path.indexOf("@")>=0){
  904. _replaced_xpath = "//"+_path;
  905. return _xpath;
  906. }else if(_path=="html"){
  907. return _xpath;
  908. }else{
  909. _temp_xpath = list_path.slice(0,i+1).join("/")
  910. _temp_nodes = findElements_byXpath(_temp_xpath)
  911. if(_temp_nodes.length==1){
  912. var _count = 0;
  913. var gen_xpath = "";
  914. for(var j=0;j<_temp_nodes[0].attributes.length;j++){
  915. var _att = _temp_nodes[0].attributes[j];
  916. _head = _att.name
  917. if(aim_att.indexOf(_head)>=0){
  918. _count += 1;
  919. if(gen_xpath==""){
  920. gen_xpath = "//"+_temp_nodes[0].tagName.toLowerCase()+"[@"+_att.name+'=\"'+_att.value+'\"]';
  921. }else{
  922. gen_xpath = gen_xpath+"[@"+_att.name+'=\"'+_att.value+'\"]';
  923. }
  924. }
  925. }
  926. if(_count>=2){
  927. var _find_nodes = findElements_byXpath(gen_xpath);
  928. if(_find_nodes.length==1){
  929. return gen_xpath+_replaced_xpath
  930. }else{
  931. _replaced_xpath = "/"+_path + _replaced_xpath;
  932. }
  933. }else{
  934. _replaced_xpath = "/"+_path + _replaced_xpath;
  935. }
  936. }else{
  937. _replaced_xpath = "/"+_path + _replaced_xpath;
  938. }
  939. }
  940. }else{
  941. if(_path!=""){
  942. _replaced_xpath = "/"+_path + _replaced_xpath;
  943. }
  944. }
  945. }
  946. return _replaced_xpath;
  947. }
  948. return replaceXpath(arguments[0]);
  949. '''