# -*- coding: utf-8 -*- ''' Created on 2018年12月20日 @author: User ''' import numpy as np import re import gensim from keras import backend as K import ctypes import inspect w2vfile = "../wiki_128_word_embedding_new.vector" model_w2v = None from decimal import Decimal import logging #logging.basicConfig(level = logging.INFO,format = '%(asctime)s - %(name)s - %(levelname)s - %(message)s') import pickle import tensorflow as tf from keras import losses import threading __author__ = 'baniu.yao' class MyThread(threading.Thread): def __init__(self, func, args=()): super(MyThread, self).__init__() self.func = func self.args = args def run(self): self.result = self.func(*self.args) def get_result(self): try: return self.result except Exception as e: print('执行js抛出异常:', e) return None def get_js_rs(browser, script, *arg, timeout=20): ''' 浏览器执行脚本,返回结果,超时中断 :param browser:浏览器对象 :param script: 脚本 :param arg:参数 :param timeout:超时时间 :return: ''' def execute_js(): data = browser.execute_script(script, *arg) return data t = MyThread(func=execute_js, args=()) t.setDaemon(True) t.start() t.join(timeout) if t.isAlive(): print('执行js超时') stop_thread(t) return None data = t.get_result() return data import time def thread_run(func, *arg, timeout=30): t = MyThread(func=func, args=(*arg,)) t.setDaemon(True) t.start() t.join(timeout) if t.isAlive(): print('thread_run time out') result = t.get_result() return result def xpath2css(xpath): ''' 把xpath路径转为css路径 :param xpath: :return: ''' xpath = xpath.replace('//', '').replace('@', '').replace('/', '>') for it in re.finditer('\[(\d)\]', xpath): xpath = xpath.replace(it.group(0), ':nth-child(%s)'%it.group(1)) if xpath[0] == '>': xpath = xpath[1:] return xpath def get_class_from_frame(fr): args, _, _, value_dict = inspect.getargvalues(fr) if len(args) and args[0] == 'self': instance = value_dict.get('self', None) if instance: return getattr(instance, '__class__', None) return None class CLog(object): def __init__(self, log_file_path='./test.log'): logging.basicConfig(level=logging.INFO, filemode='a',format='%(asctime)s %(message)s') self.logger = logging.getLogger("single_server") '''''' console = logging.FileHandler(log_file_path,encoding="UTF8") formatter = logging.Formatter('%(asctime)s %(levelname)s %(message)s [%(chain)s] %(thread)s %(threadName)s') console.setFormatter(formatter) self.logger.setLevel(logging.DEBUG) self.logger.addHandler(console) def get_file_name_in_full_path(self, file_path): return file_path.split('/')[-1] def get_meta_data(self): frames = inspect.stack() chain_list = [] for i in range(0, len(frames)-1): _, file_path, _, func_name, _, _ = frames[i] file_name = self.get_file_name_in_full_path(file_path) try: args = re.findall('\((.*)\)', frames[i+1][-2][0])[0] except Exception as e: args = "" current_chain = '%s(%s)' % (func_name, args) chain_list.append(current_chain) chain_list.reverse() return ' --> '.join(chain_list[:-2]) def info(self, message): chain = self.get_meta_data() self.logger.info(message,extra={"chain":chain}) def error(self, message): chain = self.get_meta_data() self.logger.error(message,extra={"chain":chain}) def debug(self, message): chain = self.get_meta_data() self.logger.debug(message,extra={"chain":chain}) def add_err_msg(_dict,msg): _key = "err_msg" if _key in _dict: if re.search(msg,_dict[_key]) is None: _dict[_key] = _dict[_key]+msg else: _dict[_key] = msg def _async_raise(tid, exctype): """raises the exception, performs cleanup if needed""" tid = ctypes.c_long(tid) if not inspect.isclass(exctype): exctype = type(exctype) res = ctypes.pythonapi.PyThreadState_SetAsyncExc(tid, ctypes.py_object(exctype)) if res == 0: raise ValueError("invalid thread id") elif res != 1: ctypes.pythonapi.PyThreadState_SetAsyncExc(tid, None) raise SystemError("PyThreadState_SetAsyncExc failed") def stop_thread(thread): _async_raise(thread.ident, SystemExit) _log = CLog() def log(msg): ''' @summary:打印信息 ''' _log.info(msg) def error(msg): _log.error(msg) def debug(msg): _log.debug(msg) def save(object_to_save, path): ''' 保存对象 @Arugs: object_to_save: 需要保存的对象 @Return: 保存的路径 ''' with open(path, 'wb') as f: pickle.dump(object_to_save, f) def load(path): ''' 读取对象 @Arugs: path: 读取的路径 @Return: 读取的对象 ''' with open(path, 'rb') as f: object1 = pickle.load(f) return object1 def find_index(list_tofind,text): ''' @summary: 查找所有词汇在字符串中第一次出现的位置 @param: list_tofind:待查找词汇 text:字符串 @return: list,每个词汇第一次出现的位置 ''' result = [] for item in list_tofind: index = text.find(item) if index>=0: result.append(index) else: result.append(-1) return result def combine(list1,list2): ''' @summary:将两个list中的字符串两两拼接 @param: list1:字符串list list2:字符串list @return:拼接结果list ''' result = [] for item1 in list1: for item2 in list2: result.append(str(item1)+str(item2)) return result def getDigitsDic(unit): ''' @summary:拿到中文对应的数字 ''' DigitsDic = {"零":0, "壹":1, "贰":2, "叁":3, "肆":4, "伍":5, "陆":6, "柒":7, "捌":8, "玖":9, "〇":0, "一":1, "二":2, "三":3, "四":4, "五":5, "六":6, "七":7, "八":8, "九":9} return DigitsDic.get(unit) def getMultipleFactor(unit): ''' @summary:拿到单位对应的值 ''' MultipleFactor = {"兆":Decimal(1000000000000),"亿":Decimal(100000000),"万":Decimal(10000),"仟":Decimal(1000),"千":Decimal(1000),"佰":Decimal(100),"百":Decimal(100),"拾":Decimal(10),"十":Decimal(10),"元":Decimal(1),"角":round(Decimal(0.1),1),"分":round(Decimal(0.01),2)} return MultipleFactor.get(unit) def getUnifyMoney(money): ''' @summary:将中文金额字符串转换为数字金额 @param: money:中文金额字符串 @return: decimal,数据金额 ''' #去掉逗号 money = re.sub("[,,]","",money) result = Decimal(0) chnDigits = ["零", "壹", "贰", "叁", "肆", "伍", "陆", "柒", "捌", "玖"] chnFactorUnits = ["兆", "亿", "万", "仟", "佰", "拾","元","角","分"] LowMoneypattern = re.compile("^(\d+,?)+(\.\d+)?$") BigMoneypattern = re.compile("^[%s]$"%("".join(chnDigits))) if re.search(LowMoneypattern,money) is not None: return Decimal(money) elif re.search(BigMoneypattern,money) is not None: return getDigitsDic(money) for factorUnit in chnFactorUnits: if re.search(re.compile(".*%s.*"%(factorUnit)),money) is not None: subMoneys = re.split(re.compile("%s(?!.*%s.*)"%(factorUnit,factorUnit)),money) if re.search(re.compile("^(\d+(,)?)+(\.\d+)?$"),subMoneys[0]) is not None: result += Decimal(subMoneys[0])*(getMultipleFactor(factorUnit)) elif len(subMoneys[0])==1: if re.search(re.compile("^[%s]$"%("".join(chnDigits))),subMoneys[0]) is not None: result += Decimal(getDigitsDic(subMoneys[0]))*(getMultipleFactor(factorUnit)) else: result += Decimal(getUnifyMoney(subMoneys[0]))*(getMultipleFactor(factorUnit)) if len(subMoneys)>1: if re.search(re.compile("^(\d+(,)?)+(\.\d+)?[百千万亿]?\s?(元)?$"),subMoneys[1]) is not None: result += Decimal(subMoneys[1]) elif len(subMoneys[1])==1: if re.search(re.compile("^[%s]$"%("".join(chnDigits))),subMoneys[1]) is not None: result += Decimal(getDigitsDic(subMoneys[1])) else: result += Decimal(getUnifyMoney(subMoneys[1])) break return result def mergeDict(list_dict): new_dict = dict() _flag = True hasDrew = False err_msg = "" for _dict in list_dict: if _dict is None: _flag = False continue for key in _dict.keys(): if key=="flag": if not _dict[key]: _flag = _dict[key] else: if key=="err_msg": err_msg += _dict[key] new_dict[key] = _dict[key] if key=="hasDrew": hasDrew = hasDrew or _dict[key] new_dict["flag"] = _flag new_dict["hasDrew"] = hasDrew new_dict["err_msg"] = err_msg count_rules = 0 for _key in new_dict.keys(): if _key not in ["flag","success","count_rules"] and new_dict[_key]!="" and new_dict[_key] is not None: count_rules += 1 new_dict["count_rules"] = count_rules return new_dict def getCommonXpath(list_xpaths,on_value=0.6): CommonXpath = None if len(list_xpaths)>0: MAX_LEN = max([len(x) for x in list_xpaths]) for i in range(MAX_LEN): _xpath = None _same_count = 0 for j in range(len(list_xpaths)): if i=on_value: CommonXpath = _xpath return CommonXpath def getModel_w2v(): ''' @summary:加载词向量 ''' global model_w2v if model_w2v is None: model_w2v = gensim.models.KeyedVectors.load_word2vec_format(w2vfile,binary=True) return model_w2v def findAllIndex(substr,wholestr): ''' @summary: 找到字符串的子串的所有begin_index @param: substr:子字符串 wholestr:子串所在完整字符串 @return: list,字符串的子串的所有begin_index ''' copystr = wholestr result = [] indexappend = 0 while(True): index = copystr.find(substr) if index<0: break else: result.append(indexappend+index) indexappend += index+len(substr) copystr = copystr[index+len(substr):] return result def spanWindow(tokens,begin_index,end_index,size): ''' @summary:取得某个实体的上下文词汇 @param: tokens:句子分词list begin_index:实体的开始index end_index:实体的结束index size:左右两边各取多少个词 @return: list,实体的上下文词汇 ''' length_tokens = len(tokens) if begin_index>size: begin = begin_index-size else: begin = 0 if end_index+size=length: break if item_not_space in model_w2v.vocab: embed[out_index][index] = model_w2v[item_not_space] index += 1 else: #embed[out_index][index] = model_w2v['unk'] index += 1 out_index += 1 return embed def partMoney(entity_text,input2_shape = [7]): ''' @summary:对金额分段 @param: entity_text:数值金额 input2_shape:分类数 @return: array,分段之后的独热编码 ''' money = float(entity_text) parts = np.zeros(input2_shape) if money<100: parts[0] = 1 elif money<1000: parts[1] = 1 elif money<10000: parts[2] = 1 elif money<100000: parts[3] = 1 elif money<1000000: parts[4] = 1 elif money<10000000: parts[5] = 1 else: parts[6] = 1 return parts def recall(y_true, y_pred): ''' 计算召回率 @Argus: y_true: 正确的标签 y_pred: 模型预测的标签 @Return 召回率 ''' c1 = K.sum(K.round(K.clip(y_true * y_pred, 0, 1))) c3 = K.sum(K.round(K.clip(y_true, 0, 1))) if c3 == 0: return 0 recall = c1 / c3 return recall def f1_score(y_true, y_pred): ''' 计算F1 @Argus: y_true: 正确的标签 y_pred: 模型预测的标签 @Return F1值 ''' c1 = K.sum(K.round(K.clip(y_true * y_pred, 0, 1))) c2 = K.sum(K.round(K.clip(y_pred, 0, 1))) c3 = K.sum(K.round(K.clip(y_true, 0, 1))) precision = c1 / c2 if c3 == 0: recall = 0 else: recall = c1 / c3 f1_score = 2 * (precision * recall) / (precision + recall) return f1_score def precision(y_true, y_pred): ''' 计算精确率 @Argus: y_true: 正确的标签 y_pred: 模型预测的标签 @Return 精确率 ''' c1 = K.sum(K.round(K.clip(y_true * y_pred, 0, 1))) c2 = K.sum(K.round(K.clip(y_pred, 0, 1))) precision = c1 / c2 return precision def acc(y_true, y_pred): ''' #计算正确率 ''' c1 = tf.reduce_mean(tf.cast(tf.equal(tf.matmul(tf.cast(tf.argmax(y_true,1),tf.float64),tf.constant([[0],[1]],dtype=tf.float64)),tf.matmul(tf.cast(tf.argmax(y_pred,1),tf.float64),tf.constant([[0],[1]],dtype=tf.float64))),tf.float32)) return c1 def my_loss(y_true, y_pred): return -tf.reduce_mean(y_true*tf.log(y_pred)) #return losses.categorical_crossentropy(y_true, y_pred)+(1-tf.reduce_mean(tf.cast(tf.equal(tf.matmul(tf.cast(tf.argmax(y_true,1),tf.float64),tf.constant([[0],[1]],dtype=tf.float64)),tf.matmul(tf.cast(tf.argmax(y_pred,1),tf.float64),tf.constant([[0],[1]],dtype=tf.float64))),tf.float32))) def print_metrics(history): ''' 制作每次迭代的各metrics变化图片 @Arugs: history: 模型训练迭代的历史记录 ''' import matplotlib.pyplot as plt # loss图 loss = history.history['loss'] val_loss = history.history['val_loss'] epochs = range(1, len(loss) + 1) plt.subplot(2, 2, 1) plt.plot(epochs, loss, 'bo', label='Training loss') plt.plot(epochs, val_loss, 'b', label='Validation loss') plt.title('Training and validation loss') plt.xlabel('Epochs') plt.ylabel('Loss') plt.legend() # f1图 f1 = history.history['f1_score'] val_f1 = history.history['val_f1_score'] plt.subplot(2, 2, 2) plt.plot(epochs, f1, 'bo', label='Training f1') plt.plot(epochs, val_f1, 'b', label='Validation f1') plt.title('Training and validation f1') plt.xlabel('Epochs') plt.ylabel('F1') plt.legend() # precision图 prec = history.history['precision'] val_prec = history.history['val_precision'] plt.subplot(2, 2, 3) plt.plot(epochs, prec, 'bo', label='Training precision') plt.plot(epochs, val_prec, 'b', label='Validation pecision') plt.title('Training and validation precision') plt.xlabel('Epochs') plt.ylabel('Precision') plt.legend() # recall图 recall = history.history['recall'] val_recall = history.history['val_recall'] plt.subplot(2, 2, 4) plt.plot(epochs, recall, 'bo', label='Training recall') plt.plot(epochs, val_recall, 'b', label='Validation recall') plt.title('Training and validation recall') plt.xlabel('Epochs') plt.ylabel('Recall') plt.legend() plt.show() scripts_common = ''' document.getElementsByClassName = function (Name,e,tag) { var ele = [], allEle, length, i = 0; if (typeof tag === "undefined" ){ tag = "*" } if (typeof e === "undefined"){ e = document; } allEle = e.getElementsByTagName(tag); for (length = allEle.length;i < length;i = i + 1){ if (allEle[i].className === Name) { ele.push(allEle[i]); } } return ele; } document.countElementById = function (id,e,tag) { var ele = [], allEle, length, i = 0; if (typeof tag === "undefined" ){ tag = "*" } if (typeof e === "undefined"){ e = document; } allEle = e.getElementsByTagName(tag); for (length = allEle.length;i < length;i = i + 1){ if (allEle[i].id === id) { ele.push(allEle[i]); } } return ele; } /*js集合set类的实现*/ function Set() { this.dataStore = []; this.add = add;//新增元素 this.remove = remove;//删除元素 this.size = size;//集合的元素个数 this.union = union;//求并集 this.contains = contains;//判断一个集合中是否包含某个元素 this.intersect = intersect;//交集 this.subset = subset;//判断一个集合是否是另一个的子集 this.difference = difference;//求补集 this.show = show;//将集合元素显示出来 } function add(data) { if (this.dataStore.indexOf(data) < 0) { this.dataStore.push(data); return true; } else { return false; } } function remove(data) { var pos = this.dataStore.indexOf(data); if (pos > -1) { this.dataStore.splice(pos,1); return true; } else { return false; } } function size() { return this.dataStore.length; } function show() { return "[" + this.dataStore + "]"; } function contains(data) { if (this.dataStore.indexOf(data) > -1) { return true; } else { return false; } } function union(set) { var tempSet = new Set(); for (var i = 0; i < this.dataStore.length; ++i) { tempSet.add(this.dataStore[i]); } for (var i = 0; i < set.dataStore.length; ++i) { if (!tempSet.contains(set.dataStore[i])) { tempSet.dataStore.push(set.dataStore[i]); } } return tempSet; } function intersect(set) { var tempSet = new Set(); for (var i = 0; i < this.dataStore.length; ++i) { if (set.contains(this.dataStore[i])) { tempSet.add(this.dataStore[i]); } } return tempSet; } function subset(set) { if (this.size() > set.size()) { return false; } else { for(var member in this.dataStore) { if (!set.contains(member)) { return false; } } } return true; } function difference(set) { var tempSet = new Set(); for (var i = 0; i < this.dataStore.length; ++i) { if (!set.contains(this.dataStore[i])) { tempSet.add(this.dataStore[i]); } } return tempSet; } function check(node,set_url){ if(node.nodeType!=1){ return false; } var label_flag = false; var list_a = node.getElementsByTagName("a"); if(list_a.length==set_url.size()){ return true; }else{ return false; } } function getRemoveList(node,recurse,list_remove){ var pattern = /(上一?篇|下一?篇|作者|点击数|发布时间|发布日期|更新日期|更新时间|字体|字号|来源|阅读次?数|浏览次?数|点击次?数|本站编辑|编辑人|关键字|上一条|下一条)|(打印|关闭窗口|回到顶部|现在的位置|首页|分享)/ if(node.childNodes==null || node.childNodes.length<=0){ return; } for(var i=0;i1 || (ix==1 && i+1=0){ counts += 1; } } if(counts!=1){ return null; } var jsoup = _nodeName+':contains('+_nodeText.trim()+')'; return jsoup; } function getOffsetLeft(el){ return el.offsetParent ? el.offsetLeft + getOffsetLeft(el.offsetParent) : el.offsetLeft; } function getOffsetTop(el){ return el.offsetParent ? el.offsetTop + getOffsetTop(el.offsetParent) : el.offsetTop; } function search_pageBt(node,type,list_hitTag,pattern_page){ var find_flag = false; if(node!=null && node.nodeName.toLowerCase()=="a"){ list_hitTag.push([node,type,getOffsetLeft(node),getOffsetTop(node)]) }else{ if(node.childNodes==null){ }else{ for(var i=0;i_max){ _max = _prob; _list_max_cluster = list_cluster[k][2]; } } return _list_max_cluster; } function clustering_turnPage(){ //var pattern_page = /((?下一?页|>>|>)|(?上一?页|<<|<)|(?首页|第一页)|(?尾页)|(?\.{1,2}|共\d[条页]|\d+\/\d+))/ //phantomjs不支持命名分组 var pattern_page = /^\s*[^最]?\s*([下后]\s*一?\s*页?|[下后]\s*一?\s*页\s*»|[下后]\s*一?\s*页\s*>|[下后]\s*一?\s*页\s*>>|»|>>|>|[Nn]ext)\s*.?\s*$|^\s*.?([前上]\s*一?\s*页?|«\s*[前上]\s*一?\s*页|«|<<|<|[Pp]revious).?\s*$|^\s*.?(首\s*页?|第\s*一\s*页|first|\|<).?\s*$|^\s*.?([尾末]\s*一?\s*页?|tail|>\|).?s\s*$|(^\s*\.{1,2}\s*$|^.{,10}共\s*\d+\s*[条页].{,10}$|^.{,10}\d+\/\d+.{,3}$|^\s*\.{0,2}\d+\s*$|^\s*[gG][oO]\s*$|^.{0,2}跳?转到?)/ var pattern_nextPage = /[Nn]ext/ var list_hitTag = new Array(); //search_pageBt(document,"other",list_hitTag,pattern_page) for(var i=0;i0 && getOffsetTop(node)>0))){ continue; } if(node.tagName.toLowerCase()=="script"){ continue; } var _value = node.getAttribute("value"); if(_value==null){ _value = ""; } var _title = node.getAttribute("title"); if(_title==null){ _title = ""; } var _text = ""; if(node!=null && node.innerText!=null){ _text = node.innerText; } if (_text!=null && _text!="" && node.tagName.toLowerCase()!="option"){ _match = _text.match(pattern_page) if(_match!=null){ var _type = _match[1]?"nextPage":(_match[2]?"lastPage":(_match[3]?"firstPage":(_match[4]?"tailPage":"other"))) list_hitTag.push([node,_type,getOffsetLeft(node),getOffsetTop(node)]); } }else if (_value!=null && _value!="" && node.tagName.toLowerCase()!="option"){ _match = _value.match(pattern_page) if(_match!=null){ var _type = _match[1]?"nextPage":(_match[2]?"lastPage":(_match[3]?"firstPage":(_match[4]?"tailPage":"other"))) list_hitTag.push([node,_type,getOffsetLeft(node),getOffsetTop(node)]); } }else if (_title!=null && _title!=""){ _match = _title.match(pattern_page) if(_match!=null){ var _type = _match[1]?"nextPage":(_match[2]?"lastPage":(_match[3]?"firstPage":(_match[4]?"tailPage":"other"))) list_hitTag.push([node,_type,getOffsetLeft(node),getOffsetTop(node)]); } }else if(node!=null && node.getAttribute("class")!=null && node.getAttribute("class").match(pattern_nextPage)!=null){ list_hitTag.push([node,"nextPage",getOffsetLeft(node),getOffsetTop(node)]); } } var _find = false; for(var i=0;i"){ _find = true; } } if(_find){ for(var i=0;i>"){ list_hitTag[i][1] = "tailPage" } } } list_cluster = clustering(list_hitTag); return list_cluster; } function findElements_byXpath(STR_XPATH) { var xresult = document.evaluate(STR_XPATH, document, null, XPathResult.ANY_TYPE, null); var xnodes = []; var xres; while (xres = xresult.iterateNext()) { xnodes.push(xres); } return xnodes; } ''' scripts_replaceXpath = ''' function findElements_byXpath(STR_XPATH) { var xresult = document.evaluate(STR_XPATH, document, null, XPathResult.ANY_TYPE, null); var xnodes = []; var xres; while (xres = xresult.iterateNext()) { xnodes.push(xres); } return xnodes; } function replaceXpath(_xpath){ var list_path = _xpath.split("/"); var _replaced_xpath = ""; var aim_att = ["height","width","align","valign","border","bgcolor","style"] for(var i=list_path.length-1;i>=0;i--){ var _path = list_path[i]; if(_path.indexOf("]")>=0){ if(_path.indexOf("@")>=0){ _replaced_xpath = "//"+_path; return _xpath; }else if(_path=="html"){ return _xpath; }else{ _temp_xpath = list_path.slice(0,i+1).join("/") _temp_nodes = findElements_byXpath(_temp_xpath) if(_temp_nodes.length==1){ var _count = 0; var gen_xpath = ""; for(var j=0;j<_temp_nodes[0].attributes.length;j++){ var _att = _temp_nodes[0].attributes[j]; _head = _att.name if(aim_att.indexOf(_head)>=0){ _count += 1; if(gen_xpath==""){ gen_xpath = "//"+_temp_nodes[0].tagName.toLowerCase()+"[@"+_att.name+'=\"'+_att.value+'\"]'; }else{ gen_xpath = gen_xpath+"[@"+_att.name+'=\"'+_att.value+'\"]'; } } } if(_count>=2){ var _find_nodes = findElements_byXpath(gen_xpath); if(_find_nodes.length==1){ return gen_xpath+_replaced_xpath }else{ _replaced_xpath = "/"+_path + _replaced_xpath; } }else{ _replaced_xpath = "/"+_path + _replaced_xpath; } }else{ _replaced_xpath = "/"+_path + _replaced_xpath; } } }else{ if(_path!=""){ _replaced_xpath = "/"+_path + _replaced_xpath; } } } return _replaced_xpath; } return replaceXpath(arguments[0]); '''