''' Created on 2019年8月8日 @author: User ''' import re import time from keras.preprocessing.sequence import pad_sequences scripts_title = ''' function statisticIframe(nodes){ var counts_communicateTags = 0; for(var i=0;imaxWidth){ maxWidth = child.offsetWidth; } if(child.offsetHeight>maxHeight){ maxHeight = child.offsetHeight; } } for(var i=0;imaxWidth){ maxWidth = child.offsetWidth; } if(child.offsetHeight>maxHeight){ maxHeight = child.offsetHeight; } } var _flag = true; var list_node_true = new Array(); for(var i=0;i100 && getOffsetTop(child_true)>0){ var _fontWeight = window.getComputedStyle(child_true).fontWeight var _weight = 400; if(_fontWeight=="normal"){ _weight = 400; }else if(_fontWeight=="bold"){ _weight = 700; }else if(_fontWeight=="lighter"){ _weight = 200; }else if(_fontWeight=="bolder"){ _weight = 600; }else{ _weight = parseInt(_fontWeight) } var _fontSize = parseInt(window.getComputedStyle(child_true).fontSize.match(/\d+/)[0]) list_candidate_title.push([[maxWidth,maxHeight,getOffsetLeft(child_true),getOffsetTop(child_true),child_true.offsetWidth,child_true.offsetHeight,_fontSize,_weight,child_true.counts_text,child_true.counts_lines,child_true.counts_punctuations,child_true.counts_communicateTags],child_true.innerHTML,getListXpath(child_true,new Array())]); } } return [_node_fontSize,false]; } } } } var list_candidate_title = new Array(); statistic(document,1); recursive_candidate_title(document,list_candidate_title,0,0); return list_candidate_title; ''' import module.htmlDrawing as hd import numpy as np import math from module.Utils import * def dealWithScriptOut(data,sort_index=3): list_input = [] list_inner = [] list_xpath = [] list_top = [] for index in range(len(data)): #clean nan for i in range(len(data[index][0])): if data[index][0][i] is None or math.isnan(data[index][0][i]): data[index][0][i] = -1 data.sort(key=lambda x:x[0][sort_index]) for item in data: list_input.append(item[0]) list_inner.append(item[1]) list_xpath.append(item[2]) list_top.append(item[0][3]) #print(len(data)) if len(list_input)>0: the_max = np.max(list_input,axis=0) the_max = np.array([x if x>0 else 1 for x in the_max]) the_max = np.array(list(the_max)[0:2]*3+[16,400,20,20,20,20]) input_x = np.array(list_input/the_max) return input_x,list_inner,list_xpath,list_top else: return None def getInput_byJS(browser,url): try: # browser = hd.getdriver() # debug("get driver") # hd.loadPage(browser, url) # data = browser.execute_script(scripts_common+scripts_title) data = get_js_rs(browser, scripts_common+scripts_title) deal_data = dealWithScriptOut(data) if deal_data is None: return False,"" else: input_x,list_inner,list_xpath,list_height = deal_data return True,[[np.expand_dims(input_x,0)],list_inner,list_xpath,list_height] except Exception as e: error(str(e)) err_msg = "" if re.search("frame",str(e)) is not None: err_msg = "#iframe#" return None,err_msg # finally: # hd.adddriver(browser) # debug("release driver") def encodeInput_byJS(url,targethtml): def label(innerhtml,target_source): target_source =re.sub("[\r\n\s]","",str(target_source)) pattern = ">(.*)<" target_source = re.findall(re.compile(pattern), target_source)[0] innerhtml = re.sub("[\r\n\s]","",str(innerhtml)) #print(target_source[0:40]) #print(element_source[0:40]) #if target_source[0:10]==element_source[0:10] and target_source[-10:]==element_source[-10]: if target_source==innerhtml: return 1 return 0 try: browser = hd.getdriver() debug("get driver") start = time.time() hd.loadPage(browser, url) print("get",time.time()-start) browser.maximize_window() start = time.time() # data = browser.execute_script(scripts_common+scripts_title) data = get_js_rs(browser, scripts_common+scripts_title) input_x,list_inner,_,_ = dealWithScriptOut(data) list_label = [] for item in list_inner: list_label.append(label(item, targethtml)) if len(list_label)>0 and np.sum(list_label)==1: return input_x,np.array(list_label) else: return None print("cost",time.time()-start) except Exception as e: print(e) finally: hd.adddriver(browser) debug("release driver") return None def dumpLinkTitle(): def trytosave(d): try: save(d,"1.pk") return 1 except Exception as e: return 0 import cx_Oracle as cx_Oracle conn=cx_Oracle.connect('bxkc/bxkc@192.168.2.54:1521/orcl') #连接数据库 cursor=conn.cursor() sql = " select page_link,page_title from DETAIL_CONTENT_HTML where page_link is not null and page_type=1 and page_title like '<%' and page_title not like '