123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385 |
- '''
- Created on 2019年8月8日
- @author: User
- '''
- import re
- import time
- from keras.preprocessing.sequence import pad_sequences
- scripts_title = '''
-
- function statisticIframe(nodes){
- var counts_communicateTags = 0;
- for(var i=0;i<nodes.length;i++){
- child = nodes[i]
- if (child.tagName!=null){
- if (child.tagName.toLowerCase() in {a:"",input:"",select:""} || child.onclick!=null){
- counts_communicateTags += 1;
- }
- if(child.tagName.toLowerCase()=="iframe"){
- if(child.contentWindow.document!=null){
- counts_communicateTags += statisticIframe(child.contentWindow.document.all);
- }
- }
- }
- }
- return counts_communicateTags;
- }
- function statistic(node,deepth){
- if(node.childNodes==null){
- node.counts_communicateTags = 0;
- return node.counts_communicateTags;
- }
- node.counts_communicateTags = 0;
- for(var i=0;i<node.childNodes.length;i++){
- child = node.childNodes[i];
- //删除标签
- /*
- if (child.tagName!=null){
- if (child.tagName.toLowerCase() in {head:"",script:"",meta:"",link:"",style:""} || child.nodeType==8 ){
- node.removeChild(child);
- continue;
- }
- }
- */
- if (child.tagName!=null){
- if (child.tagName.toLowerCase() in {a:"",input:"",select:""} || child.onclick!=null){
- node.counts_communicateTags += 1;
- }
- }
- /*if(child.tagName!=null && child.tagName.toLowerCase()=="iframe" && child.contentWindow.document!=null){
- node.counts_communicateTags += statisticIframe(child.contentWindow.document.all);
- }else{
- node.counts_communicateTags += statistic(child,deepth+1);
- }*/
- node.counts_communicateTags += statistic(child,deepth+1);
- }
- var innertext = node.innerText;
- if(innertext){
- var text = innertext.replace(/\s/g,'');
- //var text = innertext;
- node.counts_text = text.length;
- var punc = text.match(/;|,|。|:|、/g);
- var lines = innertext.match(/.{10}\\n/g);
- if(lines){
- node.counts_lines = lines.length;
- }else{
- node.counts_lines = 0;
- }
- if(punc){
- node['counts_punctuations']= punc.length;
- }else{
- node.counts_punctuations = 0;
- }
-
- }else{
- node.counts_lines = 0;
- node.counts_text = 0;
- node.counts_punctuations=0;
- }
- node.deepth = deepth;
- return node.counts_communicateTags;
- }
- function recursive_candidate_title(node,list_candidate_title,maxWidth,maxHeight){
- if(node==document){
- var _flag = true;
- var list_node_true = new Array();
- for(var i=0;i<node.childNodes.length;i++){
- child = node.childNodes[i];
- if(child.offsetWidth>maxWidth){
- maxWidth = child.offsetWidth;
- }
- if(child.offsetHeight>maxHeight){
- maxHeight = child.offsetHeight;
- }
- }
- for(var i=0;i<node.childNodes.length;i++){
- var child = node.childNodes[i];
- var _result = recursive_candidate_title(child,list_candidate_title,maxWidth,maxHeight);
- if(_result!=null){
- if(!_result[1]){
- _flag = false;
- }else{
- list_node_true.push(child);
- }
- }
- }
-
- if(_flag){
-
- }else{
- for(var i=0;i<list_node_true.length;i++){
- list_candidate_title.push([node,node.innerHTML]);
- }
-
- }
- }else{
- if(node.nodeType!=1){
- return null;
- }
- if(node.innerText==null || node.innerText==""){
- return null;
- }
- var _node_fontSize = window.getComputedStyle(node).fontSize;
- if(node.childNodes==null){
- return [_node_fontSize,true];
- }else{
- for(var i=0;i<node.childNodes.length;i++){
- child = node.childNodes[i];
- if(child.offsetWidth>maxWidth){
- maxWidth = child.offsetWidth;
- }
- if(child.offsetHeight>maxHeight){
- maxHeight = child.offsetHeight;
- }
- }
-
- var _flag = true;
- var list_node_true = new Array();
- for(var i=0;i<node.childNodes.length;i++){
- var child = node.childNodes[i];
- var _result = recursive_candidate_title(child,list_candidate_title,maxWidth,maxHeight);
- if(_result!=null){
- if(!_result[1]){
- _flag = false;
- }else{
- list_node_true.push(child);
- }
- if(_node_fontSize!=_result[0]){
- _flag = false;
- }
- }
- }
-
- if(_flag){
- return [_node_fontSize,true];
- }else{
- for(var i=0;i<list_node_true.length;i++){
- var child_true = list_node_true[i]
- if(child_true.offsetWidth>100 && getOffsetTop(child_true)>0){
- var _fontWeight = window.getComputedStyle(child_true).fontWeight
- var _weight = 400;
- if(_fontWeight=="normal"){
- _weight = 400;
- }else if(_fontWeight=="bold"){
- _weight = 700;
- }else if(_fontWeight=="lighter"){
- _weight = 200;
- }else if(_fontWeight=="bolder"){
- _weight = 600;
- }else{
- _weight = parseInt(_fontWeight)
- }
- var _fontSize = parseInt(window.getComputedStyle(child_true).fontSize.match(/\d+/)[0])
-
- list_candidate_title.push([[maxWidth,maxHeight,getOffsetLeft(child_true),getOffsetTop(child_true),child_true.offsetWidth,child_true.offsetHeight,_fontSize,_weight,child_true.counts_text,child_true.counts_lines,child_true.counts_punctuations,child_true.counts_communicateTags],child_true.innerHTML,getListXpath(child_true,new Array())]);
- }
-
- }
- return [_node_fontSize,false];
- }
- }
- }
- }
- var list_candidate_title = new Array();
- statistic(document,1);
- recursive_candidate_title(document,list_candidate_title,0,0);
- return list_candidate_title;
- '''
- import module.htmlDrawing as hd
- import numpy as np
- import math
- from module.Utils import *
- def dealWithScriptOut(data,sort_index=3):
- list_input = []
- list_inner = []
- list_xpath = []
- list_top = []
- for index in range(len(data)):
- #clean nan
- for i in range(len(data[index][0])):
- if data[index][0][i] is None or math.isnan(data[index][0][i]):
- data[index][0][i] = -1
- data.sort(key=lambda x:x[0][sort_index])
- for item in data:
- list_input.append(item[0])
- list_inner.append(item[1])
- list_xpath.append(item[2])
- list_top.append(item[0][3])
- #print(len(data))
- if len(list_input)>0:
- the_max = np.max(list_input,axis=0)
- the_max = np.array([x if x>0 else 1 for x in the_max])
- the_max = np.array(list(the_max)[0:2]*3+[16,400,20,20,20,20])
- input_x = np.array(list_input/the_max)
- return input_x,list_inner,list_xpath,list_top
- else:
- return None
- def getInput_byJS(browser,url):
- try:
- # browser = hd.getdriver()
- # debug("get driver")
- # hd.loadPage(browser, url)
-
- # data = browser.execute_script(scripts_common+scripts_title)
- data = get_js_rs(browser, scripts_common+scripts_title)
- deal_data = dealWithScriptOut(data)
- if deal_data is None:
- return False,""
- else:
- input_x,list_inner,list_xpath,list_height = deal_data
- return True,[[np.expand_dims(input_x,0)],list_inner,list_xpath,list_height]
- except Exception as e:
- error(str(e))
- err_msg = ""
- if re.search("frame",str(e)) is not None:
- err_msg = "#iframe#"
- return None,err_msg
- # finally:
- # hd.adddriver(browser)
- # debug("release driver")
- def encodeInput_byJS(url,targethtml):
- def label(innerhtml,target_source):
- target_source =re.sub("[\r\n\s]","",str(target_source))
- pattern = ">(.*)<"
- target_source = re.findall(re.compile(pattern), target_source)[0]
- innerhtml = re.sub("[\r\n\s]","",str(innerhtml))
- #print(target_source[0:40])
- #print(element_source[0:40])
- #if target_source[0:10]==element_source[0:10] and target_source[-10:]==element_source[-10]:
- if target_source==innerhtml:
- return 1
- return 0
- try:
- browser = hd.getdriver()
- debug("get driver")
- start = time.time()
- hd.loadPage(browser, url)
- print("get",time.time()-start)
- browser.maximize_window()
- start = time.time()
-
- # data = browser.execute_script(scripts_common+scripts_title)
- data = get_js_rs(browser, scripts_common+scripts_title)
- input_x,list_inner,_,_ = dealWithScriptOut(data)
- list_label = []
- for item in list_inner:
- list_label.append(label(item, targethtml))
- if len(list_label)>0 and np.sum(list_label)==1:
- return input_x,np.array(list_label)
- else:
- return None
- print("cost",time.time()-start)
- except Exception as e:
- print(e)
- finally:
- hd.adddriver(browser)
- debug("release driver")
- return None
- def dumpLinkTitle():
- def trytosave(d):
- try:
- save(d,"1.pk")
- return 1
- except Exception as e:
- return 0
- import cx_Oracle as cx_Oracle
- conn=cx_Oracle.connect('bxkc/bxkc@192.168.2.54:1521/orcl') #连接数据库
- cursor=conn.cursor()
- sql = " select page_link,page_title from DETAIL_CONTENT_HTML where page_link is not null and page_type=1 and page_title like '<%' and page_title not like '<a%' "
- cursor.execute(sql)
- data = []
- while(True):
- try:
- rows = cursor.fetchmany(10)
- if not rows:
- break
- for row in rows:
- if trytosave(row)==1:
- data.append(row)
- except Exception as e:
- print(e)
- save(data,"Link_Title.pk")
-
- def getAllData():
- all_data = load("Link_Title.pk")
- data = []
- temp_file ="temp_data.pk"
- count = 0
- label = 0
- data_len = len(all_data)
- for row in all_data:
- count += 1
- print(str(label)+"/"+str(count)+"/"+str(data_len),row[0])
- #encode = encodeInput(row[0], row[1])
-
- if count%100==0:
- save(data,temp_file)
- encode = encodeInput_byJS(row[0], row[1])
- if encode:
- label += 1
- x,y = encode
- data.append([x,y,row[0]])
- else:
- print("None")
- save(data,"data_done.pk")
- return data
- def filter():
- list_length = []
- data = load("temp_data.pk")
- print(data[0])
- data.sort(key = lambda x:x[2])
- new_data = []
- for item in data:
- list_length.append(len(item[0]))
- if len(item[0])<100:
- new_data.append(item)
- print(max(list_length))
- print(len(data))
- print(len(new_data))
- save(new_data,"source_12input.pk")
-
- def paddinig(all_data,pad=True):
- max_len = np.max([len(data[1]) for data in all_data])
- print("max_len",max_len)
- #max_len = 200
- list_x = []
- list_y = []
- list_url = []
- for data in all_data:
- input_x = data[0]
- label_y = data[1]
- url = data[2]
- if pad:
- input_x = np.transpose(pad_sequences(np.transpose(input_x,(1,0)), max_len,padding="post", truncating="post", value=0,dtype="float32"),(1,0))
- list_x.append(input_x)
- label_y = pad_sequences([label_y],max_len,padding="post", truncating="post", value=-1)[0]
- #list_y.append(label_y)
- list_y.append([(np.arange(2)==i).astype(np.integer) for i in label_y])
- else:
- #input_x = np.array(input_x)
- list_x.append([input_x])
- list_y.append([(np.arange(2)==i).astype(np.integer) for i in label_y])
- list_url.append(url)
- return [np.array(list_x),np.array(list_y),list_url]
- if __name__=="__main__":
- #data = getInput_byJS("http://www.tonghua.gov.cn/cjj/zbtb/201908/t20190802_360119.html")
- #dumpLinkTitle()
- #getAllData()
- #filter()
- data = paddinig(load("source_12input.pk"))
- save(data,"source_12input_padding.pk")
|