import re
import module.htmlDrawing as hd
import numpy as np
from keras.preprocessing.sequence import pad_sequences
import time
from bs4 import BeautifulSoup
from module.Utils import *
import math
import json
from _collections import OrderedDict
import os
scripts = '''
function statisticIframe(nodes){
var counts_communicateTags = 0;
for(var i=0;i=0){
stastic_time(childNode,_array);
_find_flag = true;
}
}
}
if (!_find_flag && node!=document && node.tagName.toLowerCase()!='script'){
_array_fontSize = new Array();
getListFontSize(node,_array_fontSize);
_array.push([getOffsetLeft(node),getOffsetTop(node),getListXpath(node,new Array()),Math.min(_array_fontSize)]);
}
return _array;
}
function search(){
statistic(document,1);
var objs = document.all;
var data = new Array();
for(var i=0;i100 && obj.offsetHeight>100 && obj.parentNode.tagName!=null && obj.childNodes.length>0){
maxArea = 0;
child_maxArea = null;
secondmaxArea = 0;
child_secondmaxArea = null;
for(var j =0;jmaxArea){
maxArea = obj.childNodes[j].offsetWidth*obj.childNodes[j].offsetHeight;
child_maxArea = obj.childNodes[j];
}
if(obj.childNodes[j].offsetWidth*obj.childNodes[j].offsetHeight>secondmaxArea && obj.childNodes[j].offsetWidth*obj.childNodes[j].offsetHeight=MIN_WIDTH and height>=MIN_HEIGHT:
list_input.append(element)
print("search",time.time()-start_time)
start_time = time.time()
for element in list_input:
communicateTags = statisticCommunicateTags(element)
punctuation,words = statisticPunctuationAndWords(element)
input_x.append([element.rect["x"],element.rect["y"],element.rect["width"],element.rect["height"],communicateTags,punctuation,words])
label_y.append(labelElement(element, target_source))
print("encode",time.time()-start_time)
the_max = np.max(input_x,axis=0)
the_max = np.array(list(the_max)[2:4]+list(the_max)[2:])
input_x = np.array(input_x/the_max)
if len(label_y)>0 and np.max(label_y)==1:
return input_x,np.array(label_y)
else:
return None
except Exception as e:
print(e)
return None
args = {"url":url,"target_source":target_source}
hd.executeMethod(_method, args)
def getInput(url):
def _method(args):
try:
url = args["url"]
browser = args["browser"]
start_time = time.time()
browser.get(url)
print("get",time.time()-start_time)
start_time = time.time()
#browser.refresh()
#time.sleep(1)
browser.maximize_window()
#elements = browser.find_elements_by_xpath("//*")
''''''
findTags = ["div","table","tbody","tr","td","form","li","span"]
MIN_WIDTH = 400
MIN_HEIGHT = 400
list_input = []
input_x = []
label_y = []
for tag in findTags:
for element in browser.find_elements_by_tag_name(tag):
rect = element.rect
x = rect["x"]
y = rect["y"]
width = rect["width"]
height = rect["height"]
if width>=MIN_WIDTH and height>=MIN_HEIGHT:
list_input.append(element)
print("search",time.time()-start_time)
start_time = time.time()
for element in list_input:
communicateTags = statisticCommunicateTags(element)
punctuation,words = statisticPunctuationAndWords(element)
input_x.append([element.rect["x"],element.rect["y"],element.rect["width"],element.rect["height"],communicateTags,punctuation,words])
print("encode",time.time()-start_time)
the_max = np.max(input_x,axis=0)
the_max = np.array(list(the_max)[2:4]+list(the_max)[2:])
input_x = np.array(input_x/the_max)
return [np.expand_dims(input_x,0)]
except Exception as e:
print(e)
return None
args = {"url":url}
hd.executeMethod(_method, args)
def encodeInput_byJS(url,targethtml):
def label(innerhtml,target_source):
target_source =re.sub("[\r\n\s]","",str(target_source))
pattern = ">(.*)<"
target_source = re.findall(re.compile(pattern), target_source)[0]
innerhtml = re.sub("[\r\n\s]","",str(innerhtml))
#print(target_source[0:40])
#print(element_source[0:40])
#if target_source[0:10]==element_source[0:10] and target_source[-10:]==element_source[-10]:
if target_source[0:60]==innerhtml[0:60]:
return 1
return 0
def _method(args):
try:
url = args["url"]
targethtml = args["targethtml"]
browser = args["browser"]
start = time.time()
browser.get(url)
_log = CLog()
_log.write("get"+str(time.time()-start))
browser.maximize_window()
start = time.time()
# data = browser.execute_script(scripts_common+scripts)
data = get_js_rs(browser, scripts_common+scripts)
input_x,list_inner = dealWithScriptOut(data)
list_label = []
for item in list_inner:
list_label.append(label(item, targethtml))
if len(list_label)>0 and np.max(list_label)==1:
return input_x,np.array(list_label)
else:
return None
print("cost",time.time()-start)
except Exception as e:
print(e)
finally:
pass
return None
args = {"url":url,"targethtml":targethtml}
hd.executeMethod(_method, args)
def getInput_byJS(browser, url):
def label(innerhtml,target_source):
target_source =re.sub("[\r\n\s]","",str(target_source))
pattern = ">(.*)<"
target_source = re.findall(re.compile(pattern), target_source)[0]
innerhtml = re.sub("[\r\n\s]","",str(innerhtml))
#print(target_source[0:40])
#print(element_source[0:40])
#if target_source[0:10]==element_source[0:10] and target_source[-10:]==element_source[-10]:
if target_source[0:60]==innerhtml[0:60]:
return 1
return 0
try:
# browser = hd.getdriver()
# debug("get driver")
# hd.loadPage(browser, url)
# browser.maximize_window()
# data,data_time = browser.execute_script(scripts_common+scripts)
data,data_time = get_js_rs(browser, scripts_common+scripts)
log('获取正文、时间脚本执行完毕')
input_x,list_inner,list_xpath = dealWithScriptOut(data)
if input_x is not None:
#return [np.expand_dims(np.transpose(pad_sequences(np.transpose(input_x,(1,0)), 155,padding="post", truncating="post", value=0,dtype="float32"),(1,0)),0)],list_inner
return True,[[np.expand_dims(input_x,0)],list_inner,list_xpath,data_time]
else:
return False,""
except Exception as e:
error(str(e))
err_msg = ""
if re.search("frame",str(e)) is not None:
err_msg = "#iframe#"
return None,err_msg
# finally:
# hd.adddriver(browser)
# debug("release driver")
def dealWithScriptOut(data,key_index=4):
list_input = []
list_inner = []
list_xpath = []
for index in range(len(data)):
#clean nan
for i in range(len(data[index][0])):
if data[index][0][i] is None or math.isnan(data[index][0][i]):
data[index][0][i] = -1
#order by deepth
data.sort(key=lambda x:x[0][2]*x[0][3],reverse=True)
for item in data:
list_input.append(item[0])
list_inner.append(item[1])
list_xpath.append(item[2])
#print(len(data))
if len(list_input)>0:
the_max = np.max(list_input,axis=0)
the_max = np.array([x if x>0 else 1 for x in the_max])
the_max = np.array((list(the_max)[2:4]+list(the_max)[2:9])*4)
input_x = np.array(list_input/the_max)
return input_x,list_inner,list_xpath
else:
return None,None,None
def getResponseHeaders(browser):
har = json.loads(browser.get_log('har')[0]['message'])
print(har['log']['entries'])
return OrderedDict(sorted([(header["name"], header["value"]) for header in har['log']['entries'][0]['General']], key = lambda x: x[0]))
def getHttpStatus(browser):
for responseReceived in browser.get_log('performance'):
try:
response = json.loads(responseReceived[u'message'])[u'message'][u'params'][u'response']
if response[u'url'] == browser.current_url:
return (response[u'status'], response[u'statusText'])
except:
pass
return None
def getHttpResponseHeader(browser):
for responseReceived in browser.get_log('performance'):
try:
response = json.loads(responseReceived[u'message'])[u'message'][u'params'][u'response']
print
if response[u'url'] == browser.current_url:
print(response)
except:
pass
return response[u'headers']
return None
def labelElement(element,target_source):
target_source =re.sub("[\r\n\s]","",str(target_source))
pattern = ">(.*)<"
target_source = re.findall(re.compile(pattern), target_source)[0]
element_source = element.get_attribute("innerHTML")
element_source = re.sub("[\r\n\s]","",str(element_source))
#print(target_source[0:40])
#print(element_source[0:40])
#if target_source[0:10]==element_source[0:10] and target_source[-10:]==element_source[-10]:
if target_source[0:60]==element_source[0:60]:
return 1
return 0
def padding(all_data,pad=True):
max_len = np.max([len(data[1]) for data in all_data])
print("max_len",max_len)
#max_len = 200
list_x = []
list_y = []
list_url = []
for data in all_data:
input_x = data[0]
label_y = data[1]
url = data[2]
if pad:
input_x = np.transpose(pad_sequences(np.transpose(input_x,(1,0)), max_len,padding="post", truncating="post", value=0,dtype="float32"),(1,0))
list_x.append(input_x)
label_y = pad_sequences([label_y],max_len,padding="post", truncating="post", value=-1)[0]
#list_y.append(label_y)
list_y.append([(np.arange(2)==i).astype(np.integer) for i in label_y])
else:
#input_x = np.array(input_x)
list_x.append([input_x])
list_y.append([(np.arange(2)==i).astype(np.integer) for i in label_y])
list_url.append(url)
return [np.array(list_x),np.array(list_y),list_url]
def getAllData():
all_data = load("Link_Content.pk")
data = []
temp_file ="temp_data.pk"
count = 0
label = 0
data_len = len(all_data)
for row in all_data:
count += 1
print(str(label)+"/"+str(count)+"/"+str(data_len),row[0])
#encode = encodeInput(row[0], row[1])
if count%100==0:
save(data,temp_file)
encode = encodeInput_byJS(row[0], row[1])
if encode:
label += 1
x,y = encode
data.append([x,y,row[0]])
else:
print("None")
data = padding(data)
return data
def augmentation(data,times=100):
aug_data = []
for item in data:
x,y = item[0],item[1]
new_item = []
for i_x,i_y in zip(list(x),list(y)):
new_item.append([i_x,i_y])
aug_data.append(item)
for _ in range(times):
new_x = []
new_y = []
np.random.shuffle(new_item)
for new_i in new_item:
new_x.append(new_i[0])
new_y.append(new_i[1])
aug_data.append([new_x,new_y])
return aug_data
def dumpLinkContent():
def trytosave(d):
try:
save(d,"1.pk")
return 1
except Exception as e:
return 0
import cx_Oracle as cx_Oracle
conn=cx_Oracle.connect('bxkc/bxkc@192.168.2.54:1521/orcl') #连接数据库
cursor=conn.cursor()
sql = " select page_link,page_content from detail_content "
cursor.execute(sql)
rows = cursor.fetchall()
data = []
for row in rows:
if trytosave(row)==1:
data.append(row)
save(data,"Link_Content.pk")
def relabel(file_data="sourceData_36Input_28849_sort.pk"):
'''
@summary: 调整标注数据,解决上卷问题
'''
data = load(file_data)
count = 0
set_1 = set()
set_2 = set()
for page in data:
_feature = page[0]
_label = page[1]
_url = page[2]
_label_index = np.argmax(_label)
_label_left = _feature[_label_index][0]
_label_top = _feature[_label_index][1]
_label_width = _feature[_label_index][2]
_label_height = _feature[_label_index][3]
_label_deepth = _feature[_label_index][4]
_label_text = _feature[_label_index][7]
_index = 0
_re_deepth = 0
_re_index = -1
for _box in _feature:
_left = _box[0]
_top = _box[1]
_width = _box[2]
_height = _box[3]
_deepth = _box[4]
_text = _box[7]
if _deepth>_label_deepth:
if _left>=_label_left and _top>=_label_top and (_left+_width)<=(_label_left+_label_width) and (_top+_height)<=(_label_top+_label_height) and (_width*_height/(_label_width*_label_height)>0.7 or (_width*_height/(_label_width*_label_height)>0.5 and _text/_label_text>0.9)):
set_1.add(_url)
if _deepth>_re_deepth:
_re_deepth = _deepth
_re_index = _index
_index += 1
if _re_index>-1:
_label[_label_index] = 0
_label[_re_index] = 1
print(_url)
print(_label_index,_re_index)
data.sort(key=lambda x:x[2])
print(len(list(set_1)))
save(data,"sourceData_36Input_"+str(len(data[1]))+"_relabel.pk")
data = padding(data)
save(data,"data_"+str(len(data[1]))+"_relabel.pk")
return data
if __name__=="__main__":
#dumpLinkContent()
'''
relabel()
'''
data = getInput_byJS("http://hailing.taizhou.gov.cn/art/2019/5/23/art_50810_2498758.html")
for item in data[3]:
print(item)