123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605 |
- import re
- import module.htmlDrawing as hd
- import numpy as np
- from keras.preprocessing.sequence import pad_sequences
- import time
- from bs4 import BeautifulSoup
- from module.Utils import *
- import math
- import json
- from _collections import OrderedDict
- import os
- scripts = '''
-
- function statisticIframe(nodes){
- var counts_communicateTags = 0;
- for(var i=0;i<nodes.length;i++){
- child = nodes[i]
- if (child.tagName!=null){
- if (child.tagName.toLowerCase() in {a:"",input:"",select:""} || child.onclick!=null){
- counts_communicateTags += 1;
- }
- if(child.tagName.toLowerCase()=="iframe"){
- if(child.contentWindow.document!=null){
- counts_communicateTags += statisticIframe(child.contentWindow.document.all);
- }
- }
- }
- }
- return counts_communicateTags;
- }
- function statistic(node,deepth){
- if(node.childNodes==null){
- node.counts_communicateTags = 0;
- return node.counts_communicateTags;
- }
- node.counts_communicateTags = 0;
- for(var i=0;i<node.childNodes.length;i++){
- child = node.childNodes[i];
- //删除标签
- /*
- if (child.tagName!=null){
- if (child.tagName.toLowerCase() in {head:"",script:"",meta:"",link:"",style:""} || child.nodeType==8 ){
- node.removeChild(child);
- continue;
- }
- }
- */
- if (child.tagName!=null){
- if (child.tagName.toLowerCase() in {a:"",input:"",select:""} || child.onclick!=null){
- node.counts_communicateTags += 1;
- }
- }
- /*if(child.tagName!=null && child.tagName.toLowerCase()=="iframe" && child.contentWindow.document!=null){
- node.counts_communicateTags += statisticIframe(child.contentWindow.document.all);
- }else{
- node.counts_communicateTags += statistic(child,deepth+1);
- }*/
- node.counts_communicateTags += statistic(child,deepth+1);
- }
- var innertext = node.innerText;
- if(innertext){
- var text = innertext.replace(/\s/g,'');
- //var text = innertext;
- node.counts_text = text.length;
- var punc = text.match(/;|,|。|:|、/g);
- var lines = innertext.match(/.{10}\\n/g);
- if(lines){
- node.counts_lines = lines.length;
- }else{
- node.counts_lines = 0;
- }
- if(punc){
- node['counts_punctuations']= punc.length;
- }else{
- node.counts_punctuations = 0;
- }
-
- }else{
- node.counts_lines = 0;
- node.counts_text = 0;
- node.counts_punctuations=0;
- }
- node.deepth = deepth;
- return node.counts_communicateTags;
- }
- function label(node,targethtml){
- var innerhtml = node.innerHTML;
- if(innerhtml){
- innerhtml = innerhtml.replace(/\s/g,'');
- sub_innerhtml = innerhtml.substring(0,40);
- if (sub_innerhtml==targethtml.substring(0,40)){
- return 1;
- }else{
- return 0;
- }
- }else{
- return 0;
- }
- }
- function getListFontSize(node,_array){
- if(node!=null && node.nodeType==1){
- _fontSize = parseInt(window.getComputedStyle(node).fontSize.match(/\d+/)[0]);
- if(_fontSize!=null){
- _array.push(_fontSize);
- }
- if(node.childNodes!=null){
- for(var i=0;i<node.childNodes.length;i++){
- var child = node.childNodes[i];
- getListFontSize(child,_array);
- }
- }
- }
- }
- function stastic_time(node,_array){
- var pattern_time = /\d{4}[\-\/::年.]\d{1,2}[\-\/::月.]\d{1,2}/g
- var _find_flag = false;
- if (node.childNodes==null){
- }else{
- for(var i=0;i<node.childNodes.length;i++){
- var childNode = node.childNodes[i];
- var _innerText = childNode.innerText;
- if (_innerText!=null && _innerText.search(pattern_time)>=0){
- stastic_time(childNode,_array);
- _find_flag = true;
- }
- }
- }
- if (!_find_flag && node!=document){
- _array_fontSize = new Array();
- getListFontSize(node,_array_fontSize);
- _array.push([getOffsetLeft(node),getOffsetTop(node),getListXpath(node,new Array()),Math.min(_array_fontSize)]);
- }
- return _array;
- }
- function search(){
- statistic(document,1);
- var objs = document.all;
- var data = new Array();
- for(var i=0;i<objs.length;i++){
- obj = objs[i];
- if (obj.offsetWidth>100 && obj.offsetHeight>100 && obj.parentNode.tagName!=null && obj.childNodes.length>0){
- maxArea = 0;
- child_maxArea = null;
- secondmaxArea = 0;
- child_secondmaxArea = null;
- for(var j =0;j<obj.childNodes.length;j++){
- if(obj.childNodes[j].offsetWidth!=null && obj.childNodes[j].offsetHeight!=null){
- if( obj.childNodes[j].offsetWidth*obj.childNodes[j].offsetHeight>maxArea){
- maxArea = obj.childNodes[j].offsetWidth*obj.childNodes[j].offsetHeight;
- child_maxArea = obj.childNodes[j];
- }
- if(obj.childNodes[j].offsetWidth*obj.childNodes[j].offsetHeight>secondmaxArea && obj.childNodes[j].offsetWidth*obj.childNodes[j].offsetHeight<maxArea){
- secondmaxArea = obj.childNodes[j].offsetWidth*obj.childNodes[j].offsetHeight;
- child_secondmaxArea = obj.childNodes[j];
- }
- }
- }
-
- _item = new Array();
- _item.push(getOffsetLeft(obj),getOffsetTop(obj),obj.offsetWidth,obj.offsetHeight,obj.deepth,obj.counts_communicateTags,obj.counts_lines,obj.counts_text,obj.counts_punctuations,
- getOffsetLeft(obj.parentNode),getOffsetTop(obj.parentNode),obj.parentNode.offsetWidth,obj.parentNode.offsetHeight,obj.parentNode.deepth,obj.parentNode.counts_communicateTags,obj.parentNode.counts_lines,obj.parentNode.counts_text,obj.parentNode.counts_punctuations)
- if(child_maxArea!=null){
- _item.push(getOffsetLeft(child_maxArea),getOffsetTop(child_maxArea),child_maxArea.offsetWidth,child_maxArea.offsetHeight,child_maxArea.deepth,child_maxArea.counts_communicateTags,child_maxArea.counts_lines,child_maxArea.counts_text,child_maxArea.counts_punctuations)
- }else{
- _item.push(-1,-1,-1,-1,-1,-1,-1,-1,-1)
- }
-
- if(child_secondmaxArea!=null){
- _item.push(getOffsetLeft(child_secondmaxArea),getOffsetTop(child_secondmaxArea),child_secondmaxArea.offsetWidth,child_secondmaxArea.offsetHeight,child_secondmaxArea.deepth,child_secondmaxArea.counts_communicateTags,child_secondmaxArea.counts_lines,child_secondmaxArea.counts_text,child_secondmaxArea.counts_punctuations)
- }else{
- _item.push(-1,-1,-1,-1,-1,-1,-1,-1,-1)
- }
- data.push([_item,obj.innerHTML,getListXpath(obj,new Array(),true)])
-
- }
- }
-
- var data_time = stastic_time(document,new Array());
-
- return([data,data_time]);
- }
- return (search());
- '''
- def statisticCommunicateTags(element):
- count = 0
- soup = BeautifulSoup(element.get_attribute("innerHTML"),"lxml")
- childs = soup.find_all(recursive=True)
- for child in childs:
- if child.name in ["a","input","select"] or "onclick" in child.attrs is not None:
- #print(child.text)
- count += 1
- return count
-
- def statisticPunctuationAndWords(element,punctuationWords_pattern = re.compile("[;,。:、]")):
- text = element.text
- text = re.sub("\r|\n|\s","",text)
- words_len = len(text)
- punctuation_len = len(re.findall(punctuationWords_pattern,text))
- return punctuation_len,words_len
-
- def encodeInput(url,target_source):
- def _method(args):
- try:
- url = args["url"]
- target_source = args["target_source"]
- browser = args["browser"]
- start_time = time.time()
- browser.get(url)
- print("get",time.time()-start_time)
- start_time = time.time()
- #browser.refresh()
- #time.sleep(1)
- browser.maximize_window()
- #elements = browser.find_elements_by_xpath("//*")
- ''''''
- findTags = ["div","table","tbody","tr","td","form","li","span"]
- MIN_WIDTH = 400
- MIN_HEIGHT = 400
- list_input = []
- input_x = []
- label_y = []
- for tag in findTags:
- for element in browser.find_elements_by_tag_name(tag):
- rect = element.rect
- x = rect["x"]
- y = rect["y"]
- width = rect["width"]
- height = rect["height"]
- if width>=MIN_WIDTH and height>=MIN_HEIGHT:
- list_input.append(element)
-
- print("search",time.time()-start_time)
- start_time = time.time()
- for element in list_input:
- communicateTags = statisticCommunicateTags(element)
- punctuation,words = statisticPunctuationAndWords(element)
- input_x.append([element.rect["x"],element.rect["y"],element.rect["width"],element.rect["height"],communicateTags,punctuation,words])
- label_y.append(labelElement(element, target_source))
-
- print("encode",time.time()-start_time)
- the_max = np.max(input_x,axis=0)
- the_max = np.array(list(the_max)[2:4]+list(the_max)[2:])
- input_x = np.array(input_x/the_max)
- if len(label_y)>0 and np.max(label_y)==1:
- return input_x,np.array(label_y)
- else:
- return None
- except Exception as e:
- print(e)
- return None
- args = {"url":url,"target_source":target_source}
- hd.executeMethod(_method, args)
-
- def getInput(url):
- def _method(args):
- try:
- url = args["url"]
- browser = args["browser"]
- start_time = time.time()
- browser.get(url)
- print("get",time.time()-start_time)
- start_time = time.time()
- #browser.refresh()
- #time.sleep(1)
- browser.maximize_window()
- #elements = browser.find_elements_by_xpath("//*")
- ''''''
- findTags = ["div","table","tbody","tr","td","form","li","span"]
- MIN_WIDTH = 400
- MIN_HEIGHT = 400
- list_input = []
- input_x = []
- label_y = []
- for tag in findTags:
-
- for element in browser.find_elements_by_tag_name(tag):
- rect = element.rect
- x = rect["x"]
- y = rect["y"]
- width = rect["width"]
- height = rect["height"]
- if width>=MIN_WIDTH and height>=MIN_HEIGHT:
- list_input.append(element)
-
- print("search",time.time()-start_time)
- start_time = time.time()
- for element in list_input:
- communicateTags = statisticCommunicateTags(element)
- punctuation,words = statisticPunctuationAndWords(element)
- input_x.append([element.rect["x"],element.rect["y"],element.rect["width"],element.rect["height"],communicateTags,punctuation,words])
-
- print("encode",time.time()-start_time)
- the_max = np.max(input_x,axis=0)
- the_max = np.array(list(the_max)[2:4]+list(the_max)[2:])
- input_x = np.array(input_x/the_max)
- return [np.expand_dims(input_x,0)]
- except Exception as e:
- print(e)
- return None
- args = {"url":url}
- hd.executeMethod(_method, args)
-
- def encodeInput_byJS(url,targethtml):
- def label(innerhtml,target_source):
- target_source =re.sub("[\r\n\s]","",str(target_source))
- pattern = ">(.*)<"
- target_source = re.findall(re.compile(pattern), target_source)[0]
- innerhtml = re.sub("[\r\n\s]","",str(innerhtml))
- #print(target_source[0:40])
- #print(element_source[0:40])
- #if target_source[0:10]==element_source[0:10] and target_source[-10:]==element_source[-10]:
- if target_source[0:60]==innerhtml[0:60]:
- return 1
- return 0
-
- def _method(args):
- try:
- url = args["url"]
- targethtml = args["targethtml"]
- browser = args["browser"]
- start = time.time()
- browser.get(url)
- _log = CLog()
- _log.write("get"+str(time.time()-start))
- browser.maximize_window()
- start = time.time()
-
- # data = browser.execute_script(scripts_common+scripts)
- data = get_js_rs(browser, scripts_common+scripts)
- input_x,list_inner = dealWithScriptOut(data)
- list_label = []
- for item in list_inner:
- list_label.append(label(item, targethtml))
- if len(list_label)>0 and np.max(list_label)==1:
- return input_x,np.array(list_label)
- else:
- return None
- print("cost",time.time()-start)
- except Exception as e:
- print(e)
- finally:
- pass
- return None
- args = {"url":url,"targethtml":targethtml}
- hd.executeMethod(_method, args)
-
- def getInput_byJS(browser, url):
- def label(innerhtml,target_source):
- target_source =re.sub("[\r\n\s]","",str(target_source))
- pattern = ">(.*)<"
- target_source = re.findall(re.compile(pattern), target_source)[0]
- innerhtml = re.sub("[\r\n\s]","",str(innerhtml))
- #print(target_source[0:40])
- #print(element_source[0:40])
- #if target_source[0:10]==element_source[0:10] and target_source[-10:]==element_source[-10]:
- if target_source[0:60]==innerhtml[0:60]:
- return 1
- return 0
- try:
- # browser = hd.getdriver()
- # debug("get driver")
- # hd.loadPage(browser, url)
- # browser.maximize_window()
-
- # data,data_time = browser.execute_script(scripts_common+scripts)
- data,data_time = get_js_rs(browser, scripts_common+scripts)
- log('获取正文、时间脚本执行完毕')
- input_x,list_inner,list_xpath = dealWithScriptOut(data)
- if input_x is not None:
- #return [np.expand_dims(np.transpose(pad_sequences(np.transpose(input_x,(1,0)), 155,padding="post", truncating="post", value=0,dtype="float32"),(1,0)),0)],list_inner
- return True,[[np.expand_dims(input_x,0)],list_inner,list_xpath,data_time]
- else:
- return False,""
- except Exception as e:
- error(str(e))
- err_msg = ""
- if re.search("frame",str(e)) is not None:
- err_msg = "#iframe#"
- return None,err_msg
- # finally:
- # hd.adddriver(browser)
- # debug("release driver")
- def dealWithScriptOut(data,key_index=4):
- list_input = []
- list_inner = []
- list_xpath = []
- for index in range(len(data)):
- #clean nan
- for i in range(len(data[index][0])):
- if data[index][0][i] is None or math.isnan(data[index][0][i]):
- data[index][0][i] = -1
- #order by deepth
- data.sort(key=lambda x:x[0][2]*x[0][3],reverse=True)
- for item in data:
- list_input.append(item[0])
- list_inner.append(item[1])
- list_xpath.append(item[2])
- #print(len(data))
- if len(list_input)>0:
- the_max = np.max(list_input,axis=0)
- the_max = np.array([x if x>0 else 1 for x in the_max])
- the_max = np.array((list(the_max)[2:4]+list(the_max)[2:9])*4)
- input_x = np.array(list_input/the_max)
- return input_x,list_inner,list_xpath
- else:
- return None,None,None
-
- def getResponseHeaders(browser):
- har = json.loads(browser.get_log('har')[0]['message'])
- print(har['log']['entries'])
- return OrderedDict(sorted([(header["name"], header["value"]) for header in har['log']['entries'][0]['General']], key = lambda x: x[0]))
- def getHttpStatus(browser):
- for responseReceived in browser.get_log('performance'):
- try:
- response = json.loads(responseReceived[u'message'])[u'message'][u'params'][u'response']
- if response[u'url'] == browser.current_url:
- return (response[u'status'], response[u'statusText'])
- except:
- pass
- return None
- def getHttpResponseHeader(browser):
- for responseReceived in browser.get_log('performance'):
- try:
- response = json.loads(responseReceived[u'message'])[u'message'][u'params'][u'response']
- print
- if response[u'url'] == browser.current_url:
- print(response)
-
- except:
- pass
- return response[u'headers']
- return None
-
- def labelElement(element,target_source):
- target_source =re.sub("[\r\n\s]","",str(target_source))
- pattern = ">(.*)<"
- target_source = re.findall(re.compile(pattern), target_source)[0]
- element_source = element.get_attribute("innerHTML")
- element_source = re.sub("[\r\n\s]","",str(element_source))
- #print(target_source[0:40])
- #print(element_source[0:40])
- #if target_source[0:10]==element_source[0:10] and target_source[-10:]==element_source[-10]:
- if target_source[0:60]==element_source[0:60]:
- return 1
- return 0
- def padding(all_data,pad=True):
- max_len = np.max([len(data[1]) for data in all_data])
- print("max_len",max_len)
- #max_len = 200
- list_x = []
- list_y = []
- list_url = []
- for data in all_data:
- input_x = data[0]
- label_y = data[1]
- url = data[2]
- if pad:
- input_x = np.transpose(pad_sequences(np.transpose(input_x,(1,0)), max_len,padding="post", truncating="post", value=0,dtype="float32"),(1,0))
- list_x.append(input_x)
- label_y = pad_sequences([label_y],max_len,padding="post", truncating="post", value=-1)[0]
- #list_y.append(label_y)
- list_y.append([(np.arange(2)==i).astype(np.integer) for i in label_y])
- else:
- #input_x = np.array(input_x)
- list_x.append([input_x])
- list_y.append([(np.arange(2)==i).astype(np.integer) for i in label_y])
- list_url.append(url)
- return [np.array(list_x),np.array(list_y),list_url]
- def getAllData():
- all_data = load("Link_Content.pk")
- data = []
- temp_file ="temp_data.pk"
- count = 0
- label = 0
- data_len = len(all_data)
- for row in all_data:
- count += 1
- print(str(label)+"/"+str(count)+"/"+str(data_len),row[0])
- #encode = encodeInput(row[0], row[1])
-
- if count%100==0:
- save(data,temp_file)
- encode = encodeInput_byJS(row[0], row[1])
- if encode:
- label += 1
- x,y = encode
- data.append([x,y,row[0]])
- else:
- print("None")
- data = padding(data)
- return data
- def augmentation(data,times=100):
- aug_data = []
- for item in data:
- x,y = item[0],item[1]
- new_item = []
- for i_x,i_y in zip(list(x),list(y)):
- new_item.append([i_x,i_y])
- aug_data.append(item)
- for _ in range(times):
- new_x = []
- new_y = []
- np.random.shuffle(new_item)
- for new_i in new_item:
- new_x.append(new_i[0])
- new_y.append(new_i[1])
- aug_data.append([new_x,new_y])
- return aug_data
-
- def dumpLinkContent():
- def trytosave(d):
- try:
- save(d,"1.pk")
- return 1
- except Exception as e:
- return 0
- import cx_Oracle as cx_Oracle
- conn=cx_Oracle.connect('bxkc/bxkc@192.168.2.54:1521/orcl') #连接数据库
- cursor=conn.cursor()
- sql = " select page_link,page_content from detail_content "
- cursor.execute(sql)
- rows = cursor.fetchall()
- data = []
- for row in rows:
- if trytosave(row)==1:
- data.append(row)
- save(data,"Link_Content.pk")
-
- def relabel(file_data="sourceData_36Input_28849_sort.pk"):
- '''
- @summary: 调整标注数据,解决上卷问题
- '''
- data = load(file_data)
- count = 0
- set_1 = set()
- set_2 = set()
- for page in data:
- _feature = page[0]
- _label = page[1]
- _url = page[2]
- _label_index = np.argmax(_label)
- _label_left = _feature[_label_index][0]
- _label_top = _feature[_label_index][1]
- _label_width = _feature[_label_index][2]
- _label_height = _feature[_label_index][3]
- _label_deepth = _feature[_label_index][4]
- _label_text = _feature[_label_index][7]
- _index = 0
- _re_deepth = 0
- _re_index = -1
- for _box in _feature:
- _left = _box[0]
- _top = _box[1]
- _width = _box[2]
- _height = _box[3]
- _deepth = _box[4]
- _text = _box[7]
- if _deepth>_label_deepth:
- if _left>=_label_left and _top>=_label_top and (_left+_width)<=(_label_left+_label_width) and (_top+_height)<=(_label_top+_label_height) and (_width*_height/(_label_width*_label_height)>0.7 or (_width*_height/(_label_width*_label_height)>0.5 and _text/_label_text>0.9)):
- set_1.add(_url)
- if _deepth>_re_deepth:
- _re_deepth = _deepth
- _re_index = _index
- _index += 1
- if _re_index>-1:
- _label[_label_index] = 0
- _label[_re_index] = 1
- print(_url)
- print(_label_index,_re_index)
- data.sort(key=lambda x:x[2])
- print(len(list(set_1)))
- save(data,"sourceData_36Input_"+str(len(data[1]))+"_relabel.pk")
- data = padding(data)
- save(data,"data_"+str(len(data[1]))+"_relabel.pk")
- return data
-
- if __name__=="__main__":
- #dumpLinkContent()
- '''
- relabel()
- '''
- data = getInput_byJS("http://hailing.taizhou.gov.cn/art/2019/5/23/art_50810_2498758.html")
- for item in data[3]:
- print(item)
|