import re import module.htmlDrawing as hd import numpy as np from keras.preprocessing.sequence import pad_sequences import time from bs4 import BeautifulSoup from module.Utils import * import math import json from _collections import OrderedDict import os scripts = ''' function statisticIframe(nodes){ var counts_communicateTags = 0; for(var i=0;i=0){ stastic_time(childNode,_array); _find_flag = true; } } } if (!_find_flag && node!=document && node.tagName.toLowerCase()!='script'){ _array_fontSize = new Array(); getListFontSize(node,_array_fontSize); _array.push([getOffsetLeft(node),getOffsetTop(node),getListXpath(node,new Array()),Math.min(_array_fontSize)]); } return _array; } function search(){ statistic(document,1); var objs = document.all; var data = new Array(); for(var i=0;i100 && obj.offsetHeight>100 && obj.parentNode.tagName!=null && obj.childNodes.length>0){ maxArea = 0; child_maxArea = null; secondmaxArea = 0; child_secondmaxArea = null; for(var j =0;jmaxArea){ maxArea = obj.childNodes[j].offsetWidth*obj.childNodes[j].offsetHeight; child_maxArea = obj.childNodes[j]; } if(obj.childNodes[j].offsetWidth*obj.childNodes[j].offsetHeight>secondmaxArea && obj.childNodes[j].offsetWidth*obj.childNodes[j].offsetHeight=MIN_WIDTH and height>=MIN_HEIGHT: list_input.append(element) print("search",time.time()-start_time) start_time = time.time() for element in list_input: communicateTags = statisticCommunicateTags(element) punctuation,words = statisticPunctuationAndWords(element) input_x.append([element.rect["x"],element.rect["y"],element.rect["width"],element.rect["height"],communicateTags,punctuation,words]) label_y.append(labelElement(element, target_source)) print("encode",time.time()-start_time) the_max = np.max(input_x,axis=0) the_max = np.array(list(the_max)[2:4]+list(the_max)[2:]) input_x = np.array(input_x/the_max) if len(label_y)>0 and np.max(label_y)==1: return input_x,np.array(label_y) else: return None except Exception as e: print(e) return None args = {"url":url,"target_source":target_source} hd.executeMethod(_method, args) def getInput(url): def _method(args): try: url = args["url"] browser = args["browser"] start_time = time.time() browser.get(url) print("get",time.time()-start_time) start_time = time.time() #browser.refresh() #time.sleep(1) browser.maximize_window() #elements = browser.find_elements_by_xpath("//*") '''''' findTags = ["div","table","tbody","tr","td","form","li","span"] MIN_WIDTH = 400 MIN_HEIGHT = 400 list_input = [] input_x = [] label_y = [] for tag in findTags: for element in browser.find_elements_by_tag_name(tag): rect = element.rect x = rect["x"] y = rect["y"] width = rect["width"] height = rect["height"] if width>=MIN_WIDTH and height>=MIN_HEIGHT: list_input.append(element) print("search",time.time()-start_time) start_time = time.time() for element in list_input: communicateTags = statisticCommunicateTags(element) punctuation,words = statisticPunctuationAndWords(element) input_x.append([element.rect["x"],element.rect["y"],element.rect["width"],element.rect["height"],communicateTags,punctuation,words]) print("encode",time.time()-start_time) the_max = np.max(input_x,axis=0) the_max = np.array(list(the_max)[2:4]+list(the_max)[2:]) input_x = np.array(input_x/the_max) return [np.expand_dims(input_x,0)] except Exception as e: print(e) return None args = {"url":url} hd.executeMethod(_method, args) def encodeInput_byJS(url,targethtml): def label(innerhtml,target_source): target_source =re.sub("[\r\n\s]","",str(target_source)) pattern = ">(.*)<" target_source = re.findall(re.compile(pattern), target_source)[0] innerhtml = re.sub("[\r\n\s]","",str(innerhtml)) #print(target_source[0:40]) #print(element_source[0:40]) #if target_source[0:10]==element_source[0:10] and target_source[-10:]==element_source[-10]: if target_source[0:60]==innerhtml[0:60]: return 1 return 0 def _method(args): try: url = args["url"] targethtml = args["targethtml"] browser = args["browser"] start = time.time() browser.get(url) _log = CLog() _log.write("get"+str(time.time()-start)) browser.maximize_window() start = time.time() # data = browser.execute_script(scripts_common+scripts) data = get_js_rs(browser, scripts_common+scripts) input_x,list_inner = dealWithScriptOut(data) list_label = [] for item in list_inner: list_label.append(label(item, targethtml)) if len(list_label)>0 and np.max(list_label)==1: return input_x,np.array(list_label) else: return None print("cost",time.time()-start) except Exception as e: print(e) finally: pass return None args = {"url":url,"targethtml":targethtml} hd.executeMethod(_method, args) def getInput_byJS(browser, url): def label(innerhtml,target_source): target_source =re.sub("[\r\n\s]","",str(target_source)) pattern = ">(.*)<" target_source = re.findall(re.compile(pattern), target_source)[0] innerhtml = re.sub("[\r\n\s]","",str(innerhtml)) #print(target_source[0:40]) #print(element_source[0:40]) #if target_source[0:10]==element_source[0:10] and target_source[-10:]==element_source[-10]: if target_source[0:60]==innerhtml[0:60]: return 1 return 0 try: # browser = hd.getdriver() # debug("get driver") # hd.loadPage(browser, url) # browser.maximize_window() # data,data_time = browser.execute_script(scripts_common+scripts) data,data_time = get_js_rs(browser, scripts_common+scripts) log('获取正文、时间脚本执行完毕') input_x,list_inner,list_xpath = dealWithScriptOut(data) if input_x is not None: #return [np.expand_dims(np.transpose(pad_sequences(np.transpose(input_x,(1,0)), 155,padding="post", truncating="post", value=0,dtype="float32"),(1,0)),0)],list_inner return True,[[np.expand_dims(input_x,0)],list_inner,list_xpath,data_time] else: return False,"" except Exception as e: error(str(e)) err_msg = "" if re.search("frame",str(e)) is not None: err_msg = "#iframe#" return None,err_msg # finally: # hd.adddriver(browser) # debug("release driver") def dealWithScriptOut(data,key_index=4): list_input = [] list_inner = [] list_xpath = [] for index in range(len(data)): #clean nan for i in range(len(data[index][0])): if data[index][0][i] is None or math.isnan(data[index][0][i]): data[index][0][i] = -1 #order by deepth data.sort(key=lambda x:x[0][2]*x[0][3],reverse=True) for item in data: list_input.append(item[0]) list_inner.append(item[1]) list_xpath.append(item[2]) #print(len(data)) if len(list_input)>0: the_max = np.max(list_input,axis=0) the_max = np.array([x if x>0 else 1 for x in the_max]) the_max = np.array((list(the_max)[2:4]+list(the_max)[2:9])*4) input_x = np.array(list_input/the_max) return input_x,list_inner,list_xpath else: return None,None,None def getResponseHeaders(browser): har = json.loads(browser.get_log('har')[0]['message']) print(har['log']['entries']) return OrderedDict(sorted([(header["name"], header["value"]) for header in har['log']['entries'][0]['General']], key = lambda x: x[0])) def getHttpStatus(browser): for responseReceived in browser.get_log('performance'): try: response = json.loads(responseReceived[u'message'])[u'message'][u'params'][u'response'] if response[u'url'] == browser.current_url: return (response[u'status'], response[u'statusText']) except: pass return None def getHttpResponseHeader(browser): for responseReceived in browser.get_log('performance'): try: response = json.loads(responseReceived[u'message'])[u'message'][u'params'][u'response'] print if response[u'url'] == browser.current_url: print(response) except: pass return response[u'headers'] return None def labelElement(element,target_source): target_source =re.sub("[\r\n\s]","",str(target_source)) pattern = ">(.*)<" target_source = re.findall(re.compile(pattern), target_source)[0] element_source = element.get_attribute("innerHTML") element_source = re.sub("[\r\n\s]","",str(element_source)) #print(target_source[0:40]) #print(element_source[0:40]) #if target_source[0:10]==element_source[0:10] and target_source[-10:]==element_source[-10]: if target_source[0:60]==element_source[0:60]: return 1 return 0 def padding(all_data,pad=True): max_len = np.max([len(data[1]) for data in all_data]) print("max_len",max_len) #max_len = 200 list_x = [] list_y = [] list_url = [] for data in all_data: input_x = data[0] label_y = data[1] url = data[2] if pad: input_x = np.transpose(pad_sequences(np.transpose(input_x,(1,0)), max_len,padding="post", truncating="post", value=0,dtype="float32"),(1,0)) list_x.append(input_x) label_y = pad_sequences([label_y],max_len,padding="post", truncating="post", value=-1)[0] #list_y.append(label_y) list_y.append([(np.arange(2)==i).astype(np.integer) for i in label_y]) else: #input_x = np.array(input_x) list_x.append([input_x]) list_y.append([(np.arange(2)==i).astype(np.integer) for i in label_y]) list_url.append(url) return [np.array(list_x),np.array(list_y),list_url] def getAllData(): all_data = load("Link_Content.pk") data = [] temp_file ="temp_data.pk" count = 0 label = 0 data_len = len(all_data) for row in all_data: count += 1 print(str(label)+"/"+str(count)+"/"+str(data_len),row[0]) #encode = encodeInput(row[0], row[1]) if count%100==0: save(data,temp_file) encode = encodeInput_byJS(row[0], row[1]) if encode: label += 1 x,y = encode data.append([x,y,row[0]]) else: print("None") data = padding(data) return data def augmentation(data,times=100): aug_data = [] for item in data: x,y = item[0],item[1] new_item = [] for i_x,i_y in zip(list(x),list(y)): new_item.append([i_x,i_y]) aug_data.append(item) for _ in range(times): new_x = [] new_y = [] np.random.shuffle(new_item) for new_i in new_item: new_x.append(new_i[0]) new_y.append(new_i[1]) aug_data.append([new_x,new_y]) return aug_data def dumpLinkContent(): def trytosave(d): try: save(d,"1.pk") return 1 except Exception as e: return 0 import cx_Oracle as cx_Oracle conn=cx_Oracle.connect('bxkc/bxkc@192.168.2.54:1521/orcl') #连接数据库 cursor=conn.cursor() sql = " select page_link,page_content from detail_content " cursor.execute(sql) rows = cursor.fetchall() data = [] for row in rows: if trytosave(row)==1: data.append(row) save(data,"Link_Content.pk") def relabel(file_data="sourceData_36Input_28849_sort.pk"): ''' @summary: 调整标注数据,解决上卷问题 ''' data = load(file_data) count = 0 set_1 = set() set_2 = set() for page in data: _feature = page[0] _label = page[1] _url = page[2] _label_index = np.argmax(_label) _label_left = _feature[_label_index][0] _label_top = _feature[_label_index][1] _label_width = _feature[_label_index][2] _label_height = _feature[_label_index][3] _label_deepth = _feature[_label_index][4] _label_text = _feature[_label_index][7] _index = 0 _re_deepth = 0 _re_index = -1 for _box in _feature: _left = _box[0] _top = _box[1] _width = _box[2] _height = _box[3] _deepth = _box[4] _text = _box[7] if _deepth>_label_deepth: if _left>=_label_left and _top>=_label_top and (_left+_width)<=(_label_left+_label_width) and (_top+_height)<=(_label_top+_label_height) and (_width*_height/(_label_width*_label_height)>0.7 or (_width*_height/(_label_width*_label_height)>0.5 and _text/_label_text>0.9)): set_1.add(_url) if _deepth>_re_deepth: _re_deepth = _deepth _re_index = _index _index += 1 if _re_index>-1: _label[_label_index] = 0 _label[_re_index] = 1 print(_url) print(_label_index,_re_index) data.sort(key=lambda x:x[2]) print(len(list(set_1))) save(data,"sourceData_36Input_"+str(len(data[1]))+"_relabel.pk") data = padding(data) save(data,"data_"+str(len(data[1]))+"_relabel.pk") return data if __name__=="__main__": #dumpLinkContent() ''' relabel() ''' data = getInput_byJS("http://hailing.taizhou.gov.cn/art/2019/5/23/art_50810_2498758.html") for item in data[3]: print(item)