123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178 |
- '''
- Created on 2019年2月27日
- @author: User
- '''
- import module.htmlDrawing as hd
- import time
- import math
- import numpy as np
- def dealWithScriptOut(data,key_index=4):
- list_input = []
- list_inner = []
- for index in range(len(data)):
- #clean nan
- for i in range(len(data[index][0])):
- if data[index][0][i] is None or math.isnan(data[index][0][i]):
- data[index][0][i] = -1
- #order by deepth
- data.sort(key=lambda x:x[0][key_index])
- for item in data:
- list_input.append(item[0])
- list_inner.append(item[1])
- print(item)
- print(len(data))
- the_max = np.max(list_input,axis=0)
- the_max = np.array([x if x>0 else 1 for x in the_max])
- the_max = np.array((list(the_max)[2:4]+list(the_max)[2:9])*3)
- input_x = np.array(list_input/the_max)
- return input_x,list_inner
- if __name__=="__main__":
- browser = hd.getdriver()
- hd.loadPage(browser, "http://www.beian.miit.gov.cn/")
- hd.adddriver(browser)
- browser.maximize_window()
- start = time.time()
- scripts = '''
- function statisticIframe(nodes){
- var counts_communicateTags = 0;
- for(var i=0;i<nodes.length;i++){
- child = nodes[i]
- if (child.tagName!=null){
- if (child.tagName.toLowerCase() in {a:"",input:"",select:""} || child.onclick!=null){
- counts_communicateTags += 1;
- }
- if(child.tagName.toLowerCase()=="iframe"){
- if(child.contentWindow.document!=null){
- counts_communicateTags += statisticIframe(child.contentWindow.document.all);
- }
- }
- }
- }
- return counts_communicateTags;
- }
- function statistic(node,deepth){
- if(node.childNodes==null){
- node.counts_communicateTags = 0;
- return node.counts_communicateTags;
- }
- node.counts_communicateTags = 0;
- for(var i=0;i<node.childNodes.length;i++){
- child = node.childNodes[i];
- //删除标签
- /*
- if (child.tagName!=null){
- if (child.tagName.toLowerCase() in {head:"",script:"",meta:"",link:"",style:""} || child.nodeType==8 ){
- node.removeChild(child);
- continue;
- }
- }
- */
- if (child.tagName!=null){
- if (child.tagName.toLowerCase() in {a:"",input:"",select:""} || child.onclick!=null){
- node.counts_communicateTags += 1;
- }
- }
- if(child.tagName!=null && child.tagName.toLowerCase()=="iframe" && child.contentWindow.document!=null){
- node.counts_communicateTags += statisticIframe(child.contentWindow.document.all);
- }else{
- node.counts_communicateTags += statistic(child,deepth+1);
- }
-
- }
- var innertext = node.innerText;
- if(innertext){
- var text = innertext.replace(/\s/g,'');
- //var text = innertext;
- node.counts_text = text.length;
- var punc = text.match(/;|,|。|:|、/g);
- var lines = innertext.match(/.{10}\\n/g);
- if(lines){
- node.counts_lines = lines.length;
- }else{
- node.counts_lines = 0;
- }
- if(punc){
- node['counts_punctuations']= punc.length;
- }else{
- node.counts_punctuations = 0;
- }
-
- }else{
- node.counts_lines = 0;
- node.counts_text = 0;
- node.counts_punctuations=0;
- }
- node.deepth = deepth;
- return node.counts_communicateTags;
- }
- function label(node,targethtml){
- var innerhtml = node.innerHTML;
- if(innerhtml){
- innerhtml = innerhtml.replace(/\s/g,'');
- sub_innerhtml = innerhtml.substring(0,40);
- if (sub_innerhtml==targethtml.substring(0,40)){
- return 1;
- }else{
- return 0;
- }
- }else{
- return 0;
- }
- }
- function search(){
- statistic(document,1);
- var objs = document.all;
- var data = new Array();
- for(var i=0;i<objs.length;i++){
- obj = objs[i];
- if (obj.offsetWidth>100 && obj.offsetHeight>100 && obj.parentNode.tagName!=null && obj.childNodes.length>0){
- maxArea = 0;
- child_maxArea = null;
- secondmaxArea = 0;
- child_secondmaxArea = null;
- for(var j =0;j<obj.childNodes.length;j++){
- if(obj.childNodes[j].offsetWidth!=null && obj.childNodes[j].offsetHeight!=null){
- if( obj.childNodes[j].offsetWidth*obj.childNodes[j].offsetHeight>maxArea){
- maxArea = obj.childNodes[j].offsetWidth*obj.childNodes[j].offsetHeight;
- child_maxArea = obj.childNodes[j];
- }
- if(obj.childNodes[j].offsetWidth*obj.childNodes[j].offsetHeight>secondmaxArea && obj.childNodes[j].offsetWidth*obj.childNodes[j].offsetHeight<maxArea){
- secondmaxArea = obj.childNodes[j].offsetWidth*obj.childNodes[j].offsetHeight;
- child_secondmaxArea = obj.childNodes[j];
- }
- }
- }
- _item = new Array();
- _item.push(obj.offsetLeft,obj.offsetTop,obj.offsetWidth,obj.offsetHeight,obj.deepth,obj.counts_communicateTags,obj.counts_lines,obj.counts_text,obj.counts_punctuations,
- obj.parentNode.offsetLeft,obj.parentNode.offsetTop,obj.parentNode.offsetWidth,obj.parentNode.offsetHeight,obj.parentNode.deepth,obj.parentNode.counts_communicateTags,obj.parentNode.counts_lines,obj.parentNode.counts_text,obj.parentNode.counts_punctuations)
- if(child_maxArea!=null){
- _item.push(child_maxArea.offsetLeft,child_maxArea.offsetTop,child_maxArea.offsetWidth,child_maxArea.offsetHeight,child_maxArea.deepth,child_maxArea.counts_communicateTags,child_maxArea.counts_lines,child_maxArea.counts_text,child_maxArea.counts_punctuations)
- }else{
- _item.push(-1,-1,-1,-1,-1,-1,-1,-1,-1)
- }
-
- if(child_secondmaxArea!=null){
- _item.push(child_secondmaxArea.offsetLeft,child_secondmaxArea.offsetTop,child_secondmaxArea.offsetWidth,child_secondmaxArea.offsetHeight,child_secondmaxArea.deepth,child_secondmaxArea.counts_communicateTags,child_secondmaxArea.counts_lines,child_secondmaxArea.counts_text,child_secondmaxArea.counts_punctuations)
- }else{
- _item.push(-1,-1,-1,-1,-1,-1,-1,-1,-1)
- }
- data.push([_item,obj.innerHTML])
-
- }
- }
- return(data);
- }
- return (search());
- '''
- #print(browser.execute_script("return document.getElementsByClassName('detail')[0].offsetWidth;"))
- #print(browser.execute_script(scripts))
- data = browser.execute_script(scripts)
- for item in data:
- print(item)
- #print(browser.execute_script(scripts))
- print("cost",time.time()-start)
- dealWithScriptOut(data)
|