test.py 7.7 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178
  1. '''
  2. Created on 2019年2月27日
  3. @author: User
  4. '''
  5. import module.htmlDrawing as hd
  6. import time
  7. import math
  8. import numpy as np
  9. def dealWithScriptOut(data,key_index=4):
  10. list_input = []
  11. list_inner = []
  12. for index in range(len(data)):
  13. #clean nan
  14. for i in range(len(data[index][0])):
  15. if data[index][0][i] is None or math.isnan(data[index][0][i]):
  16. data[index][0][i] = -1
  17. #order by deepth
  18. data.sort(key=lambda x:x[0][key_index])
  19. for item in data:
  20. list_input.append(item[0])
  21. list_inner.append(item[1])
  22. print(item)
  23. print(len(data))
  24. the_max = np.max(list_input,axis=0)
  25. the_max = np.array([x if x>0 else 1 for x in the_max])
  26. the_max = np.array((list(the_max)[2:4]+list(the_max)[2:9])*3)
  27. input_x = np.array(list_input/the_max)
  28. return input_x,list_inner
  29. if __name__=="__main__":
  30. browser = hd.getdriver()
  31. hd.loadPage(browser, "http://www.beian.miit.gov.cn/")
  32. hd.adddriver(browser)
  33. browser.maximize_window()
  34. start = time.time()
  35. scripts = '''
  36. function statisticIframe(nodes){
  37. var counts_communicateTags = 0;
  38. for(var i=0;i<nodes.length;i++){
  39. child = nodes[i]
  40. if (child.tagName!=null){
  41. if (child.tagName.toLowerCase() in {a:"",input:"",select:""} || child.onclick!=null){
  42. counts_communicateTags += 1;
  43. }
  44. if(child.tagName.toLowerCase()=="iframe"){
  45. if(child.contentWindow.document!=null){
  46. counts_communicateTags += statisticIframe(child.contentWindow.document.all);
  47. }
  48. }
  49. }
  50. }
  51. return counts_communicateTags;
  52. }
  53. function statistic(node,deepth){
  54. if(node.childNodes==null){
  55. node.counts_communicateTags = 0;
  56. return node.counts_communicateTags;
  57. }
  58. node.counts_communicateTags = 0;
  59. for(var i=0;i<node.childNodes.length;i++){
  60. child = node.childNodes[i];
  61. //删除标签
  62. /*
  63. if (child.tagName!=null){
  64. if (child.tagName.toLowerCase() in {head:"",script:"",meta:"",link:"",style:""} || child.nodeType==8 ){
  65. node.removeChild(child);
  66. continue;
  67. }
  68. }
  69. */
  70. if (child.tagName!=null){
  71. if (child.tagName.toLowerCase() in {a:"",input:"",select:""} || child.onclick!=null){
  72. node.counts_communicateTags += 1;
  73. }
  74. }
  75. if(child.tagName!=null && child.tagName.toLowerCase()=="iframe" && child.contentWindow.document!=null){
  76. node.counts_communicateTags += statisticIframe(child.contentWindow.document.all);
  77. }else{
  78. node.counts_communicateTags += statistic(child,deepth+1);
  79. }
  80. }
  81. var innertext = node.innerText;
  82. if(innertext){
  83. var text = innertext.replace(/\s/g,'');
  84. //var text = innertext;
  85. node.counts_text = text.length;
  86. var punc = text.match(/;|,|。|:|、/g);
  87. var lines = innertext.match(/.{10}\\n/g);
  88. if(lines){
  89. node.counts_lines = lines.length;
  90. }else{
  91. node.counts_lines = 0;
  92. }
  93. if(punc){
  94. node['counts_punctuations']= punc.length;
  95. }else{
  96. node.counts_punctuations = 0;
  97. }
  98. }else{
  99. node.counts_lines = 0;
  100. node.counts_text = 0;
  101. node.counts_punctuations=0;
  102. }
  103. node.deepth = deepth;
  104. return node.counts_communicateTags;
  105. }
  106. function label(node,targethtml){
  107. var innerhtml = node.innerHTML;
  108. if(innerhtml){
  109. innerhtml = innerhtml.replace(/\s/g,'');
  110. sub_innerhtml = innerhtml.substring(0,40);
  111. if (sub_innerhtml==targethtml.substring(0,40)){
  112. return 1;
  113. }else{
  114. return 0;
  115. }
  116. }else{
  117. return 0;
  118. }
  119. }
  120. function search(){
  121. statistic(document,1);
  122. var objs = document.all;
  123. var data = new Array();
  124. for(var i=0;i<objs.length;i++){
  125. obj = objs[i];
  126. if (obj.offsetWidth>100 && obj.offsetHeight>100 && obj.parentNode.tagName!=null && obj.childNodes.length>0){
  127. maxArea = 0;
  128. child_maxArea = null;
  129. secondmaxArea = 0;
  130. child_secondmaxArea = null;
  131. for(var j =0;j<obj.childNodes.length;j++){
  132. if(obj.childNodes[j].offsetWidth!=null && obj.childNodes[j].offsetHeight!=null){
  133. if( obj.childNodes[j].offsetWidth*obj.childNodes[j].offsetHeight>maxArea){
  134. maxArea = obj.childNodes[j].offsetWidth*obj.childNodes[j].offsetHeight;
  135. child_maxArea = obj.childNodes[j];
  136. }
  137. if(obj.childNodes[j].offsetWidth*obj.childNodes[j].offsetHeight>secondmaxArea && obj.childNodes[j].offsetWidth*obj.childNodes[j].offsetHeight<maxArea){
  138. secondmaxArea = obj.childNodes[j].offsetWidth*obj.childNodes[j].offsetHeight;
  139. child_secondmaxArea = obj.childNodes[j];
  140. }
  141. }
  142. }
  143. _item = new Array();
  144. _item.push(obj.offsetLeft,obj.offsetTop,obj.offsetWidth,obj.offsetHeight,obj.deepth,obj.counts_communicateTags,obj.counts_lines,obj.counts_text,obj.counts_punctuations,
  145. obj.parentNode.offsetLeft,obj.parentNode.offsetTop,obj.parentNode.offsetWidth,obj.parentNode.offsetHeight,obj.parentNode.deepth,obj.parentNode.counts_communicateTags,obj.parentNode.counts_lines,obj.parentNode.counts_text,obj.parentNode.counts_punctuations)
  146. if(child_maxArea!=null){
  147. _item.push(child_maxArea.offsetLeft,child_maxArea.offsetTop,child_maxArea.offsetWidth,child_maxArea.offsetHeight,child_maxArea.deepth,child_maxArea.counts_communicateTags,child_maxArea.counts_lines,child_maxArea.counts_text,child_maxArea.counts_punctuations)
  148. }else{
  149. _item.push(-1,-1,-1,-1,-1,-1,-1,-1,-1)
  150. }
  151. if(child_secondmaxArea!=null){
  152. _item.push(child_secondmaxArea.offsetLeft,child_secondmaxArea.offsetTop,child_secondmaxArea.offsetWidth,child_secondmaxArea.offsetHeight,child_secondmaxArea.deepth,child_secondmaxArea.counts_communicateTags,child_secondmaxArea.counts_lines,child_secondmaxArea.counts_text,child_secondmaxArea.counts_punctuations)
  153. }else{
  154. _item.push(-1,-1,-1,-1,-1,-1,-1,-1,-1)
  155. }
  156. data.push([_item,obj.innerHTML])
  157. }
  158. }
  159. return(data);
  160. }
  161. return (search());
  162. '''
  163. #print(browser.execute_script("return document.getElementsByClassName('detail')[0].offsetWidth;"))
  164. #print(browser.execute_script(scripts))
  165. data = browser.execute_script(scripts)
  166. for item in data:
  167. print(item)
  168. #print(browser.execute_script(scripts))
  169. print("cost",time.time()-start)
  170. dealWithScriptOut(data)