featureEngine.py 13 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385
  1. '''
  2. Created on 2019年8月8日
  3. @author: User
  4. '''
  5. import re
  6. import time
  7. from keras.preprocessing.sequence import pad_sequences
  8. scripts_title = '''
  9. function statisticIframe(nodes){
  10. var counts_communicateTags = 0;
  11. for(var i=0;i<nodes.length;i++){
  12. child = nodes[i]
  13. if (child.tagName!=null){
  14. if (child.tagName.toLowerCase() in {a:"",input:"",select:""} || child.onclick!=null){
  15. counts_communicateTags += 1;
  16. }
  17. if(child.tagName.toLowerCase()=="iframe"){
  18. if(child.contentWindow.document!=null){
  19. counts_communicateTags += statisticIframe(child.contentWindow.document.all);
  20. }
  21. }
  22. }
  23. }
  24. return counts_communicateTags;
  25. }
  26. function statistic(node,deepth){
  27. if(node.childNodes==null){
  28. node.counts_communicateTags = 0;
  29. return node.counts_communicateTags;
  30. }
  31. node.counts_communicateTags = 0;
  32. for(var i=0;i<node.childNodes.length;i++){
  33. child = node.childNodes[i];
  34. //删除标签
  35. /*
  36. if (child.tagName!=null){
  37. if (child.tagName.toLowerCase() in {head:"",script:"",meta:"",link:"",style:""} || child.nodeType==8 ){
  38. node.removeChild(child);
  39. continue;
  40. }
  41. }
  42. */
  43. if (child.tagName!=null){
  44. if (child.tagName.toLowerCase() in {a:"",input:"",select:""} || child.onclick!=null){
  45. node.counts_communicateTags += 1;
  46. }
  47. }
  48. /*if(child.tagName!=null && child.tagName.toLowerCase()=="iframe" && child.contentWindow.document!=null){
  49. node.counts_communicateTags += statisticIframe(child.contentWindow.document.all);
  50. }else{
  51. node.counts_communicateTags += statistic(child,deepth+1);
  52. }*/
  53. node.counts_communicateTags += statistic(child,deepth+1);
  54. }
  55. var innertext = node.innerText;
  56. if(innertext){
  57. var text = innertext.replace(/\s/g,'');
  58. //var text = innertext;
  59. node.counts_text = text.length;
  60. var punc = text.match(/;|,|。|:|、/g);
  61. var lines = innertext.match(/.{10}\\n/g);
  62. if(lines){
  63. node.counts_lines = lines.length;
  64. }else{
  65. node.counts_lines = 0;
  66. }
  67. if(punc){
  68. node['counts_punctuations']= punc.length;
  69. }else{
  70. node.counts_punctuations = 0;
  71. }
  72. }else{
  73. node.counts_lines = 0;
  74. node.counts_text = 0;
  75. node.counts_punctuations=0;
  76. }
  77. node.deepth = deepth;
  78. return node.counts_communicateTags;
  79. }
  80. function recursive_candidate_title(node,list_candidate_title,maxWidth,maxHeight){
  81. if(node==document){
  82. var _flag = true;
  83. var list_node_true = new Array();
  84. for(var i=0;i<node.childNodes.length;i++){
  85. child = node.childNodes[i];
  86. if(child.offsetWidth>maxWidth){
  87. maxWidth = child.offsetWidth;
  88. }
  89. if(child.offsetHeight>maxHeight){
  90. maxHeight = child.offsetHeight;
  91. }
  92. }
  93. for(var i=0;i<node.childNodes.length;i++){
  94. var child = node.childNodes[i];
  95. var _result = recursive_candidate_title(child,list_candidate_title,maxWidth,maxHeight);
  96. if(_result!=null){
  97. if(!_result[1]){
  98. _flag = false;
  99. }else{
  100. list_node_true.push(child);
  101. }
  102. }
  103. }
  104. if(_flag){
  105. }else{
  106. for(var i=0;i<list_node_true.length;i++){
  107. list_candidate_title.push([node,node.innerHTML]);
  108. }
  109. }
  110. }else{
  111. if(node.nodeType!=1){
  112. return null;
  113. }
  114. if(node.innerText==null || node.innerText==""){
  115. return null;
  116. }
  117. var _node_fontSize = window.getComputedStyle(node).fontSize;
  118. if(node.childNodes==null){
  119. return [_node_fontSize,true];
  120. }else{
  121. for(var i=0;i<node.childNodes.length;i++){
  122. child = node.childNodes[i];
  123. if(child.offsetWidth>maxWidth){
  124. maxWidth = child.offsetWidth;
  125. }
  126. if(child.offsetHeight>maxHeight){
  127. maxHeight = child.offsetHeight;
  128. }
  129. }
  130. var _flag = true;
  131. var list_node_true = new Array();
  132. for(var i=0;i<node.childNodes.length;i++){
  133. var child = node.childNodes[i];
  134. var _result = recursive_candidate_title(child,list_candidate_title,maxWidth,maxHeight);
  135. if(_result!=null){
  136. if(!_result[1]){
  137. _flag = false;
  138. }else{
  139. list_node_true.push(child);
  140. }
  141. if(_node_fontSize!=_result[0]){
  142. _flag = false;
  143. }
  144. }
  145. }
  146. if(_flag){
  147. return [_node_fontSize,true];
  148. }else{
  149. for(var i=0;i<list_node_true.length;i++){
  150. var child_true = list_node_true[i]
  151. if(child_true.offsetWidth>100 && getOffsetTop(child_true)>0){
  152. var _fontWeight = window.getComputedStyle(child_true).fontWeight
  153. var _weight = 400;
  154. if(_fontWeight=="normal"){
  155. _weight = 400;
  156. }else if(_fontWeight=="bold"){
  157. _weight = 700;
  158. }else if(_fontWeight=="lighter"){
  159. _weight = 200;
  160. }else if(_fontWeight=="bolder"){
  161. _weight = 600;
  162. }else{
  163. _weight = parseInt(_fontWeight)
  164. }
  165. var _fontSize = parseInt(window.getComputedStyle(child_true).fontSize.match(/\d+/)[0])
  166. list_candidate_title.push([[maxWidth,maxHeight,getOffsetLeft(child_true),getOffsetTop(child_true),child_true.offsetWidth,child_true.offsetHeight,_fontSize,_weight,child_true.counts_text,child_true.counts_lines,child_true.counts_punctuations,child_true.counts_communicateTags],child_true.innerHTML,getListXpath(child_true,new Array())]);
  167. }
  168. }
  169. return [_node_fontSize,false];
  170. }
  171. }
  172. }
  173. }
  174. var list_candidate_title = new Array();
  175. statistic(document,1);
  176. recursive_candidate_title(document,list_candidate_title,0,0);
  177. return list_candidate_title;
  178. '''
  179. import module.htmlDrawing as hd
  180. import numpy as np
  181. import math
  182. from module.Utils import *
  183. def dealWithScriptOut(data,sort_index=3):
  184. list_input = []
  185. list_inner = []
  186. list_xpath = []
  187. list_top = []
  188. for index in range(len(data)):
  189. #clean nan
  190. for i in range(len(data[index][0])):
  191. if data[index][0][i] is None or math.isnan(data[index][0][i]):
  192. data[index][0][i] = -1
  193. data.sort(key=lambda x:x[0][sort_index])
  194. for item in data:
  195. list_input.append(item[0])
  196. list_inner.append(item[1])
  197. list_xpath.append(item[2])
  198. list_top.append(item[0][3])
  199. #print(len(data))
  200. if len(list_input)>0:
  201. the_max = np.max(list_input,axis=0)
  202. the_max = np.array([x if x>0 else 1 for x in the_max])
  203. the_max = np.array(list(the_max)[0:2]*3+[16,400,20,20,20,20])
  204. input_x = np.array(list_input/the_max)
  205. return input_x,list_inner,list_xpath,list_top
  206. else:
  207. return None
  208. def getInput_byJS(browser,url):
  209. try:
  210. # browser = hd.getdriver()
  211. # debug("get driver")
  212. # hd.loadPage(browser, url)
  213. # data = browser.execute_script(scripts_common+scripts_title)
  214. data = get_js_rs(browser, scripts_common+scripts_title)
  215. deal_data = dealWithScriptOut(data)
  216. if deal_data is None:
  217. return False,""
  218. else:
  219. input_x,list_inner,list_xpath,list_height = deal_data
  220. return True,[[np.expand_dims(input_x,0)],list_inner,list_xpath,list_height]
  221. except Exception as e:
  222. error(str(e))
  223. err_msg = ""
  224. if re.search("frame",str(e)) is not None:
  225. err_msg = "#iframe#"
  226. return None,err_msg
  227. # finally:
  228. # hd.adddriver(browser)
  229. # debug("release driver")
  230. def encodeInput_byJS(url,targethtml):
  231. def label(innerhtml,target_source):
  232. target_source =re.sub("[\r\n\s]","",str(target_source))
  233. pattern = ">(.*)<"
  234. target_source = re.findall(re.compile(pattern), target_source)[0]
  235. innerhtml = re.sub("[\r\n\s]","",str(innerhtml))
  236. #print(target_source[0:40])
  237. #print(element_source[0:40])
  238. #if target_source[0:10]==element_source[0:10] and target_source[-10:]==element_source[-10]:
  239. if target_source==innerhtml:
  240. return 1
  241. return 0
  242. try:
  243. browser = hd.getdriver()
  244. debug("get driver")
  245. start = time.time()
  246. hd.loadPage(browser, url)
  247. print("get",time.time()-start)
  248. browser.maximize_window()
  249. start = time.time()
  250. # data = browser.execute_script(scripts_common+scripts_title)
  251. data = get_js_rs(browser, scripts_common+scripts_title)
  252. input_x,list_inner,_,_ = dealWithScriptOut(data)
  253. list_label = []
  254. for item in list_inner:
  255. list_label.append(label(item, targethtml))
  256. if len(list_label)>0 and np.sum(list_label)==1:
  257. return input_x,np.array(list_label)
  258. else:
  259. return None
  260. print("cost",time.time()-start)
  261. except Exception as e:
  262. print(e)
  263. finally:
  264. hd.adddriver(browser)
  265. debug("release driver")
  266. return None
  267. def dumpLinkTitle():
  268. def trytosave(d):
  269. try:
  270. save(d,"1.pk")
  271. return 1
  272. except Exception as e:
  273. return 0
  274. import cx_Oracle as cx_Oracle
  275. conn=cx_Oracle.connect('bxkc/bxkc@192.168.2.54:1521/orcl') #连接数据库
  276. cursor=conn.cursor()
  277. sql = " select page_link,page_title from DETAIL_CONTENT_HTML where page_link is not null and page_type=1 and page_title like '<%' and page_title not like '<a%' "
  278. cursor.execute(sql)
  279. data = []
  280. while(True):
  281. try:
  282. rows = cursor.fetchmany(10)
  283. if not rows:
  284. break
  285. for row in rows:
  286. if trytosave(row)==1:
  287. data.append(row)
  288. except Exception as e:
  289. print(e)
  290. save(data,"Link_Title.pk")
  291. def getAllData():
  292. all_data = load("Link_Title.pk")
  293. data = []
  294. temp_file ="temp_data.pk"
  295. count = 0
  296. label = 0
  297. data_len = len(all_data)
  298. for row in all_data:
  299. count += 1
  300. print(str(label)+"/"+str(count)+"/"+str(data_len),row[0])
  301. #encode = encodeInput(row[0], row[1])
  302. if count%100==0:
  303. save(data,temp_file)
  304. encode = encodeInput_byJS(row[0], row[1])
  305. if encode:
  306. label += 1
  307. x,y = encode
  308. data.append([x,y,row[0]])
  309. else:
  310. print("None")
  311. save(data,"data_done.pk")
  312. return data
  313. def filter():
  314. list_length = []
  315. data = load("temp_data.pk")
  316. print(data[0])
  317. data.sort(key = lambda x:x[2])
  318. new_data = []
  319. for item in data:
  320. list_length.append(len(item[0]))
  321. if len(item[0])<100:
  322. new_data.append(item)
  323. print(max(list_length))
  324. print(len(data))
  325. print(len(new_data))
  326. save(new_data,"source_12input.pk")
  327. def paddinig(all_data,pad=True):
  328. max_len = np.max([len(data[1]) for data in all_data])
  329. print("max_len",max_len)
  330. #max_len = 200
  331. list_x = []
  332. list_y = []
  333. list_url = []
  334. for data in all_data:
  335. input_x = data[0]
  336. label_y = data[1]
  337. url = data[2]
  338. if pad:
  339. input_x = np.transpose(pad_sequences(np.transpose(input_x,(1,0)), max_len,padding="post", truncating="post", value=0,dtype="float32"),(1,0))
  340. list_x.append(input_x)
  341. label_y = pad_sequences([label_y],max_len,padding="post", truncating="post", value=-1)[0]
  342. #list_y.append(label_y)
  343. list_y.append([(np.arange(2)==i).astype(np.integer) for i in label_y])
  344. else:
  345. #input_x = np.array(input_x)
  346. list_x.append([input_x])
  347. list_y.append([(np.arange(2)==i).astype(np.integer) for i in label_y])
  348. list_url.append(url)
  349. return [np.array(list_x),np.array(list_y),list_url]
  350. if __name__=="__main__":
  351. #data = getInput_byJS("http://www.tonghua.gov.cn/cjj/zbtb/201908/t20190802_360119.html")
  352. #dumpLinkTitle()
  353. #getAllData()
  354. #filter()
  355. data = paddinig(load("source_12input.pk"))
  356. save(data,"source_12input_padding.pk")