featureEngine.py 23 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700
  1. '''
  2. Created on 2019年8月12日
  3. @author: User
  4. '''
  5. import module.htmlDrawing as hd
  6. import math
  7. import numpy as np
  8. from module.Utils import *
  9. from keras.preprocessing.sequence import pad_sequences
  10. import re
  11. script_content = '''
  12. function label(node,set_url){
  13. var node_flag = check(node,set_url);
  14. var child_flag = false;
  15. if(node.childNodes!=null){
  16. for(var i=0;i<node.childNodes.length;i++){
  17. var child = node.childNodes[i];
  18. if(check(child,set_url)){
  19. child_flag = true;
  20. }
  21. }
  22. }
  23. if(node_flag && !child_flag){
  24. return 1;
  25. }else{
  26. return 0;
  27. }
  28. }
  29. function statisticIframe(nodes){
  30. var counts_communicateTags = 0;
  31. for(var i=0;i<nodes.length;i++){
  32. child = nodes[i]
  33. if (child.tagName!=null){
  34. if (child.tagName.toLowerCase() in {a:"",input:"",select:""} || child.onclick!=null){
  35. counts_communicateTags += 1;
  36. }
  37. if(child.tagName.toLowerCase()=="iframe"){
  38. if(child.contentWindow.document!=null){
  39. counts_communicateTags += statisticIframe(child.contentWindow.document.all);
  40. }
  41. }
  42. }
  43. }
  44. return counts_communicateTags;
  45. }
  46. function statistic(node,deepth){
  47. if(node.childNodes==null){
  48. node.counts_communicateTags = 0;
  49. node.counts_tag = 0;
  50. node.entropy_width = 0;
  51. node.entropy_height = 0;
  52. return node.counts_communicateTags;
  53. }
  54. node.counts_communicateTags = 0;
  55. node.counts_tag = 0;
  56. var set_tag = new Set();
  57. var list_width = []
  58. var list_height = []
  59. for(var i=0;i<node.childNodes.length;i++){
  60. child = node.childNodes[i];
  61. //删除标签
  62. /*
  63. if (child.tagName!=null){
  64. if (child.tagName.toLowerCase() in {head:"",script:"",meta:"",link:"",style:""} || child.nodeType==8 ){
  65. node.removeChild(child);
  66. continue;
  67. }
  68. }
  69. */
  70. if(child.offsetWidth){
  71. list_width.push(child.offsetWidth);
  72. }
  73. if(child.offsetHeight){
  74. list_height.push(child.offsetHeight);
  75. }
  76. node.counts_tag += 1;
  77. if (child.tagName!=null){
  78. set_tag.add(child.tagName.toLowerCase())
  79. if (child.tagName.toLowerCase() in {a:"",input:"",select:""} || child.onclick!=null){
  80. node.counts_communicateTags += 1;
  81. }
  82. }
  83. /*if(child.tagName!=null && child.tagName.toLowerCase()=="iframe" && child.contentWindow.document!=null){
  84. node.counts_communicateTags += statisticIframe(child.contentWindow.document.all);
  85. }else{
  86. node.counts_communicateTags += statistic(child,deepth+1);
  87. }*/
  88. node.counts_communicateTags += statistic(child,deepth+1);
  89. }
  90. node.counts_tagType = set_tag.size();
  91. var sum_width = 0;
  92. var sum_height = 0;
  93. var avg_width = 0;
  94. var avg_height = 0;
  95. var entropy_width = 0;
  96. var entropy_height = 0;
  97. if(list_width.length>0){
  98. for(var i=0;i<list_width.length;i++){
  99. sum_width += list_width[i];
  100. }
  101. for(var i=0;i<list_height.length;i++){
  102. sum_height += list_height[i];
  103. }
  104. avg_width = sum_width/list_width.length;
  105. avg_height = sum_height/list_height.length;
  106. for(var i=0;i<list_width.length;i++){
  107. entropy_width += Math.pow(list_width[i]-avg_width,2);
  108. }
  109. for(var i=0;i<list_height.length;i++){
  110. entropy_height += Math.pow(list_height[i]-avg_height,2);
  111. }
  112. entropy_width /= list_width.length;
  113. entropy_height /= list_height.length;
  114. }
  115. entropy_width = entropy_width>1000?1000:entropy_width;
  116. entropy_height = entropy_height>1000?1000:entropy_height;
  117. node.entropy_width = entropy_width;
  118. node.entropy_height = entropy_height;
  119. var innertext = node.innerText;
  120. if(innertext){
  121. var pattern_time = /([^\d]?(\d{4}|\d{2})\s*[-\/::年.]\s*\d{1,2}\s*[-\/::月.]\s*\d{1,2}[^\d]?)|([^\d]?\d{2,4}\s*[-\/月年]\s*\d{1,2}[^\d]?)/
  122. var text = innertext.replace(/\s/g,'');
  123. //var text = innertext;
  124. node.counts_text = text.length;
  125. var punc = text.match(/;|,|。|:|、/g);
  126. var lines = innertext.match(/.{10}\\n/g);
  127. var times = innertext.match(pattern_time);
  128. if(lines){
  129. node.counts_lines = lines.length;
  130. }else{
  131. node.counts_lines = 0;
  132. }
  133. if(punc){
  134. node['counts_punctuations']= punc.length;
  135. }else{
  136. node.counts_punctuations = 0;
  137. }
  138. if(times){
  139. node.counts_times = times.length;
  140. }else{
  141. node.counts_times = 0;
  142. }
  143. }else{
  144. node.counts_lines = 0;
  145. node.counts_text = 0;
  146. node.counts_punctuations=0;
  147. node.counts_times = 0;
  148. }
  149. node.deepth = deepth;
  150. return node.counts_communicateTags;
  151. }
  152. function search(str_url){
  153. statistic(document,1);
  154. var objs = document.all;
  155. var set_url = new Set();
  156. list_url = str_url.split("比地");
  157. for(var i=0;i<list_url.length;i++){
  158. if(list_url[i]!=""){
  159. set_url.add(list_url[i]);
  160. }
  161. }
  162. var data = new Array();
  163. for(var i=0;i<objs.length;i++){
  164. obj = objs[i];
  165. if (obj.offsetWidth>100 && obj.offsetHeight>100 && obj.parentNode.tagName!=null && obj.childNodes.length>0){
  166. _item = new Array();
  167. _item.push(getOffsetLeft(obj),getOffsetTop(obj),obj.offsetWidth,obj.offsetHeight,obj.deepth,obj.counts_text,obj.counts_times,obj.counts_tagType,obj.counts_tag,obj.entropy_width,obj.entropy_height)
  168. data.push([_item,label(obj,set_url),obj.innerHTML,getListXpath(obj,new Array())])
  169. }
  170. }
  171. return(data);
  172. }
  173. return search(arguments[0])
  174. '''
  175. script_get_A_Date = '''
  176. function is_similar(source,target){
  177. var diff_index = -1;
  178. var source_split = source.split(/(\d+)/)
  179. var target_split = target.split(/(\d+)/)
  180. if(source_split.length==target_split.length){
  181. var diff_count = 0;
  182. for(var i=0;i<source_split.length;i++){
  183. if(source_split[i]!=target_split[i]){
  184. if(diff_index==-1){
  185. if(source_split[i].search(/^\d+$/)>=0 && target_split[i].search(/^\d+$/)>=0){
  186. diff_index = i;
  187. }else{
  188. //不同的部分一定要是数字
  189. return -1;
  190. }
  191. }
  192. diff_count += 1;
  193. }
  194. }
  195. if(diff_count==1){
  196. return diff_index;
  197. }else{
  198. return -1;
  199. }
  200. }else{
  201. return -1;
  202. }
  203. }
  204. function getNode_listContent(xpath){
  205. /*
  206. var objs = document.all;
  207. for(var i=0;i<objs.length;i++){
  208. var obj = objs[i];
  209. if(obj!=null && getXpath(obj,[])==xpath){
  210. return obj;
  211. }
  212. }
  213. return null;
  214. */
  215. var objs = findElements_byXpath(xpath);
  216. if(objs.length>0){
  217. return objs[0];
  218. }
  219. return null;
  220. }
  221. function statistic_time(node,_array){
  222. var pattern_time = /([^\d]?(\d{4}|\d{2})\s*[-\/::年.]\s*\d{1,2}\s*[-\/::月.]\s*\d{1,2}[^\d]?)|([^\d]?\d{2,4}\s*[-\/月年]\s*\d{1,2}[^\d]?)/
  223. var _find_flag = false;
  224. if (node.childNodes==null){
  225. }else{
  226. for(var i=0;i<node.childNodes.length;i++){
  227. var childNode = node.childNodes[i];
  228. var _innerText = childNode.innerText;
  229. if(childNode!=null && childNode.tagName!=null && childNode.tagName.toLowerCase()=="script"){
  230. continue;
  231. }
  232. if (_innerText!=null && _innerText.search(pattern_time)>=0){
  233. statistic_time(childNode,_array);
  234. _find_flag = true;
  235. }
  236. }
  237. }
  238. if (!_find_flag){
  239. _array.push(getXpath(node,["tr","li"],true));
  240. }
  241. return _array;
  242. }
  243. function padding_href(href){
  244. var baseUrl = window.location.href;
  245. var baseUrl_split = baseUrl.split("/");
  246. var join_flag = true;
  247. var href_padded = "";
  248. var level_nums = 1;
  249. var filename = "";
  250. if(href==null){
  251. join_flag = false;
  252. }else if(href.indexOf("javascript")>-1){
  253. join_flag = false;
  254. }else if(href.indexOf("http")>-1){
  255. join_flag = false;
  256. href_padded = href;
  257. }else if(href.indexOf("./")==0){
  258. filename = href.substring(2);
  259. }else if(href.indexOf("../")==0){
  260. level_nums ++;
  261. _substr = href.substring(3)
  262. while(true){
  263. if(_substr.indexOf("../")==0){
  264. level_nums ++;
  265. _substr = _substr.substring(3);
  266. }else{
  267. filename = _substr;
  268. break;
  269. }
  270. }
  271. }else if(href.indexOf("./")==0){
  272. level_nums = baseUrl_split.length-3;
  273. filename = href.substring(1);
  274. }else if(href.indexOf("?")==0){
  275. _href = baseUrl.split("?")[0]+href;
  276. return _href;
  277. }else{
  278. filename = href;
  279. }
  280. if(join_flag){
  281. for(var i=0;i<baseUrl_split.length-level_nums;i++){
  282. href_padded += baseUrl_split[i]+"/";
  283. }
  284. href_padded += filename;
  285. }
  286. return href_padded;
  287. }
  288. function statistic_A(node){
  289. var list_a = node.getElementsByTagName("a");
  290. var clustered_turnPage = clustering_turnPage();
  291. var array_xpath_turnPage = new Set();
  292. for(var i=0;i<clustered_turnPage.length;i++){
  293. array_xpath_turnPage.add(padding_href(clustered_turnPage[i][0].href));
  294. }
  295. var set_aXpath = new Set();
  296. var set_href = new Set();
  297. for(var i=0;i<list_a.length;i++){
  298. _href = padding_href(list_a[i].href);
  299. var is_turnPage = false;
  300. _xpath = getXpath(list_a[i],["tr","li"],true);
  301. if(array_xpath_turnPage.contains(_href)){
  302. is_turnPage = true;
  303. }
  304. if(!is_turnPage){
  305. set_aXpath.add(_xpath);
  306. if(_href!=""){
  307. set_href.add(_href);
  308. }
  309. }
  310. }
  311. return [set_aXpath.dataStore,set_href.dataStore];
  312. }
  313. function similar_all(_xpath,array_xpath){
  314. var similar_index = -1;
  315. for(var h=0;h<array_xpath.length;h++){
  316. diff_index = is_similar(_xpath,array_xpath[h]);
  317. if( similar_index>-1 && similar_index!=diff_index){
  318. return -1;
  319. }
  320. similar_index = diff_index;
  321. if(diff_index<=-1){
  322. return -1;
  323. }
  324. }
  325. return similar_index;
  326. }
  327. function clustering_xpath(array_xpath){
  328. var array_class = new Array();
  329. for(var i=0;i<array_xpath.length;i++){
  330. for(var j=0;j<array_class.length;j++){
  331. //与此类中所有xpath都要一样
  332. var diff_index = similar_all(array_xpath[i],array_class[j][1])
  333. if(diff_index>-1){
  334. if(array_class[j][0].indexOf(diff_index)==-1){
  335. array_class[j][0].push(diff_index);
  336. }
  337. if(array_class[j][1].indexOf(array_xpath[i])<0){
  338. array_class[j][1].push(array_xpath[i]);
  339. }
  340. }
  341. }
  342. array_class.push([[],[array_xpath[i]]]);
  343. }
  344. var _max_length = 0;
  345. var _max_index = -1;
  346. for(var i=0;i<array_class.length;i++){
  347. if(array_class[i][1].length>_max_length){
  348. _max_length = array_class[i][1].length;
  349. _max_index = i;
  350. }
  351. }
  352. return array_class[_max_index];
  353. }
  354. function search(content_xpath){
  355. try{
  356. content_node = getNode_listContent(content_xpath) //获取列表页标签节点
  357. if(content_node!=null){
  358. var array_a_href = statistic_A(content_node);
  359. var array_a = array_a_href[0];
  360. var array_href = new Array();
  361. var array_date = new Array();
  362. statistic_time(content_node,array_date);
  363. var _clustered_a = clustering_xpath(array_a);
  364. var _clustered_date = clustering_xpath(array_date);
  365. for(var i=0;i<array_a.length;i++){
  366. if(_clustered_a[1].indexOf(array_a_href[0][i])>=0){
  367. array_href.push(array_a_href[1][i]);
  368. }
  369. }
  370. return [_clustered_a,_clustered_date,array_href]
  371. }
  372. return null;
  373. }
  374. catch(e){
  375. return null
  376. }
  377. }
  378. return search(arguments[0]);
  379. '''
  380. def dealWithScriptOut(data):
  381. list_input = []
  382. list_label = []
  383. list_inner = []
  384. list_xpath = []
  385. for index in range(len(data)):
  386. #clean nan
  387. for i in range(len(data[index][0])):
  388. if data[index][0][i] is None or math.isnan(data[index][0][i]):
  389. data[index][0][i] = -1
  390. #order by deepth
  391. data.sort(key=lambda x:x[0][2]*x[0][3],reverse=True)
  392. for item in data:
  393. list_input.append(item[0])
  394. list_label.append(item[1])
  395. list_inner.append(item[2])
  396. list_xpath.append(item[3])
  397. #print(len(data))
  398. if len(list_input)>0:
  399. the_max = np.max(list_input,axis=0)
  400. the_max = np.array([x if x>0 else 1 for x in the_max])
  401. the_max = np.array((list(the_max)[2:4]*2+list(the_max)[4:7]+[10,20]+list(the_max)[9:]))
  402. input_x = np.array(list_input/the_max)
  403. return input_x,list_label,list_inner,list_xpath
  404. else:
  405. return None
  406. def encodeInput_byJS(url,str_href):
  407. try:
  408. browser = hd.getdriver()
  409. debug("get driver")
  410. hd.loadPage(browser, url)
  411. # data = browser.execute_script(scripts_common+script_content,str_href)
  412. data = get_js_rs(browser, scripts_common+script_content,str_href)
  413. deal_data = dealWithScriptOut(data)
  414. if deal_data is None:
  415. return None
  416. input_x,list_label,list_inner,list_xpath = deal_data
  417. if np.sum(list_label)==1:
  418. return input_x,np.array(list_label)
  419. else:
  420. return None
  421. except Exception as e:
  422. log(str(e))
  423. finally:
  424. hd.adddriver(browser)
  425. debug("release driver")
  426. return None
  427. def getInput_byJS(browser,url,str_href):
  428. try:
  429. # hd.loadPage(browser,url)
  430. # data = browser.execute_script(scripts_common+script_content,str_href)
  431. data = get_js_rs(browser, scripts_common+script_content,str_href)
  432. deal_data = dealWithScriptOut(data)
  433. if deal_data is None:
  434. return None
  435. else:
  436. input_x,_,list_inner,list_xpath = deal_data
  437. return [np.expand_dims(input_x,0)],list_inner,list_xpath
  438. except Exception as e:
  439. error(str(e))
  440. return None
  441. def getRule_A_Date(browser, url,content_xpath):
  442. def appendXpath(list_xpath,_xpath):
  443. if len(list_xpath)==0:
  444. list_xpath.append(_xpath)
  445. else:
  446. list_xpath.append(list_xpath[-1].split("/")[-1]+"/"+_xpath)
  447. dict_Rule_A_Date = {"listpage_A":None,
  448. "listpage_Date":None,
  449. "flag":True,
  450. "hasDrew":False}
  451. # try:
  452. # browser = hd.getdriver()
  453. # debug("get driver")
  454. # hd.loadPage(browser,url)
  455. list_a = None
  456. for _content_xpath in [content_xpath,"/html"]:
  457. # data = browser.execute_script(scripts_common+script_get_A_Date,_content_xpath)
  458. data = get_js_rs(browser, scripts_common+script_get_A_Date,_content_xpath)
  459. if data is None:
  460. log("A_Date not found with xpath:"+_content_xpath)
  461. continue
  462. if _content_xpath==content_xpath or len(data[0][1])==len(data[1][1]):
  463. list_a = data[0]
  464. list_date = data[1]
  465. list_hrefs = data[2]
  466. if list_a is not None and len(list_a[1])==len(list_date[1]):
  467. log('list_a is not None and len(list_a[1])==len(list_date[1])')
  468. break
  469. else:
  470. log("different length of A and Date:with xpath:"+_content_xpath)
  471. if list_a is None:
  472. log("A_Date not found with all xpath")
  473. return None;
  474. log("xpath of a:\t"+str(list_a[1][0])+"-"+str(list_a[0]))
  475. log("xpath of date:\t"+str(list_date[1][0])+"-"+str(list_date[0]))
  476. log("length of A and Date:"+str(len(list_a[1]))+"-"+str(len(list_date[1])))
  477. if len(list_a[1])!=len(list_date[1]):
  478. dict_Rule_A_Date["flag"] = False
  479. add_err_msg(dict_Rule_A_Date, "#列表页链接和标题数量不一致#")
  480. return dict_Rule_A_Date,list_hrefs
  481. else:
  482. list_diffindex = list_a[0]
  483. _xpath = list_a[1][0]
  484. listpage_a = []
  485. begin = 0
  486. list_diffindex.sort(key=lambda x:x)
  487. _jump_flag = False
  488. dict_Rule_A_Date["hasDrew"] = dict_Rule_A_Date["hasDrew"] or hd.hasDrew(url, [{"rule":_xpath,"type":"xpath"}])
  489. _xpath_split = re.split("(\d+)",_xpath)
  490. for i in range(len(list_diffindex)):
  491. _index = list_diffindex[i]
  492. if not (_xpath_split[_index-1][-1]=="[" and _xpath_split[_index+1][0]=="]"):
  493. add_err_msg(dict_Rule_A_Date, "#列表页链接xpath无法分割#")
  494. dict_Rule_A_Date["flag"] = False
  495. return dict_Rule_A_Date,list_hrefs
  496. else:
  497. if i==0:
  498. appendXpath(listpage_a,re.search("(.*)\[","".join(_xpath_split[:_index])).group(1))
  499. begin = _index+1
  500. elif i<len(list_diffindex):
  501. appendXpath(listpage_a,re.search("/(.*)\[","".join(_xpath_split[begin:_index])).group(1))
  502. begin = _index+1
  503. else:
  504. appendXpath(listpage_a,re.search("/(.*)","".join(_xpath_split[begin:])).group(1))
  505. if i==len(list_diffindex)-1:
  506. _group = re.search("/(.*)","".join(_xpath_split[begin:]))
  507. if _group is not None:
  508. appendXpath(listpage_a,_group.group(1))
  509. for i in range(len(listpage_a)):
  510. if len(listpage_a[i].split("/"))>6:
  511. # listpage_a[i] = browser.execute_script(scripts_replaceXpath,listpage_a[i])
  512. listpage_a[i] = get_js_rs(browser, scripts_replaceXpath,listpage_a[i])
  513. dict_Rule_A_Date["listpage_A"] = listpage_a
  514. list_diffindex = list_date[0]
  515. _xpath = list_date[1][0]
  516. listpage_date = []
  517. begin = 0
  518. list_diffindex.sort(key=lambda x:x)
  519. _jump_flag = False
  520. dict_Rule_A_Date["hasDrew"] = dict_Rule_A_Date["hasDrew"] or hd.hasDrew(url, [{"rule":_xpath,"type":"xpath"}])
  521. _xpath_split = re.split("(\d+)",_xpath)
  522. for i in range(len(list_diffindex)):
  523. _index = list_diffindex[i]
  524. if not (_xpath_split[_index-1][-1]=="[" and _xpath_split[_index+1][0]=="]"):
  525. add_err_msg(dict_Rule_A_Date, "#列表页链接xpath无法分割#")
  526. dict_Rule_A_Date["flag"] = False
  527. return dict_Rule_A_Date,list_hrefs
  528. else:
  529. if i==0:
  530. appendXpath(listpage_date,re.search("(.*)\[","".join(_xpath_split[:_index])).group(1))
  531. begin = _index+1
  532. elif i<len(list_diffindex):
  533. appendXpath(listpage_date,re.search("/(.*)\[","".join(_xpath_split[begin:_index])).group(1))
  534. begin = _index+1
  535. else:
  536. appendXpath(listpage_date,re.search("/(.*)","".join(_xpath_split[begin:])).group(1))
  537. if i==len(list_diffindex)-1:
  538. _group = re.search("/(.*)","".join(_xpath_split[begin:]))
  539. if _group is not None:
  540. appendXpath(listpage_date,_group.group(1))
  541. for i in range(len(listpage_date)):
  542. if len(listpage_date[i].split("/"))>6:
  543. # listpage_date[i] = browser.execute_script(scripts_replaceXpath,listpage_date[i])
  544. listpage_date[i] = get_js_rs(browser, scripts_replaceXpath,listpage_date[i])
  545. dict_Rule_A_Date["listpage_Date"] = listpage_date
  546. return dict_Rule_A_Date,list_hrefs
  547. # except Exception as e:
  548. # error(str(e))
  549. # finally:
  550. # # hd.adddriver(browser)
  551. # # debug("release driver")
  552. # log('getRule_A_Date done')
  553. return None
  554. def dumpLinkContent():
  555. def trytosave(d):
  556. try:
  557. save(d,"1.pk")
  558. return 1
  559. except Exception as e:
  560. return 0
  561. import cx_Oracle as cx_Oracle
  562. conn=cx_Oracle.connect('bxkc/bxkc@192.168.2.54:1521/orcl') #连接数据库
  563. cursor=conn.cursor()
  564. sql = " select page_link,page_content from BXKC.DETAIL_CONTENT_HTML where page_type=0 "
  565. cursor.execute(sql)
  566. data = []
  567. while(True):
  568. try:
  569. rows = cursor.fetchmany(10)
  570. if not rows:
  571. break
  572. for row in rows:
  573. if trytosave(row)==1:
  574. data.append(row)
  575. except Exception as e:
  576. print(e)
  577. save(data,"Link_Content.pk")
  578. def getAllData():
  579. all_data = load("Link_Content.pk")
  580. data = []
  581. temp_file ="temp_data.pk"
  582. count = 0
  583. label = 0
  584. data_len = len(all_data)
  585. for row in all_data:
  586. count += 1
  587. print(str(label)+"/"+str(count)+"/"+str(data_len),row[0])
  588. #encode = encodeInput(row[0], row[1])
  589. if count%100==0:
  590. save(data,temp_file)
  591. encode = encodeInput_byJS(row[0], row[1])
  592. if encode:
  593. label += 1
  594. x,y = encode
  595. data.append([x,y,row[0]])
  596. else:
  597. print("None")
  598. save(data,"data_done.pk")
  599. return data
  600. def filter():
  601. list_length = []
  602. data = load("temp_data.pk")
  603. print(data[0])
  604. data.sort(key = lambda x:x[2])
  605. new_data = []
  606. for item in data:
  607. list_length.append(len(item[0]))
  608. if len(item[0])<100:
  609. new_data.append(item)
  610. print(max(list_length))
  611. print(len(data))
  612. print(len(new_data))
  613. save(new_data,"source_11input.pk")
  614. def padding(all_data,pad=True):
  615. max_len = np.max([len(data[1]) for data in all_data])
  616. print("max_len",max_len)
  617. #max_len = 200
  618. list_x = []
  619. list_y = []
  620. list_url = []
  621. for data in all_data:
  622. input_x = data[0]
  623. label_y = data[1]
  624. url = data[2]
  625. if pad:
  626. input_x = np.transpose(pad_sequences(np.transpose(input_x,(1,0)), max_len,padding="post", truncating="post", value=0,dtype="float32"),(1,0))
  627. list_x.append(input_x)
  628. label_y = pad_sequences([label_y],max_len,padding="post", truncating="post", value=-1)[0]
  629. #list_y.append(label_y)
  630. list_y.append([(np.arange(2)==i).astype(np.integer) for i in label_y])
  631. else:
  632. #input_x = np.array(input_x)
  633. list_x.append([input_x])
  634. list_y.append([(np.arange(2)==i).astype(np.integer) for i in label_y])
  635. list_url.append(url)
  636. return [np.array(list_x),np.array(list_y),list_url]
  637. if __name__=="__main__":
  638. #dumpLinkContent()
  639. #getAllData()
  640. #filter()
  641. #data = padding(load("source_11input.pk"))
  642. #save(data,"source_11input_padding.pk")
  643. getRule_A_Date(url="http://www.dp.gov.cn/dpxw/zwgz/gsgg.htm",content_xpath='//*[@class="yaowen_list"]')