123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700 |
- '''
- Created on 2019年8月12日
- @author: User
- '''
- import module.htmlDrawing as hd
- import math
- import numpy as np
- from module.Utils import *
- from keras.preprocessing.sequence import pad_sequences
- import re
- script_content = '''
- function label(node,set_url){
- var node_flag = check(node,set_url);
- var child_flag = false;
- if(node.childNodes!=null){
- for(var i=0;i<node.childNodes.length;i++){
- var child = node.childNodes[i];
- if(check(child,set_url)){
- child_flag = true;
- }
- }
- }
- if(node_flag && !child_flag){
- return 1;
- }else{
- return 0;
- }
- }
- function statisticIframe(nodes){
- var counts_communicateTags = 0;
- for(var i=0;i<nodes.length;i++){
- child = nodes[i]
- if (child.tagName!=null){
- if (child.tagName.toLowerCase() in {a:"",input:"",select:""} || child.onclick!=null){
- counts_communicateTags += 1;
- }
- if(child.tagName.toLowerCase()=="iframe"){
- if(child.contentWindow.document!=null){
- counts_communicateTags += statisticIframe(child.contentWindow.document.all);
- }
- }
- }
- }
- return counts_communicateTags;
- }
- function statistic(node,deepth){
- if(node.childNodes==null){
- node.counts_communicateTags = 0;
- node.counts_tag = 0;
- node.entropy_width = 0;
- node.entropy_height = 0;
- return node.counts_communicateTags;
- }
- node.counts_communicateTags = 0;
- node.counts_tag = 0;
- var set_tag = new Set();
- var list_width = []
- var list_height = []
- for(var i=0;i<node.childNodes.length;i++){
- child = node.childNodes[i];
- //删除标签
- /*
- if (child.tagName!=null){
- if (child.tagName.toLowerCase() in {head:"",script:"",meta:"",link:"",style:""} || child.nodeType==8 ){
- node.removeChild(child);
- continue;
- }
- }
- */
- if(child.offsetWidth){
- list_width.push(child.offsetWidth);
- }
- if(child.offsetHeight){
- list_height.push(child.offsetHeight);
- }
- node.counts_tag += 1;
- if (child.tagName!=null){
- set_tag.add(child.tagName.toLowerCase())
- if (child.tagName.toLowerCase() in {a:"",input:"",select:""} || child.onclick!=null){
- node.counts_communicateTags += 1;
- }
- }
- /*if(child.tagName!=null && child.tagName.toLowerCase()=="iframe" && child.contentWindow.document!=null){
- node.counts_communicateTags += statisticIframe(child.contentWindow.document.all);
- }else{
- node.counts_communicateTags += statistic(child,deepth+1);
- }*/
- node.counts_communicateTags += statistic(child,deepth+1);
- }
- node.counts_tagType = set_tag.size();
- var sum_width = 0;
- var sum_height = 0;
- var avg_width = 0;
- var avg_height = 0;
- var entropy_width = 0;
- var entropy_height = 0;
- if(list_width.length>0){
- for(var i=0;i<list_width.length;i++){
- sum_width += list_width[i];
- }
- for(var i=0;i<list_height.length;i++){
- sum_height += list_height[i];
- }
-
- avg_width = sum_width/list_width.length;
- avg_height = sum_height/list_height.length;
-
- for(var i=0;i<list_width.length;i++){
- entropy_width += Math.pow(list_width[i]-avg_width,2);
- }
- for(var i=0;i<list_height.length;i++){
- entropy_height += Math.pow(list_height[i]-avg_height,2);
- }
-
- entropy_width /= list_width.length;
- entropy_height /= list_height.length;
- }
-
- entropy_width = entropy_width>1000?1000:entropy_width;
- entropy_height = entropy_height>1000?1000:entropy_height;
- node.entropy_width = entropy_width;
- node.entropy_height = entropy_height;
-
- var innertext = node.innerText;
- if(innertext){
- var pattern_time = /([^\d]?(\d{4}|\d{2})\s*[-\/::年.]\s*\d{1,2}\s*[-\/::月.]\s*\d{1,2}[^\d]?)|([^\d]?\d{2,4}\s*[-\/月年]\s*\d{1,2}[^\d]?)/
- var text = innertext.replace(/\s/g,'');
- //var text = innertext;
- node.counts_text = text.length;
- var punc = text.match(/;|,|。|:|、/g);
- var lines = innertext.match(/.{10}\\n/g);
- var times = innertext.match(pattern_time);
- if(lines){
- node.counts_lines = lines.length;
- }else{
- node.counts_lines = 0;
- }
- if(punc){
- node['counts_punctuations']= punc.length;
- }else{
- node.counts_punctuations = 0;
- }
- if(times){
- node.counts_times = times.length;
- }else{
- node.counts_times = 0;
- }
-
- }else{
- node.counts_lines = 0;
- node.counts_text = 0;
- node.counts_punctuations=0;
- node.counts_times = 0;
- }
- node.deepth = deepth;
- return node.counts_communicateTags;
- }
- function search(str_url){
- statistic(document,1);
- var objs = document.all;
- var set_url = new Set();
- list_url = str_url.split("比地");
- for(var i=0;i<list_url.length;i++){
- if(list_url[i]!=""){
- set_url.add(list_url[i]);
- }
- }
- var data = new Array();
- for(var i=0;i<objs.length;i++){
- obj = objs[i];
- if (obj.offsetWidth>100 && obj.offsetHeight>100 && obj.parentNode.tagName!=null && obj.childNodes.length>0){
- _item = new Array();
- _item.push(getOffsetLeft(obj),getOffsetTop(obj),obj.offsetWidth,obj.offsetHeight,obj.deepth,obj.counts_text,obj.counts_times,obj.counts_tagType,obj.counts_tag,obj.entropy_width,obj.entropy_height)
- data.push([_item,label(obj,set_url),obj.innerHTML,getListXpath(obj,new Array())])
- }
- }
- return(data);
- }
- return search(arguments[0])
- '''
- script_get_A_Date = '''
- function is_similar(source,target){
- var diff_index = -1;
- var source_split = source.split(/(\d+)/)
- var target_split = target.split(/(\d+)/)
- if(source_split.length==target_split.length){
- var diff_count = 0;
- for(var i=0;i<source_split.length;i++){
- if(source_split[i]!=target_split[i]){
- if(diff_index==-1){
- if(source_split[i].search(/^\d+$/)>=0 && target_split[i].search(/^\d+$/)>=0){
- diff_index = i;
- }else{
- //不同的部分一定要是数字
- return -1;
- }
- }
- diff_count += 1;
- }
- }
- if(diff_count==1){
- return diff_index;
- }else{
- return -1;
- }
- }else{
- return -1;
- }
- }
- function getNode_listContent(xpath){
- /*
- var objs = document.all;
- for(var i=0;i<objs.length;i++){
- var obj = objs[i];
- if(obj!=null && getXpath(obj,[])==xpath){
- return obj;
- }
- }
- return null;
- */
- var objs = findElements_byXpath(xpath);
- if(objs.length>0){
- return objs[0];
- }
- return null;
- }
- function statistic_time(node,_array){
- var pattern_time = /([^\d]?(\d{4}|\d{2})\s*[-\/::年.]\s*\d{1,2}\s*[-\/::月.]\s*\d{1,2}[^\d]?)|([^\d]?\d{2,4}\s*[-\/月年]\s*\d{1,2}[^\d]?)/
- var _find_flag = false;
- if (node.childNodes==null){
- }else{
- for(var i=0;i<node.childNodes.length;i++){
- var childNode = node.childNodes[i];
- var _innerText = childNode.innerText;
- if(childNode!=null && childNode.tagName!=null && childNode.tagName.toLowerCase()=="script"){
- continue;
- }
- if (_innerText!=null && _innerText.search(pattern_time)>=0){
- statistic_time(childNode,_array);
- _find_flag = true;
- }
- }
- }
- if (!_find_flag){
- _array.push(getXpath(node,["tr","li"],true));
- }
- return _array;
- }
- function padding_href(href){
- var baseUrl = window.location.href;
- var baseUrl_split = baseUrl.split("/");
- var join_flag = true;
- var href_padded = "";
- var level_nums = 1;
- var filename = "";
- if(href==null){
- join_flag = false;
- }else if(href.indexOf("javascript")>-1){
- join_flag = false;
- }else if(href.indexOf("http")>-1){
- join_flag = false;
- href_padded = href;
- }else if(href.indexOf("./")==0){
- filename = href.substring(2);
- }else if(href.indexOf("../")==0){
- level_nums ++;
- _substr = href.substring(3)
- while(true){
- if(_substr.indexOf("../")==0){
- level_nums ++;
- _substr = _substr.substring(3);
- }else{
- filename = _substr;
- break;
- }
- }
- }else if(href.indexOf("./")==0){
- level_nums = baseUrl_split.length-3;
- filename = href.substring(1);
- }else if(href.indexOf("?")==0){
- _href = baseUrl.split("?")[0]+href;
- return _href;
- }else{
- filename = href;
- }
- if(join_flag){
- for(var i=0;i<baseUrl_split.length-level_nums;i++){
- href_padded += baseUrl_split[i]+"/";
- }
- href_padded += filename;
- }
- return href_padded;
- }
- function statistic_A(node){
- var list_a = node.getElementsByTagName("a");
- var clustered_turnPage = clustering_turnPage();
- var array_xpath_turnPage = new Set();
- for(var i=0;i<clustered_turnPage.length;i++){
- array_xpath_turnPage.add(padding_href(clustered_turnPage[i][0].href));
- }
- var set_aXpath = new Set();
- var set_href = new Set();
- for(var i=0;i<list_a.length;i++){
- _href = padding_href(list_a[i].href);
- var is_turnPage = false;
- _xpath = getXpath(list_a[i],["tr","li"],true);
- if(array_xpath_turnPage.contains(_href)){
- is_turnPage = true;
- }
- if(!is_turnPage){
- set_aXpath.add(_xpath);
- if(_href!=""){
- set_href.add(_href);
- }
-
- }
- }
- return [set_aXpath.dataStore,set_href.dataStore];
- }
- function similar_all(_xpath,array_xpath){
- var similar_index = -1;
- for(var h=0;h<array_xpath.length;h++){
- diff_index = is_similar(_xpath,array_xpath[h]);
- if( similar_index>-1 && similar_index!=diff_index){
- return -1;
- }
- similar_index = diff_index;
- if(diff_index<=-1){
- return -1;
- }
- }
- return similar_index;
- }
- function clustering_xpath(array_xpath){
- var array_class = new Array();
- for(var i=0;i<array_xpath.length;i++){
- for(var j=0;j<array_class.length;j++){
- //与此类中所有xpath都要一样
- var diff_index = similar_all(array_xpath[i],array_class[j][1])
- if(diff_index>-1){
- if(array_class[j][0].indexOf(diff_index)==-1){
- array_class[j][0].push(diff_index);
- }
- if(array_class[j][1].indexOf(array_xpath[i])<0){
- array_class[j][1].push(array_xpath[i]);
- }
-
- }
- }
- array_class.push([[],[array_xpath[i]]]);
- }
- var _max_length = 0;
- var _max_index = -1;
- for(var i=0;i<array_class.length;i++){
- if(array_class[i][1].length>_max_length){
- _max_length = array_class[i][1].length;
- _max_index = i;
- }
- }
- return array_class[_max_index];
- }
- function search(content_xpath){
- try{
- content_node = getNode_listContent(content_xpath) //获取列表页标签节点
- if(content_node!=null){
- var array_a_href = statistic_A(content_node);
- var array_a = array_a_href[0];
- var array_href = new Array();
- var array_date = new Array();
- statistic_time(content_node,array_date);
- var _clustered_a = clustering_xpath(array_a);
- var _clustered_date = clustering_xpath(array_date);
- for(var i=0;i<array_a.length;i++){
- if(_clustered_a[1].indexOf(array_a_href[0][i])>=0){
- array_href.push(array_a_href[1][i]);
- }
- }
- return [_clustered_a,_clustered_date,array_href]
- }
- return null;
- }
- catch(e){
- return null
- }
- }
- return search(arguments[0]);
- '''
- def dealWithScriptOut(data):
- list_input = []
- list_label = []
- list_inner = []
- list_xpath = []
- for index in range(len(data)):
- #clean nan
- for i in range(len(data[index][0])):
- if data[index][0][i] is None or math.isnan(data[index][0][i]):
- data[index][0][i] = -1
- #order by deepth
- data.sort(key=lambda x:x[0][2]*x[0][3],reverse=True)
- for item in data:
- list_input.append(item[0])
- list_label.append(item[1])
- list_inner.append(item[2])
- list_xpath.append(item[3])
- #print(len(data))
- if len(list_input)>0:
- the_max = np.max(list_input,axis=0)
- the_max = np.array([x if x>0 else 1 for x in the_max])
- the_max = np.array((list(the_max)[2:4]*2+list(the_max)[4:7]+[10,20]+list(the_max)[9:]))
- input_x = np.array(list_input/the_max)
- return input_x,list_label,list_inner,list_xpath
- else:
- return None
-
- def encodeInput_byJS(url,str_href):
- try:
- browser = hd.getdriver()
- debug("get driver")
- hd.loadPage(browser, url)
- # data = browser.execute_script(scripts_common+script_content,str_href)
- data = get_js_rs(browser, scripts_common+script_content,str_href)
- deal_data = dealWithScriptOut(data)
-
- if deal_data is None:
- return None
- input_x,list_label,list_inner,list_xpath = deal_data
- if np.sum(list_label)==1:
- return input_x,np.array(list_label)
- else:
- return None
- except Exception as e:
- log(str(e))
- finally:
- hd.adddriver(browser)
- debug("release driver")
- return None
-
- def getInput_byJS(browser,url,str_href):
- try:
- # hd.loadPage(browser,url)
- # data = browser.execute_script(scripts_common+script_content,str_href)
- data = get_js_rs(browser, scripts_common+script_content,str_href)
- deal_data = dealWithScriptOut(data)
- if deal_data is None:
- return None
- else:
- input_x,_,list_inner,list_xpath = deal_data
- return [np.expand_dims(input_x,0)],list_inner,list_xpath
- except Exception as e:
- error(str(e))
- return None
-
- def getRule_A_Date(browser, url,content_xpath):
- def appendXpath(list_xpath,_xpath):
- if len(list_xpath)==0:
- list_xpath.append(_xpath)
- else:
- list_xpath.append(list_xpath[-1].split("/")[-1]+"/"+_xpath)
-
- dict_Rule_A_Date = {"listpage_A":None,
- "listpage_Date":None,
- "flag":True,
- "hasDrew":False}
- # try:
- # browser = hd.getdriver()
- # debug("get driver")
- # hd.loadPage(browser,url)
-
- list_a = None
- for _content_xpath in [content_xpath,"/html"]:
- # data = browser.execute_script(scripts_common+script_get_A_Date,_content_xpath)
- data = get_js_rs(browser, scripts_common+script_get_A_Date,_content_xpath)
- if data is None:
- log("A_Date not found with xpath:"+_content_xpath)
- continue
- if _content_xpath==content_xpath or len(data[0][1])==len(data[1][1]):
- list_a = data[0]
- list_date = data[1]
- list_hrefs = data[2]
- if list_a is not None and len(list_a[1])==len(list_date[1]):
- log('list_a is not None and len(list_a[1])==len(list_date[1])')
- break
- else:
- log("different length of A and Date:with xpath:"+_content_xpath)
- if list_a is None:
- log("A_Date not found with all xpath")
- return None;
- log("xpath of a:\t"+str(list_a[1][0])+"-"+str(list_a[0]))
- log("xpath of date:\t"+str(list_date[1][0])+"-"+str(list_date[0]))
- log("length of A and Date:"+str(len(list_a[1]))+"-"+str(len(list_date[1])))
- if len(list_a[1])!=len(list_date[1]):
- dict_Rule_A_Date["flag"] = False
- add_err_msg(dict_Rule_A_Date, "#列表页链接和标题数量不一致#")
- return dict_Rule_A_Date,list_hrefs
- else:
- list_diffindex = list_a[0]
- _xpath = list_a[1][0]
- listpage_a = []
- begin = 0
- list_diffindex.sort(key=lambda x:x)
- _jump_flag = False
- dict_Rule_A_Date["hasDrew"] = dict_Rule_A_Date["hasDrew"] or hd.hasDrew(url, [{"rule":_xpath,"type":"xpath"}])
- _xpath_split = re.split("(\d+)",_xpath)
- for i in range(len(list_diffindex)):
- _index = list_diffindex[i]
- if not (_xpath_split[_index-1][-1]=="[" and _xpath_split[_index+1][0]=="]"):
- add_err_msg(dict_Rule_A_Date, "#列表页链接xpath无法分割#")
- dict_Rule_A_Date["flag"] = False
- return dict_Rule_A_Date,list_hrefs
- else:
- if i==0:
- appendXpath(listpage_a,re.search("(.*)\[","".join(_xpath_split[:_index])).group(1))
- begin = _index+1
- elif i<len(list_diffindex):
- appendXpath(listpage_a,re.search("/(.*)\[","".join(_xpath_split[begin:_index])).group(1))
- begin = _index+1
- else:
- appendXpath(listpage_a,re.search("/(.*)","".join(_xpath_split[begin:])).group(1))
- if i==len(list_diffindex)-1:
- _group = re.search("/(.*)","".join(_xpath_split[begin:]))
- if _group is not None:
- appendXpath(listpage_a,_group.group(1))
- for i in range(len(listpage_a)):
- if len(listpage_a[i].split("/"))>6:
- # listpage_a[i] = browser.execute_script(scripts_replaceXpath,listpage_a[i])
- listpage_a[i] = get_js_rs(browser, scripts_replaceXpath,listpage_a[i])
- dict_Rule_A_Date["listpage_A"] = listpage_a
- list_diffindex = list_date[0]
- _xpath = list_date[1][0]
- listpage_date = []
- begin = 0
- list_diffindex.sort(key=lambda x:x)
- _jump_flag = False
- dict_Rule_A_Date["hasDrew"] = dict_Rule_A_Date["hasDrew"] or hd.hasDrew(url, [{"rule":_xpath,"type":"xpath"}])
- _xpath_split = re.split("(\d+)",_xpath)
- for i in range(len(list_diffindex)):
- _index = list_diffindex[i]
- if not (_xpath_split[_index-1][-1]=="[" and _xpath_split[_index+1][0]=="]"):
- add_err_msg(dict_Rule_A_Date, "#列表页链接xpath无法分割#")
- dict_Rule_A_Date["flag"] = False
- return dict_Rule_A_Date,list_hrefs
- else:
- if i==0:
- appendXpath(listpage_date,re.search("(.*)\[","".join(_xpath_split[:_index])).group(1))
- begin = _index+1
- elif i<len(list_diffindex):
- appendXpath(listpage_date,re.search("/(.*)\[","".join(_xpath_split[begin:_index])).group(1))
- begin = _index+1
- else:
- appendXpath(listpage_date,re.search("/(.*)","".join(_xpath_split[begin:])).group(1))
- if i==len(list_diffindex)-1:
- _group = re.search("/(.*)","".join(_xpath_split[begin:]))
- if _group is not None:
- appendXpath(listpage_date,_group.group(1))
- for i in range(len(listpage_date)):
- if len(listpage_date[i].split("/"))>6:
- # listpage_date[i] = browser.execute_script(scripts_replaceXpath,listpage_date[i])
- listpage_date[i] = get_js_rs(browser, scripts_replaceXpath,listpage_date[i])
- dict_Rule_A_Date["listpage_Date"] = listpage_date
- return dict_Rule_A_Date,list_hrefs
-
-
- # except Exception as e:
- # error(str(e))
- # finally:
- # # hd.adddriver(browser)
- # # debug("release driver")
- # log('getRule_A_Date done')
- return None
-
- def dumpLinkContent():
- def trytosave(d):
- try:
- save(d,"1.pk")
- return 1
- except Exception as e:
- return 0
- import cx_Oracle as cx_Oracle
- conn=cx_Oracle.connect('bxkc/bxkc@192.168.2.54:1521/orcl') #连接数据库
- cursor=conn.cursor()
- sql = " select page_link,page_content from BXKC.DETAIL_CONTENT_HTML where page_type=0 "
- cursor.execute(sql)
- data = []
- while(True):
- try:
- rows = cursor.fetchmany(10)
- if not rows:
- break
- for row in rows:
- if trytosave(row)==1:
- data.append(row)
- except Exception as e:
- print(e)
- save(data,"Link_Content.pk")
-
- def getAllData():
- all_data = load("Link_Content.pk")
- data = []
- temp_file ="temp_data.pk"
- count = 0
- label = 0
- data_len = len(all_data)
- for row in all_data:
- count += 1
- print(str(label)+"/"+str(count)+"/"+str(data_len),row[0])
- #encode = encodeInput(row[0], row[1])
-
- if count%100==0:
- save(data,temp_file)
- encode = encodeInput_byJS(row[0], row[1])
- if encode:
- label += 1
- x,y = encode
- data.append([x,y,row[0]])
- else:
- print("None")
- save(data,"data_done.pk")
- return data
- def filter():
- list_length = []
- data = load("temp_data.pk")
- print(data[0])
- data.sort(key = lambda x:x[2])
- new_data = []
- for item in data:
- list_length.append(len(item[0]))
- if len(item[0])<100:
- new_data.append(item)
- print(max(list_length))
- print(len(data))
- print(len(new_data))
- save(new_data,"source_11input.pk")
-
- def padding(all_data,pad=True):
- max_len = np.max([len(data[1]) for data in all_data])
- print("max_len",max_len)
- #max_len = 200
- list_x = []
- list_y = []
- list_url = []
- for data in all_data:
- input_x = data[0]
- label_y = data[1]
- url = data[2]
- if pad:
- input_x = np.transpose(pad_sequences(np.transpose(input_x,(1,0)), max_len,padding="post", truncating="post", value=0,dtype="float32"),(1,0))
- list_x.append(input_x)
- label_y = pad_sequences([label_y],max_len,padding="post", truncating="post", value=-1)[0]
- #list_y.append(label_y)
- list_y.append([(np.arange(2)==i).astype(np.integer) for i in label_y])
- else:
- #input_x = np.array(input_x)
- list_x.append([input_x])
- list_y.append([(np.arange(2)==i).astype(np.integer) for i in label_y])
- list_url.append(url)
- return [np.array(list_x),np.array(list_y),list_url]
-
- if __name__=="__main__":
- #dumpLinkContent()
- #getAllData()
- #filter()
- #data = padding(load("source_11input.pk"))
- #save(data,"source_11input_padding.pk")
- getRule_A_Date(url="http://www.dp.gov.cn/dpxw/zwgz/gsgg.htm",content_xpath='//*[@class="yaowen_list"]')
|