''' Created on 2019年8月12日 @author: User ''' import module.htmlDrawing as hd import math import numpy as np from module.Utils import * from keras.preprocessing.sequence import pad_sequences import re script_content = ''' function label(node,set_url){ var node_flag = check(node,set_url); var child_flag = false; if(node.childNodes!=null){ for(var i=0;i0){ for(var i=0;i1000?1000:entropy_width; entropy_height = entropy_height>1000?1000:entropy_height; node.entropy_width = entropy_width; node.entropy_height = entropy_height; var innertext = node.innerText; if(innertext){ var pattern_time = /([^\d]?(\d{4}|\d{2})\s*[-\/::年.]\s*\d{1,2}\s*[-\/::月.]\s*\d{1,2}[^\d]?)|([^\d]?\d{2,4}\s*[-\/月年]\s*\d{1,2}[^\d]?)/ var text = innertext.replace(/\s/g,''); //var text = innertext; node.counts_text = text.length; var punc = text.match(/;|,|。|:|、/g); var lines = innertext.match(/.{10}\\n/g); var times = innertext.match(pattern_time); if(lines){ node.counts_lines = lines.length; }else{ node.counts_lines = 0; } if(punc){ node['counts_punctuations']= punc.length; }else{ node.counts_punctuations = 0; } if(times){ node.counts_times = times.length; }else{ node.counts_times = 0; } }else{ node.counts_lines = 0; node.counts_text = 0; node.counts_punctuations=0; node.counts_times = 0; } node.deepth = deepth; return node.counts_communicateTags; } function search(str_url){ statistic(document,1); var objs = document.all; var set_url = new Set(); list_url = str_url.split("比地"); for(var i=0;i100 && obj.offsetHeight>100 && obj.parentNode.tagName!=null && obj.childNodes.length>0){ _item = new Array(); _item.push(getOffsetLeft(obj),getOffsetTop(obj),obj.offsetWidth,obj.offsetHeight,obj.deepth,obj.counts_text,obj.counts_times,obj.counts_tagType,obj.counts_tag,obj.entropy_width,obj.entropy_height) data.push([_item,label(obj,set_url),obj.innerHTML,getListXpath(obj,new Array())]) } } return(data); } return search(arguments[0]) ''' script_get_A_Date = ''' function is_similar(source,target){ var diff_index = -1; var source_split = source.split(/(\d+)/) var target_split = target.split(/(\d+)/) if(source_split.length==target_split.length){ var diff_count = 0; for(var i=0;i=0 && target_split[i].search(/^\d+$/)>=0){ diff_index = i; }else{ //不同的部分一定要是数字 return -1; } } diff_count += 1; } } if(diff_count==1){ return diff_index; }else{ return -1; } }else{ return -1; } } function getNode_listContent(xpath){ /* var objs = document.all; for(var i=0;i0){ return objs[0]; } return null; } function statistic_time(node,_array){ var pattern_time = /([^\d]?(\d{4}|\d{2})\s*[-\/::年.]\s*\d{1,2}\s*[-\/::月.]\s*\d{1,2}[^\d]?)|([^\d]?\d{2,4}\s*[-\/月年]\s*\d{1,2}[^\d]?)/ var _find_flag = false; if (node.childNodes==null){ }else{ for(var i=0;i=0){ statistic_time(childNode,_array); _find_flag = true; } } } if (!_find_flag){ _array.push(getXpath(node,["tr","li"],true)); } return _array; } function padding_href(href){ var baseUrl = window.location.href; var baseUrl_split = baseUrl.split("/"); var join_flag = true; var href_padded = ""; var level_nums = 1; var filename = ""; if(href==null){ join_flag = false; }else if(href.indexOf("javascript")>-1){ join_flag = false; }else if(href.indexOf("http")>-1){ join_flag = false; href_padded = href; }else if(href.indexOf("./")==0){ filename = href.substring(2); }else if(href.indexOf("../")==0){ level_nums ++; _substr = href.substring(3) while(true){ if(_substr.indexOf("../")==0){ level_nums ++; _substr = _substr.substring(3); }else{ filename = _substr; break; } } }else if(href.indexOf("./")==0){ level_nums = baseUrl_split.length-3; filename = href.substring(1); }else if(href.indexOf("?")==0){ _href = baseUrl.split("?")[0]+href; return _href; }else{ filename = href; } if(join_flag){ for(var i=0;i-1 && similar_index!=diff_index){ return -1; } similar_index = diff_index; if(diff_index<=-1){ return -1; } } return similar_index; } function clustering_xpath(array_xpath){ var array_class = new Array(); for(var i=0;i-1){ if(array_class[j][0].indexOf(diff_index)==-1){ array_class[j][0].push(diff_index); } if(array_class[j][1].indexOf(array_xpath[i])<0){ array_class[j][1].push(array_xpath[i]); } } } array_class.push([[],[array_xpath[i]]]); } var _max_length = 0; var _max_index = -1; for(var i=0;i_max_length){ _max_length = array_class[i][1].length; _max_index = i; } } return array_class[_max_index]; } function search(content_xpath){ try{ content_node = getNode_listContent(content_xpath) //获取列表页标签节点 if(content_node!=null){ var array_a_href = statistic_A(content_node); var array_a = array_a_href[0]; var array_href = new Array(); var array_date = new Array(); statistic_time(content_node,array_date); var _clustered_a = clustering_xpath(array_a); var _clustered_date = clustering_xpath(array_date); for(var i=0;i=0){ array_href.push(array_a_href[1][i]); } } return [_clustered_a,_clustered_date,array_href] } return null; } catch(e){ return null } } return search(arguments[0]); ''' def dealWithScriptOut(data): list_input = [] list_label = [] list_inner = [] list_xpath = [] for index in range(len(data)): #clean nan for i in range(len(data[index][0])): if data[index][0][i] is None or math.isnan(data[index][0][i]): data[index][0][i] = -1 #order by deepth data.sort(key=lambda x:x[0][2]*x[0][3],reverse=True) for item in data: list_input.append(item[0]) list_label.append(item[1]) list_inner.append(item[2]) list_xpath.append(item[3]) #print(len(data)) if len(list_input)>0: the_max = np.max(list_input,axis=0) the_max = np.array([x if x>0 else 1 for x in the_max]) the_max = np.array((list(the_max)[2:4]*2+list(the_max)[4:7]+[10,20]+list(the_max)[9:])) input_x = np.array(list_input/the_max) return input_x,list_label,list_inner,list_xpath else: return None def encodeInput_byJS(url,str_href): try: browser = hd.getdriver() debug("get driver") hd.loadPage(browser, url) # data = browser.execute_script(scripts_common+script_content,str_href) data = get_js_rs(browser, scripts_common+script_content,str_href) deal_data = dealWithScriptOut(data) if deal_data is None: return None input_x,list_label,list_inner,list_xpath = deal_data if np.sum(list_label)==1: return input_x,np.array(list_label) else: return None except Exception as e: log(str(e)) finally: hd.adddriver(browser) debug("release driver") return None def getInput_byJS(browser,url,str_href): try: # hd.loadPage(browser,url) # data = browser.execute_script(scripts_common+script_content,str_href) data = get_js_rs(browser, scripts_common+script_content,str_href) deal_data = dealWithScriptOut(data) if deal_data is None: return None else: input_x,_,list_inner,list_xpath = deal_data return [np.expand_dims(input_x,0)],list_inner,list_xpath except Exception as e: error(str(e)) return None def getRule_A_Date(browser, url,content_xpath): def appendXpath(list_xpath,_xpath): if len(list_xpath)==0: list_xpath.append(_xpath) else: list_xpath.append(list_xpath[-1].split("/")[-1]+"/"+_xpath) dict_Rule_A_Date = {"listpage_A":None, "listpage_Date":None, "flag":True, "hasDrew":False} # try: # browser = hd.getdriver() # debug("get driver") # hd.loadPage(browser,url) list_a = None for _content_xpath in [content_xpath,"/html"]: # data = browser.execute_script(scripts_common+script_get_A_Date,_content_xpath) data = get_js_rs(browser, scripts_common+script_get_A_Date,_content_xpath) if data is None: log("A_Date not found with xpath:"+_content_xpath) continue if _content_xpath==content_xpath or len(data[0][1])==len(data[1][1]): list_a = data[0] list_date = data[1] list_hrefs = data[2] if list_a is not None and len(list_a[1])==len(list_date[1]): log('list_a is not None and len(list_a[1])==len(list_date[1])') break else: log("different length of A and Date:with xpath:"+_content_xpath) if list_a is None: log("A_Date not found with all xpath") return None; log("xpath of a:\t"+str(list_a[1][0])+"-"+str(list_a[0])) log("xpath of date:\t"+str(list_date[1][0])+"-"+str(list_date[0])) log("length of A and Date:"+str(len(list_a[1]))+"-"+str(len(list_date[1]))) if len(list_a[1])!=len(list_date[1]) and len(list_hrefs)>2 and len(set(list_hrefs[0])-set(list_hrefs[1]))>1: dict_Rule_A_Date["flag"] = False add_err_msg(dict_Rule_A_Date, "#列表页链接和标题数量不一致#") return dict_Rule_A_Date,list_hrefs else: list_diffindex = list_a[0] _xpath = list_a[1][0] listpage_a = [] begin = 0 list_diffindex.sort(key=lambda x:x) _jump_flag = False dict_Rule_A_Date["hasDrew"] = dict_Rule_A_Date["hasDrew"] or hd.hasDrew(url, [{"rule":_xpath,"type":"xpath"}]) _xpath_split = re.split("(\d+)",_xpath) for i in range(len(list_diffindex)): _index = list_diffindex[i] if not (_xpath_split[_index-1][-1]=="[" and _xpath_split[_index+1][0]=="]"): add_err_msg(dict_Rule_A_Date, "#列表页链接xpath无法分割#") dict_Rule_A_Date["flag"] = False return dict_Rule_A_Date,list_hrefs else: if i==0: appendXpath(listpage_a,re.search("(.*)\[","".join(_xpath_split[:_index])).group(1)) begin = _index+1 elif i6: # listpage_a[i] = browser.execute_script(scripts_replaceXpath,listpage_a[i]) listpage_a[i] = get_js_rs(browser, scripts_replaceXpath,listpage_a[i]) dict_Rule_A_Date["listpage_A"] = listpage_a list_diffindex = list_date[0] _xpath = list_date[1][0] listpage_date = [] begin = 0 list_diffindex.sort(key=lambda x:x) _jump_flag = False dict_Rule_A_Date["hasDrew"] = dict_Rule_A_Date["hasDrew"] or hd.hasDrew(url, [{"rule":_xpath,"type":"xpath"}]) _xpath_split = re.split("(\d+)",_xpath) for i in range(len(list_diffindex)): _index = list_diffindex[i] if not (_xpath_split[_index-1][-1]=="[" and _xpath_split[_index+1][0]=="]"): add_err_msg(dict_Rule_A_Date, "#列表页链接xpath无法分割#") dict_Rule_A_Date["flag"] = False return dict_Rule_A_Date,list_hrefs else: if i==0: appendXpath(listpage_date,re.search("(.*)\[","".join(_xpath_split[:_index])).group(1)) begin = _index+1 elif i6: # listpage_date[i] = browser.execute_script(scripts_replaceXpath,listpage_date[i]) listpage_date[i] = get_js_rs(browser, scripts_replaceXpath,listpage_date[i]) dict_Rule_A_Date["listpage_Date"] = listpage_date return dict_Rule_A_Date,list_hrefs # except Exception as e: # error(str(e)) # finally: # # hd.adddriver(browser) # # debug("release driver") # log('getRule_A_Date done') return None def dumpLinkContent(): def trytosave(d): try: save(d,"1.pk") return 1 except Exception as e: return 0 import cx_Oracle as cx_Oracle conn=cx_Oracle.connect('bxkc/bxkc@192.168.2.54:1521/orcl') #连接数据库 cursor=conn.cursor() sql = " select page_link,page_content from BXKC.DETAIL_CONTENT_HTML where page_type=0 " cursor.execute(sql) data = [] while(True): try: rows = cursor.fetchmany(10) if not rows: break for row in rows: if trytosave(row)==1: data.append(row) except Exception as e: print(e) save(data,"Link_Content.pk") def getAllData(): all_data = load("Link_Content.pk") data = [] temp_file ="temp_data.pk" count = 0 label = 0 data_len = len(all_data) for row in all_data: count += 1 print(str(label)+"/"+str(count)+"/"+str(data_len),row[0]) #encode = encodeInput(row[0], row[1]) if count%100==0: save(data,temp_file) encode = encodeInput_byJS(row[0], row[1]) if encode: label += 1 x,y = encode data.append([x,y,row[0]]) else: print("None") save(data,"data_done.pk") return data def filter(): list_length = [] data = load("temp_data.pk") print(data[0]) data.sort(key = lambda x:x[2]) new_data = [] for item in data: list_length.append(len(item[0])) if len(item[0])<100: new_data.append(item) print(max(list_length)) print(len(data)) print(len(new_data)) save(new_data,"source_11input.pk") def padding(all_data,pad=True): max_len = np.max([len(data[1]) for data in all_data]) print("max_len",max_len) #max_len = 200 list_x = [] list_y = [] list_url = [] for data in all_data: input_x = data[0] label_y = data[1] url = data[2] if pad: input_x = np.transpose(pad_sequences(np.transpose(input_x,(1,0)), max_len,padding="post", truncating="post", value=0,dtype="float32"),(1,0)) list_x.append(input_x) label_y = pad_sequences([label_y],max_len,padding="post", truncating="post", value=-1)[0] #list_y.append(label_y) list_y.append([(np.arange(2)==i).astype(np.integer) for i in label_y]) else: #input_x = np.array(input_x) list_x.append([input_x]) list_y.append([(np.arange(2)==i).astype(np.integer) for i in label_y]) list_url.append(url) return [np.array(list_x),np.array(list_y),list_url] if __name__=="__main__": #dumpLinkContent() #getAllData() #filter() #data = padding(load("source_11input.pk")) #save(data,"source_11input_padding.pk") getRule_A_Date(url="http://www.dp.gov.cn/dpxw/zwgz/gsgg.htm",content_xpath='//*[@class="yaowen_list"]')