123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404 |
- '''
- Created on 2019年8月5日
- @author: User
- '''
- import module.htmlDrawing as hd
- import time
- from selenium.webdriver.common.action_chains import ActionChains
- import re
- from module.Utils import *
- script = '''
- function click_bt(type_click){
- var pattern_pageNum = /[共\/]\s*(\d+)\s*页|\d+\s*\/\s*(\d+)|\.{2}\s*(\d+)/
- var pattern_nextPage = /^\s*[^最]?([下后]一?页|[下后]一?页\s*»|»|>|[Nn]ext).?\s*$/
-
- var pattern_tailPage = /^\s*(最[尾末后]一?页|tail|>\|).?s\s*$/
- list_cluster = clustering_turnPage();
- var pageNum = null;
- var pageNum_jsoup = null;
- var _node_xpath = null;
- var _node_jsoup = null;
- var _node_click = null;
- var click_message = '';
- for(var i=0;i<list_cluster.length;i++){
- _node = list_cluster[i][0]
- _type = list_cluster[i][1]
-
- if(_node.innerText!=null){
- var _match_num = _node.innerText.match(pattern_pageNum);
- if(pageNum==null && _match_num!=null){
- /*
- for(var j=1;j<_match_num.length;j++){
- if(_match_num[j]!=null){
- pageNum = _match_num[j]
- }
- }
- */
- //改为获取规则
- if(pageNum==null){
- pageNum = getXpath(_node);
- }
- if(pageNum_jsoup==null){
- pageNum_jsoup = getJsoup(_node);
- }
- }
- }
-
- if(_type==type_click){
- if(_node.tagName.toLowerCase() in {a:"",button:""} || _node.onclick!=null){
- //return getXpath(_node);
- _href = _node.getAttribute("href")
- if(_href!=null && _href!="" && _href!="#" && _href.indexOf('javascript')<0){
- if(_node_xpath==null){
- _node_xpath = getXpath(_node);
- }
- if(_node_jsoup==null){
- _node_jsoup = getJsoup(_node);
- }
-
- }
- if(_href==null || _href=="" || _href=="#"){
- click_message = '翻页链接为空或#异常';
- }
- if(_href!=null && _href.indexOf('javascript')>=0){
- click_message = '翻页链接为javascript';
- }
- if(_node_click==null){
- _node_click = _node;
- }
-
- }
- else if(_node.getAttribute("type")=='button'){
- _node_click = _node;
- click_message = '标签属性type为button的翻页';
- }
- else if(_node.parentNode.tagName.toLowerCase() in {a:"",button:""} || _node.parentNode.onclick!=null){
- _href = _node.parentNode.getAttribute("href")
- if(_href!=null && _href!="" && _href!="#" && _href.indexOf('javascript')<0){
- if(_node_xpath==null){
- _node_xpath = getXpath(_node.parentNode);
- }
- if(_node_jsoup==null){
- _node_jsoup = getJsoup(_node.parentNode);
- }
-
- }
- if(_node_click==null){
- _node_click = _node.parentNode;
- }
- click_message = '父节点为翻页链接';
- }
- }
- }
- if(_node_click!=null){
- _node_click.click();
- return [true,[pattern_pageNum.toString(),pageNum,pageNum_jsoup],[_node_xpath,_node_jsoup], click_message];
- }else{
- var _pattern = null;
- if(type_click=="nextPage"){
- _pattern = pattern_nextPage;
- }else{
- _pattern = pattern_tailPage;
- }
- var list_a = document.getElementsByTagName("a");
- for(var i=0;i<list_a.length;i++){
- var _node = list_a[i];
- if(_node!=null && _node.innerText!=null && _node.innerText.match(_pattern)!=null){
- _href = _node.getAttribute("href")
- if(_href!=null && _href!="" && _href!="#" && _href.indexOf('javascript')<0){
- _node_xpath = getXpath(_node);
- _node_jsoup = getJsoup(_node);
- }
- _node.click();
- click_message = '找不到翻页按钮,a标签为翻页链接';
- return [true,[pattern_pageNum.toString(),pageNum,pageNum_jsoup],[_node_xpath,_node_jsoup], click_message];
- }
- }
- }
- if(click_message==''){click_message = '最终没找到翻页按钮';}
- return [false,[pattern_pageNum.toString(),pageNum,pageNum_jsoup],[_node_xpath,_node_jsoup], click_message];
- }
- return click_bt(arguments[0]);
- '''
- script_pattern = '''
- function turnpage_by_pattern(pattern_s){
- list_cluster = clustering_turnPage();
- var pattern = new RegExp(pattern_s)
- for(var i=0;i<list_cluster.length;i++){
- _node = list_cluster[i][0];
- _type = list_cluster[i][1];
- if(_node!=null && _node.innerText!=null && _node.innerText.match(pattern)!=null){
- if((_node.tagName!=null && _node.tagName.toLowerCase()=="a")){
- _node.click();
- return true;
- }
- }
- }
- return false;
- }
- return turnpage_by_pattern(arguments[0]);
- '''
- def click_bt_lastPage(browser):
- _url = browser.current_url
- _window_handles = len(browser.window_handles)
- # _result = browser.execute_script(scripts_common+script,"lastPage")
- _result = get_js_rs(browser, scripts_common+script,"lastPage")
- if _result[0]:
- if len(browser.window_handles)>_window_handles:
- switch_window(browser)
- for i in range(4):
- if _url!=browser.current_url and browser.current_url[-1]!="#":
- return _result
- else:
- time.sleep(1)
- return _result
- def click_bt_nextPage(browser):
- _url = browser.current_url
- _window_handles = len(browser.window_handles)
- # _result = browser.execute_script(scripts_common+script,"nextPage")
- _result = get_js_rs(browser, scripts_common+script,"nextPage", timeout=30)
- if _result!=None and _result[0]:
- if len(browser.window_handles)>_window_handles:
- switch_window(browser)
- for i in range(4):
- if _url!=browser.current_url and browser.current_url[-1]!="#":
- return _result
- else:
- time.sleep(1.5)
- return _result
- def click_bt_tailPage(browser):
- _url = browser.current_url
- _window_handles = len(browser.window_handles)
- # _result = browser.execute_script(scripts_common+script,"tailPage")
- _result = get_js_rs(browser, scripts_common+script,"tailPage")
- if _result!=None and _result[0]:
- if len(browser.window_handles)>_window_handles:
- switch_window(browser)
- for i in range(4):
- if _url!=browser.current_url and browser.current_url[-1]!="#":
- return _result
- else:
- time.sleep(1)
- return _result
- def click_bt_pattern(browser,pattern):
- _url = browser.current_url
- _window_handles = len(browser.window_handles)
- # _result = browser.execute_script(scripts_common+script_pattern,pattern)
- _result = get_js_rs(browser, scripts_common+script_pattern,pattern)
- if _result:
- if len(browser.window_handles)>_window_handles:
- switch_window(browser)
- for i in range(4):
- if _url!=browser.current_url and browser.current_url[-1]!="#":
- return _result
- else:
- time.sleep(1)
- return _result
- def switch_window(browser):
- _current_handle = browser.current_window_handle
- browser.switch_to_window(browser.window_handles[-1])
- for i in range(10):
- if browser.current_url=="about:blank":
- time.sleep(1)
- else:
- break
- if browser.current_url=="about:blank":
- browser.switch_to_window(_current_handle)
- def getRuleOfUrl(first_url,second_url):
- dict_rule = {"flag":True,"listpage_turn_before":None,"listpage_pageBegin":None,"listpage_pageStep":1,"listpage_turn_after":None}
- pattern = "(\d+)"
- split_all_first = re.split(pattern,first_url)
- split_all_second = re.split(pattern,second_url)
- log("pageTurn first_url:\t"+first_url)
- log("pageTurn second_url:\t"+second_url)
- if len(split_all_first)!=len(split_all_second):
- split_url = second_url.split('/')
- if split_url[-1]== 'index_2.html':
- dict_rule["listpage_turn_before"] = '/'.join(split_url[:-1])+'/index_'
- dict_rule["listpage_turn_after"] = '.html'
- dict_rule["listpage_pageBegin"] = 2
- dict_rule["listpage_pageStep"] = 1
- return dict_rule
- add_err_msg(dict_rule, "#翻页链接不匹配#")
- dict_rule["flag"] = False
- return dict_rule
- list_diff_index = []
- _index = 0
- for _first,_second in zip(split_all_first,split_all_second):
- if _first!=_second:
- list_diff_index.append(_index)
- _index += 1
- if len(list_diff_index)!=1:
- add_err_msg(dict_rule, "#翻页链接不匹配#")
- dict_rule["flag"] = False
- return dict_rule
- pattern = "^\d+$"
- diff_first = split_all_first[list_diff_index[0]]
- diff_second = split_all_second[list_diff_index[0]]
- if re.search(pattern,diff_first) is None or re.search(pattern,diff_second) is None:
- add_err_msg(dict_rule, "#翻页链接不匹配#")
- dict_rule["flag"] = False
- return dict_rule
- _begin = int(diff_first)
- _end = int(diff_second)
- if _begin<_end:
- pageStep = _end-_begin
- else:
- pageStep = _begin-_end
- part_before = "".join(split_all_first[:list_diff_index[0]])
- part_after = "".join(split_all_first[list_diff_index[0]+1:])
- dict_rule["listpage_turn_before"] = part_before
- dict_rule["listpage_turn_after"] = part_after
- dict_rule["listpage_pageBegin"] = _begin
- dict_rule["listpage_pageStep"] = pageStep
- return dict_rule
- def getTurnRule(browser,listpage_url):
- '''
- 通过点击下一页或数字翻页得到下一页规则(页数,下一页路径等),list_listpage_url(前后列表页url)
- :param browser: 浏览器对象
- :param listpage_url: 列表页url
- :return:
- '''
- # try:
- # hd.loadPage(browser,listpage_url)
- first_url = browser.current_url
- list_listpage_url = []
- click_flag = True
- #点击下一页
- # click_next_1 = click_bt_nextPage(browser)
- click_next_1 = thread_run(click_bt_nextPage, browser)
- url1 = ''
- url2 = browser.current_url
- log("click next bt:"+str(click_next_1))
- #点击下一页
- # click_next_2 = click_bt_nextPage(browser)
- click_next_2 = thread_run(click_bt_nextPage, browser)
- if click_next_1==None:
- click_next_1 = [False, ['/[共\\/]\\s*(\\d+)\\s*页|\\d+\\s*\\/\\s*(\\d+)|\\.{2}\\s*(\\d+)/', None, None],
- [None, None]]
- if click_next_2==None:
- click_next_2 = [False, ['/[共\\/]\\s*(\\d+)\\s*页|\\d+\\s*\\/\\s*(\\d+)|\\.{2}\\s*(\\d+)/', None, None],
- [None, None]]
- log("click next bt:"+str(click_next_2))
- list_pageNum1 = click_next_1[1]
- list_node1 = click_next_1[2]
- list_pageNum2 = click_next_2[1]
- list_node2 = click_next_2[2]
- dict_rule = None
- url3 = browser.current_url
- #是否有点击到下一页
- #click_flag = click_next_1[0] or click_next_2[0]
- click_flag = click_next_2[0]
- #点击数字翻页
- # if not click_flag:
- # #第一个下一页点击到而第二个未点击到
- # log('开始数字翻页')
- # if click_next_1[0]:
- # click_last_1 = click_bt_lastPage(browser)
- # url2 = browser.current_url
- # log('第一次翻页成功,最后一页作为第二页')
- if not click_next_1[0]: # or not click_last_1[0]
- log('开始数字翻页')
- # click_pattern_2 = click_bt_pattern(browser, "^\\s*2\\s*$")
- click_pattern_2 = thread_run(click_bt_pattern, browser, "^\\s*2\\s*$")
- if click_pattern_2:
- url2 = browser.current_url
- log('数字翻页第二页%s'%url2)
- # click_pattern_3 = click_bt_pattern(browser, "^\\s*3\\s*$")
- click_pattern_3 = thread_run(click_bt_pattern , browser, "^\\s*3\\s*$")
- if click_pattern_3:
- url3 = browser.current_url
- log('数字翻页第三页%s'%url3)
- else:
- # click_pattern_1 = click_bt_pattern(browser, "^\\s*1\\s*$")
- click_pattern_1 = thread_run(click_bt_pattern, browser, "^\\s*1\\s*$")
- if click_pattern_1:
- url1 = browser.current_url
- log('数字翻页第一页%s'%url1)
- if url2 != url3:
- dict_rule = getRuleOfUrl(url2, url3)
- elif url1!='' and url2 != url1:
- dict_rule = getRuleOfUrl(url1, url2)
- else:
- dict_rule = getRuleOfUrl(first_url, url2)
- if click_next_1 != None and len(click_next_1)==4:
- click_message = click_next_1[3]
- if click_message!="":
- add_err_msg(dict_rule, '#%s#'%click_message)
- if not click_flag:
- add_err_msg(dict_rule, "#进行数字翻页#")
- list_listpage_url.append(url1)
- list_listpage_url.append(url2)
- if list_pageNum1[2]==list_pageNum2[2] and list_pageNum1[2] is not None:
- dict_rule["listpage_pageNum"] = [list_pageNum1[2],"jsoup"]
- elif list_pageNum1[1]==list_pageNum2[1] and list_pageNum1[1] is not None:
- dict_rule["listpage_pageNum"] = [list_pageNum1[1],"xpath"]
- else:
- dict_rule["listpage_pageNum"] = None
- dict_rule["listpage_pageNum_pattern"] = list_pageNum1[0]
- '''
- #若是未识别到pageNum则flag为False
- if dict_rule["listpage_pageNum"] is None:
- dict_rule["flag"] = False
- '''
- #优先jsoup,后xpath
- if list_node1[1]==list_node2[1] and list_node1[1] is not None:
- dict_rule["listpage_nextPage"] = [list_node1[1],"jsoup"]
- #只有2页的适配
- elif list_node1[1] is not None and list_node2[1] is None:
- log('只有两页更新适配 ')
- dict_rule["listpage_nextPage"] = [list_node1[1],"jsoup"]
- elif list_node1[0]==list_node2[0] and list_node1[0] is not None:
- dict_rule["listpage_nextPage"] = [list_node1[0],"xpath"]
- else:
- dict_rule["listpage_nextPage"] = None
- #翻页按钮或者是拼接规则有一个即可
- if dict_rule["listpage_nextPage"] is not None:
- dict_rule["flag"] = True
- else:
- add_err_msg(dict_rule, "#下一页规则未获取#")
- return dict_rule,list_listpage_url
- # except Exception as e:
- # error(str(e))
- if __name__=="__main__":
- browser = hd.getBrowser()
-
- #browser.get("http://www.jltc.edu.cn/xwdt/ggtz.htm")
- #browser.get("https://www.sdju.edu.cn/zb_3104/list.htm")
- browser.get("http://www.gzsmzmuseum.cn/list-7.html")
- #print(browser.page_source)
- script1 = '''
- list_cluster = clustering_turnPage();
- _array = new Array();
- for(var i=0;i<list_cluster.length;i++){
- _array.push([list_cluster[i][0].innerText,list_cluster[i][1],getOffsetLeft(list_cluster[i][0]),getOffsetTop(list_cluster[i][0])])
- }
- return _array
- '''
-
- # data = browser.execute_script(scripts_common+script1)
- data = get_js_rs(browser, scripts_common+script1)
- #browser.maximize_window()
- browser.save_screenshot("112.png")
- for item in data:
- print(item)
- #print(getTurnRule(browser))
|