''' Created on 2019年8月5日 @author: User ''' import module.htmlDrawing as hd import time from selenium.webdriver.common.action_chains import ActionChains import re from module.Utils import * script = ''' function click_bt(type_click){ var pattern_pageNum = /[共\/]\s*(\d+)\s*页|\d+\s*\/\s*(\d+)|\.{2}\s*(\d+)/ var pattern_nextPage = /^\s*[^最]?([下后]一?页|[下后]一?页\s*»|»|>|[Nn]ext).?\s*$/ var pattern_tailPage = /^\s*(最[尾末后]一?页|tail|>\|).?s\s*$/ list_cluster = clustering_turnPage(); var pageNum = null; var pageNum_jsoup = null; var _node_xpath = null; var _node_jsoup = null; var _node_click = null; var click_message = ''; for(var i=0;i=0){ click_message = '翻页链接为javascript'; } if(_node_click==null){ _node_click = _node; } } else if(_node.getAttribute("type")=='button'){ _node_click = _node; click_message = '标签属性type为button的翻页'; } else if(_node.parentNode.tagName.toLowerCase() in {a:"",button:""} || _node.parentNode.onclick!=null){ _href = _node.parentNode.getAttribute("href") if(_href!=null && _href!="" && _href!="#" && _href.indexOf('javascript')<0){ if(_node_xpath==null){ _node_xpath = getXpath(_node.parentNode); } if(_node_jsoup==null){ _node_jsoup = getJsoup(_node.parentNode); } } if(_node_click==null){ _node_click = _node.parentNode; } click_message = '父节点为翻页链接'; } } } if(_node_click!=null){ _node_click.click(); return [true,[pattern_pageNum.toString(),pageNum,pageNum_jsoup],[_node_xpath,_node_jsoup], click_message]; }else{ var _pattern = null; if(type_click=="nextPage"){ _pattern = pattern_nextPage; }else{ _pattern = pattern_tailPage; } var list_a = document.getElementsByTagName("a"); for(var i=0;i_window_handles: switch_window(browser) for i in range(4): if _url!=browser.current_url and browser.current_url[-1]!="#": return _result else: time.sleep(1) return _result def click_bt_nextPage(browser): _url = browser.current_url _window_handles = len(browser.window_handles) # _result = browser.execute_script(scripts_common+script,"nextPage") _result = get_js_rs(browser, scripts_common+script,"nextPage", timeout=30) if _result!=None and _result[0]: if len(browser.window_handles)>_window_handles: switch_window(browser) for i in range(4): if _url!=browser.current_url and browser.current_url[-1]!="#": return _result else: time.sleep(1.5) return _result def click_bt_tailPage(browser): _url = browser.current_url _window_handles = len(browser.window_handles) # _result = browser.execute_script(scripts_common+script,"tailPage") _result = get_js_rs(browser, scripts_common+script,"tailPage") if _result!=None and _result[0]: if len(browser.window_handles)>_window_handles: switch_window(browser) for i in range(4): if _url!=browser.current_url and browser.current_url[-1]!="#": return _result else: time.sleep(1) return _result def click_bt_pattern(browser,pattern): _url = browser.current_url _window_handles = len(browser.window_handles) # _result = browser.execute_script(scripts_common+script_pattern,pattern) _result = get_js_rs(browser, scripts_common+script_pattern,pattern) if _result: if len(browser.window_handles)>_window_handles: switch_window(browser) for i in range(4): if _url!=browser.current_url and browser.current_url[-1]!="#": return _result else: time.sleep(1) return _result def switch_window(browser): _current_handle = browser.current_window_handle browser.switch_to_window(browser.window_handles[-1]) for i in range(10): if browser.current_url=="about:blank": time.sleep(1) else: break if browser.current_url=="about:blank": browser.switch_to_window(_current_handle) def getRuleOfUrl(first_url,second_url): dict_rule = {"flag":True,"listpage_turn_before":None,"listpage_pageBegin":None,"listpage_pageStep":1,"listpage_turn_after":None} pattern = "(\d+)" split_all_first = re.split(pattern,first_url) split_all_second = re.split(pattern,second_url) log("pageTurn first_url:\t"+first_url) log("pageTurn second_url:\t"+second_url) if len(split_all_first)!=len(split_all_second): split_url = second_url.split('/') if re.search('^index_[12].\w{3,5}$',split_url[-1]): suffix = split_url[-1].split('.')[1] page_begin = int(split_url[-1][6]) dict_rule["listpage_turn_before"] = '/'.join(split_url[:-1])+'/index_' dict_rule["listpage_turn_after"] = '.'+suffix dict_rule["listpage_pageBegin"] = page_begin dict_rule["listpage_pageStep"] = 1 return dict_rule add_err_msg(dict_rule, "#翻页链接不匹配#") dict_rule["flag"] = False return dict_rule list_diff_index = [] _index = 0 for _first,_second in zip(split_all_first,split_all_second): if _first!=_second: list_diff_index.append(_index) _index += 1 if len(list_diff_index)!=1: add_err_msg(dict_rule, "#翻页链接不匹配#") dict_rule["flag"] = False return dict_rule pattern = "^\d+$" diff_first = split_all_first[list_diff_index[0]] diff_second = split_all_second[list_diff_index[0]] if re.search(pattern,diff_first) is None or re.search(pattern,diff_second) is None: add_err_msg(dict_rule, "#翻页链接不匹配#") dict_rule["flag"] = False return dict_rule _begin = int(diff_first) _end = int(diff_second) if _begin<_end: pageStep = _end-_begin else: pageStep = _begin-_end part_before = "".join(split_all_first[:list_diff_index[0]]) part_after = "".join(split_all_first[list_diff_index[0]+1:]) dict_rule["listpage_turn_before"] = part_before dict_rule["listpage_turn_after"] = part_after dict_rule["listpage_pageBegin"] = _begin dict_rule["listpage_pageStep"] = pageStep return dict_rule def getTurnRule(browser,listpage_url): ''' 通过点击下一页或数字翻页得到下一页规则(页数,下一页路径等),list_listpage_url(前后列表页url) :param browser: 浏览器对象 :param listpage_url: 列表页url :return: ''' # try: # hd.loadPage(browser,listpage_url) first_url = browser.current_url list_listpage_url = [] click_flag = True #点击下一页 # click_next_1 = click_bt_nextPage(browser) click_next_1 = thread_run(click_bt_nextPage, browser) url1 = '' url2 = browser.current_url log("click next bt:"+str(click_next_1)) #点击下一页 # click_next_2 = click_bt_nextPage(browser) click_next_2 = thread_run(click_bt_nextPage, browser) if click_next_1==None: click_next_1 = [False, ['/[共\\/]\\s*(\\d+)\\s*页|\\d+\\s*\\/\\s*(\\d+)|\\.{2}\\s*(\\d+)/', None, None], [None, None]] if click_next_2==None: click_next_2 = [False, ['/[共\\/]\\s*(\\d+)\\s*页|\\d+\\s*\\/\\s*(\\d+)|\\.{2}\\s*(\\d+)/', None, None], [None, None]] log("click next bt:"+str(click_next_2)) list_pageNum1 = click_next_1[1] list_node1 = click_next_1[2] list_pageNum2 = click_next_2[1] list_node2 = click_next_2[2] dict_rule = None url3 = browser.current_url #是否有点击到下一页 #click_flag = click_next_1[0] or click_next_2[0] click_flag = click_next_2[0] #点击数字翻页 # if not click_flag: # #第一个下一页点击到而第二个未点击到 # log('开始数字翻页') # if click_next_1[0]: # click_last_1 = click_bt_lastPage(browser) # url2 = browser.current_url # log('第一次翻页成功,最后一页作为第二页') if not click_next_1[0]: # or not click_last_1[0] log('开始数字翻页') # click_pattern_2 = click_bt_pattern(browser, "^\\s*2\\s*$") click_pattern_2 = thread_run(click_bt_pattern, browser, "^\\s*2\\s*$") if click_pattern_2: url2 = browser.current_url log('数字翻页第二页%s'%url2) # click_pattern_3 = click_bt_pattern(browser, "^\\s*3\\s*$") click_pattern_3 = thread_run(click_bt_pattern , browser, "^\\s*3\\s*$") if click_pattern_3: url3 = browser.current_url log('数字翻页第三页%s'%url3) else: # click_pattern_1 = click_bt_pattern(browser, "^\\s*1\\s*$") click_pattern_1 = thread_run(click_bt_pattern, browser, "^\\s*1\\s*$") if click_pattern_1: url1 = browser.current_url log('数字翻页第一页%s'%url1) if url2 != url3: dict_rule = getRuleOfUrl(url2, url3) elif url1!='' and url2 != url1: dict_rule = getRuleOfUrl(url1, url2) else: dict_rule = getRuleOfUrl(first_url, url2) if click_next_1 != None and len(click_next_1)==4: click_message = click_next_1[3] if click_message!="": add_err_msg(dict_rule, '#%s#'%click_message) if not click_flag: add_err_msg(dict_rule, "#进行数字翻页#") list_listpage_url.append(url1) list_listpage_url.append(url2) if list_pageNum1[2]==list_pageNum2[2] and list_pageNum1[2] is not None: dict_rule["listpage_pageNum"] = [list_pageNum1[2],"jsoup"] elif list_pageNum1[1]==list_pageNum2[1] and list_pageNum1[1] is not None: dict_rule["listpage_pageNum"] = [list_pageNum1[1],"xpath"] else: dict_rule["listpage_pageNum"] = None dict_rule["listpage_pageNum_pattern"] = list_pageNum1[0] ''' #若是未识别到pageNum则flag为False if dict_rule["listpage_pageNum"] is None: dict_rule["flag"] = False ''' #优先jsoup,后xpath if list_node1[0]is not None and hd.hasDrew(first_url, [{"rule":list_node1[0],"type":"xpath"}])==True: log('翻页链接经过渲染') dict_rule["listpage_nextPage"] = None elif list_node1[1]==list_node2[1] and list_node1[1] is not None: dict_rule["listpage_nextPage"] = [list_node1[1],"jsoup"] #只有2页的适配 elif list_node1[1] is not None and list_node2[1] is None: log('只有两页更新适配 ') dict_rule["listpage_nextPage"] = [list_node1[1],"jsoup"] elif list_node1[0]==list_node2[0] and list_node1[0] is not None: dict_rule["listpage_nextPage"] = [list_node1[0],"xpath"] else: dict_rule["listpage_nextPage"] = None #翻页按钮或者是拼接规则有一个即可 if dict_rule["listpage_nextPage"] is not None: dict_rule["flag"] = True else: add_err_msg(dict_rule, "#下一页规则未获取#") return dict_rule,list_listpage_url # except Exception as e: # error(str(e)) if __name__=="__main__": browser = hd.getBrowser() #browser.get("http://www.jltc.edu.cn/xwdt/ggtz.htm") #browser.get("https://www.sdju.edu.cn/zb_3104/list.htm") browser.get("http://www.gzsmzmuseum.cn/list-7.html") #print(browser.page_source) script1 = ''' list_cluster = clustering_turnPage(); _array = new Array(); for(var i=0;i