luojiehua
/
ContentExtract


			
							123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404
							'''
Created on 2019年8月5日

@author: User
'''

import module.htmlDrawing as hd
import time
from selenium.webdriver.common.action_chains import ActionChains
import re
from module.Utils import *

script = '''

function click_bt(type_click){
    var pattern_pageNum = /[共\/]\s*(\d+)\s*页|\d+\s*\/\s*(\d+)|\.{2}\s*(\d+)/
    var pattern_nextPage = /^\s*[^最]?([下后]一?页|[下后]一?页\s*»|»|>|[Nn]ext).?\s*$/
    
    var pattern_tailPage = /^\s*(最[尾末后]一?页|tail|>\|).?s\s*$/
    list_cluster = clustering_turnPage();
    var pageNum = null;
    var pageNum_jsoup = null;
    var _node_xpath = null;
    var _node_jsoup = null;
    var _node_click = null;
    var click_message = '';
    for(var i=0;i<list_cluster.length;i++){
        _node = list_cluster[i][0]
        _type = list_cluster[i][1]
        
        if(_node.innerText!=null){
            var _match_num = _node.innerText.match(pattern_pageNum);
            if(pageNum==null && _match_num!=null){
                /*
                for(var j=1;j<_match_num.length;j++){
                    if(_match_num[j]!=null){
                        pageNum = _match_num[j]
                    }
                }
                */
                //改为获取规则
                if(pageNum==null){
                    pageNum = getXpath(_node);
                }
                if(pageNum_jsoup==null){
                    pageNum_jsoup = getJsoup(_node);
                }
            }
        }
        
        if(_type==type_click){
            if(_node.tagName.toLowerCase() in {a:"",button:""} || _node.onclick!=null){
                //return getXpath(_node);
                _href = _node.getAttribute("href")
                if(_href!=null && _href!="" && _href!="#" && _href.indexOf('javascript')<0){
                    if(_node_xpath==null){
                        _node_xpath = getXpath(_node);
                    }
                    if(_node_jsoup==null){
                        _node_jsoup = getJsoup(_node);
                    }
                    
                }
                if(_href==null || _href=="" || _href=="#"){
                    click_message = '翻页链接为空或#异常';
                }
                if(_href!=null && _href.indexOf('javascript')>=0){
                    click_message = '翻页链接为javascript';
                }
                if(_node_click==null){
                    _node_click = _node;
                }               
               
            }
            else if(_node.getAttribute("type")=='button'){
                _node_click = _node;
                click_message = '标签属性type为button的翻页';
            }            
            else if(_node.parentNode.tagName.toLowerCase() in {a:"",button:""} || _node.parentNode.onclick!=null){
                _href = _node.parentNode.getAttribute("href")
                if(_href!=null && _href!="" && _href!="#" && _href.indexOf('javascript')<0){
                    if(_node_xpath==null){
                        _node_xpath = getXpath(_node.parentNode);
                    }
                    if(_node_jsoup==null){
                        _node_jsoup = getJsoup(_node.parentNode);
                    }
                    
                }
                if(_node_click==null){
                    _node_click = _node.parentNode;
                }
                click_message = '父节点为翻页链接';				
            }
        }
    }
    if(_node_click!=null){
        _node_click.click();
        return [true,[pattern_pageNum.toString(),pageNum,pageNum_jsoup],[_node_xpath,_node_jsoup], click_message];
    }else{
        var _pattern = null;
        if(type_click=="nextPage"){
            _pattern = pattern_nextPage;
        }else{
            _pattern = pattern_tailPage;
        }
        var list_a = document.getElementsByTagName("a");
        for(var i=0;i<list_a.length;i++){
            var _node = list_a[i];
            if(_node!=null && _node.innerText!=null && _node.innerText.match(_pattern)!=null){
                _href = _node.getAttribute("href")
                if(_href!=null && _href!="" && _href!="#" && _href.indexOf('javascript')<0){
                    _node_xpath = getXpath(_node);
                    _node_jsoup = getJsoup(_node);
                }
                _node.click();
                click_message = '找不到翻页按钮，a标签为翻页链接';
                return [true,[pattern_pageNum.toString(),pageNum,pageNum_jsoup],[_node_xpath,_node_jsoup], click_message];
            }
        }
    }
    if(click_message==''){click_message = '最终没找到翻页按钮';}
    return [false,[pattern_pageNum.toString(),pageNum,pageNum_jsoup],[_node_xpath,_node_jsoup], click_message];
}
return click_bt(arguments[0]);
'''

script_pattern = '''
function turnpage_by_pattern(pattern_s){
    list_cluster = clustering_turnPage();
    var pattern = new RegExp(pattern_s)
    for(var i=0;i<list_cluster.length;i++){
        _node = list_cluster[i][0];
        _type = list_cluster[i][1];
        if(_node!=null && _node.innerText!=null && _node.innerText.match(pattern)!=null){
            if((_node.tagName!=null && _node.tagName.toLowerCase()=="a")){
                _node.click();
                return true;
            }
        }
    }
    return false;
}
return turnpage_by_pattern(arguments[0]);
'''

def click_bt_lastPage(browser):
    _url = browser.current_url
    _window_handles = len(browser.window_handles)
    # _result = browser.execute_script(scripts_common+script,"lastPage")
    _result = get_js_rs(browser, scripts_common+script,"lastPage")
    if _result[0]:
        if len(browser.window_handles)>_window_handles:
            switch_window(browser)
        for i in range(4):
            if _url!=browser.current_url and browser.current_url[-1]!="#":
                return _result
            else:
                time.sleep(1)
    return _result

def click_bt_nextPage(browser):
    _url = browser.current_url
    _window_handles = len(browser.window_handles)
    # _result = browser.execute_script(scripts_common+script,"nextPage")
    _result = get_js_rs(browser, scripts_common+script,"nextPage", timeout=30)
    if _result!=None and _result[0]:
        if len(browser.window_handles)>_window_handles:
            switch_window(browser)
        for i in range(4):
            if _url!=browser.current_url and browser.current_url[-1]!="#":
                return _result
            else:
                time.sleep(1.5)
    return _result

def click_bt_tailPage(browser):
    _url = browser.current_url
    _window_handles = len(browser.window_handles)
    # _result = browser.execute_script(scripts_common+script,"tailPage")
    _result = get_js_rs(browser, scripts_common+script,"tailPage")
    if _result!=None and  _result[0]:
        if len(browser.window_handles)>_window_handles:
            switch_window(browser)
        for i in range(4):
            if _url!=browser.current_url and browser.current_url[-1]!="#":
                return _result
            else:
                time.sleep(1)
    return _result

def click_bt_pattern(browser,pattern):
    _url = browser.current_url
    _window_handles = len(browser.window_handles)
    # _result = browser.execute_script(scripts_common+script_pattern,pattern)
    _result = get_js_rs(browser, scripts_common+script_pattern,pattern)
    if _result:
        if len(browser.window_handles)>_window_handles:
            switch_window(browser)
        for i in range(4):
            if _url!=browser.current_url and browser.current_url[-1]!="#":
                return _result
            else:
                time.sleep(1)
    return _result

def switch_window(browser):
    _current_handle = browser.current_window_handle
    browser.switch_to_window(browser.window_handles[-1])
    for i in range(10):
        if browser.current_url=="about:blank":
            time.sleep(1)
        else:
            break
    if browser.current_url=="about:blank":
        browser.switch_to_window(_current_handle)

def getRuleOfUrl(first_url,second_url):
    dict_rule = {"flag":True,"listpage_turn_before":None,"listpage_pageBegin":None,"listpage_pageStep":1,"listpage_turn_after":None}
    pattern = "(\d+)"
    split_all_first = re.split(pattern,first_url)
    split_all_second = re.split(pattern,second_url)
    log("pageTurn first_url:\t"+first_url)
    log("pageTurn second_url:\t"+second_url)
    if len(split_all_first)!=len(split_all_second):
        split_url = second_url.split('/')
        if split_url[-1]== 'index_2.html':
            dict_rule["listpage_turn_before"] = '/'.join(split_url[:-1])+'/index_'
            dict_rule["listpage_turn_after"] = '.html'
            dict_rule["listpage_pageBegin"] = 2
            dict_rule["listpage_pageStep"] = 1
            return dict_rule
        add_err_msg(dict_rule, "#翻页链接不匹配#")
        dict_rule["flag"] = False
        return dict_rule
    list_diff_index = []
    _index = 0
    for _first,_second in zip(split_all_first,split_all_second):
        if _first!=_second:
            list_diff_index.append(_index)
        _index += 1
    if len(list_diff_index)!=1:
        add_err_msg(dict_rule, "#翻页链接不匹配#")
        dict_rule["flag"] = False
        return dict_rule
    pattern = "^\d+$"
    diff_first = split_all_first[list_diff_index[0]]
    diff_second = split_all_second[list_diff_index[0]]
    if re.search(pattern,diff_first) is None or re.search(pattern,diff_second) is None:
        add_err_msg(dict_rule, "#翻页链接不匹配#")
        dict_rule["flag"] = False
        return dict_rule
    _begin = int(diff_first)
    _end = int(diff_second)
    if _begin<_end:
        pageStep = _end-_begin
    else:
        pageStep = _begin-_end
    part_before = "".join(split_all_first[:list_diff_index[0]])
    part_after = "".join(split_all_first[list_diff_index[0]+1:])
    dict_rule["listpage_turn_before"] = part_before
    dict_rule["listpage_turn_after"] = part_after
    dict_rule["listpage_pageBegin"] = _begin
    dict_rule["listpage_pageStep"] = pageStep
    return dict_rule

def getTurnRule(browser,listpage_url):
    '''
    通过点击下一页或数字翻页得到下一页规则（页数，下一页路径等），list_listpage_url(前后列表页url)
    :param browser: 浏览器对象
    :param listpage_url: 列表页url
    :return:
    '''
    # try:
    # hd.loadPage(browser,listpage_url)
    first_url = browser.current_url
    list_listpage_url = []
    click_flag = True
    #点击下一页
    # click_next_1 = click_bt_nextPage(browser)
    click_next_1 = thread_run(click_bt_nextPage, browser)
    url1 = ''
    url2 = browser.current_url
    log("click next bt:"+str(click_next_1))
    #点击下一页
    # click_next_2 = click_bt_nextPage(browser)
    click_next_2 = thread_run(click_bt_nextPage, browser)
    if click_next_1==None:
        click_next_1 = [False, ['/[共\\/]\\s*(\\d+)\\s*页|\\d+\\s*\\/\\s*(\\d+)|\\.{2}\\s*(\\d+)/', None, None],
                        [None, None]]
    if click_next_2==None:
        click_next_2 = [False, ['/[共\\/]\\s*(\\d+)\\s*页|\\d+\\s*\\/\\s*(\\d+)|\\.{2}\\s*(\\d+)/', None, None],
                        [None, None]]
    log("click next bt:"+str(click_next_2))
    list_pageNum1 = click_next_1[1]
    list_node1 = click_next_1[2]
    list_pageNum2 = click_next_2[1]
    list_node2 = click_next_2[2]
    dict_rule = None
    url3 = browser.current_url

    #是否有点击到下一页
    #click_flag = click_next_1[0] or click_next_2[0]
    click_flag = click_next_2[0]


    #点击数字翻页
    # if not click_flag:
    #     #第一个下一页点击到而第二个未点击到
    #     log('开始数字翻页')
        # if click_next_1[0]:
        #     click_last_1 = click_bt_lastPage(browser)
        #     url2 = browser.current_url
        #     log('第一次翻页成功，最后一页作为第二页')
    if not click_next_1[0]: # or not click_last_1[0]
        log('开始数字翻页')
        # click_pattern_2 = click_bt_pattern(browser, "^\\s*2\\s*$")
        click_pattern_2 = thread_run(click_bt_pattern, browser, "^\\s*2\\s*$")
        if click_pattern_2:
            url2 = browser.current_url
            log('数字翻页第二页%s'%url2)
        # click_pattern_3 = click_bt_pattern(browser, "^\\s*3\\s*$")
        click_pattern_3 = thread_run(click_bt_pattern , browser, "^\\s*3\\s*$")
        if click_pattern_3:
            url3 = browser.current_url
            log('数字翻页第三页%s'%url3)
        else:
            # click_pattern_1 = click_bt_pattern(browser, "^\\s*1\\s*$")
            click_pattern_1 = thread_run(click_bt_pattern, browser, "^\\s*1\\s*$")
            if click_pattern_1:
                url1 = browser.current_url
                log('数字翻页第一页%s'%url1)
    if url2 != url3:
        dict_rule = getRuleOfUrl(url2, url3)
    elif url1!='' and url2 != url1:
        dict_rule = getRuleOfUrl(url1, url2)
    else:
        dict_rule = getRuleOfUrl(first_url, url2)
    if click_next_1 != None and len(click_next_1)==4:
        click_message = click_next_1[3]
        if click_message!="":
            add_err_msg(dict_rule, '#%s#'%click_message)
    if not click_flag:
        add_err_msg(dict_rule, "#进行数字翻页#")
    list_listpage_url.append(url1)
    list_listpage_url.append(url2)

    if list_pageNum1[2]==list_pageNum2[2] and list_pageNum1[2] is not None:
        dict_rule["listpage_pageNum"] = [list_pageNum1[2],"jsoup"]
    elif list_pageNum1[1]==list_pageNum2[1] and list_pageNum1[1] is not None:
        dict_rule["listpage_pageNum"] = [list_pageNum1[1],"xpath"]
    else:
        dict_rule["listpage_pageNum"] = None
    dict_rule["listpage_pageNum_pattern"] = list_pageNum1[0]
    '''
    #若是未识别到pageNum则flag为False
    if dict_rule["listpage_pageNum"] is None:
        dict_rule["flag"] = False
    '''
    #优先jsoup，后xpath
    if list_node1[1]==list_node2[1] and list_node1[1] is not None:
        dict_rule["listpage_nextPage"] = [list_node1[1],"jsoup"]
    #只有2页的适配
    elif list_node1[1] is not None and list_node2[1] is None:
        log('只有两页更新适配 ')
        dict_rule["listpage_nextPage"] = [list_node1[1],"jsoup"]
    elif list_node1[0]==list_node2[0] and list_node1[0] is not None:
        dict_rule["listpage_nextPage"] = [list_node1[0],"xpath"]
    else:
        dict_rule["listpage_nextPage"] = None

    #翻页按钮或者是拼接规则有一个即可
    if dict_rule["listpage_nextPage"] is not None:
        dict_rule["flag"] = True
    else:
        add_err_msg(dict_rule, "#下一页规则未获取#")
    return dict_rule,list_listpage_url
    # except Exception as e:
    #     error(str(e))

if __name__=="__main__":
    browser = hd.getBrowser()
    
    #browser.get("http://www.jltc.edu.cn/xwdt/ggtz.htm")
    #browser.get("https://www.sdju.edu.cn/zb_3104/list.htm")
    browser.get("http://www.gzsmzmuseum.cn/list-7.html")
    #print(browser.page_source)
    script1 = '''
    list_cluster = clustering_turnPage();
    _array = new Array();
    for(var i=0;i<list_cluster.length;i++){
        _array.push([list_cluster[i][0].innerText,list_cluster[i][1],getOffsetLeft(list_cluster[i][0]),getOffsetTop(list_cluster[i][0])])
    }
    return _array
    '''
    
    # data = browser.execute_script(scripts_common+script1)
    data = get_js_rs(browser, scripts_common+script1)
    #browser.maximize_window()
    browser.save_screenshot("112.png")
    for item in data:
        print(item)
    #print(getTurnRule(browser))