|
@@ -14,15 +14,16 @@ script = '''
|
|
|
|
|
|
function click_bt(type_click){
|
|
|
var pattern_pageNum = /[共\/]\s*(\d+)\s*页|\d+\s*\/\s*(\d+)|\.{2}\s*(\d+)/
|
|
|
- var pattern_nextPage = /^\s*.?([下后]一?页|[下后]一?页\s*»|»|>|[Nn]ext).?\s*$/
|
|
|
+ var pattern_nextPage = /^\s*[^最]?([下后]一?页|[下后]一?页\s*»|»|>|[Nn]ext).?\s*$/
|
|
|
|
|
|
- var pattern_tailPage = /^\s*.?(最?[尾末]一?页|tail|>\|).?s\s*$/
|
|
|
+ var pattern_tailPage = /^\s*(最[尾末后]一?页|tail|>\|).?s\s*$/
|
|
|
list_cluster = clustering_turnPage();
|
|
|
var pageNum = null;
|
|
|
var pageNum_jsoup = null;
|
|
|
var _node_xpath = null;
|
|
|
var _node_jsoup = null;
|
|
|
var _node_click = null;
|
|
|
+ var click_message = '';
|
|
|
for(var i=0;i<list_cluster.length;i++){
|
|
|
_node = list_cluster[i][0]
|
|
|
_type = list_cluster[i][1]
|
|
@@ -60,17 +61,42 @@ function click_bt(type_click){
|
|
|
}
|
|
|
|
|
|
}
|
|
|
+ if(_href==null || _href=="" || _href=="#"){
|
|
|
+ click_message = '翻页链接为空或#异常';
|
|
|
+ }
|
|
|
+ if(_href!=null && _href.indexOf('javascript')>=0){
|
|
|
+ click_message = '翻页链接为javascript';
|
|
|
+ }
|
|
|
if(_node_click==null){
|
|
|
_node_click = _node;
|
|
|
+ }
|
|
|
+
|
|
|
+ }
|
|
|
+ else if(_node.getAttribute("type")=='button'){
|
|
|
+ _node_click = _node;
|
|
|
+ click_message = '标签属性type为button的翻页';
|
|
|
+ }
|
|
|
+ else if(_node.parentNode.tagName.toLowerCase() in {a:"",button:""} || _node.parentNode.onclick!=null){
|
|
|
+ _href = _node.parentNode.getAttribute("href")
|
|
|
+ if(_href!=null && _href!="" && _href!="#" && _href.indexOf('javascript')<0){
|
|
|
+ if(_node_xpath==null){
|
|
|
+ _node_xpath = getXpath(_node.parentNode);
|
|
|
+ }
|
|
|
+ if(_node_jsoup==null){
|
|
|
+ _node_jsoup = getJsoup(_node.parentNode);
|
|
|
+ }
|
|
|
+
|
|
|
+ }
|
|
|
+ if(_node_click==null){
|
|
|
+ _node_click = _node.parentNode;
|
|
|
}
|
|
|
-
|
|
|
-
|
|
|
+ click_message = '父节点为翻页链接';
|
|
|
}
|
|
|
}
|
|
|
}
|
|
|
if(_node_click!=null){
|
|
|
_node_click.click();
|
|
|
- return [true,[pattern_pageNum.toString(),pageNum,pageNum_jsoup],[_node_xpath,_node_jsoup]];
|
|
|
+ return [true,[pattern_pageNum.toString(),pageNum,pageNum_jsoup],[_node_xpath,_node_jsoup], click_message];
|
|
|
}else{
|
|
|
var _pattern = null;
|
|
|
if(type_click=="nextPage"){
|
|
@@ -88,11 +114,13 @@ function click_bt(type_click){
|
|
|
_node_jsoup = getJsoup(_node);
|
|
|
}
|
|
|
_node.click();
|
|
|
- return [true,[pattern_pageNum.toString(),pageNum,pageNum_jsoup],[_node_xpath,_node_jsoup]];
|
|
|
+ click_message = '找不到翻页按钮,a标签为翻页链接';
|
|
|
+ return [true,[pattern_pageNum.toString(),pageNum,pageNum_jsoup],[_node_xpath,_node_jsoup], click_message];
|
|
|
}
|
|
|
}
|
|
|
}
|
|
|
- return [false,[pattern_pageNum.toString(),pageNum,pageNum_jsoup],[_node_xpath,_node_jsoup]];
|
|
|
+ if(click_message==''){click_message = '最终没找到翻页按钮';}
|
|
|
+ return [false,[pattern_pageNum.toString(),pageNum,pageNum_jsoup],[_node_xpath,_node_jsoup], click_message];
|
|
|
}
|
|
|
return click_bt(arguments[0]);
|
|
|
'''
|
|
@@ -119,7 +147,8 @@ return turnpage_by_pattern(arguments[0]);
|
|
|
def click_bt_lastPage(browser):
|
|
|
_url = browser.current_url
|
|
|
_window_handles = len(browser.window_handles)
|
|
|
- _result = browser.execute_script(scripts_common+script,"lastPage")
|
|
|
+ # _result = browser.execute_script(scripts_common+script,"lastPage")
|
|
|
+ _result = get_js_rs(browser, scripts_common+script,"lastPage")
|
|
|
if _result[0]:
|
|
|
if len(browser.window_handles)>_window_handles:
|
|
|
switch_window(browser)
|
|
@@ -133,8 +162,9 @@ def click_bt_lastPage(browser):
|
|
|
def click_bt_nextPage(browser):
|
|
|
_url = browser.current_url
|
|
|
_window_handles = len(browser.window_handles)
|
|
|
- _result = browser.execute_script(scripts_common+script,"nextPage")
|
|
|
- if _result[0]:
|
|
|
+ # _result = browser.execute_script(scripts_common+script,"nextPage")
|
|
|
+ _result = get_js_rs(browser, scripts_common+script,"nextPage", timeout=30)
|
|
|
+ if _result!=None and _result[0]:
|
|
|
if len(browser.window_handles)>_window_handles:
|
|
|
switch_window(browser)
|
|
|
for i in range(4):
|
|
@@ -147,8 +177,9 @@ def click_bt_nextPage(browser):
|
|
|
def click_bt_tailPage(browser):
|
|
|
_url = browser.current_url
|
|
|
_window_handles = len(browser.window_handles)
|
|
|
- _result = browser.execute_script(scripts_common+script,"tailPage")
|
|
|
- if _result[0]:
|
|
|
+ # _result = browser.execute_script(scripts_common+script,"tailPage")
|
|
|
+ _result = get_js_rs(browser, scripts_common+script,"tailPage")
|
|
|
+ if _result!=None and _result[0]:
|
|
|
if len(browser.window_handles)>_window_handles:
|
|
|
switch_window(browser)
|
|
|
for i in range(4):
|
|
@@ -161,7 +192,8 @@ def click_bt_tailPage(browser):
|
|
|
def click_bt_pattern(browser,pattern):
|
|
|
_url = browser.current_url
|
|
|
_window_handles = len(browser.window_handles)
|
|
|
- _result = browser.execute_script(scripts_common+script_pattern,pattern)
|
|
|
+ # _result = browser.execute_script(scripts_common+script_pattern,pattern)
|
|
|
+ _result = get_js_rs(browser, scripts_common+script_pattern,pattern)
|
|
|
if _result:
|
|
|
if len(browser.window_handles)>_window_handles:
|
|
|
switch_window(browser)
|
|
@@ -191,6 +223,13 @@ def getRuleOfUrl(first_url,second_url):
|
|
|
log("pageTurn first_url:\t"+first_url)
|
|
|
log("pageTurn second_url:\t"+second_url)
|
|
|
if len(split_all_first)!=len(split_all_second):
|
|
|
+ split_url = second_url.split('/')
|
|
|
+ if split_url[-1]== 'index_2.html':
|
|
|
+ dict_rule["listpage_turn_before"] = '/'.join(split_url[:-1])+'/index_'
|
|
|
+ dict_rule["listpage_turn_after"] = '.html'
|
|
|
+ dict_rule["listpage_pageBegin"] = 2
|
|
|
+ dict_rule["listpage_pageStep"] = 1
|
|
|
+ return dict_rule
|
|
|
add_err_msg(dict_rule, "#翻页链接不匹配#")
|
|
|
dict_rule["flag"] = False
|
|
|
return dict_rule
|
|
@@ -226,86 +265,119 @@ def getRuleOfUrl(first_url,second_url):
|
|
|
return dict_rule
|
|
|
|
|
|
def getTurnRule(browser,listpage_url):
|
|
|
- try:
|
|
|
- hd.loadPage(browser,listpage_url)
|
|
|
- first_url = browser.current_url
|
|
|
- list_listpage_url = []
|
|
|
- click_flag = True
|
|
|
- #点击下一页
|
|
|
- click_next_1 = click_bt_nextPage(browser)
|
|
|
-
|
|
|
- url1 = browser.current_url
|
|
|
- log("click next bt:"+str(click_next_1))
|
|
|
- #点击下一页
|
|
|
- click_next_2 = click_bt_nextPage(browser)
|
|
|
- log("click next bt:"+str(click_next_2))
|
|
|
- list_pageNum1 = click_next_1[1]
|
|
|
- list_node1 = click_next_1[2]
|
|
|
- list_pageNum2 = click_next_2[1]
|
|
|
- list_node2 = click_next_2[2]
|
|
|
- dict_rule = None
|
|
|
- url2 = browser.current_url
|
|
|
-
|
|
|
- #是否有点击到下一页
|
|
|
- #click_flag = click_next_1[0] or click_next_2[0]
|
|
|
- click_flag = click_next_2[0]
|
|
|
-
|
|
|
-
|
|
|
-
|
|
|
- #点击数字翻页
|
|
|
- if not click_flag:
|
|
|
- #第一个下一页点击到而第二个未点击到
|
|
|
- if click_next_1[0]:
|
|
|
- click_last_1 = click_bt_lastPage(browser)
|
|
|
- url2 = browser.current_url
|
|
|
- if not click_next_1[0] or not click_last_1[0]:
|
|
|
- click_pattern_2 = click_bt_pattern(browser, "^\\s*2\\s*$")
|
|
|
- if click_pattern_2:
|
|
|
- url2 = browser.current_url
|
|
|
- click_pattern_1 = click_bt_pattern(browser, "^\\s*1\\s*$")
|
|
|
- if click_pattern_1:
|
|
|
- url1 = browser.current_url
|
|
|
- if url1==first_url:
|
|
|
- click_pattern_3 = click_bt_pattern(browser, "^\\s*3\\s*$")
|
|
|
- if click_pattern_3:
|
|
|
- url1 = url2
|
|
|
- url2 = browser.current_url
|
|
|
-
|
|
|
- dict_rule = getRuleOfUrl(url1, url2)
|
|
|
- list_listpage_url.append(url1)
|
|
|
- list_listpage_url.append(url2)
|
|
|
-
|
|
|
- if list_pageNum1[2]==list_pageNum2[2] and list_pageNum1[2] is not None:
|
|
|
- dict_rule["listpage_pageNum"] = [list_pageNum1[2],"jsoup"]
|
|
|
- elif list_pageNum1[1]==list_pageNum2[1] and list_pageNum1[1] is not None:
|
|
|
- dict_rule["listpage_pageNum"] = [list_pageNum1[1],"xpath"]
|
|
|
- else:
|
|
|
- dict_rule["listpage_pageNum"] = None
|
|
|
- dict_rule["listpage_pageNum_pattern"] = list_pageNum1[0]
|
|
|
- '''
|
|
|
- #若是未识别到pageNum则flag为False
|
|
|
- if dict_rule["listpage_pageNum"] is None:
|
|
|
- dict_rule["flag"] = False
|
|
|
- '''
|
|
|
- #优先jsoup,后xpath
|
|
|
- if list_node1[1]==list_node2[1] and list_node1[1] is not None:
|
|
|
- dict_rule["listpage_nextPage"] = [list_node1[1],"jsoup"]
|
|
|
- #只有2页的适配
|
|
|
- elif list_node1[1] is not None and list_node2[1] is None:
|
|
|
- dict_rule["listpage_nextPage"] = [list_node1[1],"jsoup"]
|
|
|
- elif list_node1[0]==list_node2[0] and list_node1[0] is not None:
|
|
|
- dict_rule["listpage_nextPage"] = [list_node1[0],"xpath"]
|
|
|
- else:
|
|
|
- dict_rule["listpage_nextPage"] = None
|
|
|
-
|
|
|
- #翻页按钮或者是拼接规则有一个即可
|
|
|
- if dict_rule["listpage_nextPage"] is not None:
|
|
|
- dict_rule["flag"] = True
|
|
|
+ '''
|
|
|
+ 通过点击下一页或数字翻页得到下一页规则(页数,下一页路径等),list_listpage_url(前后列表页url)
|
|
|
+ :param browser: 浏览器对象
|
|
|
+ :param listpage_url: 列表页url
|
|
|
+ :return:
|
|
|
+ '''
|
|
|
+ # try:
|
|
|
+ # hd.loadPage(browser,listpage_url)
|
|
|
+ first_url = browser.current_url
|
|
|
+ list_listpage_url = []
|
|
|
+ click_flag = True
|
|
|
+ #点击下一页
|
|
|
+ # click_next_1 = click_bt_nextPage(browser)
|
|
|
+ click_next_1 = thread_run(click_bt_nextPage, browser)
|
|
|
+ url1 = ''
|
|
|
+ url2 = browser.current_url
|
|
|
+ log("click next bt:"+str(click_next_1))
|
|
|
+ #点击下一页
|
|
|
+ # click_next_2 = click_bt_nextPage(browser)
|
|
|
+ click_next_2 = thread_run(click_bt_nextPage, browser)
|
|
|
+ if click_next_1==None:
|
|
|
+ click_next_1 = [False, ['/[共\\/]\\s*(\\d+)\\s*页|\\d+\\s*\\/\\s*(\\d+)|\\.{2}\\s*(\\d+)/', None, None],
|
|
|
+ [None, None]]
|
|
|
+ if click_next_2==None:
|
|
|
+ click_next_2 = [False, ['/[共\\/]\\s*(\\d+)\\s*页|\\d+\\s*\\/\\s*(\\d+)|\\.{2}\\s*(\\d+)/', None, None],
|
|
|
+ [None, None]]
|
|
|
+ log("click next bt:"+str(click_next_2))
|
|
|
+ list_pageNum1 = click_next_1[1]
|
|
|
+ list_node1 = click_next_1[2]
|
|
|
+ list_pageNum2 = click_next_2[1]
|
|
|
+ list_node2 = click_next_2[2]
|
|
|
+ dict_rule = None
|
|
|
+ url3 = browser.current_url
|
|
|
+
|
|
|
+ #是否有点击到下一页
|
|
|
+ #click_flag = click_next_1[0] or click_next_2[0]
|
|
|
+ click_flag = click_next_2[0]
|
|
|
+
|
|
|
+
|
|
|
+
|
|
|
+ #点击数字翻页
|
|
|
+ # if not click_flag:
|
|
|
+ # #第一个下一页点击到而第二个未点击到
|
|
|
+ # log('开始数字翻页')
|
|
|
+ # if click_next_1[0]:
|
|
|
+ # click_last_1 = click_bt_lastPage(browser)
|
|
|
+ # url2 = browser.current_url
|
|
|
+ # log('第一次翻页成功,最后一页作为第二页')
|
|
|
+ if not click_next_1[0]: # or not click_last_1[0]
|
|
|
+ log('开始数字翻页')
|
|
|
+ # click_pattern_2 = click_bt_pattern(browser, "^\\s*2\\s*$")
|
|
|
+ click_pattern_2 = thread_run(click_bt_pattern, browser, "^\\s*2\\s*$")
|
|
|
+ if click_pattern_2:
|
|
|
+ url2 = browser.current_url
|
|
|
+ log('数字翻页第二页%s'%url2)
|
|
|
+ # click_pattern_3 = click_bt_pattern(browser, "^\\s*3\\s*$")
|
|
|
+ click_pattern_3 = thread_run(click_bt_pattern , browser, "^\\s*3\\s*$")
|
|
|
+ if click_pattern_3:
|
|
|
+ url3 = browser.current_url
|
|
|
+ log('数字翻页第三页%s'%url3)
|
|
|
else:
|
|
|
- add_err_msg(dict_rule, "#下一页规则未获取#")
|
|
|
- return dict_rule,list_listpage_url
|
|
|
- except Exception as e:
|
|
|
- error(str(e))
|
|
|
+ # click_pattern_1 = click_bt_pattern(browser, "^\\s*1\\s*$")
|
|
|
+ click_pattern_1 = thread_run(click_bt_pattern, browser, "^\\s*1\\s*$")
|
|
|
+ if click_pattern_1:
|
|
|
+ url1 = browser.current_url
|
|
|
+ log('数字翻页第一页%s'%url1)
|
|
|
+ if url2 != url3:
|
|
|
+ dict_rule = getRuleOfUrl(url2, url3)
|
|
|
+ elif url1!='' and url2 != url1:
|
|
|
+ dict_rule = getRuleOfUrl(url1, url2)
|
|
|
+ else:
|
|
|
+ dict_rule = getRuleOfUrl(first_url, url2)
|
|
|
+ if click_next_1 != None and len(click_next_1)==4:
|
|
|
+ click_message = click_next_1[3]
|
|
|
+ if click_message!="":
|
|
|
+ add_err_msg(dict_rule, '#%s#'%click_message)
|
|
|
+ if not click_flag:
|
|
|
+ add_err_msg(dict_rule, "#进行数字翻页#")
|
|
|
+ list_listpage_url.append(url1)
|
|
|
+ list_listpage_url.append(url2)
|
|
|
+
|
|
|
+ if list_pageNum1[2]==list_pageNum2[2] and list_pageNum1[2] is not None:
|
|
|
+ dict_rule["listpage_pageNum"] = [list_pageNum1[2],"jsoup"]
|
|
|
+ elif list_pageNum1[1]==list_pageNum2[1] and list_pageNum1[1] is not None:
|
|
|
+ dict_rule["listpage_pageNum"] = [list_pageNum1[1],"xpath"]
|
|
|
+ else:
|
|
|
+ dict_rule["listpage_pageNum"] = None
|
|
|
+ dict_rule["listpage_pageNum_pattern"] = list_pageNum1[0]
|
|
|
+ '''
|
|
|
+ #若是未识别到pageNum则flag为False
|
|
|
+ if dict_rule["listpage_pageNum"] is None:
|
|
|
+ dict_rule["flag"] = False
|
|
|
+ '''
|
|
|
+ #优先jsoup,后xpath
|
|
|
+ if list_node1[1]==list_node2[1] and list_node1[1] is not None:
|
|
|
+ dict_rule["listpage_nextPage"] = [list_node1[1],"jsoup"]
|
|
|
+ #只有2页的适配
|
|
|
+ elif list_node1[1] is not None and list_node2[1] is None:
|
|
|
+ log('只有两页更新适配 ')
|
|
|
+ dict_rule["listpage_nextPage"] = [list_node1[1],"jsoup"]
|
|
|
+ elif list_node1[0]==list_node2[0] and list_node1[0] is not None:
|
|
|
+ dict_rule["listpage_nextPage"] = [list_node1[0],"xpath"]
|
|
|
+ else:
|
|
|
+ dict_rule["listpage_nextPage"] = None
|
|
|
+
|
|
|
+ #翻页按钮或者是拼接规则有一个即可
|
|
|
+ if dict_rule["listpage_nextPage"] is not None:
|
|
|
+ dict_rule["flag"] = True
|
|
|
+ else:
|
|
|
+ add_err_msg(dict_rule, "#下一页规则未获取#")
|
|
|
+ return dict_rule,list_listpage_url
|
|
|
+ # except Exception as e:
|
|
|
+ # error(str(e))
|
|
|
|
|
|
if __name__=="__main__":
|
|
|
browser = hd.getBrowser()
|
|
@@ -323,7 +395,8 @@ if __name__=="__main__":
|
|
|
return _array
|
|
|
'''
|
|
|
|
|
|
- data = browser.execute_script(scripts_common+script1)
|
|
|
+ # data = browser.execute_script(scripts_common+script1)
|
|
|
+ data = get_js_rs(browser, scripts_common+script1)
|
|
|
#browser.maximize_window()
|
|
|
browser.save_screenshot("112.png")
|
|
|
for item in data:
|