engine.py 15 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404
  1. '''
  2. Created on 2019年8月5日
  3. @author: User
  4. '''
  5. import module.htmlDrawing as hd
  6. import time
  7. from selenium.webdriver.common.action_chains import ActionChains
  8. import re
  9. from module.Utils import *
  10. script = '''
  11. function click_bt(type_click){
  12. var pattern_pageNum = /[共\/]\s*(\d+)\s*页|\d+\s*\/\s*(\d+)|\.{2}\s*(\d+)/
  13. var pattern_nextPage = /^\s*[^最]?([下后]一?页|[下后]一?页\s*»|»|>|[Nn]ext).?\s*$/
  14. var pattern_tailPage = /^\s*(最[尾末后]一?页|tail|>\|).?s\s*$/
  15. list_cluster = clustering_turnPage();
  16. var pageNum = null;
  17. var pageNum_jsoup = null;
  18. var _node_xpath = null;
  19. var _node_jsoup = null;
  20. var _node_click = null;
  21. var click_message = '';
  22. for(var i=0;i<list_cluster.length;i++){
  23. _node = list_cluster[i][0]
  24. _type = list_cluster[i][1]
  25. if(_node.innerText!=null){
  26. var _match_num = _node.innerText.match(pattern_pageNum);
  27. if(pageNum==null && _match_num!=null){
  28. /*
  29. for(var j=1;j<_match_num.length;j++){
  30. if(_match_num[j]!=null){
  31. pageNum = _match_num[j]
  32. }
  33. }
  34. */
  35. //改为获取规则
  36. if(pageNum==null){
  37. pageNum = getXpath(_node);
  38. }
  39. if(pageNum_jsoup==null){
  40. pageNum_jsoup = getJsoup(_node);
  41. }
  42. }
  43. }
  44. if(_type==type_click){
  45. if(_node.tagName.toLowerCase() in {a:"",button:""} || _node.onclick!=null){
  46. //return getXpath(_node);
  47. _href = _node.getAttribute("href")
  48. if(_href!=null && _href!="" && _href!="#" && _href.indexOf('javascript')<0){
  49. if(_node_xpath==null){
  50. _node_xpath = getXpath(_node);
  51. }
  52. if(_node_jsoup==null){
  53. _node_jsoup = getJsoup(_node);
  54. }
  55. }
  56. if(_href==null || _href=="" || _href=="#"){
  57. click_message = '翻页链接为空或#异常';
  58. }
  59. if(_href!=null && _href.indexOf('javascript')>=0){
  60. click_message = '翻页链接为javascript';
  61. }
  62. if(_node_click==null){
  63. _node_click = _node;
  64. }
  65. }
  66. else if(_node.getAttribute("type")=='button'){
  67. _node_click = _node;
  68. click_message = '标签属性type为button的翻页';
  69. }
  70. else if(_node.parentNode.tagName.toLowerCase() in {a:"",button:""} || _node.parentNode.onclick!=null){
  71. _href = _node.parentNode.getAttribute("href")
  72. if(_href!=null && _href!="" && _href!="#" && _href.indexOf('javascript')<0){
  73. if(_node_xpath==null){
  74. _node_xpath = getXpath(_node.parentNode);
  75. }
  76. if(_node_jsoup==null){
  77. _node_jsoup = getJsoup(_node.parentNode);
  78. }
  79. }
  80. if(_node_click==null){
  81. _node_click = _node.parentNode;
  82. }
  83. click_message = '父节点为翻页链接';
  84. }
  85. }
  86. }
  87. if(_node_click!=null){
  88. _node_click.click();
  89. return [true,[pattern_pageNum.toString(),pageNum,pageNum_jsoup],[_node_xpath,_node_jsoup], click_message];
  90. }else{
  91. var _pattern = null;
  92. if(type_click=="nextPage"){
  93. _pattern = pattern_nextPage;
  94. }else{
  95. _pattern = pattern_tailPage;
  96. }
  97. var list_a = document.getElementsByTagName("a");
  98. for(var i=0;i<list_a.length;i++){
  99. var _node = list_a[i];
  100. if(_node!=null && _node.innerText!=null && _node.innerText.match(_pattern)!=null){
  101. _href = _node.getAttribute("href")
  102. if(_href!=null && _href!="" && _href!="#" && _href.indexOf('javascript')<0){
  103. _node_xpath = getXpath(_node);
  104. _node_jsoup = getJsoup(_node);
  105. }
  106. _node.click();
  107. click_message = '找不到翻页按钮,a标签为翻页链接';
  108. return [true,[pattern_pageNum.toString(),pageNum,pageNum_jsoup],[_node_xpath,_node_jsoup], click_message];
  109. }
  110. }
  111. }
  112. if(click_message==''){click_message = '最终没找到翻页按钮';}
  113. return [false,[pattern_pageNum.toString(),pageNum,pageNum_jsoup],[_node_xpath,_node_jsoup], click_message];
  114. }
  115. return click_bt(arguments[0]);
  116. '''
  117. script_pattern = '''
  118. function turnpage_by_pattern(pattern_s){
  119. list_cluster = clustering_turnPage();
  120. var pattern = new RegExp(pattern_s)
  121. for(var i=0;i<list_cluster.length;i++){
  122. _node = list_cluster[i][0];
  123. _type = list_cluster[i][1];
  124. if(_node!=null && _node.innerText!=null && _node.innerText.match(pattern)!=null){
  125. if((_node.tagName!=null && _node.tagName.toLowerCase()=="a")){
  126. _node.click();
  127. return true;
  128. }
  129. }
  130. }
  131. return false;
  132. }
  133. return turnpage_by_pattern(arguments[0]);
  134. '''
  135. def click_bt_lastPage(browser):
  136. _url = browser.current_url
  137. _window_handles = len(browser.window_handles)
  138. # _result = browser.execute_script(scripts_common+script,"lastPage")
  139. _result = get_js_rs(browser, scripts_common+script,"lastPage")
  140. if _result[0]:
  141. if len(browser.window_handles)>_window_handles:
  142. switch_window(browser)
  143. for i in range(4):
  144. if _url!=browser.current_url and browser.current_url[-1]!="#":
  145. return _result
  146. else:
  147. time.sleep(1)
  148. return _result
  149. def click_bt_nextPage(browser):
  150. _url = browser.current_url
  151. _window_handles = len(browser.window_handles)
  152. # _result = browser.execute_script(scripts_common+script,"nextPage")
  153. _result = get_js_rs(browser, scripts_common+script,"nextPage", timeout=30)
  154. if _result!=None and _result[0]:
  155. if len(browser.window_handles)>_window_handles:
  156. switch_window(browser)
  157. for i in range(4):
  158. if _url!=browser.current_url and browser.current_url[-1]!="#":
  159. return _result
  160. else:
  161. time.sleep(1.5)
  162. return _result
  163. def click_bt_tailPage(browser):
  164. _url = browser.current_url
  165. _window_handles = len(browser.window_handles)
  166. # _result = browser.execute_script(scripts_common+script,"tailPage")
  167. _result = get_js_rs(browser, scripts_common+script,"tailPage")
  168. if _result!=None and _result[0]:
  169. if len(browser.window_handles)>_window_handles:
  170. switch_window(browser)
  171. for i in range(4):
  172. if _url!=browser.current_url and browser.current_url[-1]!="#":
  173. return _result
  174. else:
  175. time.sleep(1)
  176. return _result
  177. def click_bt_pattern(browser,pattern):
  178. _url = browser.current_url
  179. _window_handles = len(browser.window_handles)
  180. # _result = browser.execute_script(scripts_common+script_pattern,pattern)
  181. _result = get_js_rs(browser, scripts_common+script_pattern,pattern)
  182. if _result:
  183. if len(browser.window_handles)>_window_handles:
  184. switch_window(browser)
  185. for i in range(4):
  186. if _url!=browser.current_url and browser.current_url[-1]!="#":
  187. return _result
  188. else:
  189. time.sleep(1)
  190. return _result
  191. def switch_window(browser):
  192. _current_handle = browser.current_window_handle
  193. browser.switch_to_window(browser.window_handles[-1])
  194. for i in range(10):
  195. if browser.current_url=="about:blank":
  196. time.sleep(1)
  197. else:
  198. break
  199. if browser.current_url=="about:blank":
  200. browser.switch_to_window(_current_handle)
  201. def getRuleOfUrl(first_url,second_url):
  202. dict_rule = {"flag":True,"listpage_turn_before":None,"listpage_pageBegin":None,"listpage_pageStep":1,"listpage_turn_after":None}
  203. pattern = "(\d+)"
  204. split_all_first = re.split(pattern,first_url)
  205. split_all_second = re.split(pattern,second_url)
  206. log("pageTurn first_url:\t"+first_url)
  207. log("pageTurn second_url:\t"+second_url)
  208. if len(split_all_first)!=len(split_all_second):
  209. split_url = second_url.split('/')
  210. if split_url[-1]== 'index_2.html':
  211. dict_rule["listpage_turn_before"] = '/'.join(split_url[:-1])+'/index_'
  212. dict_rule["listpage_turn_after"] = '.html'
  213. dict_rule["listpage_pageBegin"] = 2
  214. dict_rule["listpage_pageStep"] = 1
  215. return dict_rule
  216. add_err_msg(dict_rule, "#翻页链接不匹配#")
  217. dict_rule["flag"] = False
  218. return dict_rule
  219. list_diff_index = []
  220. _index = 0
  221. for _first,_second in zip(split_all_first,split_all_second):
  222. if _first!=_second:
  223. list_diff_index.append(_index)
  224. _index += 1
  225. if len(list_diff_index)!=1:
  226. add_err_msg(dict_rule, "#翻页链接不匹配#")
  227. dict_rule["flag"] = False
  228. return dict_rule
  229. pattern = "^\d+$"
  230. diff_first = split_all_first[list_diff_index[0]]
  231. diff_second = split_all_second[list_diff_index[0]]
  232. if re.search(pattern,diff_first) is None or re.search(pattern,diff_second) is None:
  233. add_err_msg(dict_rule, "#翻页链接不匹配#")
  234. dict_rule["flag"] = False
  235. return dict_rule
  236. _begin = int(diff_first)
  237. _end = int(diff_second)
  238. if _begin<_end:
  239. pageStep = _end-_begin
  240. else:
  241. pageStep = _begin-_end
  242. part_before = "".join(split_all_first[:list_diff_index[0]])
  243. part_after = "".join(split_all_first[list_diff_index[0]+1:])
  244. dict_rule["listpage_turn_before"] = part_before
  245. dict_rule["listpage_turn_after"] = part_after
  246. dict_rule["listpage_pageBegin"] = _begin
  247. dict_rule["listpage_pageStep"] = pageStep
  248. return dict_rule
  249. def getTurnRule(browser,listpage_url):
  250. '''
  251. 通过点击下一页或数字翻页得到下一页规则(页数,下一页路径等),list_listpage_url(前后列表页url)
  252. :param browser: 浏览器对象
  253. :param listpage_url: 列表页url
  254. :return:
  255. '''
  256. # try:
  257. # hd.loadPage(browser,listpage_url)
  258. first_url = browser.current_url
  259. list_listpage_url = []
  260. click_flag = True
  261. #点击下一页
  262. # click_next_1 = click_bt_nextPage(browser)
  263. click_next_1 = thread_run(click_bt_nextPage, browser)
  264. url1 = ''
  265. url2 = browser.current_url
  266. log("click next bt:"+str(click_next_1))
  267. #点击下一页
  268. # click_next_2 = click_bt_nextPage(browser)
  269. click_next_2 = thread_run(click_bt_nextPage, browser)
  270. if click_next_1==None:
  271. click_next_1 = [False, ['/[共\\/]\\s*(\\d+)\\s*页|\\d+\\s*\\/\\s*(\\d+)|\\.{2}\\s*(\\d+)/', None, None],
  272. [None, None]]
  273. if click_next_2==None:
  274. click_next_2 = [False, ['/[共\\/]\\s*(\\d+)\\s*页|\\d+\\s*\\/\\s*(\\d+)|\\.{2}\\s*(\\d+)/', None, None],
  275. [None, None]]
  276. log("click next bt:"+str(click_next_2))
  277. list_pageNum1 = click_next_1[1]
  278. list_node1 = click_next_1[2]
  279. list_pageNum2 = click_next_2[1]
  280. list_node2 = click_next_2[2]
  281. dict_rule = None
  282. url3 = browser.current_url
  283. #是否有点击到下一页
  284. #click_flag = click_next_1[0] or click_next_2[0]
  285. click_flag = click_next_2[0]
  286. #点击数字翻页
  287. # if not click_flag:
  288. # #第一个下一页点击到而第二个未点击到
  289. # log('开始数字翻页')
  290. # if click_next_1[0]:
  291. # click_last_1 = click_bt_lastPage(browser)
  292. # url2 = browser.current_url
  293. # log('第一次翻页成功,最后一页作为第二页')
  294. if not click_next_1[0]: # or not click_last_1[0]
  295. log('开始数字翻页')
  296. # click_pattern_2 = click_bt_pattern(browser, "^\\s*2\\s*$")
  297. click_pattern_2 = thread_run(click_bt_pattern, browser, "^\\s*2\\s*$")
  298. if click_pattern_2:
  299. url2 = browser.current_url
  300. log('数字翻页第二页%s'%url2)
  301. # click_pattern_3 = click_bt_pattern(browser, "^\\s*3\\s*$")
  302. click_pattern_3 = thread_run(click_bt_pattern , browser, "^\\s*3\\s*$")
  303. if click_pattern_3:
  304. url3 = browser.current_url
  305. log('数字翻页第三页%s'%url3)
  306. else:
  307. # click_pattern_1 = click_bt_pattern(browser, "^\\s*1\\s*$")
  308. click_pattern_1 = thread_run(click_bt_pattern, browser, "^\\s*1\\s*$")
  309. if click_pattern_1:
  310. url1 = browser.current_url
  311. log('数字翻页第一页%s'%url1)
  312. if url2 != url3:
  313. dict_rule = getRuleOfUrl(url2, url3)
  314. elif url1!='' and url2 != url1:
  315. dict_rule = getRuleOfUrl(url1, url2)
  316. else:
  317. dict_rule = getRuleOfUrl(first_url, url2)
  318. if click_next_1 != None and len(click_next_1)==4:
  319. click_message = click_next_1[3]
  320. if click_message!="":
  321. add_err_msg(dict_rule, '#%s#'%click_message)
  322. if not click_flag:
  323. add_err_msg(dict_rule, "#进行数字翻页#")
  324. list_listpage_url.append(url1)
  325. list_listpage_url.append(url2)
  326. if list_pageNum1[2]==list_pageNum2[2] and list_pageNum1[2] is not None:
  327. dict_rule["listpage_pageNum"] = [list_pageNum1[2],"jsoup"]
  328. elif list_pageNum1[1]==list_pageNum2[1] and list_pageNum1[1] is not None:
  329. dict_rule["listpage_pageNum"] = [list_pageNum1[1],"xpath"]
  330. else:
  331. dict_rule["listpage_pageNum"] = None
  332. dict_rule["listpage_pageNum_pattern"] = list_pageNum1[0]
  333. '''
  334. #若是未识别到pageNum则flag为False
  335. if dict_rule["listpage_pageNum"] is None:
  336. dict_rule["flag"] = False
  337. '''
  338. #优先jsoup,后xpath
  339. if list_node1[1]==list_node2[1] and list_node1[1] is not None:
  340. dict_rule["listpage_nextPage"] = [list_node1[1],"jsoup"]
  341. #只有2页的适配
  342. elif list_node1[1] is not None and list_node2[1] is None:
  343. log('只有两页更新适配 ')
  344. dict_rule["listpage_nextPage"] = [list_node1[1],"jsoup"]
  345. elif list_node1[0]==list_node2[0] and list_node1[0] is not None:
  346. dict_rule["listpage_nextPage"] = [list_node1[0],"xpath"]
  347. else:
  348. dict_rule["listpage_nextPage"] = None
  349. #翻页按钮或者是拼接规则有一个即可
  350. if dict_rule["listpage_nextPage"] is not None:
  351. dict_rule["flag"] = True
  352. else:
  353. add_err_msg(dict_rule, "#下一页规则未获取#")
  354. return dict_rule,list_listpage_url
  355. # except Exception as e:
  356. # error(str(e))
  357. if __name__=="__main__":
  358. browser = hd.getBrowser()
  359. #browser.get("http://www.jltc.edu.cn/xwdt/ggtz.htm")
  360. #browser.get("https://www.sdju.edu.cn/zb_3104/list.htm")
  361. browser.get("http://www.gzsmzmuseum.cn/list-7.html")
  362. #print(browser.page_source)
  363. script1 = '''
  364. list_cluster = clustering_turnPage();
  365. _array = new Array();
  366. for(var i=0;i<list_cluster.length;i++){
  367. _array.push([list_cluster[i][0].innerText,list_cluster[i][1],getOffsetLeft(list_cluster[i][0]),getOffsetTop(list_cluster[i][0])])
  368. }
  369. return _array
  370. '''
  371. # data = browser.execute_script(scripts_common+script1)
  372. data = get_js_rs(browser, scripts_common+script1)
  373. #browser.maximize_window()
  374. browser.save_screenshot("112.png")
  375. for item in data:
  376. print(item)
  377. #print(getTurnRule(browser))