engine.py 16 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409
  1. '''
  2. Created on 2019年8月5日
  3. @author: User
  4. '''
  5. import module.htmlDrawing as hd
  6. import time
  7. from selenium.webdriver.common.action_chains import ActionChains
  8. import re
  9. from module.Utils import *
  10. script = '''
  11. function click_bt(type_click){
  12. var pattern_pageNum = /[共\/]\s*(\d+)\s*页|\d+\s*\/\s*(\d+)|\.{2}\s*(\d+)/
  13. var pattern_nextPage = /^\s*[^最]?([下后]一?页|[下后]一?页\s*»|»|>|[Nn]ext).?\s*$/
  14. var pattern_tailPage = /^\s*(最[尾末后]一?页|tail|>\|).?s\s*$/
  15. list_cluster = clustering_turnPage();
  16. var pageNum = null;
  17. var pageNum_jsoup = null;
  18. var _node_xpath = null;
  19. var _node_jsoup = null;
  20. var _node_click = null;
  21. var click_message = '';
  22. for(var i=0;i<list_cluster.length;i++){
  23. _node = list_cluster[i][0]
  24. _type = list_cluster[i][1]
  25. if(_node.innerText!=null){
  26. var _match_num = _node.innerText.match(pattern_pageNum);
  27. if(pageNum==null && _match_num!=null){
  28. /*
  29. for(var j=1;j<_match_num.length;j++){
  30. if(_match_num[j]!=null){
  31. pageNum = _match_num[j]
  32. }
  33. }
  34. */
  35. //改为获取规则
  36. if(pageNum==null){
  37. pageNum = getXpath(_node);
  38. }
  39. if(pageNum_jsoup==null){
  40. pageNum_jsoup = getJsoup(_node);
  41. }
  42. }
  43. }
  44. if(_type==type_click){
  45. if(_node.tagName.toLowerCase() in {a:"",button:""} || _node.onclick!=null){
  46. //return getXpath(_node);
  47. _href = _node.getAttribute("href")
  48. if(_href!=null && _href!="" && _href!="#" && _href.indexOf('javascript')<0){
  49. if(_node_xpath==null){
  50. _node_xpath = getXpath(_node);
  51. }
  52. if(_node_jsoup==null){
  53. _node_jsoup = getJsoup(_node);
  54. }
  55. }
  56. if(_href==null || _href=="" || _href=="#"){
  57. click_message = '翻页链接为空或#异常';
  58. }
  59. if(_href!=null && _href.indexOf('javascript')>=0){
  60. click_message = '翻页链接为javascript';
  61. }
  62. if(_node_click==null){
  63. _node_click = _node;
  64. }
  65. }
  66. else if(_node.getAttribute("type")=='button'){
  67. _node_click = _node;
  68. click_message = '标签属性type为button的翻页';
  69. }
  70. else if(_node.parentNode.tagName.toLowerCase() in {a:"",button:""} || _node.parentNode.onclick!=null){
  71. _href = _node.parentNode.getAttribute("href")
  72. if(_href!=null && _href!="" && _href!="#" && _href.indexOf('javascript')<0){
  73. if(_node_xpath==null){
  74. _node_xpath = getXpath(_node.parentNode);
  75. }
  76. if(_node_jsoup==null){
  77. _node_jsoup = getJsoup(_node.parentNode);
  78. }
  79. }
  80. if(_node_click==null){
  81. _node_click = _node.parentNode;
  82. }
  83. click_message = '父节点为翻页链接';
  84. }
  85. }
  86. }
  87. if(_node_click!=null){
  88. _node_click.click();
  89. return [true,[pattern_pageNum.toString(),pageNum,pageNum_jsoup],[_node_xpath,_node_jsoup], click_message];
  90. }else{
  91. var _pattern = null;
  92. if(type_click=="nextPage"){
  93. _pattern = pattern_nextPage;
  94. }else{
  95. _pattern = pattern_tailPage;
  96. }
  97. var list_a = document.getElementsByTagName("a");
  98. for(var i=0;i<list_a.length;i++){
  99. var _node = list_a[i];
  100. if(_node!=null && _node.innerText!=null && _node.innerText.match(_pattern)!=null){
  101. _href = _node.getAttribute("href")
  102. if(_href!=null && _href!="" && _href!="#" && _href.indexOf('javascript')<0){
  103. _node_xpath = getXpath(_node);
  104. _node_jsoup = getJsoup(_node);
  105. }
  106. _node.click();
  107. click_message = '找不到翻页按钮,a标签为翻页链接';
  108. return [true,[pattern_pageNum.toString(),pageNum,pageNum_jsoup],[_node_xpath,_node_jsoup], click_message];
  109. }
  110. }
  111. }
  112. if(click_message==''){click_message = '最终没找到翻页按钮';}
  113. return [false,[pattern_pageNum.toString(),pageNum,pageNum_jsoup],[_node_xpath,_node_jsoup], click_message];
  114. }
  115. return click_bt(arguments[0]);
  116. '''
  117. script_pattern = '''
  118. function turnpage_by_pattern(pattern_s){
  119. list_cluster = clustering_turnPage();
  120. var pattern = new RegExp(pattern_s)
  121. for(var i=0;i<list_cluster.length;i++){
  122. _node = list_cluster[i][0];
  123. _type = list_cluster[i][1];
  124. if(_node!=null && _node.innerText!=null && _node.innerText.match(pattern)!=null){
  125. if((_node.tagName!=null && _node.tagName.toLowerCase()=="a")){
  126. _node.click();
  127. return true;
  128. }
  129. }
  130. }
  131. return false;
  132. }
  133. return turnpage_by_pattern(arguments[0]);
  134. '''
  135. def click_bt_lastPage(browser):
  136. _url = browser.current_url
  137. _window_handles = len(browser.window_handles)
  138. # _result = browser.execute_script(scripts_common+script,"lastPage")
  139. _result = get_js_rs(browser, scripts_common+script,"lastPage")
  140. if _result[0]:
  141. if len(browser.window_handles)>_window_handles:
  142. switch_window(browser)
  143. for i in range(4):
  144. if _url!=browser.current_url and browser.current_url[-1]!="#":
  145. return _result
  146. else:
  147. time.sleep(1)
  148. return _result
  149. def click_bt_nextPage(browser):
  150. _url = browser.current_url
  151. _window_handles = len(browser.window_handles)
  152. # _result = browser.execute_script(scripts_common+script,"nextPage")
  153. _result = get_js_rs(browser, scripts_common+script,"nextPage", timeout=30)
  154. if _result!=None and _result[0]:
  155. if len(browser.window_handles)>_window_handles:
  156. switch_window(browser)
  157. for i in range(4):
  158. if _url!=browser.current_url and browser.current_url[-1]!="#":
  159. return _result
  160. else:
  161. time.sleep(1.5)
  162. return _result
  163. def click_bt_tailPage(browser):
  164. _url = browser.current_url
  165. _window_handles = len(browser.window_handles)
  166. # _result = browser.execute_script(scripts_common+script,"tailPage")
  167. _result = get_js_rs(browser, scripts_common+script,"tailPage")
  168. if _result!=None and _result[0]:
  169. if len(browser.window_handles)>_window_handles:
  170. switch_window(browser)
  171. for i in range(4):
  172. if _url!=browser.current_url and browser.current_url[-1]!="#":
  173. return _result
  174. else:
  175. time.sleep(1)
  176. return _result
  177. def click_bt_pattern(browser,pattern):
  178. _url = browser.current_url
  179. _window_handles = len(browser.window_handles)
  180. # _result = browser.execute_script(scripts_common+script_pattern,pattern)
  181. _result = get_js_rs(browser, scripts_common+script_pattern,pattern)
  182. if _result:
  183. if len(browser.window_handles)>_window_handles:
  184. switch_window(browser)
  185. for i in range(4):
  186. if _url!=browser.current_url and browser.current_url[-1]!="#":
  187. return _result
  188. else:
  189. time.sleep(1)
  190. return _result
  191. def switch_window(browser):
  192. _current_handle = browser.current_window_handle
  193. browser.switch_to_window(browser.window_handles[-1])
  194. for i in range(10):
  195. if browser.current_url=="about:blank":
  196. time.sleep(1)
  197. else:
  198. break
  199. if browser.current_url=="about:blank":
  200. browser.switch_to_window(_current_handle)
  201. def getRuleOfUrl(first_url,second_url):
  202. dict_rule = {"flag":True,"listpage_turn_before":None,"listpage_pageBegin":None,"listpage_pageStep":1,"listpage_turn_after":None}
  203. pattern = "(\d+)"
  204. split_all_first = re.split(pattern,first_url)
  205. split_all_second = re.split(pattern,second_url)
  206. log("pageTurn first_url:\t"+first_url)
  207. log("pageTurn second_url:\t"+second_url)
  208. if len(split_all_first)!=len(split_all_second):
  209. split_url = second_url.split('/')
  210. if re.search('^index_[12].\w{3,5}$',split_url[-1]):
  211. suffix = split_url[-1].split('.')[1]
  212. page_begin = int(split_url[-1][6])
  213. dict_rule["listpage_turn_before"] = '/'.join(split_url[:-1])+'/index_'
  214. dict_rule["listpage_turn_after"] = '.'+suffix
  215. dict_rule["listpage_pageBegin"] = page_begin
  216. dict_rule["listpage_pageStep"] = 1
  217. return dict_rule
  218. add_err_msg(dict_rule, "#翻页链接不匹配#")
  219. dict_rule["flag"] = False
  220. return dict_rule
  221. list_diff_index = []
  222. _index = 0
  223. for _first,_second in zip(split_all_first,split_all_second):
  224. if _first!=_second:
  225. list_diff_index.append(_index)
  226. _index += 1
  227. if len(list_diff_index)!=1:
  228. add_err_msg(dict_rule, "#翻页链接不匹配#")
  229. dict_rule["flag"] = False
  230. return dict_rule
  231. pattern = "^\d+$"
  232. diff_first = split_all_first[list_diff_index[0]]
  233. diff_second = split_all_second[list_diff_index[0]]
  234. if re.search(pattern,diff_first) is None or re.search(pattern,diff_second) is None:
  235. add_err_msg(dict_rule, "#翻页链接不匹配#")
  236. dict_rule["flag"] = False
  237. return dict_rule
  238. _begin = int(diff_first)
  239. _end = int(diff_second)
  240. if _begin<_end:
  241. pageStep = _end-_begin
  242. else:
  243. pageStep = _begin-_end
  244. part_before = "".join(split_all_first[:list_diff_index[0]])
  245. part_after = "".join(split_all_first[list_diff_index[0]+1:])
  246. dict_rule["listpage_turn_before"] = part_before
  247. dict_rule["listpage_turn_after"] = part_after
  248. dict_rule["listpage_pageBegin"] = _begin
  249. dict_rule["listpage_pageStep"] = pageStep
  250. return dict_rule
  251. def getTurnRule(browser,listpage_url):
  252. '''
  253. 通过点击下一页或数字翻页得到下一页规则(页数,下一页路径等),list_listpage_url(前后列表页url)
  254. :param browser: 浏览器对象
  255. :param listpage_url: 列表页url
  256. :return:
  257. '''
  258. # try:
  259. # hd.loadPage(browser,listpage_url)
  260. first_url = browser.current_url
  261. list_listpage_url = []
  262. click_flag = True
  263. #点击下一页
  264. # click_next_1 = click_bt_nextPage(browser)
  265. click_next_1 = thread_run(click_bt_nextPage, browser)
  266. url1 = ''
  267. url2 = browser.current_url
  268. log("click next bt:"+str(click_next_1))
  269. #点击下一页
  270. # click_next_2 = click_bt_nextPage(browser)
  271. click_next_2 = thread_run(click_bt_nextPage, browser)
  272. if click_next_1==None:
  273. click_next_1 = [False, ['/[共\\/]\\s*(\\d+)\\s*页|\\d+\\s*\\/\\s*(\\d+)|\\.{2}\\s*(\\d+)/', None, None],
  274. [None, None]]
  275. if click_next_2==None:
  276. click_next_2 = [False, ['/[共\\/]\\s*(\\d+)\\s*页|\\d+\\s*\\/\\s*(\\d+)|\\.{2}\\s*(\\d+)/', None, None],
  277. [None, None]]
  278. log("click next bt:"+str(click_next_2))
  279. list_pageNum1 = click_next_1[1]
  280. list_node1 = click_next_1[2]
  281. list_pageNum2 = click_next_2[1]
  282. list_node2 = click_next_2[2]
  283. dict_rule = None
  284. url3 = browser.current_url
  285. #是否有点击到下一页
  286. #click_flag = click_next_1[0] or click_next_2[0]
  287. click_flag = click_next_2[0]
  288. #点击数字翻页
  289. # if not click_flag:
  290. # #第一个下一页点击到而第二个未点击到
  291. # log('开始数字翻页')
  292. # if click_next_1[0]:
  293. # click_last_1 = click_bt_lastPage(browser)
  294. # url2 = browser.current_url
  295. # log('第一次翻页成功,最后一页作为第二页')
  296. if not click_next_1[0]: # or not click_last_1[0]
  297. log('开始数字翻页')
  298. # click_pattern_2 = click_bt_pattern(browser, "^\\s*2\\s*$")
  299. click_pattern_2 = thread_run(click_bt_pattern, browser, "^\\s*2\\s*$")
  300. if click_pattern_2:
  301. url2 = browser.current_url
  302. log('数字翻页第二页%s'%url2)
  303. # click_pattern_3 = click_bt_pattern(browser, "^\\s*3\\s*$")
  304. click_pattern_3 = thread_run(click_bt_pattern , browser, "^\\s*3\\s*$")
  305. if click_pattern_3:
  306. url3 = browser.current_url
  307. log('数字翻页第三页%s'%url3)
  308. else:
  309. # click_pattern_1 = click_bt_pattern(browser, "^\\s*1\\s*$")
  310. click_pattern_1 = thread_run(click_bt_pattern, browser, "^\\s*1\\s*$")
  311. if click_pattern_1:
  312. url1 = browser.current_url
  313. log('数字翻页第一页%s'%url1)
  314. if url2 != url3:
  315. dict_rule = getRuleOfUrl(url2, url3)
  316. elif url1!='' and url2 != url1:
  317. dict_rule = getRuleOfUrl(url1, url2)
  318. else:
  319. dict_rule = getRuleOfUrl(first_url, url2)
  320. if click_next_1 != None and len(click_next_1)==4:
  321. click_message = click_next_1[3]
  322. if click_message!="":
  323. add_err_msg(dict_rule, '#%s#'%click_message)
  324. if not click_flag:
  325. add_err_msg(dict_rule, "#进行数字翻页#")
  326. list_listpage_url.append(url1)
  327. list_listpage_url.append(url2)
  328. if list_pageNum1[2]==list_pageNum2[2] and list_pageNum1[2] is not None:
  329. dict_rule["listpage_pageNum"] = [list_pageNum1[2],"jsoup"]
  330. elif list_pageNum1[1]==list_pageNum2[1] and list_pageNum1[1] is not None:
  331. dict_rule["listpage_pageNum"] = [list_pageNum1[1],"xpath"]
  332. else:
  333. dict_rule["listpage_pageNum"] = None
  334. dict_rule["listpage_pageNum_pattern"] = list_pageNum1[0]
  335. '''
  336. #若是未识别到pageNum则flag为False
  337. if dict_rule["listpage_pageNum"] is None:
  338. dict_rule["flag"] = False
  339. '''
  340. #优先jsoup,后xpath
  341. if list_node1[0]is not None and hd.hasDrew(first_url, [{"rule":list_node1[0],"type":"xpath"}])==True:
  342. log('翻页链接经过渲染')
  343. dict_rule["listpage_nextPage"] = None
  344. elif list_node1[1]==list_node2[1] and list_node1[1] is not None:
  345. dict_rule["listpage_nextPage"] = [list_node1[1],"jsoup"]
  346. #只有2页的适配
  347. elif list_node1[1] is not None and list_node2[1] is None:
  348. log('只有两页更新适配 ')
  349. dict_rule["listpage_nextPage"] = [list_node1[1],"jsoup"]
  350. elif list_node1[0]==list_node2[0] and list_node1[0] is not None:
  351. dict_rule["listpage_nextPage"] = [list_node1[0],"xpath"]
  352. else:
  353. dict_rule["listpage_nextPage"] = None
  354. #翻页按钮或者是拼接规则有一个即可
  355. if dict_rule["listpage_nextPage"] is not None:
  356. dict_rule["flag"] = True
  357. else:
  358. add_err_msg(dict_rule, "#下一页规则未获取#")
  359. return dict_rule,list_listpage_url
  360. # except Exception as e:
  361. # error(str(e))
  362. if __name__=="__main__":
  363. browser = hd.getBrowser()
  364. #browser.get("http://www.jltc.edu.cn/xwdt/ggtz.htm")
  365. #browser.get("https://www.sdju.edu.cn/zb_3104/list.htm")
  366. browser.get("http://www.gzsmzmuseum.cn/list-7.html")
  367. #print(browser.page_source)
  368. script1 = '''
  369. list_cluster = clustering_turnPage();
  370. _array = new Array();
  371. for(var i=0;i<list_cluster.length;i++){
  372. _array.push([list_cluster[i][0].innerText,list_cluster[i][1],getOffsetLeft(list_cluster[i][0]),getOffsetTop(list_cluster[i][0])])
  373. }
  374. return _array
  375. '''
  376. # data = browser.execute_script(scripts_common+script1)
  377. data = get_js_rs(browser, scripts_common+script1)
  378. #browser.maximize_window()
  379. browser.save_screenshot("112.png")
  380. for item in data:
  381. print(item)
  382. #print(getTurnRule(browser))