htmlDrawing.py 11 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319
  1. import os
  2. from selenium import webdriver
  3. from selenium.webdriver import DesiredCapabilities
  4. from bs4 import BeautifulSoup
  5. import platform
  6. import requests
  7. from lxml import html
  8. import re
  9. import signal
  10. from threading import Thread, RLock
  11. from module.Utils import *
  12. from queue import Queue
  13. import time
  14. header={
  15. "Accept": "text/html, application/xhtml+xml, image/jxr, */*",
  16. # "Referer": "http://uia.hnist.cn/sso/login?service=http%3A%2F%2Fportal.hnist.\
  17. # cn%2Fuser%2FsimpleSSOLogin",
  18. "Accept-Language": "zh-Hans-CN,zh-Hans;q=0.8,en-US;q=0.5,en;q=0.3",
  19. "Content-Type": "application/x-www-form-urlencoded",
  20. "Connection": "Keep-Alive",
  21. "User-Agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64) \
  22. AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.162 Safari/537.36",
  23. #"Accept-Encoding": "gzip, deflate",
  24. # "Origin": "http://uia.hnist.cn",
  25. "Upgrade-Insecure-Requests": "1",
  26. }
  27. TYPE = "phantomjs"
  28. # TYPE = "chrome"
  29. current_path = os.path.abspath("/".join(__file__.split("\\")[:-1]))
  30. driver_paths = {"phantomjs_linux":current_path+"/../driver/phantomjs/phantomjs-2.1.1-linux-x86_64/bin/phantomjs",
  31. "phantomjs_window":current_path+"/../driver/phantomjs/phantomjs-2.1.1-windows/bin/phantomjs.exe",
  32. "chrome_linux":current_path+"/../driver/chromedriver/chromedriver_linux64/chromedriver",
  33. "chrome_window":current_path+"/../driver/chromedriver/chromedriver_win32/chromedriver.exe"}
  34. print(driver_paths)
  35. def getBrowser_phantomJS(platform="linux",straight=False):
  36. if platform=="linux":
  37. executable_path = driver_paths["phantomjs_linux"]
  38. else:
  39. executable_path = driver_paths["phantomjs_window"]
  40. desired_capabilities= DesiredCapabilities.PHANTOMJS.copy()
  41. # print('os.path.exists executable_path', executable_path, os.path.exists(executable_path))
  42. for key, value in header.items():
  43. desired_capabilities['phantomjs.page.customHeaders.{}'.format(key)] = value
  44. desired_capabilities['phantomjs.page.customHeaders.User-Agent'] ='Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/100.0.4896.75 Safari/537.36 Edg/100.0.1185.366'
  45. desired_capabilities["phantomjs.page.settings.loadImages"] = False
  46. desired_capabilities["phantomjs.page.settings.disk-cache"] = False
  47. browser_phantomjs = webdriver.PhantomJS(executable_path=executable_path,desired_capabilities=desired_capabilities,service_args=['--ignore-ssl-errors=true','--ssl-protocol=TLSv1'])
  48. browser_phantomjs.implicitly_wait(10)
  49. browser_phantomjs.set_script_timeout(20)
  50. browser_phantomjs.set_page_load_timeout(10)
  51. return browser_phantomjs
  52. def getBrowser_chrome(platform="linux",straight=False):
  53. if platform=="linux":
  54. executable_path = driver_paths["chrome_linux"]
  55. else:
  56. executable_path = driver_paths["chrome_window"]
  57. chrome_options = webdriver.ChromeOptions()
  58. prefs = {"profile.managed_default_content_settings.images":2}
  59. chrome_options.add_experimental_option("prefs",prefs)
  60. chrome_options.add_argument('--headless')
  61. chrome_options.add_argument('--no-sandbox')
  62. chrome_options.add_argument('--user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/100.0.4896.75 Safari/537.36 Edg/100.0.1185.36')
  63. desired_capabilities= DesiredCapabilities.CHROME.copy()
  64. desired_capabilities['loggingPrefs'] = { 'performance':'ALL' }
  65. browser_chrome = webdriver.Chrome(desired_capabilities=desired_capabilities,executable_path=executable_path, chrome_options=chrome_options,service_args=['--ignore-ssl-errors=true','--ssl-protocol=any']) # '--ssl-protocol=any' TLSv1
  66. browser_chrome.implicitly_wait(15)
  67. browser_chrome.set_page_load_timeout(15)
  68. # browser_chrome = webdriver.Chrome(executable_path=executable_path)
  69. return browser_chrome
  70. def getBrowser(type=TYPE,straight=False):
  71. if platform.system()=="Windows":
  72. if type=="phantomjs":
  73. _browser = getBrowser_phantomJS(platform="window",straight=straight)
  74. else:
  75. _browser = getBrowser_chrome(platform="window",straight=straight)
  76. else:
  77. if type=="phantomjs":
  78. _browser = getBrowser_phantomJS(straight=straight)
  79. else:
  80. _browser = getBrowser_chrome(straight=straight)
  81. _browser.maximize_window()
  82. return _browser
  83. def getStatus(url):
  84. try:
  85. r = requests.get(url, headers=header, allow_redirects = False,timeout=15)
  86. except Exception as e:
  87. log('requests.get error :%s'%e)
  88. return 404
  89. return r.status_code
  90. _queue = Queue(4)
  91. lock = RLock()#用于获取池中的driver
  92. _get_count = 0#记录调用次数
  93. lock_kill = RLock()#用于杀掉进程,如果已经有一个kill线程启动了,则其他线程不启动
  94. def releaseAllDriver():
  95. '''
  96. @summary: 通过线程释放所有driver,并重新初始化
  97. '''
  98. def _method():
  99. if lock_kill.acquire(blocking=True, timeout=0):
  100. try:
  101. lock.acquire()
  102. wait_count = 0
  103. t0 = time.time()
  104. while(True):
  105. if _queue.full():
  106. break
  107. elif time.time()-t0>60:
  108. log('等待放回浏览器超时,强制释放所有driver')
  109. break
  110. else:
  111. wait_count += 1
  112. log("waitting for drivers all back..."+str(wait_count)+"qsize:"+str(_queue.qsize()))
  113. time.sleep(1)
  114. initQueue()
  115. finally:
  116. lock.release()
  117. lock_kill.release()
  118. t = Thread(target=_method)
  119. t.start()
  120. t.join(100)
  121. def killAllDriver():
  122. '''
  123. @summary: 干掉所有webdriver进程
  124. '''
  125. try:
  126. log("killing all webdrivers")
  127. if platform.system()=="Windows":
  128. if TYPE=="phantomjs":
  129. os.system('taskkill /im phantomjsdriver.exe /F')
  130. os.system('taskkill /im phantomjs.exe /F')
  131. else:
  132. os.system('taskkill /im chromedriver.exe /F')
  133. os.system('taskkill /im chrome.exe /F')
  134. else:
  135. if TYPE=="phantomjs":
  136. os.system('killall phantomjsdriver')
  137. os.system('killall phantomjs')
  138. else:
  139. os.system('killall chromedriver ')
  140. os.system('killall chrome')
  141. except Exception as e:
  142. error(str(e))
  143. def adddriver(browser):
  144. _queue.put(browser)
  145. def getdriver():
  146. '''
  147. @summary: 获取driver,要先获得锁
  148. '''
  149. global _get_count
  150. _get_count += 1
  151. if _get_count>1000:
  152. log("get_driver 达到调用次数,重新进行初始化")
  153. releaseAllDriver()
  154. _get_count = 0
  155. lock.acquire()
  156. browser = _queue.get()
  157. lock.release()
  158. return browser
  159. def initQueue():
  160. '''
  161. @summary: 初始化webdriver队列,每调用一定的次数或者卡死的时候进行初始化,将所有webdriver进程干掉,然后初始化这个队列
  162. '''
  163. #释放资源
  164. while(not _queue.empty()):
  165. _driver = _queue.get()
  166. _driver = None
  167. #杀死进程
  168. killAllDriver()
  169. #初始化
  170. while(not _queue.full()):
  171. browser = getBrowser()
  172. adddriver(browser)
  173. initQueue()
  174. def executeMethod(_method,args):
  175. '''
  176. @summary: 执行所有需要browser的方法,确保driver拿到之后会放回队列中
  177. '''
  178. try:
  179. browser = getdriver(_queue)
  180. args["browser"] = browser
  181. _method(args)
  182. finally:
  183. adddriver(browser)
  184. def hasDrew(url,list_rule):
  185. '''
  186. @summary: 根据规则判断是否渲染
  187. @param: url:网页链接,list_rule: xpath规则数组
  188. '''
  189. def hasdrew(url,list_rule):
  190. try:
  191. r = requests.get(url, headers=header, allow_redirects = False, timeout=10)
  192. _encoding = r.encoding
  193. if _encoding is None:
  194. _encoding = "utf8"
  195. dom = html.fromstring(r.content.decode(_encoding))
  196. for item in list_rule:
  197. if item["type"]=="xpath":
  198. if item["rule"] is not None:
  199. list_nodes = dom.xpath(item["rule"])
  200. if len(list_nodes)==0:
  201. return True
  202. except Exception as e:
  203. error(str(e))
  204. return False
  205. rs = thread_run(hasdrew, url,list_rule)
  206. if rs != None:
  207. return rs
  208. else:
  209. return False
  210. def loadPage(browser,url,timeout=30):
  211. '''
  212. @summary: 解决selenium加载网页不返回的问题,设置线程进行加载,对线程设置超时时间
  213. '''
  214. def _thread_load(browser,url):
  215. try:
  216. debug("load "+url)
  217. browser.get(url)
  218. debug("load "+url+" done")
  219. except Exception as e:
  220. error(str(e))
  221. log('加载页面抛出异常:'+str(e))
  222. if re.search("由于目标计算机积极拒绝",str(e)) is not None:
  223. log('log page exception')
  224. releaseAllDriver()
  225. t = Thread(target=_thread_load,args=(browser,url))
  226. t.start()
  227. debug("load_thread:"+str(t.ident)+" "+str(t.name))
  228. t.join(timeout)
  229. if t.isAlive():
  230. #browser = None
  231. '''
  232. browser.service.process.send_signal(signal.SIGTERM)
  233. '''
  234. #执行释放资源的线程
  235. error("driver get方法卡住,强制释放所有资源")
  236. stop_thread(t)
  237. log('stop_loadpage thread return false')
  238. adddriver(browser)
  239. debug("release driver")
  240. releaseAllDriver()
  241. return False
  242. # raise NameError("超时加载"+str(url))
  243. return True
  244. def getSource(url):
  245. '''
  246. #import chardet
  247. sess = requests.Session()
  248. sess.headers = header
  249. data=sess.get(url)
  250. encoding = data.encoding
  251. #print(encoding)
  252. data = data.text.encode(data.encoding)
  253. content = data.decode(encoding)
  254. #print(content)
  255. browser = webdriver.PhantomJS()
  256. browser.get(url)
  257. iframe = browser.find_elements_by_tag_name("iframe")[0]
  258. browser.switch_to_frame(iframe)
  259. print(browser.page_source)
  260. content = browser.page_source
  261. print(content)
  262. #browser.close()
  263. '''
  264. #browser = getBrowser_chrome()
  265. browser = getBrowser()
  266. browser.get(url)
  267. element = browser.find_elements_by_xpath('//*')
  268. for ele in element:
  269. print(ele.rect)
  270. #print(element.value_of_css_property("width"))
  271. #print(element.get_attribute("innerHTML"))
  272. content = browser.page_source
  273. soup = BeautifulSoup(content,"lxml")
  274. iframes = soup.find_all("iframe",recursive=True)
  275. if len(iframes)>0:
  276. browser_iframes = browser.find_elements_by_tag_name("iframe")
  277. for iframe,browser_iframe in zip(iframes,browser_iframes):
  278. browser.switch_to_default_content()
  279. browser.switch_to_frame(browser_iframe)
  280. iframe.replace_with(BeautifulSoup(browser.page_source,"lxml"))
  281. return soup
  282. if __name__=="__main__":
  283. url = "http://yl.km.gov.cn/tzgg/"
  284. #source = getSource(url)
  285. hasDrew(url,[{"rule":'//*[@class="li_c"]',"type":"xpath"}])