import os from selenium import webdriver from selenium.webdriver import DesiredCapabilities from bs4 import BeautifulSoup import platform import requests from lxml import html import re import signal from threading import Thread, RLock from module.Utils import * from queue import Queue import time header={ "Accept": "text/html, application/xhtml+xml, image/jxr, */*", # "Referer": "http://uia.hnist.cn/sso/login?service=http%3A%2F%2Fportal.hnist.\ # cn%2Fuser%2FsimpleSSOLogin", "Accept-Language": "zh-Hans-CN,zh-Hans;q=0.8,en-US;q=0.5,en;q=0.3", "Content-Type": "application/x-www-form-urlencoded", "Connection": "Keep-Alive", "User-Agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64) \ AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.162 Safari/537.36", #"Accept-Encoding": "gzip, deflate", # "Origin": "http://uia.hnist.cn", "Upgrade-Insecure-Requests": "1", } TYPE = "phantomjs" # TYPE = "chrome" current_path = os.path.abspath("/".join(__file__.split("\\")[:-1])) driver_paths = {"phantomjs_linux":current_path+"/../driver/phantomjs/phantomjs-2.1.1-linux-x86_64/bin/phantomjs", "phantomjs_window":current_path+"/../driver/phantomjs/phantomjs-2.1.1-windows/bin/phantomjs.exe", "chrome_linux":current_path+"/../driver/chromedriver/chromedriver_linux64/chromedriver", "chrome_window":current_path+"/../driver/chromedriver/chromedriver_win32/chromedriver.exe"} print(driver_paths) def getBrowser_phantomJS(platform="linux",straight=False): if platform=="linux": executable_path = driver_paths["phantomjs_linux"] else: executable_path = driver_paths["phantomjs_window"] desired_capabilities= DesiredCapabilities.PHANTOMJS.copy() # print('os.path.exists executable_path', executable_path, os.path.exists(executable_path)) for key, value in header.items(): desired_capabilities['phantomjs.page.customHeaders.{}'.format(key)] = value desired_capabilities['phantomjs.page.customHeaders.User-Agent'] ='Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/100.0.4896.75 Safari/537.36 Edg/100.0.1185.366' desired_capabilities["phantomjs.page.settings.loadImages"] = False desired_capabilities["phantomjs.page.settings.disk-cache"] = False browser_phantomjs = webdriver.PhantomJS(executable_path=executable_path,desired_capabilities=desired_capabilities,service_args=['--ignore-ssl-errors=true','--ssl-protocol=TLSv1']) browser_phantomjs.implicitly_wait(10) browser_phantomjs.set_script_timeout(20) browser_phantomjs.set_page_load_timeout(10) return browser_phantomjs def getBrowser_chrome(platform="linux",straight=False): if platform=="linux": executable_path = driver_paths["chrome_linux"] else: executable_path = driver_paths["chrome_window"] chrome_options = webdriver.ChromeOptions() prefs = {"profile.managed_default_content_settings.images":2} chrome_options.add_experimental_option("prefs",prefs) chrome_options.add_argument('--headless') chrome_options.add_argument('--no-sandbox') chrome_options.add_argument('--user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/100.0.4896.75 Safari/537.36 Edg/100.0.1185.36') desired_capabilities= DesiredCapabilities.CHROME.copy() desired_capabilities['loggingPrefs'] = { 'performance':'ALL' } browser_chrome = webdriver.Chrome(desired_capabilities=desired_capabilities,executable_path=executable_path, chrome_options=chrome_options,service_args=['--ignore-ssl-errors=true','--ssl-protocol=any']) # '--ssl-protocol=any' TLSv1 browser_chrome.implicitly_wait(15) browser_chrome.set_page_load_timeout(15) # browser_chrome = webdriver.Chrome(executable_path=executable_path) return browser_chrome def getBrowser(type=TYPE,straight=False): if platform.system()=="Windows": if type=="phantomjs": _browser = getBrowser_phantomJS(platform="window",straight=straight) else: _browser = getBrowser_chrome(platform="window",straight=straight) else: if type=="phantomjs": _browser = getBrowser_phantomJS(straight=straight) else: _browser = getBrowser_chrome(straight=straight) _browser.maximize_window() return _browser def getStatus(url): try: r = requests.get(url, headers=header, allow_redirects = False,timeout=15) except Exception as e: log('requests.get error :%s'%e) return 404 return r.status_code _queue = Queue(4) lock = RLock()#用于获取池中的driver _get_count = 0#记录调用次数 lock_kill = RLock()#用于杀掉进程,如果已经有一个kill线程启动了,则其他线程不启动 def releaseAllDriver(): ''' @summary: 通过线程释放所有driver,并重新初始化 ''' def _method(): if lock_kill.acquire(blocking=True, timeout=0): try: lock.acquire() wait_count = 0 t0 = time.time() while(True): if _queue.full(): break elif time.time()-t0>60: log('等待放回浏览器超时,强制释放所有driver') break else: wait_count += 1 log("waitting for drivers all back..."+str(wait_count)+"qsize:"+str(_queue.qsize())) time.sleep(1) initQueue() finally: lock.release() lock_kill.release() t = Thread(target=_method) t.start() t.join(100) def killAllDriver(): ''' @summary: 干掉所有webdriver进程 ''' try: log("killing all webdrivers") if platform.system()=="Windows": if TYPE=="phantomjs": os.system('taskkill /im phantomjsdriver.exe /F') os.system('taskkill /im phantomjs.exe /F') else: os.system('taskkill /im chromedriver.exe /F') os.system('taskkill /im chrome.exe /F') else: if TYPE=="phantomjs": os.system('killall phantomjsdriver') os.system('killall phantomjs') else: os.system('killall chromedriver ') os.system('killall chrome') except Exception as e: error(str(e)) def adddriver(browser): _queue.put(browser) def getdriver(): ''' @summary: 获取driver,要先获得锁 ''' global _get_count _get_count += 1 if _get_count>1000: log("get_driver 达到调用次数,重新进行初始化") releaseAllDriver() _get_count = 0 lock.acquire() browser = _queue.get() lock.release() return browser def initQueue(): ''' @summary: 初始化webdriver队列,每调用一定的次数或者卡死的时候进行初始化,将所有webdriver进程干掉,然后初始化这个队列 ''' #释放资源 while(not _queue.empty()): _driver = _queue.get() _driver = None #杀死进程 killAllDriver() #初始化 while(not _queue.full()): browser = getBrowser() adddriver(browser) initQueue() def executeMethod(_method,args): ''' @summary: 执行所有需要browser的方法,确保driver拿到之后会放回队列中 ''' try: browser = getdriver(_queue) args["browser"] = browser _method(args) finally: adddriver(browser) def hasDrew(url,list_rule): ''' @summary: 根据规则判断是否渲染 @param: url:网页链接,list_rule: xpath规则数组 ''' def hasdrew(url,list_rule): try: r = requests.get(url, headers=header, allow_redirects = False, timeout=10) _encoding = r.encoding if _encoding is None: _encoding = "utf8" dom = html.fromstring(r.content.decode(_encoding)) for item in list_rule: if item["type"]=="xpath": if item["rule"] is not None: list_nodes = dom.xpath(item["rule"]) if len(list_nodes)==0: return True except Exception as e: error(str(e)) return False rs = thread_run(hasdrew, url,list_rule) if rs != None: return rs else: return False def loadPage(browser,url,timeout=30): ''' @summary: 解决selenium加载网页不返回的问题,设置线程进行加载,对线程设置超时时间 ''' def _thread_load(browser,url): try: debug("load "+url) browser.get(url) debug("load "+url+" done") except Exception as e: error(str(e)) log('加载页面抛出异常:'+str(e)) if re.search("由于目标计算机积极拒绝",str(e)) is not None: log('log page exception') releaseAllDriver() t = Thread(target=_thread_load,args=(browser,url)) t.start() debug("load_thread:"+str(t.ident)+" "+str(t.name)) t.join(timeout) if t.isAlive(): #browser = None ''' browser.service.process.send_signal(signal.SIGTERM) ''' #执行释放资源的线程 error("driver get方法卡住,强制释放所有资源") stop_thread(t) log('stop_loadpage thread return false') adddriver(browser) debug("release driver") releaseAllDriver() return False # raise NameError("超时加载"+str(url)) return True def getSource(url): ''' #import chardet sess = requests.Session() sess.headers = header data=sess.get(url) encoding = data.encoding #print(encoding) data = data.text.encode(data.encoding) content = data.decode(encoding) #print(content) browser = webdriver.PhantomJS() browser.get(url) iframe = browser.find_elements_by_tag_name("iframe")[0] browser.switch_to_frame(iframe) print(browser.page_source) content = browser.page_source print(content) #browser.close() ''' #browser = getBrowser_chrome() browser = getBrowser() browser.get(url) element = browser.find_elements_by_xpath('//*') for ele in element: print(ele.rect) #print(element.value_of_css_property("width")) #print(element.get_attribute("innerHTML")) content = browser.page_source soup = BeautifulSoup(content,"lxml") iframes = soup.find_all("iframe",recursive=True) if len(iframes)>0: browser_iframes = browser.find_elements_by_tag_name("iframe") for iframe,browser_iframe in zip(iframes,browser_iframes): browser.switch_to_default_content() browser.switch_to_frame(browser_iframe) iframe.replace_with(BeautifulSoup(browser.page_source,"lxml")) return soup if __name__=="__main__": url = "http://yl.km.gov.cn/tzgg/" #source = getSource(url) hasDrew(url,[{"rule":'//*[@class="li_c"]',"type":"xpath"}])