123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319 |
- import os
- from selenium import webdriver
- from selenium.webdriver import DesiredCapabilities
- from bs4 import BeautifulSoup
- import platform
- import requests
- from lxml import html
- import re
- import signal
- from threading import Thread, RLock
- from module.Utils import *
- from queue import Queue
- import time
- header={
- "Accept": "text/html, application/xhtml+xml, image/jxr, */*",
- # "Referer": "http://uia.hnist.cn/sso/login?service=http%3A%2F%2Fportal.hnist.\
- # cn%2Fuser%2FsimpleSSOLogin",
- "Accept-Language": "zh-Hans-CN,zh-Hans;q=0.8,en-US;q=0.5,en;q=0.3",
- "Content-Type": "application/x-www-form-urlencoded",
- "Connection": "Keep-Alive",
- "User-Agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64) \
- AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.162 Safari/537.36",
- #"Accept-Encoding": "gzip, deflate",
- # "Origin": "http://uia.hnist.cn",
- "Upgrade-Insecure-Requests": "1",
- }
- TYPE = "phantomjs"
- # TYPE = "chrome"
- current_path = os.path.abspath("/".join(__file__.split("\\")[:-1]))
- driver_paths = {"phantomjs_linux":current_path+"/../driver/phantomjs/phantomjs-2.1.1-linux-x86_64/bin/phantomjs",
- "phantomjs_window":current_path+"/../driver/phantomjs/phantomjs-2.1.1-windows/bin/phantomjs.exe",
- "chrome_linux":current_path+"/../driver/chromedriver/chromedriver_linux64/chromedriver",
- "chrome_window":current_path+"/../driver/chromedriver/chromedriver_win32/chromedriver.exe"}
- print(driver_paths)
- def getBrowser_phantomJS(platform="linux",straight=False):
-
- if platform=="linux":
- executable_path = driver_paths["phantomjs_linux"]
- else:
- executable_path = driver_paths["phantomjs_window"]
- desired_capabilities= DesiredCapabilities.PHANTOMJS.copy()
- print('os.path.exists executable_path', executable_path, os.path.exists(executable_path))
- for key, value in header.items():
- desired_capabilities['phantomjs.page.customHeaders.{}'.format(key)] = value
- desired_capabilities['phantomjs.page.customHeaders.User-Agent'] ='Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/100.0.4896.75 Safari/537.36 Edg/100.0.1185.366'
- desired_capabilities["phantomjs.page.settings.loadImages"] = False
- desired_capabilities["phantomjs.page.settings.disk-cache"] = False
- browser_phantomjs = webdriver.PhantomJS(executable_path=executable_path,desired_capabilities=desired_capabilities,service_args=['--ignore-ssl-errors=true','--ssl-protocol=TLSv1'])
- browser_phantomjs.implicitly_wait(10)
- browser_phantomjs.set_script_timeout(20)
- browser_phantomjs.set_page_load_timeout(10)
-
- return browser_phantomjs
- def getBrowser_chrome(platform="linux",straight=False):
- if platform=="linux":
- executable_path = driver_paths["chrome_linux"]
- else:
- executable_path = driver_paths["chrome_window"]
- chrome_options = webdriver.ChromeOptions()
- prefs = {"profile.managed_default_content_settings.images":2}
- chrome_options.add_experimental_option("prefs",prefs)
- chrome_options.add_argument('--headless')
- chrome_options.add_argument('--no-sandbox')
- chrome_options.add_argument('--user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/100.0.4896.75 Safari/537.36 Edg/100.0.1185.36')
- desired_capabilities= DesiredCapabilities.CHROME.copy()
- desired_capabilities['loggingPrefs'] = { 'performance':'ALL' }
- browser_chrome = webdriver.Chrome(desired_capabilities=desired_capabilities,executable_path=executable_path, chrome_options=chrome_options,service_args=['--ignore-ssl-errors=true','--ssl-protocol=any']) # '--ssl-protocol=any' TLSv1
- browser_chrome.implicitly_wait(15)
- browser_chrome.set_page_load_timeout(15)
- # browser_chrome = webdriver.Chrome(executable_path=executable_path)
-
- return browser_chrome
- def getBrowser(type=TYPE,straight=False):
- if platform.system()=="Windows":
- if type=="phantomjs":
- _browser = getBrowser_phantomJS(platform="window",straight=straight)
- else:
- _browser = getBrowser_chrome(platform="window",straight=straight)
- else:
- if type=="phantomjs":
- _browser = getBrowser_phantomJS(straight=straight)
- else:
- _browser = getBrowser_chrome(straight=straight)
- _browser.maximize_window()
- return _browser
- def getStatus(url):
- try:
- r = requests.get(url, headers=header, allow_redirects = False,timeout=15)
- except Exception as e:
- log('requests.get error :%s'%e)
- return 404
- return r.status_code
- _queue = Queue(4)
- lock = RLock()#用于获取池中的driver
- _get_count = 0#记录调用次数
- lock_kill = RLock()#用于杀掉进程,如果已经有一个kill线程启动了,则其他线程不启动
- def releaseAllDriver():
- '''
- @summary: 通过线程释放所有driver,并重新初始化
- '''
- def _method():
- if lock_kill.acquire(blocking=True, timeout=0):
- try:
- lock.acquire()
- wait_count = 0
- t0 = time.time()
- while(True):
- if _queue.full():
- break
- elif time.time()-t0>60:
- log('等待放回浏览器超时,强制释放所有driver')
- break
- else:
- wait_count += 1
- log("waitting for drivers all back..."+str(wait_count)+"qsize:"+str(_queue.qsize()))
- time.sleep(1)
- initQueue()
- finally:
- lock.release()
- lock_kill.release()
- t = Thread(target=_method)
- t.start()
- t.join(100)
-
-
- def killAllDriver():
- '''
- @summary: 干掉所有webdriver进程
- '''
- try:
- log("killing all webdrivers")
- if platform.system()=="Windows":
- if TYPE=="phantomjs":
- os.system('taskkill /im phantomjsdriver.exe /F')
- os.system('taskkill /im phantomjs.exe /F')
- else:
- os.system('taskkill /im chromedriver.exe /F')
- os.system('taskkill /im chrome.exe /F')
- else:
- if TYPE=="phantomjs":
- os.system('killall phantomjsdriver')
- os.system('killall phantomjs')
- else:
- os.system('killall chromedriver ')
- os.system('killall chrome')
- except Exception as e:
- error(str(e))
- def adddriver(browser):
- _queue.put(browser)
-
- def getdriver():
- '''
- @summary: 获取driver,要先获得锁
- '''
- global _get_count
- _get_count += 1
- if _get_count>1000:
- log("get_driver 达到调用次数,重新进行初始化")
- releaseAllDriver()
- _get_count = 0
- lock.acquire()
- browser = _queue.get()
- lock.release()
- return browser
- def initQueue():
- '''
- @summary: 初始化webdriver队列,每调用一定的次数或者卡死的时候进行初始化,将所有webdriver进程干掉,然后初始化这个队列
- '''
- #释放资源
- while(not _queue.empty()):
- _driver = _queue.get()
- _driver = None
- #杀死进程
- killAllDriver()
- #初始化
- while(not _queue.full()):
- browser = getBrowser()
- adddriver(browser)
-
- initQueue()
-
- def executeMethod(_method,args):
- '''
- @summary: 执行所有需要browser的方法,确保driver拿到之后会放回队列中
- '''
- try:
- browser = getdriver(_queue)
- args["browser"] = browser
- _method(args)
- finally:
- adddriver(browser)
- def hasDrew(url,list_rule):
- '''
- @summary: 根据规则判断是否渲染
- @param: url:网页链接,list_rule: xpath规则数组
- '''
- def hasdrew(url,list_rule):
- try:
- r = requests.get(url, headers=header, allow_redirects = False, timeout=10)
- _encoding = r.encoding
- if _encoding is None:
- _encoding = "utf8"
- dom = html.fromstring(r.content.decode(_encoding))
- for item in list_rule:
- if item["type"]=="xpath":
- if item["rule"] is not None:
- list_nodes = dom.xpath(item["rule"])
- if len(list_nodes)==0:
- return True
- except Exception as e:
- error(str(e))
- return False
- rs = thread_run(hasdrew, url,list_rule)
- if rs != None:
- return rs
- else:
- return False
- def loadPage(browser,url,timeout=30):
- '''
- @summary: 解决selenium加载网页不返回的问题,设置线程进行加载,对线程设置超时时间
- '''
- def _thread_load(browser,url):
- try:
- debug("load "+url)
- browser.get(url)
- debug("load "+url+" done")
- except Exception as e:
- error(str(e))
- log('加载页面抛出异常:'+str(e))
- if re.search("由于目标计算机积极拒绝",str(e)) is not None:
- log('log page exception')
- releaseAllDriver()
-
- t = Thread(target=_thread_load,args=(browser,url))
- t.start()
- debug("load_thread:"+str(t.ident)+" "+str(t.name))
- t.join(timeout)
- if t.isAlive():
- #browser = None
- '''
- browser.service.process.send_signal(signal.SIGTERM)
- '''
- #执行释放资源的线程
- error("driver get方法卡住,强制释放所有资源")
- stop_thread(t)
- log('stop_loadpage thread return false')
- adddriver(browser)
- debug("release driver")
- releaseAllDriver()
- return False
- # raise NameError("超时加载"+str(url))
- return True
-
-
- def getSource(url):
- '''
- #import chardet
- sess = requests.Session()
- sess.headers = header
- data=sess.get(url)
- encoding = data.encoding
- #print(encoding)
- data = data.text.encode(data.encoding)
- content = data.decode(encoding)
- #print(content)
-
- browser = webdriver.PhantomJS()
- browser.get(url)
- iframe = browser.find_elements_by_tag_name("iframe")[0]
- browser.switch_to_frame(iframe)
- print(browser.page_source)
- content = browser.page_source
- print(content)
- #browser.close()
- '''
-
- #browser = getBrowser_chrome()
- browser = getBrowser()
- browser.get(url)
-
- element = browser.find_elements_by_xpath('//*')
-
- for ele in element:
- print(ele.rect)
-
- #print(element.value_of_css_property("width"))
- #print(element.get_attribute("innerHTML"))
- content = browser.page_source
- soup = BeautifulSoup(content,"lxml")
- iframes = soup.find_all("iframe",recursive=True)
- if len(iframes)>0:
- browser_iframes = browser.find_elements_by_tag_name("iframe")
- for iframe,browser_iframe in zip(iframes,browser_iframes):
- browser.switch_to_default_content()
- browser.switch_to_frame(browser_iframe)
- iframe.replace_with(BeautifulSoup(browser.page_source,"lxml"))
- return soup
-
-
- if __name__=="__main__":
- url = "http://yl.km.gov.cn/tzgg/"
- #source = getSource(url)
- hasDrew(url,[{"rule":'//*[@class="li_c"]',"type":"xpath"}])
|