luojiehua
/
ContentExtract


			
							123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319
							import os
from selenium import webdriver
from selenium.webdriver import DesiredCapabilities
from bs4 import BeautifulSoup
import platform
import requests
from lxml import html
import re
import signal
from threading import Thread, RLock
from module.Utils import *
from queue import Queue
import time

header={
    "Accept": "text/html, application/xhtml+xml, image/jxr, */*",
    # "Referer": "http://uia.hnist.cn/sso/login?service=http%3A%2F%2Fportal.hnist.\
    #             cn%2Fuser%2FsimpleSSOLogin",
    "Accept-Language": "zh-Hans-CN,zh-Hans;q=0.8,en-US;q=0.5,en;q=0.3",
    "Content-Type": "application/x-www-form-urlencoded",
    "Connection": "Keep-Alive",
    "User-Agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64) \
     AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.162 Safari/537.36",
    #"Accept-Encoding": "gzip, deflate",
    # "Origin": "http://uia.hnist.cn",
    "Upgrade-Insecure-Requests": "1",
    }  

TYPE = "phantomjs"
# TYPE = "chrome"
current_path = os.path.abspath("/".join(__file__.split("\\")[:-1]))
driver_paths = {"phantomjs_linux":current_path+"/../driver/phantomjs/phantomjs-2.1.1-linux-x86_64/bin/phantomjs",
                "phantomjs_window":current_path+"/../driver/phantomjs/phantomjs-2.1.1-windows/bin/phantomjs.exe",
                "chrome_linux":current_path+"/../driver/chromedriver/chromedriver_linux64/chromedriver",
                "chrome_window":current_path+"/../driver/chromedriver/chromedriver_win32/chromedriver.exe"}

print(driver_paths)


def getBrowser_phantomJS(platform="linux",straight=False):
    
    if platform=="linux":
        executable_path = driver_paths["phantomjs_linux"]
    else:
        executable_path = driver_paths["phantomjs_window"]
    desired_capabilities= DesiredCapabilities.PHANTOMJS.copy()
    # print('os.path.exists executable_path', executable_path, os.path.exists(executable_path))
    for key, value in header.items():
        desired_capabilities['phantomjs.page.customHeaders.{}'.format(key)] = value
    desired_capabilities['phantomjs.page.customHeaders.User-Agent'] ='Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/100.0.4896.75 Safari/537.36 Edg/100.0.1185.366'
    desired_capabilities["phantomjs.page.settings.loadImages"] = False
    desired_capabilities["phantomjs.page.settings.disk-cache"] = False
    browser_phantomjs = webdriver.PhantomJS(executable_path=executable_path,desired_capabilities=desired_capabilities,service_args=['--ignore-ssl-errors=true','--ssl-protocol=TLSv1'])
    browser_phantomjs.implicitly_wait(10)
    browser_phantomjs.set_script_timeout(20)
    browser_phantomjs.set_page_load_timeout(10)
 
    return browser_phantomjs

def getBrowser_chrome(platform="linux",straight=False):
    if platform=="linux":
        executable_path = driver_paths["chrome_linux"]
    else:
        executable_path = driver_paths["chrome_window"]
    chrome_options = webdriver.ChromeOptions()
    prefs = {"profile.managed_default_content_settings.images":2}
    chrome_options.add_experimental_option("prefs",prefs)
    chrome_options.add_argument('--headless')
    chrome_options.add_argument('--no-sandbox')
    chrome_options.add_argument('--user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/100.0.4896.75 Safari/537.36 Edg/100.0.1185.36')
    desired_capabilities= DesiredCapabilities.CHROME.copy()
    desired_capabilities['loggingPrefs'] = { 'performance':'ALL' }
    browser_chrome = webdriver.Chrome(desired_capabilities=desired_capabilities,executable_path=executable_path, chrome_options=chrome_options,service_args=['--ignore-ssl-errors=true','--ssl-protocol=any'])  # '--ssl-protocol=any'  TLSv1
    browser_chrome.implicitly_wait(15)
    browser_chrome.set_page_load_timeout(15)
    # browser_chrome = webdriver.Chrome(executable_path=executable_path)
    
    return browser_chrome

def getBrowser(type=TYPE,straight=False):
    if platform.system()=="Windows":
        if type=="phantomjs":
            _browser = getBrowser_phantomJS(platform="window",straight=straight)
        else:
            _browser = getBrowser_chrome(platform="window",straight=straight)
    else:
        if type=="phantomjs":
            _browser = getBrowser_phantomJS(straight=straight)
        else:
            _browser = getBrowser_chrome(straight=straight)
    _browser.maximize_window()
    return _browser

def getStatus(url):
    try:
        r = requests.get(url, headers=header, allow_redirects = False,timeout=15)
    except Exception as e:
        log('requests.get error :%s'%e)
        return 404    
    return r.status_code

_queue = Queue(4)
lock = RLock()#用于获取池中的driver
_get_count = 0#记录调用次数
lock_kill = RLock()#用于杀掉进程，如果已经有一个kill线程启动了，则其他线程不启动
def releaseAllDriver():
    '''
    @summary: 通过线程释放所有driver,并重新初始化
    '''
    def _method():
        if lock_kill.acquire(blocking=True, timeout=0):
            try:
                lock.acquire()
                wait_count = 0
                t0 = time.time()
                while(True):
                    if _queue.full():
                        break
                    elif time.time()-t0>60:
                        log('等待放回浏览器超时，强制释放所有driver')
                        break
                    else:
                        wait_count += 1
                        log("waitting for drivers all back..."+str(wait_count)+"qsize:"+str(_queue.qsize()))
                        time.sleep(1)
                initQueue()
            finally:
                lock.release()
                lock_kill.release()
    t = Thread(target=_method)
    t.start()
    t.join(100)
            
    
def killAllDriver():
    '''
    @summary: 干掉所有webdriver进程
    '''
    try:
        log("killing all webdrivers")
        if platform.system()=="Windows":
            if TYPE=="phantomjs":
                os.system('taskkill /im phantomjsdriver.exe /F')
                os.system('taskkill /im phantomjs.exe /F')
            else:
                os.system('taskkill /im chromedriver.exe /F')
                os.system('taskkill /im chrome.exe /F')
        else:
            if TYPE=="phantomjs":
                os.system('killall phantomjsdriver')
                os.system('killall phantomjs')
            else:
                os.system('killall chromedriver ')
                os.system('killall chrome')
    except Exception as e:
        error(str(e))

def adddriver(browser):
    _queue.put(browser)
    
def getdriver():
    '''
    @summary: 获取driver，要先获得锁
    '''
    global _get_count
    _get_count += 1
    if _get_count>1000:
        log("get_driver 达到调用次数，重新进行初始化")
        releaseAllDriver()
        _get_count = 0
    lock.acquire()
    browser = _queue.get()
    lock.release()
    return browser       

def initQueue():
    '''
    @summary: 初始化webdriver队列，每调用一定的次数或者卡死的时候进行初始化，将所有webdriver进程干掉，然后初始化这个队列
    '''
    #释放资源
    while(not _queue.empty()):
        _driver = _queue.get()
        _driver = None
    #杀死进程
    killAllDriver()
    #初始化
    while(not _queue.full()):
        browser = getBrowser()
        adddriver(browser)
        
initQueue()
    

def executeMethod(_method,args):
    '''
    @summary: 执行所有需要browser的方法，确保driver拿到之后会放回队列中
    '''
    try:
        browser = getdriver(_queue)
        args["browser"] = browser
        _method(args)
    finally:
        adddriver(browser)

def hasDrew(url,list_rule):
    '''
    @summary: 根据规则判断是否渲染
    @param: url:网页链接，list_rule: xpath规则数组 
    '''
    def hasdrew(url,list_rule):
        try:
            r = requests.get(url, headers=header, allow_redirects = False, timeout=10)
            _encoding = r.encoding
            if _encoding is None:
                _encoding = "utf8"
            dom = html.fromstring(r.content.decode(_encoding))
            for item in list_rule:
                if item["type"]=="xpath":
                    if item["rule"] is not None:
                        list_nodes = dom.xpath(item["rule"])
                        if len(list_nodes)==0:
                            return True
        except Exception as e:
            error(str(e))
        return False
    rs = thread_run(hasdrew, url,list_rule)
    if rs != None:
        return rs
    else:
        return False

def loadPage(browser,url,timeout=30):
    '''
    @summary: 解决selenium加载网页不返回的问题，设置线程进行加载，对线程设置超时时间
    '''
    def _thread_load(browser,url):
        try:
            debug("load "+url)
            browser.get(url)
            debug("load "+url+" done")
        except Exception as e:
            error(str(e))
            log('加载页面抛出异常：'+str(e))
            if re.search("由于目标计算机积极拒绝",str(e)) is not None:
                log('log page exception')
                releaseAllDriver()
        
    t = Thread(target=_thread_load,args=(browser,url))
    t.start()
    debug("load_thread:"+str(t.ident)+" "+str(t.name))
    t.join(timeout)
    if t.isAlive():
        #browser = None
        '''
        browser.service.process.send_signal(signal.SIGTERM)
        '''
        #执行释放资源的线程
        error("driver get方法卡住，强制释放所有资源")
        stop_thread(t)
        log('stop_loadpage thread return false')
        adddriver(browser)
        debug("release driver")
        releaseAllDriver()
        return False
        # raise NameError("超时加载"+str(url))
    return True
    
    
def getSource(url):
        '''
        #import chardet
        sess = requests.Session()
        sess.headers = header
        data=sess.get(url)
        encoding = data.encoding
        #print(encoding)
        data = data.text.encode(data.encoding)
        content = data.decode(encoding)
        #print(content)
        
        browser = webdriver.PhantomJS()
        browser.get(url)
        iframe = browser.find_elements_by_tag_name("iframe")[0]
        browser.switch_to_frame(iframe)
        print(browser.page_source)
        content = browser.page_source
        print(content)
        #browser.close()
        '''
        
        #browser = getBrowser_chrome()
        browser = getBrowser()
        browser.get(url)
        
        element = browser.find_elements_by_xpath('//*')
        
        for ele in element:
            print(ele.rect)
        
        #print(element.value_of_css_property("width"))
        #print(element.get_attribute("innerHTML"))
        content = browser.page_source
        soup = BeautifulSoup(content,"lxml")
        iframes = soup.find_all("iframe",recursive=True)
        if len(iframes)>0:
            browser_iframes = browser.find_elements_by_tag_name("iframe")
            for iframe,browser_iframe in zip(iframes,browser_iframes):
                browser.switch_to_default_content()
                browser.switch_to_frame(browser_iframe)
                iframe.replace_with(BeautifulSoup(browser.page_source,"lxml"))
        return soup
    
    
if __name__=="__main__":
    url = "http://yl.km.gov.cn/tzgg/"
    #source = getSource(url)
    hasDrew(url,[{"rule":'//*[@class="li_c"]',"type":"xpath"}])