Browse Source

解决已发现的bug;优化提取效果;总共199个网站源 提取完整122个占 61.3%;部分成功无翻页或动态翻页31个 占15.5%

lsm 3 years ago
parent
commit
2949be0410

BIN
driver/chromedriver/chromedriver_win32/chromedriver.exe


+ 118 - 5
module/Utils.py

@@ -1,3 +1,4 @@
+# -*- coding: utf-8 -*-
 '''
 Created on 2018年12月20日
 
@@ -20,10 +21,70 @@ import logging
 import pickle
 import tensorflow as tf
 from keras import losses
+import threading
 
 __author__ = 'baniu.yao'
 
+class MyThread(threading.Thread):
+    def __init__(self, func, args=()):
+        super(MyThread, self).__init__()
+        self.func = func
+        self.args = args
 
+    def run(self):
+        self.result = self.func(*self.args)
+
+    def get_result(self):
+        try:
+            return self.result
+        except Exception as e:
+            print('执行js抛出异常:', e)
+            return None
+
+def get_js_rs(browser, script, *arg, timeout=20):
+    '''
+    浏览器执行脚本,返回结果,超时中断
+    :param browser:浏览器对象
+    :param script: 脚本
+    :param arg:参数
+    :param timeout:超时时间
+    :return:
+    '''
+    def execute_js():
+        data = browser.execute_script(script, *arg)
+        return data
+    t = MyThread(func=execute_js, args=())
+    t.setDaemon(True)
+    t.start()
+    t.join(timeout)
+    if t.isAlive():
+        print('执行js超时')
+        stop_thread(t)
+        return None
+    data = t.get_result()
+    return data
+
+import time
+def thread_run(func, *arg, timeout=30):
+    t = MyThread(func=func, args=(*arg,))
+    t.setDaemon(True)
+    t.start()
+    t.join(timeout)
+    if t.isAlive():
+        print('thread_run time out')
+    result = t.get_result()
+    return result
+
+def xpath2css(xpath):
+    '''
+    把xpath路径转为css路径
+    :param xpath:
+    :return:
+    '''
+    xpath = xpath.replace('//', '').replace('@', '').replace('/', '>')
+    for it in re.finditer('\[(\d)\]', xpath):
+        xpath = xpath.replace(it.group(0), ':nth-child(%s)'%it.group(1))
+    return xpath
 
 def get_class_from_frame(fr):
     args, _, _, value_dict = inspect.getargvalues(fr)
@@ -520,6 +581,56 @@ def print_metrics(history):
     plt.show()
 
 scripts_common = '''
+document.getElementsByClassName = function (Name,e,tag) {
+            var ele = [],
+                allEle,
+                length,
+                i = 0;
+ 
+            if (typeof tag === "undefined" ){
+                tag = "*"
+            }
+ 
+            if (typeof e === "undefined"){
+                e = document;
+            }
+ 
+            allEle = e.getElementsByTagName(tag);
+ 
+            for (length = allEle.length;i < length;i = i + 1){
+                if (allEle[i].className === Name) {
+                    ele.push(allEle[i]);
+                }
+            }
+ 
+            return ele;
+        }
+
+document.countElementById = function (id,e,tag) {
+            var ele = [],
+                allEle,
+                length,
+                i = 0;
+ 
+            if (typeof tag === "undefined" ){
+                tag = "*"
+            }
+ 
+            if (typeof e === "undefined"){
+                e = document;
+            }
+ 
+            allEle = e.getElementsByTagName(tag);
+ 
+            for (length = allEle.length;i < length;i = i + 1){
+                if (allEle[i].id === id) {
+                    ele.push(allEle[i]);
+                }
+            }
+ 
+            return ele;
+        }
+
 /*js集合set类的实现*/
 function Set() {
     this.dataStore = [];
@@ -664,7 +775,7 @@ function getRemoveList(node,recurse,list_remove){
 }
 
 function getListXpath(el,list_xpath,getRemove){
-    if (el==document.body){
+    if (el==document || el==document.body){
         return list_xpath;
     }
     if(getRemove){
@@ -678,7 +789,7 @@ function getListXpath(el,list_xpath,getRemove){
     return getListXpath(el.parentNode,list_xpath,getRemove);
 }
 function getXpath(el,b,notfirst){
-    if (el.id !=""){
+    if (el.id !="" && document.countElementById(el.id).length==1){
         var _jump_flag = false;
         if(b!=null){
             for(var i=0;i<b.length;i++){
@@ -691,14 +802,16 @@ function getXpath(el,b,notfirst){
             _jump_flag = true;
         }
         if(!_jump_flag){
-            return '//*[@id=\"'+el.id+'\"]';
+            //return '//*[@id=\"'+el.id+'\"]';
+            return '//'+el.tagName.toLowerCase()+'[@id=\"'+el.id+'\"]';
         }
         
     }
     
     if (el.getAttribute("class")!=null && document.getElementsByClassName(el.getAttribute("class")).length==1){
         if(!notfirst){
-            return '//*[@class=\"'+el.getAttribute("class")+'\"]';
+            //return '//*[@class=\"'+el.getAttribute("class")+'\"]';
+            return '//'+el.tagName.toLowerCase()+'[@class=\"'+el.getAttribute("class")+'\"]';
         }
         
     }
@@ -823,7 +936,7 @@ function clustering(list_hitTag){
 
 function clustering_turnPage(){
     //var pattern_page = /((?<nextPage>下一?页|>>|>)|(?<lastPage>上一?页|<<|<)|(?<firstPage>首页|第一页)|(?<tailPage>尾页)|(?<other>\.{1,2}|共\d[条页]|\d+\/\d+))/ //phantomjs不支持命名分组
-    var pattern_page = /^\s*.?\s*([下后]\s*一?\s*页?|[下后]\s*一?\s*页\s*»|[下后]\s*一?\s*页\s*>|[下后]\s*一?\s*页\s*>>|»|>>|>|[Nn]ext)\s*.?\s*$|^\s*.?([前上]\s*一?\s*页?|«\s*[前上]\s*一?\s*页|«|<<|<|[Pp]revious).?\s*$|^\s*.?(首\s*页?|第\s*一\s*页|first|\|<).?\s*$|^\s*.?([尾末]\s*一?\s*页?|tail|>\|).?s\s*$|(^\s*\.{1,2}\s*$|^.{,10}共\s*\d+\s*[条页].{,10}$|^.{,10}\d+\/\d+.{,3}$|^\s*\.{0,2}\d+\s*$|^\s*[gG][oO]\s*$|^.{0,2}跳?转到?)/
+    var pattern_page = /^\s*[^最]?\s*([下后]\s*一?\s*页?|[下后]\s*一?\s*页\s*»|[下后]\s*一?\s*页\s*>|[下后]\s*一?\s*页\s*>>|»|>>|>|[Nn]ext)\s*.?\s*$|^\s*.?([前上]\s*一?\s*页?|«\s*[前上]\s*一?\s*页|«|<<|<|[Pp]revious).?\s*$|^\s*.?(首\s*页?|第\s*一\s*页|first|\|<).?\s*$|^\s*.?([尾末]\s*一?\s*页?|tail|>\|).?s\s*$|(^\s*\.{1,2}\s*$|^.{,10}共\s*\d+\s*[条页].{,10}$|^.{,10}\d+\/\d+.{,3}$|^\s*\.{0,2}\d+\s*$|^\s*[gG][oO]\s*$|^.{0,2}跳?转到?)/
     var pattern_nextPage = /[Nn]ext/
     var list_hitTag = new Array();
     

+ 17 - 14
module/detail/content/featureEngine.py

@@ -54,12 +54,12 @@ function statistic(node,deepth){
                 node.counts_communicateTags += 1;
             }
         }
-        if(child.tagName!=null && child.tagName.toLowerCase()=="iframe" && child.contentWindow.document!=null){
+        /*if(child.tagName!=null && child.tagName.toLowerCase()=="iframe" && child.contentWindow.document!=null){
             node.counts_communicateTags += statisticIframe(child.contentWindow.document.all);
         }else{
             node.counts_communicateTags += statistic(child,deepth+1);
-        }
-            
+        }*/
+        node.counts_communicateTags += statistic(child,deepth+1);    
     }
     var innertext = node.innerText;
     if(innertext){
@@ -133,7 +133,7 @@ function stastic_time(node,_array){
         }
     }
 
-    if (!_find_flag){
+    if (!_find_flag && node!=document){
         _array_fontSize = new Array();
         getListFontSize(node,_array_fontSize);
         _array.push([getOffsetLeft(node),getOffsetTop(node),getListXpath(node,new Array()),Math.min(_array_fontSize)]);
@@ -334,7 +334,8 @@ def encodeInput_byJS(url,targethtml):
             browser.maximize_window()
             start = time.time()
             
-            data = browser.execute_script(scripts_common+scripts)
+            # data = browser.execute_script(scripts_common+scripts)
+            data = get_js_rs(browser, scripts_common+scripts)
             input_x,list_inner = dealWithScriptOut(data)
             list_label = []
             for item in list_inner:
@@ -352,7 +353,7 @@ def encodeInput_byJS(url,targethtml):
     args = {"url":url,"targethtml":targethtml}
     hd.executeMethod(_method, args)
     
-def getInput_byJS(url):
+def getInput_byJS(browser, url):
     def label(innerhtml,target_source):
         target_source =re.sub("[\r\n\s]","",str(target_source))
         pattern = ">(.*)<"
@@ -365,12 +366,14 @@ def getInput_byJS(url):
             return 1
         return 0
     try:
-        browser = hd.getdriver()
-        debug("get driver")
-        hd.loadPage(browser, url)
-        browser.maximize_window()
+        # browser = hd.getdriver()
+        # debug("get driver")
+        # hd.loadPage(browser, url)
+        # browser.maximize_window()
         
-        data,data_time = browser.execute_script(scripts_common+scripts)
+        # data,data_time = browser.execute_script(scripts_common+scripts)
+        data,data_time = get_js_rs(browser, scripts_common+scripts)
+        log('获取正文、时间脚本执行完毕')
         input_x,list_inner,list_xpath = dealWithScriptOut(data)
         if input_x is not None:
             #return [np.expand_dims(np.transpose(pad_sequences(np.transpose(input_x,(1,0)), 155,padding="post", truncating="post", value=0,dtype="float32"),(1,0)),0)],list_inner
@@ -383,9 +386,9 @@ def getInput_byJS(url):
         if re.search("frame",str(e)) is not None:
             err_msg = "#iframe#"
         return None,err_msg
-    finally:
-        hd.adddriver(browser)
-        debug("release driver")
+    # finally:
+    #     hd.adddriver(browser)
+    #     debug("release driver")
 
 
 

+ 54 - 27
module/detail/extractor.py

@@ -87,11 +87,31 @@ def getRule_detail(list_hrefs,try_times=3,MAX_HREFS=10):
             continue
         list_legal_time = []
         _flag = -2
-        flag,data = featureEngine_content.getInput_byJS(_url)
+        browser = hd.getdriver()
+        debug("get driver")
+        loadsucess = hd.loadPage(browser, _url)
+        if not loadsucess:
+            browser = hd.getdriver()
+        # browser.maximize_window()
+        flag,data = featureEngine_content.getInput_byJS(browser,_url)
         hasGotten = True
         if flag:
-            x,_,list_xpath,data_time = data
+            x,inner_html,list_xpath,data_time = data
             _index = detailContentPredictor.predict(x)
+
+            pt = '<a.*?\.(zip|rar|tar|7z|wim|docx|doc|xlsx|xls|pdf|txt|hnzf|bmp|tif).*?</a>'
+            total_annex = len(re.findall(pt, browser.page_source))
+            extract_annex = len(re.findall(pt, inner_html[_index]))
+            if total_annex > extract_annex and _index>5 and len(list_xpath[_index])>0:
+                extract_xpath = list_xpath[_index][0][0]
+                for i in range(_index-1, _index-5, -1):
+                    if len(re.findall(pt, inner_html[i]))== total_annex:
+                        log('规格调整模型正文提取附件不完整')
+                        _index = i
+                        break
+                    elif len(list_xpath[i])>0 and list_xpath[i][0][0] not in extract_xpath:
+                        break
+
             _xpath = list_xpath[_index]
             _xpath.reverse()
             list_xpath_remove_content.append(_xpath)
@@ -102,10 +122,12 @@ def getRule_detail(list_hrefs,try_times=3,MAX_HREFS=10):
         else:
             hasGotten = False
             add_err_msg(dict_rule_detail, data)
-             
-        flag,data_title = featureEngine_title.getInput_byJS(_url)
+        flag,data_title = featureEngine_title.getInput_byJS(browser,_url)
+        hd.adddriver(browser)
+        debug("release driver")
         if flag:
             x,_,list_xpath,list_top = data_title
+            log('详情标题获取成功')
             _index = detailTitlePredictor.predict(x)
             _xpath = list_xpath[_index]
             _xpath.reverse()
@@ -130,7 +152,7 @@ def getRule_detail(list_hrefs,try_times=3,MAX_HREFS=10):
             _xpath.append(_xpath_remove[0])
         list_xpaths_content.append(_xpath)
     dict_rule_detail["detail_content"] = getCommonXpath(list_xpaths_content)
-    
+
     set_remove_list = None
     for item in list_xpath_remove_content:
         for _xpath_remove in item:
@@ -139,31 +161,36 @@ def getRule_detail(list_hrefs,try_times=3,MAX_HREFS=10):
                     set_remove_list = set(_xpath_remove[1])
                 else:
                     set_remove_list = set(_xpath_remove[1])&set_remove_list
-    dict_rule_detail["detail_removeList"] = list(set_remove_list)
-    
+    dict_rule_detail["detail_removeList"] = list(set_remove_list) if set_remove_list!=None else []
     dict_rule_detail["detail_date"] = getCommonXpath_time(list_data_time)
     dict_rule_detail["detail_title"] = getCommonXpath(list_xpaths_title)
     
-    try:
-        browser = hd.getdriver()
-        debug("get driver")
-        if len(list_hrefs)>0:
-            hd.loadPage(browser, list_hrefs[-1],)
-            dict_rule_detail["hasDrew"] = dict_rule_detail["hasDrew"] or hd.hasDrew(list_hrefs[0], [{"type":"xpath","rule":dict_rule_detail["detail_content"]},
-                                                                                                    {"type":"xpath","rule":dict_rule_detail["detail_date"]},
-                                                                                                    {"type":"xpath","rule":dict_rule_detail["detail_title"]}])
-        if dict_rule_detail["detail_content"] is not None and len(dict_rule_detail["detail_content"].split("/"))>6:
-            log("before being replaced xpath of detail_content"+dict_rule_detail["detail_content"])
-            dict_rule_detail["detail_content"] = browser.execute_script(scripts_replaceXpath,dict_rule_detail["detail_content"])
-            log("after being replaced xpath of detail_content"+dict_rule_detail["detail_content"])
-        if dict_rule_detail["detail_date"] is not None and len(dict_rule_detail["detail_date"].split("/"))>6:
-            dict_rule_detail["detail_date"] = browser.execute_script(scripts_replaceXpath,dict_rule_detail["detail_date"])
-        if dict_rule_detail["detail_title"] is not None and len(dict_rule_detail["detail_title"].split("/"))>6:
-            dict_rule_detail["detail_title"] = browser.execute_script(scripts_replaceXpath,dict_rule_detail["detail_title"])
-    finally:
-        hd.adddriver(browser)
-        debug("release driver")  
-        
+    # try:
+    browser = hd.getdriver()
+    debug("get driver")
+    if len(list_hrefs)>0:
+        loadsucess = hd.loadPage(browser, list_hrefs[-1],)
+        log('logPage: ')
+        if loadsucess==False:
+            browser = hd.getdriver()
+        dict_rule_detail["hasDrew"] = dict_rule_detail["hasDrew"] or hd.hasDrew(list_hrefs[0], [{"type":"xpath","rule":dict_rule_detail["detail_content"]},
+                                                                                            {"type":"xpath","rule":dict_rule_detail["detail_date"]},
+                                                                                            {"type":"xpath","rule":dict_rule_detail["detail_title"]}])
+    if dict_rule_detail["detail_content"] is not None and len(dict_rule_detail["detail_content"].split("/"))>6:
+        log("before being replaced xpath of detail_content"+dict_rule_detail["detail_content"])
+        # dict_rule_detail["detail_content"] = browser.execute_script(scripts_replaceXpath,dict_rule_detail["detail_content"])
+        dict_rule_detail["detail_content"] = get_js_rs(browser, scripts_replaceXpath,dict_rule_detail["detail_content"])
+        log("after being replaced xpath of detail_content"+dict_rule_detail["detail_content"])
+    if dict_rule_detail["detail_date"] is not None and len(dict_rule_detail["detail_date"].split("/"))>6:
+        # dict_rule_detail["detail_date"] = browser.execute_script(scripts_replaceXpath,dict_rule_detail["detail_date"])
+        dict_rule_detail["detail_date"] = get_js_rs(browser, scripts_replaceXpath,dict_rule_detail["detail_date"])
+    if dict_rule_detail["detail_title"] is not None and len(dict_rule_detail["detail_title"].split("/"))>6:
+        # dict_rule_detail["detail_title"] = browser.execute_script(scripts_replaceXpath,dict_rule_detail["detail_title"])
+        dict_rule_detail["detail_title"] = get_js_rs(browser, scripts_replaceXpath,dict_rule_detail["detail_title"])
+    # finally:
+    hd.adddriver(browser)
+    debug("release driver")
+
     if dict_rule_detail["detail_content"] is not None and dict_rule_detail["detail_date"] is not None and dict_rule_detail["detail_title"] is not None:
         dict_rule_detail["flag"] = True
     else:

+ 14 - 12
module/detail/title/featureEngine.py

@@ -48,12 +48,12 @@ function statistic(node,deepth){
                 node.counts_communicateTags += 1;
             }
         }
-        if(child.tagName!=null && child.tagName.toLowerCase()=="iframe" && child.contentWindow.document!=null){
+        /*if(child.tagName!=null && child.tagName.toLowerCase()=="iframe" && child.contentWindow.document!=null){
             node.counts_communicateTags += statisticIframe(child.contentWindow.document.all);
         }else{
             node.counts_communicateTags += statistic(child,deepth+1);
-        }
-            
+        }*/
+        node.counts_communicateTags += statistic(child,deepth+1);                
     }
     var innertext = node.innerText;
     if(innertext){
@@ -223,13 +223,14 @@ def dealWithScriptOut(data,sort_index=3):
     else:
         return None
 
-def getInput_byJS(url):
+def getInput_byJS(browser,url):
     try:
-        browser = hd.getdriver()
-        debug("get driver")
-        hd.loadPage(browser, url)
+        # browser = hd.getdriver()
+        # debug("get driver")
+        # hd.loadPage(browser, url)
     
-        data = browser.execute_script(scripts_common+scripts_title)
+        # data = browser.execute_script(scripts_common+scripts_title)
+        data = get_js_rs(browser, scripts_common+scripts_title)
         deal_data = dealWithScriptOut(data)
         if deal_data is None:
             return False,""
@@ -242,9 +243,9 @@ def getInput_byJS(url):
         if re.search("frame",str(e)) is not None:
             err_msg = "#iframe#"
         return None,err_msg
-    finally:
-        hd.adddriver(browser)
-        debug("release driver")
+    # finally:
+        # hd.adddriver(browser)
+        # debug("release driver")
 
 def encodeInput_byJS(url,targethtml):
     def label(innerhtml,target_source):
@@ -267,7 +268,8 @@ def encodeInput_byJS(url,targethtml):
         browser.maximize_window()
         start = time.time()
         
-        data = browser.execute_script(scripts_common+scripts_title)
+        # data = browser.execute_script(scripts_common+scripts_title)
+        data = get_js_rs(browser, scripts_common+scripts_title)
         input_x,list_inner,_,_ = dealWithScriptOut(data)
         list_label = []
         for item in list_inner:

+ 5 - 0
module/extractFlow.py

@@ -24,7 +24,10 @@ def ruleExtract(listpage_url):
             result["status_code"] = "404"
             add_err_msg(result, "#网页打不开#")
             return result
+        print('准备取列表页 ')
         data_listpage = ext_listpage.getRule_listpage(listpage_url)
+        print('完成列表页处理')
+        # print('data_listpage:', data_listpage)
         if data_listpage is None:
             log("data_listpage is None")
             rule_listpage = None
@@ -34,7 +37,9 @@ def ruleExtract(listpage_url):
             result["status_code"] = "201"
         else:
             rule_listpage,list_hrefs = data_listpage
+            print('准备处理详情页')
             rule_detail = ext_detail.getRule_detail(list_hrefs)
+            print('详情页处理完毕')
             result = mergeDict([rule_listpage,rule_detail])
             result["status_code"] = "201"
     except Exception as e:

+ 54 - 30
module/htmlDrawing.py

@@ -14,25 +14,28 @@ import time
 
 header={
     "Accept": "text/html, application/xhtml+xml, image/jxr, */*",
-    "Referer": "http://uia.hnist.cn/sso/login?service=http%3A%2F%2Fportal.hnist.\
-                cn%2Fuser%2FsimpleSSOLogin",    
+    # "Referer": "http://uia.hnist.cn/sso/login?service=http%3A%2F%2Fportal.hnist.\
+    #             cn%2Fuser%2FsimpleSSOLogin",
     "Accept-Language": "zh-Hans-CN,zh-Hans;q=0.8,en-US;q=0.5,en;q=0.3",
     "Content-Type": "application/x-www-form-urlencoded",
     "Connection": "Keep-Alive",
     "User-Agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64) \
      AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.162 Safari/537.36",
     #"Accept-Encoding": "gzip, deflate",
-    "Origin": "http://uia.hnist.cn",
+    # "Origin": "http://uia.hnist.cn",
     "Upgrade-Insecure-Requests": "1",
     }  
 
 TYPE = "phantomjs"
+# TYPE = "chrome"
 current_path = os.path.abspath("/".join(__file__.split("\\")[:-1]))
 driver_paths = {"phantomjs_linux":current_path+"/../driver/phantomjs/phantomjs-2.1.1-linux-x86_64/bin/phantomjs",
                 "phantomjs_window":current_path+"/../driver/phantomjs/phantomjs-2.1.1-windows/bin/phantomjs.exe",
                 "chrome_linux":current_path+"/../driver/chromedriver/chromedriver_linux64/chromedriver",
                 "chrome_window":current_path+"/../driver/chromedriver/chromedriver_win32/chromedriver.exe"}
 
+print(driver_paths)
+
 
 def getBrowser_phantomJS(platform="linux",straight=False):
     
@@ -41,12 +44,13 @@ def getBrowser_phantomJS(platform="linux",straight=False):
     else:
         executable_path = driver_paths["phantomjs_window"]
     desired_capabilities= DesiredCapabilities.PHANTOMJS.copy()
+    print('os.path.exists executable_path', executable_path, os.path.exists(executable_path))
     for key, value in header.items():
         desired_capabilities['phantomjs.page.customHeaders.{}'.format(key)] = value
-    desired_capabilities['phantomjs.page.customHeaders.User-Agent'] ='Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95 Safari/537.36'
+    desired_capabilities['phantomjs.page.customHeaders.User-Agent'] ='Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/100.0.4896.75 Safari/537.36 Edg/100.0.1185.366'
     desired_capabilities["phantomjs.page.settings.loadImages"] = False
     desired_capabilities["phantomjs.page.settings.disk-cache"] = False
-    browser_phantomjs = webdriver.PhantomJS(executable_path=executable_path,desired_capabilities=desired_capabilities,service_args=['--ignore-ssl-errors=true','--ssl-protocol=any'])
+    browser_phantomjs = webdriver.PhantomJS(executable_path=executable_path,desired_capabilities=desired_capabilities,service_args=['--ignore-ssl-errors=true','--ssl-protocol=TLSv1'])
     browser_phantomjs.implicitly_wait(10)
     browser_phantomjs.set_script_timeout(20)
     browser_phantomjs.set_page_load_timeout(10)
@@ -61,14 +65,15 @@ def getBrowser_chrome(platform="linux",straight=False):
     chrome_options = webdriver.ChromeOptions()
     prefs = {"profile.managed_default_content_settings.images":2}
     chrome_options.add_experimental_option("prefs",prefs)
-    chrome_options.add_argument('--headless') 
+    chrome_options.add_argument('--headless')
     chrome_options.add_argument('--no-sandbox')
-    chrome_options.add_argument('--user-agent=iphoneMozilla/5.0 (Macintosh; Intel Mac OS X 10_10_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95 Safari/537.36')
+    chrome_options.add_argument('--user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/100.0.4896.75 Safari/537.36 Edg/100.0.1185.36')
     desired_capabilities= DesiredCapabilities.CHROME.copy()
     desired_capabilities['loggingPrefs'] = { 'performance':'ALL' }
-    browser_chrome = webdriver.Chrome(desired_capabilities=desired_capabilities,executable_path=executable_path, chrome_options=chrome_options,service_args=['--ignore-ssl-errors=true','--ssl-protocol=any'])
-    browser_chrome.implicitly_wait(10)
-    browser_chrome.set_page_load_timeout(10)
+    browser_chrome = webdriver.Chrome(desired_capabilities=desired_capabilities,executable_path=executable_path, chrome_options=chrome_options,service_args=['--ignore-ssl-errors=true','--ssl-protocol=any'])  # '--ssl-protocol=any'  TLSv1
+    browser_chrome.implicitly_wait(15)
+    browser_chrome.set_page_load_timeout(15)
+    # browser_chrome = webdriver.Chrome(executable_path=executable_path)
     
     return browser_chrome
 
@@ -88,8 +93,9 @@ def getBrowser(type=TYPE,straight=False):
 
 def getStatus(url):
     try:
-        r = requests.get(url, headers=header, allow_redirects = False,timeout=10)
+        r = requests.get(url, headers=header, allow_redirects = False,timeout=15)
     except Exception as e:
+        log('requests.get error :%s'%e)
         return 404    
     return r.status_code
 
@@ -106,9 +112,13 @@ def releaseAllDriver():
             try:
                 lock.acquire()
                 wait_count = 0
+                t0 = time.time()
                 while(True):
                     if _queue.full():
                         break
+                    elif time.time()-t0>60:
+                        log('等待放回浏览器超时,强制释放所有driver')
+                        break
                     else:
                         wait_count += 1
                         log("waitting for drivers all back..."+str(wait_count)+"qsize:"+str(_queue.qsize()))
@@ -119,6 +129,7 @@ def releaseAllDriver():
                 lock_kill.release()
     t = Thread(target=_method)
     t.start()
+    t.join(100)
             
     
 
@@ -155,7 +166,7 @@ def getdriver():
     global _get_count
     _get_count += 1
     if _get_count>1000:
-        log("get driver 达到调用次数,重新进行初始化")
+        log("get_driver 达到调用次数,重新进行初始化")
         releaseAllDriver()
         _get_count = 0
     lock.acquire()
@@ -198,23 +209,29 @@ def hasDrew(url,list_rule):
     @summary: 根据规则判断是否渲染
     @param: url:网页链接,list_rule: xpath规则数组 
     '''
-    try:
-        r = requests.get(url, headers=header, allow_redirects = False)
-        _encoding = r.encoding
-        if _encoding is None:
-            _encoding = "utf8"
-        dom = html.fromstring(r.content.decode(_encoding))
-        for item in list_rule:
-            if item["type"]=="xpath":
-                if item["rule"] is not None:
-                    list_nodes = dom.xpath(item["rule"])
-                    if len(list_nodes)==0:
-                        return True
-    except Exception as e:
-        error(str(e))
-    return False
+    def hasdrew(url,list_rule):
+        try:
+            r = requests.get(url, headers=header, allow_redirects = False, timeout=10)
+            _encoding = r.encoding
+            if _encoding is None:
+                _encoding = "utf8"
+            dom = html.fromstring(r.content.decode(_encoding))
+            for item in list_rule:
+                if item["type"]=="xpath":
+                    if item["rule"] is not None:
+                        list_nodes = dom.xpath(item["rule"])
+                        if len(list_nodes)==0:
+                            return True
+        except Exception as e:
+            error(str(e))
+        return False
+    rs = thread_run(hasdrew, url,list_rule)
+    if rs != None:
+        return rs
+    else:
+        return False
 
-def loadPage(browser,url,timeout=20):
+def loadPage(browser,url,timeout=30):
     '''
     @summary: 解决selenium加载网页不返回的问题,设置线程进行加载,对线程设置超时时间
     '''
@@ -225,7 +242,9 @@ def loadPage(browser,url,timeout=20):
             debug("load "+url+" done")
         except Exception as e:
             error(str(e))
+            log('加载页面抛出异常:'+str(e))
             if re.search("由于目标计算机积极拒绝",str(e)) is not None:
+                log('log page exception')
                 releaseAllDriver()
         
     t = Thread(target=_thread_load,args=(browser,url))
@@ -239,9 +258,14 @@ def loadPage(browser,url,timeout=20):
         '''
         #执行释放资源的线程
         error("driver get方法卡住,强制释放所有资源")
-        releaseAllDriver()
         stop_thread(t)
-        raise NameError("超时加载"+str(url))
+        log('stop_loadpage thread return false')
+        adddriver(browser)
+        debug("release driver")
+        releaseAllDriver()
+        return False
+        # raise NameError("超时加载"+str(url))
+    return True
     
     
 def getSource(url):

+ 141 - 130
module/listpage/content/featureEngine.py

@@ -86,13 +86,13 @@ function statistic(node,deepth){
             if (child.tagName.toLowerCase() in {a:"",input:"",select:""} || child.onclick!=null){
                 node.counts_communicateTags += 1;
             }
-        }
-        if(child.tagName!=null && child.tagName.toLowerCase()=="iframe" && child.contentWindow.document!=null){
+        }        
+        /*if(child.tagName!=null && child.tagName.toLowerCase()=="iframe" && child.contentWindow.document!=null){
             node.counts_communicateTags += statisticIframe(child.contentWindow.document.all);
         }else{
             node.counts_communicateTags += statistic(child,deepth+1);
-        }
-            
+        }*/
+        node.counts_communicateTags += statistic(child,deepth+1);    
     }
     node.counts_tagType = set_tag.size();
     var sum_width = 0;
@@ -379,23 +379,29 @@ function clustering_xpath(array_xpath){
 
 
 function search(content_xpath){
-    content_node = getNode_listContent(content_xpath)
-    if(content_node!=null){
-        var array_a_href = statistic_A(content_node);
-        var array_a = array_a_href[0];
-        var array_href = new Array();
-        var array_date = new Array();
-        statistic_time(content_node,array_date);
-        var _clustered_a = clustering_xpath(array_a);
-        var _clustered_date = clustering_xpath(array_date);
-        for(var i=0;i<array_a.length;i++){
-            if(_clustered_a[1].indexOf(array_a_href[0][i])>=0){
-                array_href.push(array_a_href[1][i]);
+    try{
+        content_node = getNode_listContent(content_xpath) //获取列表页标签节点
+        if(content_node!=null){
+            var array_a_href = statistic_A(content_node);
+            var array_a = array_a_href[0];
+            var array_href = new Array();
+            var array_date = new Array();
+            statistic_time(content_node,array_date);
+            var _clustered_a = clustering_xpath(array_a);
+            var _clustered_date = clustering_xpath(array_date);
+            for(var i=0;i<array_a.length;i++){
+                if(_clustered_a[1].indexOf(array_a_href[0][i])>=0){
+                    array_href.push(array_a_href[1][i]);
+                }
             }
+            return [_clustered_a,_clustered_date,array_href]
         }
-        return [_clustered_a,_clustered_date,array_href]
+        return null;
     }
-    return null;
+    catch(e){
+        return null
+    }
+
 }
 return search(arguments[0]);
 '''
@@ -433,7 +439,8 @@ def encodeInput_byJS(url,str_href):
         browser = hd.getdriver()
         debug("get driver")
         hd.loadPage(browser, url)
-        data = browser.execute_script(scripts_common+script_content,str_href)
+        # data = browser.execute_script(scripts_common+script_content,str_href)
+        data = get_js_rs(browser, scripts_common+script_content,str_href)
         deal_data = dealWithScriptOut(data)
         
         if deal_data is None:
@@ -453,8 +460,10 @@ def encodeInput_byJS(url,str_href):
 
 def getInput_byJS(browser,url,str_href):
     try:
-        hd.loadPage(browser,url)
-        data = browser.execute_script(scripts_common+script_content,str_href)
+        # hd.loadPage(browser,url)
+        # data = browser.execute_script(scripts_common+script_content,str_href)
+        data = get_js_rs(browser, scripts_common+script_content,str_href)
+
         deal_data = dealWithScriptOut(data)
         if deal_data is None:
             return None
@@ -465,8 +474,7 @@ def getInput_byJS(browser,url,str_href):
         error(str(e))
     return None
         
-def getRule_A_Date(url,content_xpath):
-    
+def getRule_A_Date(browser, url,content_xpath):
     def appendXpath(list_xpath,_xpath):
         if len(list_xpath)==0:
             list_xpath.append(_xpath)
@@ -477,119 +485,122 @@ def getRule_A_Date(url,content_xpath):
                         "listpage_Date":None,
                         "flag":True,
                         "hasDrew":False}
-    try:
-        browser = hd.getdriver()
-        debug("get driver")
-        hd.loadPage(browser,url)
-        
-        list_a = None
-        for _content_xpath in [content_xpath,"/html"]:
+    # try:
+        # browser = hd.getdriver()
+        # debug("get driver")
+        # hd.loadPage(browser,url)
         
-            
-            data = browser.execute_script(scripts_common+script_get_A_Date,_content_xpath)
-            if data is None:
-                log("A_Date not found with xpath:"+_content_xpath)
-                continue
-            if _content_xpath==content_xpath or len(data[0][1])==len(data[1][1]):
-                list_a = data[0]
-                list_date = data[1]
-                list_hrefs = data[2]
-            if list_a is not None and len(list_a[1])==len(list_date[1]):
-                break
-            else:
-                log("different length of A and Date:with xpath:"+_content_xpath)
-            
-        if list_a is None:
-            log("A_Date not found with all xpath")
-            return None;
-        log("xpath of a:\t"+str(list_a[1][0])+"-"+str(list_a[0]))
-        log("xpath of date:\t"+str(list_date[1][0])+"-"+str(list_date[0]))
-
-        log("length of A and Date:"+str(len(list_a[1]))+"-"+str(len(list_date[1])))
-        if len(list_a[1])!=len(list_date[1]):
-            dict_Rule_A_Date["flag"] = False
-            add_err_msg(dict_Rule_A_Date, "#列表页链接和标题数量不一致#")
-            return dict_Rule_A_Date,list_hrefs
+    list_a = None
+    for _content_xpath in [content_xpath,"/html"]:
+        # data = browser.execute_script(scripts_common+script_get_A_Date,_content_xpath)
+        data = get_js_rs(browser, scripts_common+script_get_A_Date,_content_xpath)
+        if data is None:
+            log("A_Date not found with xpath:"+_content_xpath)
+            continue
+        if _content_xpath==content_xpath or len(data[0][1])==len(data[1][1]):
+            list_a = data[0]
+            list_date = data[1]
+            list_hrefs = data[2]
+        if list_a is not None and len(list_a[1])==len(list_date[1]):
+            log('list_a is not None and len(list_a[1])==len(list_date[1])')
+            break
         else:
-            list_diffindex = list_a[0]
-            _xpath = list_a[1][0]
-            listpage_a = []
-            begin = 0
-            list_diffindex.sort(key=lambda x:x)
-            _jump_flag = False
-            
-            dict_Rule_A_Date["hasDrew"] = dict_Rule_A_Date["hasDrew"] or hd.hasDrew(url, [{"rule":_xpath,"type":"xpath"}])
-            _xpath_split = re.split("(\d+)",_xpath)
-            for i in range(len(list_diffindex)):
-                _index = list_diffindex[i]
-                if not (_xpath_split[_index-1][-1]=="[" and _xpath_split[_index+1][0]=="]"):
-                    add_err_msg(dict_Rule_A_Date, "#列表页链接xpath无法分割#")
-                    dict_Rule_A_Date["flag"] = False
-                    return dict_Rule_A_Date,list_hrefs
+            log("different length of A and Date:with xpath:"+_content_xpath)
+
+    if list_a is None:
+        log("A_Date not found with all xpath")
+        return None;
+    log("xpath of a:\t"+str(list_a[1][0])+"-"+str(list_a[0]))
+    log("xpath of date:\t"+str(list_date[1][0])+"-"+str(list_date[0]))
+
+    log("length of A and Date:"+str(len(list_a[1]))+"-"+str(len(list_date[1])))
+    if len(list_a[1])!=len(list_date[1]):
+        dict_Rule_A_Date["flag"] = False
+        add_err_msg(dict_Rule_A_Date, "#列表页链接和标题数量不一致#")
+        return dict_Rule_A_Date,list_hrefs
+    else:
+        list_diffindex = list_a[0]
+        _xpath = list_a[1][0]
+        listpage_a = []
+        begin = 0
+        list_diffindex.sort(key=lambda x:x)
+        _jump_flag = False
+
+        dict_Rule_A_Date["hasDrew"] = dict_Rule_A_Date["hasDrew"] or hd.hasDrew(url, [{"rule":_xpath,"type":"xpath"}])
+        _xpath_split = re.split("(\d+)",_xpath)
+        for i in range(len(list_diffindex)):
+            _index = list_diffindex[i]
+            if not (_xpath_split[_index-1][-1]=="[" and _xpath_split[_index+1][0]=="]"):
+                add_err_msg(dict_Rule_A_Date, "#列表页链接xpath无法分割#")
+                dict_Rule_A_Date["flag"] = False
+                return dict_Rule_A_Date,list_hrefs
+            else:
+                if i==0:
+                    appendXpath(listpage_a,re.search("(.*)\[","".join(_xpath_split[:_index])).group(1))
+                    begin = _index+1
+                elif i<len(list_diffindex):
+                    appendXpath(listpage_a,re.search("/(.*)\[","".join(_xpath_split[begin:_index])).group(1))
+                    begin = _index+1
                 else:
-                    if i==0:
-                        appendXpath(listpage_a,re.search("(.*)\[","".join(_xpath_split[:_index])).group(1))
-                        begin = _index+1
-                    elif i<len(list_diffindex):
-                        appendXpath(listpage_a,re.search("/(.*)\[","".join(_xpath_split[begin:_index])).group(1))
-                        begin = _index+1
-                    else:
-                        appendXpath(listpage_a,re.search("/(.*)","".join(_xpath_split[begin:])).group(1))
-                        
-                    
-                    if i==len(list_diffindex)-1:
-                        _group = re.search("/(.*)","".join(_xpath_split[begin:]))
-                        if _group is not None:
-                            appendXpath(listpage_a,_group.group(1))
-                
-            for i in range(len(listpage_a)):
-                if len(listpage_a[i].split("/"))>6:
-                    listpage_a[i] = browser.execute_script(scripts_replaceXpath,listpage_a[i])
-            dict_Rule_A_Date["listpage_A"] = listpage_a
-            list_diffindex = list_date[0]
-            _xpath = list_date[1][0]
-            listpage_date = []
-            begin = 0
-            list_diffindex.sort(key=lambda x:x)
-            _jump_flag = False
-            
-            dict_Rule_A_Date["hasDrew"] = dict_Rule_A_Date["hasDrew"] or hd.hasDrew(url, [{"rule":_xpath,"type":"xpath"}])
-            _xpath_split = re.split("(\d+)",_xpath)
-            for i in range(len(list_diffindex)):
-                _index = list_diffindex[i]
-                if not (_xpath_split[_index-1][-1]=="[" and _xpath_split[_index+1][0]=="]"):
-                    add_err_msg(dict_Rule_A_Date, "#列表页链接xpath无法分割#")
-                    dict_Rule_A_Date["flag"] = False
-                    return dict_Rule_A_Date,list_hrefs
+                    appendXpath(listpage_a,re.search("/(.*)","".join(_xpath_split[begin:])).group(1))
+
+
+                if i==len(list_diffindex)-1:
+                    _group = re.search("/(.*)","".join(_xpath_split[begin:]))
+                    if _group is not None:
+                        appendXpath(listpage_a,_group.group(1))
+
+        for i in range(len(listpage_a)):
+            if len(listpage_a[i].split("/"))>6:
+                # listpage_a[i] = browser.execute_script(scripts_replaceXpath,listpage_a[i])
+                listpage_a[i] = get_js_rs(browser, scripts_replaceXpath,listpage_a[i])
+        dict_Rule_A_Date["listpage_A"] = listpage_a
+        list_diffindex = list_date[0]
+        _xpath = list_date[1][0]
+        listpage_date = []
+        begin = 0
+        list_diffindex.sort(key=lambda x:x)
+        _jump_flag = False
+
+        dict_Rule_A_Date["hasDrew"] = dict_Rule_A_Date["hasDrew"] or hd.hasDrew(url, [{"rule":_xpath,"type":"xpath"}])
+        _xpath_split = re.split("(\d+)",_xpath)
+        for i in range(len(list_diffindex)):
+            _index = list_diffindex[i]
+            if not (_xpath_split[_index-1][-1]=="[" and _xpath_split[_index+1][0]=="]"):
+                add_err_msg(dict_Rule_A_Date, "#列表页链接xpath无法分割#")
+                dict_Rule_A_Date["flag"] = False
+                return dict_Rule_A_Date,list_hrefs
+            else:
+                if i==0:
+                    appendXpath(listpage_date,re.search("(.*)\[","".join(_xpath_split[:_index])).group(1))
+                    begin = _index+1
+                elif i<len(list_diffindex):
+                    appendXpath(listpage_date,re.search("/(.*)\[","".join(_xpath_split[begin:_index])).group(1))
+                    begin = _index+1
                 else:
-                    if i==0:
-                        appendXpath(listpage_date,re.search("(.*)\[","".join(_xpath_split[:_index])).group(1))
-                        begin = _index+1
-                    elif i<len(list_diffindex):
-                        appendXpath(listpage_date,re.search("/(.*)\[","".join(_xpath_split[begin:_index])).group(1))
-                        begin = _index+1
-                    else:
-                        appendXpath(listpage_date,re.search("/(.*)","".join(_xpath_split[begin:])).group(1))
-                    
-                    if i==len(list_diffindex)-1:
-                        _group = re.search("/(.*)","".join(_xpath_split[begin:]))
-                        if _group is not None:
-                            appendXpath(listpage_date,_group.group(1))
-            
-            
-            for i in range(len(listpage_date)):
-                if len(listpage_date[i].split("/"))>6:
-                    listpage_date[i] = browser.execute_script(scripts_replaceXpath,listpage_date[i])        
-            dict_Rule_A_Date["listpage_Date"] = listpage_date
-            
-        return dict_Rule_A_Date,list_hrefs
+                    appendXpath(listpage_date,re.search("/(.*)","".join(_xpath_split[begin:])).group(1))
+
+                if i==len(list_diffindex)-1:
+                    _group = re.search("/(.*)","".join(_xpath_split[begin:]))
+                    if _group is not None:
+                        appendXpath(listpage_date,_group.group(1))
+
+        for i in range(len(listpage_date)):
+            if len(listpage_date[i].split("/"))>6:
+                # listpage_date[i] = browser.execute_script(scripts_replaceXpath,listpage_date[i])
+                listpage_date[i] = get_js_rs(browser, scripts_replaceXpath,listpage_date[i])
+
+        dict_Rule_A_Date["listpage_Date"] = listpage_date
+
+    return dict_Rule_A_Date,list_hrefs
                 
                 
-    except Exception as e:
-        error(str(e))
-    finally:
-        hd.adddriver(browser)
-        debug("release driver")
+    # except Exception as e:
+    #     error(str(e))
+    # finally:
+    #     # hd.adddriver(browser)
+    #     # debug("release driver")
+    #     log('getRule_A_Date done')
     return None
         
 def dumpLinkContent():

+ 26 - 7
module/listpage/extractor.py

@@ -83,28 +83,47 @@ def getRule_listpage(listpage_url,try_times=3):
     for i in range(try_times):
         browser = hd.getdriver()
         debug("get driver")
+        loadsuccess = hd.loadPage(browser, listpage_url)
+        if not loadsuccess:
+            log('加载列表主页失败, 重新请求网页。')
+            continue
+        log('准备执行获取列表页内容标签脚本')
+        # with open('d:/html/home_page.html', 'w', encoding='utf-8') as f:
+        #     f.write(browser.page_source)
         data_listpage = featureEngine.getInput_byJS(browser,listpage_url,"")
+        log('获取列表页内容标签成功')
         #print(browser.page_source)
-        hd.adddriver(browser)
-        debug("release driver")
+        # hd.adddriver(browser)
+        # debug("release driver")
         if data_listpage is not None:
             x,_,list_xpath = data_listpage
             _index = listpageContentPredictor.predict(x)
+            log('模型预测列表页标签完毕')
             if len(list_xpath[_index])>0:
                 content_xpath = list_xpath[_index][0]
                 #content_xpath = "/html"
                 log("the content_xpath of listpage is "+str(content_xpath))
-                data_rule = featureEngine.getRule_A_Date(listpage_url,content_xpath)
+                data_rule = featureEngine.getRule_A_Date(browser,listpage_url,content_xpath)
+                log('执行脚本获取列表页链接及日期完毕')
                 if data_rule is not None:
                     dict_rule_A_Date,list_hrefs = data_rule
-                    browser = hd.getdriver()
-                    debug("get driver")
+                    # if dict_rule_A_Date.get('flag', '') == False:
+                    #     return None
+                    # browser = hd.getdriver()
+                    # debug("get driver")
+                    log('begin getTurnRule')
                     turn_data = engine.getTurnRule(browser,listpage_url)
-                    hd.adddriver(browser)
-                    debug("release driver")
+                    log('获取翻页内容完毕')
+                    # hd.adddriver(browser)
+                    # debug("release driver")
                     dict_rule_pageTurn,list_listpage_url = turn_data
                     dict_rule_recog = getRecognize_detail_listpage(list_listpage_url, list_hrefs)
+                    log('解析列表页规则完毕')
+                    hd.adddriver(browser)
+                    debug("release driver")
                     return mergeDict([dict_rule_A_Date,dict_rule_pageTurn,dict_rule_recog]),list_hrefs
+        hd.adddriver(browser)
+        debug("release driver")
     return None
     
     

+ 166 - 93
module/listpage/pageTurn/engine.py

@@ -14,15 +14,16 @@ script = '''
 
 function click_bt(type_click){
     var pattern_pageNum = /[共\/]\s*(\d+)\s*页|\d+\s*\/\s*(\d+)|\.{2}\s*(\d+)/
-    var pattern_nextPage = /^\s*.?([下后]一?页|[下后]一?页\s*»|»|>|[Nn]ext).?\s*$/
+    var pattern_nextPage = /^\s*[^最]?([下后]一?页|[下后]一?页\s*»|»|>|[Nn]ext).?\s*$/
     
-    var pattern_tailPage = /^\s*.?(最?[尾末]一?页|tail|>\|).?s\s*$/
+    var pattern_tailPage = /^\s*(最[尾末]一?页|tail|>\|).?s\s*$/
     list_cluster = clustering_turnPage();
     var pageNum = null;
     var pageNum_jsoup = null;
     var _node_xpath = null;
     var _node_jsoup = null;
     var _node_click = null;
+    var click_message = '';
     for(var i=0;i<list_cluster.length;i++){
         _node = list_cluster[i][0]
         _type = list_cluster[i][1]
@@ -60,17 +61,42 @@ function click_bt(type_click){
                     }
                     
                 }
+                if(_href==null || _href=="" || _href=="#"){
+                    click_message = '翻页链接为空或#异常';
+                }
+                if(_href!=null && _href.indexOf('javascript')>=0){
+                    click_message = '翻页链接为javascript';
+                }
                 if(_node_click==null){
                     _node_click = _node;
+                }               
+               
+            }
+            else if(_node.getAttribute("type")=='button'){
+                _node_click = _node;
+                click_message = '标签属性type为button的翻页';
+            }            
+            else if(_node.parentNode.tagName.toLowerCase() in {a:"",button:""} || _node.parentNode.onclick!=null){
+                _href = _node.parentNode.getAttribute("href")
+                if(_href!=null && _href!="" && _href!="#" && _href.indexOf('javascript')<0){
+                    if(_node_xpath==null){
+                        _node_xpath = getXpath(_node.parentNode);
+                    }
+                    if(_node_jsoup==null){
+                        _node_jsoup = getJsoup(_node.parentNode);
+                    }
+                    
+                }
+                if(_node_click==null){
+                    _node_click = _node.parentNode;
                 }
-                
-                
+                click_message = '父节点为翻页链接';				
             }
         }
     }
     if(_node_click!=null){
         _node_click.click();
-        return [true,[pattern_pageNum.toString(),pageNum,pageNum_jsoup],[_node_xpath,_node_jsoup]];
+        return [true,[pattern_pageNum.toString(),pageNum,pageNum_jsoup],[_node_xpath,_node_jsoup], click_message];
     }else{
         var _pattern = null;
         if(type_click=="nextPage"){
@@ -88,11 +114,13 @@ function click_bt(type_click){
                     _node_jsoup = getJsoup(_node);
                 }
                 _node.click();
-                return [true,[pattern_pageNum.toString(),pageNum,pageNum_jsoup],[_node_xpath,_node_jsoup]];
+                click_message = '找不到翻页按钮,a标签为翻页链接';
+                return [true,[pattern_pageNum.toString(),pageNum,pageNum_jsoup],[_node_xpath,_node_jsoup], click_message];
             }
         }
     }
-    return [false,[pattern_pageNum.toString(),pageNum,pageNum_jsoup],[_node_xpath,_node_jsoup]];
+    if(click_message==''){click_message = '最终没找到翻页按钮';}
+    return [false,[pattern_pageNum.toString(),pageNum,pageNum_jsoup],[_node_xpath,_node_jsoup], click_message];
 }
 return click_bt(arguments[0]);
 '''
@@ -119,7 +147,8 @@ return turnpage_by_pattern(arguments[0]);
 def click_bt_lastPage(browser):
     _url = browser.current_url
     _window_handles = len(browser.window_handles)
-    _result = browser.execute_script(scripts_common+script,"lastPage")
+    # _result = browser.execute_script(scripts_common+script,"lastPage")
+    _result = get_js_rs(browser, scripts_common+script,"lastPage")
     if _result[0]:
         if len(browser.window_handles)>_window_handles:
             switch_window(browser)
@@ -133,8 +162,9 @@ def click_bt_lastPage(browser):
 def click_bt_nextPage(browser):
     _url = browser.current_url
     _window_handles = len(browser.window_handles)
-    _result = browser.execute_script(scripts_common+script,"nextPage")
-    if _result[0]:
+    # _result = browser.execute_script(scripts_common+script,"nextPage")
+    _result = get_js_rs(browser, scripts_common+script,"nextPage", timeout=30)
+    if _result!=None and _result[0]:
         if len(browser.window_handles)>_window_handles:
             switch_window(browser)
         for i in range(4):
@@ -147,8 +177,9 @@ def click_bt_nextPage(browser):
 def click_bt_tailPage(browser):
     _url = browser.current_url
     _window_handles = len(browser.window_handles)
-    _result = browser.execute_script(scripts_common+script,"tailPage")
-    if _result[0]:
+    # _result = browser.execute_script(scripts_common+script,"tailPage")
+    _result = get_js_rs(browser, scripts_common+script,"tailPage")
+    if _result!=None and  _result[0]:
         if len(browser.window_handles)>_window_handles:
             switch_window(browser)
         for i in range(4):
@@ -161,7 +192,8 @@ def click_bt_tailPage(browser):
 def click_bt_pattern(browser,pattern):
     _url = browser.current_url
     _window_handles = len(browser.window_handles)
-    _result = browser.execute_script(scripts_common+script_pattern,pattern)
+    # _result = browser.execute_script(scripts_common+script_pattern,pattern)
+    _result = get_js_rs(browser, scripts_common+script_pattern,pattern)
     if _result:
         if len(browser.window_handles)>_window_handles:
             switch_window(browser)
@@ -191,6 +223,13 @@ def getRuleOfUrl(first_url,second_url):
     log("pageTurn first_url:\t"+first_url)
     log("pageTurn second_url:\t"+second_url)
     if len(split_all_first)!=len(split_all_second):
+        split_url = second_url.split('/')
+        if split_url[-1]== 'index_2.html':
+            dict_rule["listpage_turn_before"] = '/'.join(split_url[:-1])+'/index_'
+            dict_rule["listpage_turn_after"] = '.html'
+            dict_rule["listpage_pageBegin"] = 2
+            dict_rule["listpage_pageStep"] = 1
+            return dict_rule
         add_err_msg(dict_rule, "#翻页链接不匹配#")
         dict_rule["flag"] = False
         return dict_rule
@@ -226,86 +265,119 @@ def getRuleOfUrl(first_url,second_url):
     return dict_rule
 
 def getTurnRule(browser,listpage_url):
-    try:
-        hd.loadPage(browser,listpage_url)
-        first_url = browser.current_url
-        list_listpage_url = []
-        click_flag = True
-        #点击下一页
-        click_next_1 = click_bt_nextPage(browser)
-        
-        url1 = browser.current_url
-        log("click next bt:"+str(click_next_1))
-        #点击下一页
-        click_next_2 = click_bt_nextPage(browser)
-        log("click next bt:"+str(click_next_2))
-        list_pageNum1 = click_next_1[1]
-        list_node1 = click_next_1[2]
-        list_pageNum2 = click_next_2[1]
-        list_node2 = click_next_2[2]
-        dict_rule = None
-        url2 = browser.current_url
-        
-        #是否有点击到下一页
-        #click_flag = click_next_1[0] or click_next_2[0]
-        click_flag = click_next_2[0]
-        
-        
-        
-        #点击数字翻页
-        if not click_flag:
-            #第一个下一页点击到而第二个未点击到
-            if click_next_1[0]:
-                click_last_1 = click_bt_lastPage(browser)
-                url2 = browser.current_url
-            if not click_next_1[0] or not click_last_1[0]:
-                click_pattern_2 = click_bt_pattern(browser, "^\\s*2\\s*$")
-                if click_pattern_2:
-                    url2 = browser.current_url
-                click_pattern_1 = click_bt_pattern(browser, "^\\s*1\\s*$")
-                if click_pattern_1:
-                    url1 = browser.current_url
-                    if url1==first_url:
-                        click_pattern_3 = click_bt_pattern(browser, "^\\s*3\\s*$")
-                        if click_pattern_3:
-                            url1 = url2
-                            url2 = browser.current_url
-        
-        dict_rule = getRuleOfUrl(url1, url2)
-        list_listpage_url.append(url1)
-        list_listpage_url.append(url2)
-    
-        if list_pageNum1[2]==list_pageNum2[2] and list_pageNum1[2] is not None:
-            dict_rule["listpage_pageNum"] = [list_pageNum1[2],"jsoup"]
-        elif list_pageNum1[1]==list_pageNum2[1] and list_pageNum1[1] is not None:
-            dict_rule["listpage_pageNum"] = [list_pageNum1[1],"xpath"]
-        else:
-            dict_rule["listpage_pageNum"] = None
-        dict_rule["listpage_pageNum_pattern"] = list_pageNum1[0]
-        '''
-        #若是未识别到pageNum则flag为False
-        if dict_rule["listpage_pageNum"] is None:
-            dict_rule["flag"] = False
-        '''
-        #优先jsoup,后xpath
-        if list_node1[1]==list_node2[1] and list_node1[1] is not None:
-            dict_rule["listpage_nextPage"] = [list_node1[1],"jsoup"]
-        #只有2页的适配
-        elif list_node1[1] is not None and list_node2[1] is None:
-            dict_rule["listpage_nextPage"] = [list_node1[1],"jsoup"]
-        elif list_node1[0]==list_node2[0] and list_node1[0] is not None:
-            dict_rule["listpage_nextPage"] = [list_node1[0],"xpath"]
-        else:
-            dict_rule["listpage_nextPage"] = None
-        
-        #翻页按钮或者是拼接规则有一个即可
-        if dict_rule["listpage_nextPage"] is not None:
-            dict_rule["flag"] = True
+    '''
+    通过点击下一页或数字翻页得到下一页规则(页数,下一页路径等),list_listpage_url(前后列表页url)
+    :param browser: 浏览器对象
+    :param listpage_url: 列表页url
+    :return:
+    '''
+    # try:
+    # hd.loadPage(browser,listpage_url)
+    first_url = browser.current_url
+    list_listpage_url = []
+    click_flag = True
+    #点击下一页
+    # click_next_1 = click_bt_nextPage(browser)
+    click_next_1 = thread_run(click_bt_nextPage, browser)
+    url1 = ''
+    url2 = browser.current_url
+    log("click next bt:"+str(click_next_1))
+    #点击下一页
+    # click_next_2 = click_bt_nextPage(browser)
+    click_next_2 = thread_run(click_bt_nextPage, browser)
+    if click_next_1==None:
+        click_next_1 = [False, ['/[共\\/]\\s*(\\d+)\\s*页|\\d+\\s*\\/\\s*(\\d+)|\\.{2}\\s*(\\d+)/', None, None],
+                        [None, None]]
+    if click_next_2==None:
+        click_next_2 = [False, ['/[共\\/]\\s*(\\d+)\\s*页|\\d+\\s*\\/\\s*(\\d+)|\\.{2}\\s*(\\d+)/', None, None],
+                        [None, None]]
+    log("click next bt:"+str(click_next_2))
+    list_pageNum1 = click_next_1[1]
+    list_node1 = click_next_1[2]
+    list_pageNum2 = click_next_2[1]
+    list_node2 = click_next_2[2]
+    dict_rule = None
+    url3 = browser.current_url
+
+    #是否有点击到下一页
+    #click_flag = click_next_1[0] or click_next_2[0]
+    click_flag = click_next_2[0]
+
+
+
+    #点击数字翻页
+    # if not click_flag:
+    #     #第一个下一页点击到而第二个未点击到
+    #     log('开始数字翻页')
+        # if click_next_1[0]:
+        #     click_last_1 = click_bt_lastPage(browser)
+        #     url2 = browser.current_url
+        #     log('第一次翻页成功,最后一页作为第二页')
+    if not click_next_1[0]: # or not click_last_1[0]
+        log('开始数字翻页')
+        # click_pattern_2 = click_bt_pattern(browser, "^\\s*2\\s*$")
+        click_pattern_2 = thread_run(click_bt_pattern, browser, "^\\s*2\\s*$")
+        if click_pattern_2:
+            url2 = browser.current_url
+            log('数字翻页第二页%s'%url2)
+        # click_pattern_3 = click_bt_pattern(browser, "^\\s*3\\s*$")
+        click_pattern_3 = thread_run(click_bt_pattern , browser, "^\\s*3\\s*$")
+        if click_pattern_3:
+            url3 = browser.current_url
+            log('数字翻页第三页%s'%url3)
         else:
-            add_err_msg(dict_rule, "#下一页规则未获取#")
-        return dict_rule,list_listpage_url
-    except Exception as e:
-        error(str(e))
+            # click_pattern_1 = click_bt_pattern(browser, "^\\s*1\\s*$")
+            click_pattern_1 = thread_run(click_bt_pattern, browser, "^\\s*1\\s*$")
+            if click_pattern_1:
+                url1 = browser.current_url
+                log('数字翻页第一页%s'%url1)
+    if url2 != url3:
+        dict_rule = getRuleOfUrl(url2, url3)
+    elif url1!='' and url2 != url1:
+        dict_rule = getRuleOfUrl(url1, url2)
+    else:
+        dict_rule = getRuleOfUrl(first_url, url2)
+    if click_next_1 != None and len(click_next_1)==4:
+        click_message = click_next_1[3]
+        if click_message!="":
+            add_err_msg(dict_rule, '#%s#'%click_message)
+    if not click_flag:
+        add_err_msg(dict_rule, "#进行数字翻页#")
+    list_listpage_url.append(url1)
+    list_listpage_url.append(url2)
+
+    if list_pageNum1[2]==list_pageNum2[2] and list_pageNum1[2] is not None:
+        dict_rule["listpage_pageNum"] = [list_pageNum1[2],"jsoup"]
+    elif list_pageNum1[1]==list_pageNum2[1] and list_pageNum1[1] is not None:
+        dict_rule["listpage_pageNum"] = [list_pageNum1[1],"xpath"]
+    else:
+        dict_rule["listpage_pageNum"] = None
+    dict_rule["listpage_pageNum_pattern"] = list_pageNum1[0]
+    '''
+    #若是未识别到pageNum则flag为False
+    if dict_rule["listpage_pageNum"] is None:
+        dict_rule["flag"] = False
+    '''
+    #优先jsoup,后xpath
+    if list_node1[1]==list_node2[1] and list_node1[1] is not None:
+        dict_rule["listpage_nextPage"] = [list_node1[1],"jsoup"]
+    #只有2页的适配
+    elif list_node1[1] is not None and list_node2[1] is None:
+        log('只有两页更新适配 ')
+        dict_rule["listpage_nextPage"] = [list_node1[1],"jsoup"]
+    elif list_node1[0]==list_node2[0] and list_node1[0] is not None:
+        dict_rule["listpage_nextPage"] = [list_node1[0],"xpath"]
+    else:
+        dict_rule["listpage_nextPage"] = None
+
+    #翻页按钮或者是拼接规则有一个即可
+    if dict_rule["listpage_nextPage"] is not None:
+        dict_rule["flag"] = True
+    else:
+        add_err_msg(dict_rule, "#下一页规则未获取#")
+    return dict_rule,list_listpage_url
+    # except Exception as e:
+    #     error(str(e))
 
 if __name__=="__main__":
     browser = hd.getBrowser()
@@ -323,7 +395,8 @@ if __name__=="__main__":
     return _array
     '''
     
-    data = browser.execute_script(scripts_common+script1)
+    # data = browser.execute_script(scripts_common+script1)
+    data = get_js_rs(browser, scripts_common+script1)
     #browser.maximize_window()
     browser.save_screenshot("112.png")
     for item in data:

+ 34 - 18
module/run_single_server.py

@@ -1,21 +1,25 @@
+# -*- coding: utf-8 -*-
+import sys
+import json
+import re
+import os
+sys.path.append(os.path.abspath("../.."))
 
-from module.Utils import log# -*- coding: utf-8 -*-
+os.environ['KERAS_BACKEND']='tensorflow'
+from module.Utils import log
 """
 Created on Fri Jun  1 18:03:03 2018
 
 @author: DONG
 """
-import sys
-import os
-import json
-import re
-sys.path.append(os.path.abspath("../.."))
+
 from module import extractFlow
 from flask import Flask, jsonify
 from flask import abort
 from flask import request
 import time
 import uuid
+from module.Utils import xpath2css
 
 app = Flask(__name__)
 app.config['JSON_AS_ASCII'] = False
@@ -30,9 +34,12 @@ def transformInterface(_dict):
     if listpage_a  and listpage_date:
         if listpage_a[0]==listpage_date[0]:
             ruleValue = listpage_a[0]
-            trans_dict["listPageNode"] = {"ruleType":"xpath",
-                                          "ruleValue":ruleValue,
-                                          "ruleKey":""}
+            # trans_dict["listPageNode"] = {"ruleType":"xpath",
+            #                               "ruleValue":ruleValue,
+            #                               "ruleKey":""}
+            trans_dict["listPageNode"] = {"ruleType": "css",
+                                          "ruleValue": xpath2css(ruleValue),
+                                          "ruleKey": ""}
         else:
             flag = False
     else:
@@ -67,8 +74,11 @@ def transformInterface(_dict):
     detail_date = _dict.get("detail_date")
     trans_dict["needDetailTime"] = False
     if detail_date:
-        trans_dict["detailDateNode"] = {"ruleType": "xpath",
-                                        "ruleValue": detail_date
+        # trans_dict["detailDateNode"] = {"ruleType": "xpath",
+        #                                 "ruleValue": detail_date
+        #                                 }
+        trans_dict["detailDateNode"] = {"ruleType": "css",
+                                        "ruleValue": xpath2css(detail_date)
                                         }
         trans_dict["needDetailTime"] = True
     else:
@@ -76,16 +86,22 @@ def transformInterface(_dict):
     detail_title = _dict.get("detail_title")
     trans_dict["needDetailTitle"] = False
     if detail_title:
-        trans_dict["detailTitleNode"] = {"ruleType": "xpath",
-                                         "ruleValue": detail_title
+        # trans_dict["detailTitleNode"] = {"ruleType": "xpath",
+        #                                  "ruleValue": detail_title
+        #                                  }
+        trans_dict["detailTitleNode"] = {"ruleType": "css",
+                                         "ruleValue": xpath2css(detail_title)
                                          }
         trans_dict["needDetailTitle"] = True
     else:
         flag = False
     detail_content = _dict.get("detail_content")
     if detail_content:
-        trans_dict["detailContentNode"] = {"ruleType": "xpath",
-                                           "ruleValue": detail_content
+        # trans_dict["detailContentNode"] = {"ruleType": "xpath",
+        #                                    "ruleValue": detail_content
+        #                                    }
+        trans_dict["detailContentNode"] = {"ruleType": "css",
+                                           "ruleValue": xpath2css(detail_content)
                                            }
     else:
         flag = False
@@ -117,7 +133,7 @@ def text_predict():
                 if re.search("http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+",listpage_url) is None:
                     data["status_code"] = 400
                     abort(400)
-                else:   
+                else:
                     data = extractFlow.ruleExtract(listpage_url)
                 log("done for setting result of listpage:"+str(listpage_url))
                 data["listpage_url"] = listpage_url
@@ -130,7 +146,7 @@ def text_predict():
         log(" time from receive to send: "+str(time.time()-start_time))
 
         data = transformInterface(data)
-        log(str(data))
+        # log(str(data))
 
         _resp = jsonify(data)
         #log(str(data["flag"])+str(data))
@@ -138,5 +154,5 @@ def text_predict():
 
 
 if __name__ == '__main__':
-    app.run(host='0.0.0.0', port=15015, threaded=True, debug=False)
+    app.run(host='192.168.2.65', port=15015, threaded=True, debug=False) #15015  2.65
     log("ContentExtractor running")

+ 157 - 19
module/testInterface.py

@@ -345,23 +345,161 @@ list_url = ["http://www.csssyxx.com/xwgk/tzgg",
 _sum = 0
 _count = 0
 ''' '''
-with codecs.open("errorLink.txt","r",encoding="utf8") as f:
-    while(True):
-        line = f.readline().strip()
-        if not line:
-            break
-       
-        a = time.time()
-        # user = {"listpage_url":list_url[0]}
-        user = {"listpage_url":"http://www.gsbtn96333.com.cn/news-41-1.html"}
-        #_resp = requests.post("http://192.168.2.52:15015/content_extract", json=user, verify=True)
-        _resp = requests.post("http://127.0.0.1:15015/content_extract", json=user, verify=True)
-        resp_json = _resp.content.decode("utf-8")
-        _resp = json.loads(resp_json)
-        print(resp_json)
-        _sum += 1
-        if "flag" in _resp and _resp["flag"]:
-            _count += 1
-            print("take:",time.time()-a,json.dumps(_resp,sort_keys=True,indent=4,ensure_ascii=False))
-        print(_count,_sum)
+# with codecs.open("errorLink.txt","r",encoding="utf8") as f:
+#     while(True):
+#         line = f.readline().strip()
+#         if not line:
+#             break
+#
+#         a = time.time()
+#         # user = {"listpage_url":list_url[0]}
+#         user = {"listpage_url":"http://www.gsbtn96333.com.cn/news-41-1.html"}
+#         #_resp = requests.post("http://192.168.2.52:15015/content_extract", json=user, verify=True)
+#         _resp = requests.post("http://127.0.0.1:15015/content_extract", json=user, verify=True)
+#         resp_json = _resp.content.decode("utf-8")
+#         _resp = json.loads(resp_json)
+#         print(resp_json)
+#         _sum += 1
+#         if "flag" in _resp and _resp["flag"]:
+#             _count += 1
+#             print("take:",time.time()-a,json.dumps(_resp,sort_keys=True,indent=4,ensure_ascii=False))
+#         print(_count,_sum)
 
+
+def get_rs(url):
+    user = {"listpage_url": url}
+    _resp = requests.post("http://192.168.2.177:15015/content_extract", json=user, verify=True) #127.0.0.1  177
+    resp_json = _resp.content.decode("utf-8")
+    return resp_json
+
+    # _resp = json.loads(resp_json)
+    # print(resp_json)
+    # print(_resp)
+
+# url = 'http://www.clrmyy.com/Newslist/NewsList.aspx?code=ZPXX'
+# url = 'http://ec.chongchi.com.cn:8080/Ec468Web/ysxjcggg.jsp' # 列表页太长 js 溢出  #已设置超时
+# url = 'https://tyj.huangshan.gov.cn/content/column/6794951?pageIndex=1'
+# url = 'http://www.yangdong.gov.cn/xwzx/gggs/index.html'  # 获取详情页报错
+# url = 'https://www.guit.edu.cn/xwzx/tzgg.htm ' # 日志报错
+
+# rs = get_rs(url)
+# print(rs)
+
+# url = 'http://www.yangzhou.gov.cn/cnyzfront/newxxgk/bmfdgklist.jsp?zjclassid=b8273cd5944b41c1b6f5aeb88194340f&bmcode=KA024&showlmmc=1&showbm=0&currentPage=2' # 翻页提取失败
+# url = 'http://www.yangzhou.gov.cn/cnyzfront/newxxgk/bmfdgklist.jsp?zjclassid=aedecc7ea4cb4fbdb34df0d57db50c62&bmcode=11321000014407012K' # 所有要素提取失败, 重跑正常
+url = 'http://www.gztaijiang.gov.cn/zwgk/zdlygk/zfcg/zbgg/index.html' # 所有要素提取失败, 重跑正常
+# url = 'http://gxs.yun.liuzhou.gov.cn/xwzx/tzgg/index.shtml' # 所有要素提取失败, bug 已修复
+# url = 'http://www.yangzhou.gov.cn/cnyzfront/newxxgk/bmfdgklist.jsp?zjclassid=aedecc7ea4cb4fbdb34df0d57db50c62&bmcode=11321000014407012K' # 所有要素提取失败, 重跑正常
+# url = 'http://www.chengan.gov.cn/main/newsMore.action?subjectid=9052&pagenum=1' # 所有要素提取失败, bug 已修复
+# url = 'http://hsxzwgk.mas.gov.cn/opennessTarget/?branch_id=57a3df762c262ea9a00aadae&column_code=280200' #主页提取失败  #网页打不开# 404
+# url = 'http://www.crra.org.cn/news/tongzhi/o1' # 执行js完毕  getRule_A_Date done 后卡住 已修复
+
+# url = 'http://www.ptstjxx.org.cn/pttsjyxx_lists-16.html' # 翻页超时错误 已修复 提取正常
+
+# # url = 'https://www.neias.cn/news_list.jsp?id=10775' # 报 201 浏览器打开正常  重新提取 #翻页链接不匹配##下一页规则未获取#
+# # url = 'https://www.gzmedri.com/xwdt/list_14_page_1.html' # 报 201 浏览器打开很慢,有时正常
+# # url = 'http://www.wjqwhg.cn/Article?pageIndex=1' #列表页规则未获取#  网页打开报错 504
+#
+# # url = 'http://gxs.yun.liuzhou.gov.cn/xwzx/tzgg/index.shtml' # 所有要素提取失败, bug 已修复  列表页xpath预测错误
+# # url = 'http://sz.nxeduyun.com/index.php?r=space/school/portal/content/index&sid=6ce9765e85694be7838c7f7272199346&cid=50160' #列表页获取失败 已修复
+# # url = 'https://www.nbzjy.cn/list-gsgg.html' # #列表页规则未获取# 已解决
+# # url = 'http://www.gdhealth.net.cn/index.php?m=content&c=index&a=lists&catid=38' # # #列表页规则未获取# chome浏览器打开异常 换另一个浏览器正常
+# # url = 'http://www.kbs.gov.cn/ywdt/tzgg/index.html' #列表页规则未获取# iframe报错 已处理
+# # url = 'http://www.xs9z.com/News.asp?PageNo=1&classid=17' #包含iframe 报错  已处理
+# # url = 'http://www.tdxbmj.cn/html/qyxw1/index.html' #列表页规则未获取# 已优化处理,详情页时间没日期报错,标签id重复导致只提取到一个链接
+# # url = 'http://www.sxsltlyy.com/newslist.php?cid=29'  # 列表页获取失败,详情页xpath错误  浏览器打开界面与selenium 的不一样  ua问题已修复
+# # url = 'http://view.landtz.com:8092/jj/index' # #列表页规则未获取# 拍卖多个图标纵向列表   content_xpath of listpage is //*[@class="wp"]/div[2]/div[1]/a[1]/div[2] 预测错误
+# # url = 'http://www.hbbidcloud.cn/suizhou/jyxx/004003/004003006/about.html' # #翻页链接不匹配##下一页规则未获取#  网页本身无翻页机制
+# # url = 'http://www.cqcjda.com/ShowList.aspx?pkey=3&p=3'  #翻页链接不匹配##下一页规则未获取##详情页列表页区分长度未识别#
+# # url = 'https://www.sxeec.com/gpgg/p4.html' ##翻页链接不匹配##下一页规则未获取#  下一页在标签<i>,链接在父节点<a>标签
+# # url = 'http://sthjj.liaoyuan.gov.cn/xxgk/tzgg/' #翻页链接不匹配  第二页开始规律  翻页超时导致拿不到翻页规则 无头模式打开网页超时, 正常模式不超时
+# # url = 'http://zyjy.huizhou.gov.cn/ggfw/jyxx/gycqjy/gpjggg/index_3.html'  #翻页链接不匹配
+# # url = 'http://bj.sxggzyjy.cn/jydt/001001/001001004/001001004001/subPage.html'  #翻页链接不匹配##下一页规则未获取#
+# # url = 'http://www.tlgljs.com/cpzs.html'
+# # url = 'http://zrzyj.jlbc.gov.cn/xxgk/tzgg/'
+# # url = 'http://www.zqcyl.cn/zlzx/ggl/' #抛出异常导致返回结果失败,
+# # url = 'http://www.cqcjda.com/ShowList.aspx?pkey=3'
+# # url = 'http://www.cqcjda.com/ShowList.aspx?pkey=3&p=1'
+# # url = 'http://zyjy.huizhou.gov.cn/ggfw/jyxx/gycqjy/gpjggg/'
+# # url = 'http://www.sxeec.com/gpgg.html'
+# url = 'http://zrzyj.jlbc.gov.cn/xxgk/tzgg/'
+# url = 'http://bbkx.bb.ah.cn/kxxw/tzgg/index.html'
+# url = 'http://www.lzwhg.com/tongzhigonggao/'
+# url = 'http://www.slwr.gov.cn/zfxxgk/gkml/216/240/257/list_640.htm'  # 列表页脚本异常
+# url = 'http://view.landtz.com:8091/xh/index?resourceStatus=0&useType=&orderBy=0&title='
+# url = 'http://ggzy.yueqing.gov.cn/yqwebnew/jyxx/001009/001009010/'
+# url = 'http://ggzy.xjbt.gov.cn/TPFront/bt5/083003/083003002/083003002006/'
+# url = 'http://www.longmen.gov.cn/xzfbm/xcl/zwgk/bmwj/tzgg/index.html'
+# url = 'http://nyncj.yq.gov.cn/tzgg/'
+url = 'http://www.yrcc.gov.cn/zwzc/gzgb/gg/index.html'
+url = 'http://www.hzsq.gov.cn/index.php?r=article/Category/index&class_id=61'
+url = 'http://zyjy.huizhou.gov.cn/ggfw/jyxx/gycqjy/gpjggg/'
+url = 'http://www.lzwhg.com/tongzhigonggao/'  #翻页失败
+rs = get_rs(url)
+print(rs)
+
+
+
+import pandas as pd
+import time
+l = []
+def get_url_root(text):
+    url = re.search('https?:[a-z0-9-./]+\.(cn|com|org|net|gov|edu|biz|cc|mil|top|pub|info)', text)
+    if url:
+        return url.group(0)
+    else:
+        return ''
+def get_url(text):
+    try:
+        url = json.loads(text).get('ruleLink', '')
+        return url
+    except:
+        print('CRAWLER_LINK json loads 出错:', text)
+        return ''
+
+# df = pd.read_csv('E:/crawl_data/crawler.BXKC_CRAWLING_RULES_INFO_utf8.csv')[:]
+# df = pd.read_csv('E:/crawl_data/crawler.BXKC_CRAWLING_RULES_INFO_utf8_predict100-200.csv')[:]
+
+# df = pd.read_excel('E:\crawl_data/新建 XLS 工作表.xls')
+df = pd.read_excel('E:\crawl_data/20220526new_url_2test.xlsx')
+# df.drop_duplicates(subset=['首页网址'], inplace=True)
+
+#
+# df['url_root'] = df['CRAWLER_LINK '].apply(lambda x:get_url_root(x))
+# df['url'] = df['CRAWLER_LINK '].apply(lambda x:get_url(x))
+# df = df[df['url']!=""]
+# print(len(df))
+# df.drop_duplicates(subset=['url_root'], inplace=True)
+# print(len(df))
+# df.drop_duplicates(subset=['DETAIL_CONTENT_NODE'], inplace=True)
+# # df = df[100:200]
+df.reset_index(drop=True, inplace=True)
+print(len(df), df.columns)
+t0 = time.time()
+for i in df.index:
+    # if '#列表页规则未获取#' not in df.loc[i, 'rs']:
+    #     continue
+    t1 = time.time()
+    # url = df.loc[i, 'url']
+    url = df.loc[i, '列表页链接']
+    if not re.match('http', url):
+        l.append('')
+        print(url)
+        continue
+    print(url)
+    rs = get_rs(url)
+    # try:
+    #     url = json.loads(df.loc[i, 'CRAWLER_LINK ']).get('ruleLink', '')
+    #     print(url)
+    #     rs = get_rs(url)
+    # except:
+    #     rs = json.dumps({'err_msg': 'json loads link error'})
+    print('耗时:', time.time()-t1)
+    print(rs)
+    l.append(rs)
+df['rs3'] = pd.Series(l)
+print('完成,总耗时:', time.time()-t0)
+# # df.to_csv('E:/crawl_data/crawler.BXKC_CRAWLING_RULES_INFO_utf8_predict后1000-900.csv', encoding='utf-8')
+# df.to_excel('E:/crawl_data/20220526new_url_0531.xlsx', encoding='utf-8')
+print('写入完成,总耗时:', time.time()-t0)
+# #