Преглед на файлове

解决已发现的bug;优化提取效果;总共199个网站源 提取完整122个占 61.3%;部分成功无翻页或动态翻页31个 占15.5%

lsm преди 3 години
родител
ревизия
2949be0410

BIN
driver/chromedriver/chromedriver_win32/chromedriver.exe


+ 118 - 5
module/Utils.py

@@ -1,3 +1,4 @@
+# -*- coding: utf-8 -*-
 '''
 '''
 Created on 2018年12月20日
 Created on 2018年12月20日
 
 
@@ -20,10 +21,70 @@ import logging
 import pickle
 import pickle
 import tensorflow as tf
 import tensorflow as tf
 from keras import losses
 from keras import losses
+import threading
 
 
 __author__ = 'baniu.yao'
 __author__ = 'baniu.yao'
 
 
+class MyThread(threading.Thread):
+    def __init__(self, func, args=()):
+        super(MyThread, self).__init__()
+        self.func = func
+        self.args = args
 
 
+    def run(self):
+        self.result = self.func(*self.args)
+
+    def get_result(self):
+        try:
+            return self.result
+        except Exception as e:
+            print('执行js抛出异常:', e)
+            return None
+
+def get_js_rs(browser, script, *arg, timeout=20):
+    '''
+    浏览器执行脚本,返回结果,超时中断
+    :param browser:浏览器对象
+    :param script: 脚本
+    :param arg:参数
+    :param timeout:超时时间
+    :return:
+    '''
+    def execute_js():
+        data = browser.execute_script(script, *arg)
+        return data
+    t = MyThread(func=execute_js, args=())
+    t.setDaemon(True)
+    t.start()
+    t.join(timeout)
+    if t.isAlive():
+        print('执行js超时')
+        stop_thread(t)
+        return None
+    data = t.get_result()
+    return data
+
+import time
+def thread_run(func, *arg, timeout=30):
+    t = MyThread(func=func, args=(*arg,))
+    t.setDaemon(True)
+    t.start()
+    t.join(timeout)
+    if t.isAlive():
+        print('thread_run time out')
+    result = t.get_result()
+    return result
+
+def xpath2css(xpath):
+    '''
+    把xpath路径转为css路径
+    :param xpath:
+    :return:
+    '''
+    xpath = xpath.replace('//', '').replace('@', '').replace('/', '>')
+    for it in re.finditer('\[(\d)\]', xpath):
+        xpath = xpath.replace(it.group(0), ':nth-child(%s)'%it.group(1))
+    return xpath
 
 
 def get_class_from_frame(fr):
 def get_class_from_frame(fr):
     args, _, _, value_dict = inspect.getargvalues(fr)
     args, _, _, value_dict = inspect.getargvalues(fr)
@@ -520,6 +581,56 @@ def print_metrics(history):
     plt.show()
     plt.show()
 
 
 scripts_common = '''
 scripts_common = '''
+document.getElementsByClassName = function (Name,e,tag) {
+            var ele = [],
+                allEle,
+                length,
+                i = 0;
+ 
+            if (typeof tag === "undefined" ){
+                tag = "*"
+            }
+ 
+            if (typeof e === "undefined"){
+                e = document;
+            }
+ 
+            allEle = e.getElementsByTagName(tag);
+ 
+            for (length = allEle.length;i < length;i = i + 1){
+                if (allEle[i].className === Name) {
+                    ele.push(allEle[i]);
+                }
+            }
+ 
+            return ele;
+        }
+
+document.countElementById = function (id,e,tag) {
+            var ele = [],
+                allEle,
+                length,
+                i = 0;
+ 
+            if (typeof tag === "undefined" ){
+                tag = "*"
+            }
+ 
+            if (typeof e === "undefined"){
+                e = document;
+            }
+ 
+            allEle = e.getElementsByTagName(tag);
+ 
+            for (length = allEle.length;i < length;i = i + 1){
+                if (allEle[i].id === id) {
+                    ele.push(allEle[i]);
+                }
+            }
+ 
+            return ele;
+        }
+
 /*js集合set类的实现*/
 /*js集合set类的实现*/
 function Set() {
 function Set() {
     this.dataStore = [];
     this.dataStore = [];
@@ -664,7 +775,7 @@ function getRemoveList(node,recurse,list_remove){
 }
 }
 
 
 function getListXpath(el,list_xpath,getRemove){
 function getListXpath(el,list_xpath,getRemove){
-    if (el==document.body){
+    if (el==document || el==document.body){
         return list_xpath;
         return list_xpath;
     }
     }
     if(getRemove){
     if(getRemove){
@@ -678,7 +789,7 @@ function getListXpath(el,list_xpath,getRemove){
     return getListXpath(el.parentNode,list_xpath,getRemove);
     return getListXpath(el.parentNode,list_xpath,getRemove);
 }
 }
 function getXpath(el,b,notfirst){
 function getXpath(el,b,notfirst){
-    if (el.id !=""){
+    if (el.id !="" && document.countElementById(el.id).length==1){
         var _jump_flag = false;
         var _jump_flag = false;
         if(b!=null){
         if(b!=null){
             for(var i=0;i<b.length;i++){
             for(var i=0;i<b.length;i++){
@@ -691,14 +802,16 @@ function getXpath(el,b,notfirst){
             _jump_flag = true;
             _jump_flag = true;
         }
         }
         if(!_jump_flag){
         if(!_jump_flag){
-            return '//*[@id=\"'+el.id+'\"]';
+            //return '//*[@id=\"'+el.id+'\"]';
+            return '//'+el.tagName.toLowerCase()+'[@id=\"'+el.id+'\"]';
         }
         }
         
         
     }
     }
     
     
     if (el.getAttribute("class")!=null && document.getElementsByClassName(el.getAttribute("class")).length==1){
     if (el.getAttribute("class")!=null && document.getElementsByClassName(el.getAttribute("class")).length==1){
         if(!notfirst){
         if(!notfirst){
-            return '//*[@class=\"'+el.getAttribute("class")+'\"]';
+            //return '//*[@class=\"'+el.getAttribute("class")+'\"]';
+            return '//'+el.tagName.toLowerCase()+'[@class=\"'+el.getAttribute("class")+'\"]';
         }
         }
         
         
     }
     }
@@ -823,7 +936,7 @@ function clustering(list_hitTag){
 
 
 function clustering_turnPage(){
 function clustering_turnPage(){
     //var pattern_page = /((?<nextPage>下一?页|>>|>)|(?<lastPage>上一?页|<<|<)|(?<firstPage>首页|第一页)|(?<tailPage>尾页)|(?<other>\.{1,2}|共\d[条页]|\d+\/\d+))/ //phantomjs不支持命名分组
     //var pattern_page = /((?<nextPage>下一?页|>>|>)|(?<lastPage>上一?页|<<|<)|(?<firstPage>首页|第一页)|(?<tailPage>尾页)|(?<other>\.{1,2}|共\d[条页]|\d+\/\d+))/ //phantomjs不支持命名分组
-    var pattern_page = /^\s*.?\s*([下后]\s*一?\s*页?|[下后]\s*一?\s*页\s*»|[下后]\s*一?\s*页\s*>|[下后]\s*一?\s*页\s*>>|»|>>|>|[Nn]ext)\s*.?\s*$|^\s*.?([前上]\s*一?\s*页?|«\s*[前上]\s*一?\s*页|«|<<|<|[Pp]revious).?\s*$|^\s*.?(首\s*页?|第\s*一\s*页|first|\|<).?\s*$|^\s*.?([尾末]\s*一?\s*页?|tail|>\|).?s\s*$|(^\s*\.{1,2}\s*$|^.{,10}共\s*\d+\s*[条页].{,10}$|^.{,10}\d+\/\d+.{,3}$|^\s*\.{0,2}\d+\s*$|^\s*[gG][oO]\s*$|^.{0,2}跳?转到?)/
+    var pattern_page = /^\s*[^最]?\s*([下后]\s*一?\s*页?|[下后]\s*一?\s*页\s*»|[下后]\s*一?\s*页\s*>|[下后]\s*一?\s*页\s*>>|»|>>|>|[Nn]ext)\s*.?\s*$|^\s*.?([前上]\s*一?\s*页?|«\s*[前上]\s*一?\s*页|«|<<|<|[Pp]revious).?\s*$|^\s*.?(首\s*页?|第\s*一\s*页|first|\|<).?\s*$|^\s*.?([尾末]\s*一?\s*页?|tail|>\|).?s\s*$|(^\s*\.{1,2}\s*$|^.{,10}共\s*\d+\s*[条页].{,10}$|^.{,10}\d+\/\d+.{,3}$|^\s*\.{0,2}\d+\s*$|^\s*[gG][oO]\s*$|^.{0,2}跳?转到?)/
     var pattern_nextPage = /[Nn]ext/
     var pattern_nextPage = /[Nn]ext/
     var list_hitTag = new Array();
     var list_hitTag = new Array();
     
     

+ 17 - 14
module/detail/content/featureEngine.py

@@ -54,12 +54,12 @@ function statistic(node,deepth){
                 node.counts_communicateTags += 1;
                 node.counts_communicateTags += 1;
             }
             }
         }
         }
-        if(child.tagName!=null && child.tagName.toLowerCase()=="iframe" && child.contentWindow.document!=null){
+        /*if(child.tagName!=null && child.tagName.toLowerCase()=="iframe" && child.contentWindow.document!=null){
             node.counts_communicateTags += statisticIframe(child.contentWindow.document.all);
             node.counts_communicateTags += statisticIframe(child.contentWindow.document.all);
         }else{
         }else{
             node.counts_communicateTags += statistic(child,deepth+1);
             node.counts_communicateTags += statistic(child,deepth+1);
-        }
-            
+        }*/
+        node.counts_communicateTags += statistic(child,deepth+1);    
     }
     }
     var innertext = node.innerText;
     var innertext = node.innerText;
     if(innertext){
     if(innertext){
@@ -133,7 +133,7 @@ function stastic_time(node,_array){
         }
         }
     }
     }
 
 
-    if (!_find_flag){
+    if (!_find_flag && node!=document){
         _array_fontSize = new Array();
         _array_fontSize = new Array();
         getListFontSize(node,_array_fontSize);
         getListFontSize(node,_array_fontSize);
         _array.push([getOffsetLeft(node),getOffsetTop(node),getListXpath(node,new Array()),Math.min(_array_fontSize)]);
         _array.push([getOffsetLeft(node),getOffsetTop(node),getListXpath(node,new Array()),Math.min(_array_fontSize)]);
@@ -334,7 +334,8 @@ def encodeInput_byJS(url,targethtml):
             browser.maximize_window()
             browser.maximize_window()
             start = time.time()
             start = time.time()
             
             
-            data = browser.execute_script(scripts_common+scripts)
+            # data = browser.execute_script(scripts_common+scripts)
+            data = get_js_rs(browser, scripts_common+scripts)
             input_x,list_inner = dealWithScriptOut(data)
             input_x,list_inner = dealWithScriptOut(data)
             list_label = []
             list_label = []
             for item in list_inner:
             for item in list_inner:
@@ -352,7 +353,7 @@ def encodeInput_byJS(url,targethtml):
     args = {"url":url,"targethtml":targethtml}
     args = {"url":url,"targethtml":targethtml}
     hd.executeMethod(_method, args)
     hd.executeMethod(_method, args)
     
     
-def getInput_byJS(url):
+def getInput_byJS(browser, url):
     def label(innerhtml,target_source):
     def label(innerhtml,target_source):
         target_source =re.sub("[\r\n\s]","",str(target_source))
         target_source =re.sub("[\r\n\s]","",str(target_source))
         pattern = ">(.*)<"
         pattern = ">(.*)<"
@@ -365,12 +366,14 @@ def getInput_byJS(url):
             return 1
             return 1
         return 0
         return 0
     try:
     try:
-        browser = hd.getdriver()
-        debug("get driver")
-        hd.loadPage(browser, url)
-        browser.maximize_window()
+        # browser = hd.getdriver()
+        # debug("get driver")
+        # hd.loadPage(browser, url)
+        # browser.maximize_window()
         
         
-        data,data_time = browser.execute_script(scripts_common+scripts)
+        # data,data_time = browser.execute_script(scripts_common+scripts)
+        data,data_time = get_js_rs(browser, scripts_common+scripts)
+        log('获取正文、时间脚本执行完毕')
         input_x,list_inner,list_xpath = dealWithScriptOut(data)
         input_x,list_inner,list_xpath = dealWithScriptOut(data)
         if input_x is not None:
         if input_x is not None:
             #return [np.expand_dims(np.transpose(pad_sequences(np.transpose(input_x,(1,0)), 155,padding="post", truncating="post", value=0,dtype="float32"),(1,0)),0)],list_inner
             #return [np.expand_dims(np.transpose(pad_sequences(np.transpose(input_x,(1,0)), 155,padding="post", truncating="post", value=0,dtype="float32"),(1,0)),0)],list_inner
@@ -383,9 +386,9 @@ def getInput_byJS(url):
         if re.search("frame",str(e)) is not None:
         if re.search("frame",str(e)) is not None:
             err_msg = "#iframe#"
             err_msg = "#iframe#"
         return None,err_msg
         return None,err_msg
-    finally:
-        hd.adddriver(browser)
-        debug("release driver")
+    # finally:
+    #     hd.adddriver(browser)
+    #     debug("release driver")
 
 
 
 
 
 

+ 54 - 27
module/detail/extractor.py

@@ -87,11 +87,31 @@ def getRule_detail(list_hrefs,try_times=3,MAX_HREFS=10):
             continue
             continue
         list_legal_time = []
         list_legal_time = []
         _flag = -2
         _flag = -2
-        flag,data = featureEngine_content.getInput_byJS(_url)
+        browser = hd.getdriver()
+        debug("get driver")
+        loadsucess = hd.loadPage(browser, _url)
+        if not loadsucess:
+            browser = hd.getdriver()
+        # browser.maximize_window()
+        flag,data = featureEngine_content.getInput_byJS(browser,_url)
         hasGotten = True
         hasGotten = True
         if flag:
         if flag:
-            x,_,list_xpath,data_time = data
+            x,inner_html,list_xpath,data_time = data
             _index = detailContentPredictor.predict(x)
             _index = detailContentPredictor.predict(x)
+
+            pt = '<a.*?\.(zip|rar|tar|7z|wim|docx|doc|xlsx|xls|pdf|txt|hnzf|bmp|tif).*?</a>'
+            total_annex = len(re.findall(pt, browser.page_source))
+            extract_annex = len(re.findall(pt, inner_html[_index]))
+            if total_annex > extract_annex and _index>5 and len(list_xpath[_index])>0:
+                extract_xpath = list_xpath[_index][0][0]
+                for i in range(_index-1, _index-5, -1):
+                    if len(re.findall(pt, inner_html[i]))== total_annex:
+                        log('规格调整模型正文提取附件不完整')
+                        _index = i
+                        break
+                    elif len(list_xpath[i])>0 and list_xpath[i][0][0] not in extract_xpath:
+                        break
+
             _xpath = list_xpath[_index]
             _xpath = list_xpath[_index]
             _xpath.reverse()
             _xpath.reverse()
             list_xpath_remove_content.append(_xpath)
             list_xpath_remove_content.append(_xpath)
@@ -102,10 +122,12 @@ def getRule_detail(list_hrefs,try_times=3,MAX_HREFS=10):
         else:
         else:
             hasGotten = False
             hasGotten = False
             add_err_msg(dict_rule_detail, data)
             add_err_msg(dict_rule_detail, data)
-             
-        flag,data_title = featureEngine_title.getInput_byJS(_url)
+        flag,data_title = featureEngine_title.getInput_byJS(browser,_url)
+        hd.adddriver(browser)
+        debug("release driver")
         if flag:
         if flag:
             x,_,list_xpath,list_top = data_title
             x,_,list_xpath,list_top = data_title
+            log('详情标题获取成功')
             _index = detailTitlePredictor.predict(x)
             _index = detailTitlePredictor.predict(x)
             _xpath = list_xpath[_index]
             _xpath = list_xpath[_index]
             _xpath.reverse()
             _xpath.reverse()
@@ -130,7 +152,7 @@ def getRule_detail(list_hrefs,try_times=3,MAX_HREFS=10):
             _xpath.append(_xpath_remove[0])
             _xpath.append(_xpath_remove[0])
         list_xpaths_content.append(_xpath)
         list_xpaths_content.append(_xpath)
     dict_rule_detail["detail_content"] = getCommonXpath(list_xpaths_content)
     dict_rule_detail["detail_content"] = getCommonXpath(list_xpaths_content)
-    
+
     set_remove_list = None
     set_remove_list = None
     for item in list_xpath_remove_content:
     for item in list_xpath_remove_content:
         for _xpath_remove in item:
         for _xpath_remove in item:
@@ -139,31 +161,36 @@ def getRule_detail(list_hrefs,try_times=3,MAX_HREFS=10):
                     set_remove_list = set(_xpath_remove[1])
                     set_remove_list = set(_xpath_remove[1])
                 else:
                 else:
                     set_remove_list = set(_xpath_remove[1])&set_remove_list
                     set_remove_list = set(_xpath_remove[1])&set_remove_list
-    dict_rule_detail["detail_removeList"] = list(set_remove_list)
-    
+    dict_rule_detail["detail_removeList"] = list(set_remove_list) if set_remove_list!=None else []
     dict_rule_detail["detail_date"] = getCommonXpath_time(list_data_time)
     dict_rule_detail["detail_date"] = getCommonXpath_time(list_data_time)
     dict_rule_detail["detail_title"] = getCommonXpath(list_xpaths_title)
     dict_rule_detail["detail_title"] = getCommonXpath(list_xpaths_title)
     
     
-    try:
-        browser = hd.getdriver()
-        debug("get driver")
-        if len(list_hrefs)>0:
-            hd.loadPage(browser, list_hrefs[-1],)
-            dict_rule_detail["hasDrew"] = dict_rule_detail["hasDrew"] or hd.hasDrew(list_hrefs[0], [{"type":"xpath","rule":dict_rule_detail["detail_content"]},
-                                                                                                    {"type":"xpath","rule":dict_rule_detail["detail_date"]},
-                                                                                                    {"type":"xpath","rule":dict_rule_detail["detail_title"]}])
-        if dict_rule_detail["detail_content"] is not None and len(dict_rule_detail["detail_content"].split("/"))>6:
-            log("before being replaced xpath of detail_content"+dict_rule_detail["detail_content"])
-            dict_rule_detail["detail_content"] = browser.execute_script(scripts_replaceXpath,dict_rule_detail["detail_content"])
-            log("after being replaced xpath of detail_content"+dict_rule_detail["detail_content"])
-        if dict_rule_detail["detail_date"] is not None and len(dict_rule_detail["detail_date"].split("/"))>6:
-            dict_rule_detail["detail_date"] = browser.execute_script(scripts_replaceXpath,dict_rule_detail["detail_date"])
-        if dict_rule_detail["detail_title"] is not None and len(dict_rule_detail["detail_title"].split("/"))>6:
-            dict_rule_detail["detail_title"] = browser.execute_script(scripts_replaceXpath,dict_rule_detail["detail_title"])
-    finally:
-        hd.adddriver(browser)
-        debug("release driver")  
-        
+    # try:
+    browser = hd.getdriver()
+    debug("get driver")
+    if len(list_hrefs)>0:
+        loadsucess = hd.loadPage(browser, list_hrefs[-1],)
+        log('logPage: ')
+        if loadsucess==False:
+            browser = hd.getdriver()
+        dict_rule_detail["hasDrew"] = dict_rule_detail["hasDrew"] or hd.hasDrew(list_hrefs[0], [{"type":"xpath","rule":dict_rule_detail["detail_content"]},
+                                                                                            {"type":"xpath","rule":dict_rule_detail["detail_date"]},
+                                                                                            {"type":"xpath","rule":dict_rule_detail["detail_title"]}])
+    if dict_rule_detail["detail_content"] is not None and len(dict_rule_detail["detail_content"].split("/"))>6:
+        log("before being replaced xpath of detail_content"+dict_rule_detail["detail_content"])
+        # dict_rule_detail["detail_content"] = browser.execute_script(scripts_replaceXpath,dict_rule_detail["detail_content"])
+        dict_rule_detail["detail_content"] = get_js_rs(browser, scripts_replaceXpath,dict_rule_detail["detail_content"])
+        log("after being replaced xpath of detail_content"+dict_rule_detail["detail_content"])
+    if dict_rule_detail["detail_date"] is not None and len(dict_rule_detail["detail_date"].split("/"))>6:
+        # dict_rule_detail["detail_date"] = browser.execute_script(scripts_replaceXpath,dict_rule_detail["detail_date"])
+        dict_rule_detail["detail_date"] = get_js_rs(browser, scripts_replaceXpath,dict_rule_detail["detail_date"])
+    if dict_rule_detail["detail_title"] is not None and len(dict_rule_detail["detail_title"].split("/"))>6:
+        # dict_rule_detail["detail_title"] = browser.execute_script(scripts_replaceXpath,dict_rule_detail["detail_title"])
+        dict_rule_detail["detail_title"] = get_js_rs(browser, scripts_replaceXpath,dict_rule_detail["detail_title"])
+    # finally:
+    hd.adddriver(browser)
+    debug("release driver")
+
     if dict_rule_detail["detail_content"] is not None and dict_rule_detail["detail_date"] is not None and dict_rule_detail["detail_title"] is not None:
     if dict_rule_detail["detail_content"] is not None and dict_rule_detail["detail_date"] is not None and dict_rule_detail["detail_title"] is not None:
         dict_rule_detail["flag"] = True
         dict_rule_detail["flag"] = True
     else:
     else:

+ 14 - 12
module/detail/title/featureEngine.py

@@ -48,12 +48,12 @@ function statistic(node,deepth){
                 node.counts_communicateTags += 1;
                 node.counts_communicateTags += 1;
             }
             }
         }
         }
-        if(child.tagName!=null && child.tagName.toLowerCase()=="iframe" && child.contentWindow.document!=null){
+        /*if(child.tagName!=null && child.tagName.toLowerCase()=="iframe" && child.contentWindow.document!=null){
             node.counts_communicateTags += statisticIframe(child.contentWindow.document.all);
             node.counts_communicateTags += statisticIframe(child.contentWindow.document.all);
         }else{
         }else{
             node.counts_communicateTags += statistic(child,deepth+1);
             node.counts_communicateTags += statistic(child,deepth+1);
-        }
-            
+        }*/
+        node.counts_communicateTags += statistic(child,deepth+1);                
     }
     }
     var innertext = node.innerText;
     var innertext = node.innerText;
     if(innertext){
     if(innertext){
@@ -223,13 +223,14 @@ def dealWithScriptOut(data,sort_index=3):
     else:
     else:
         return None
         return None
 
 
-def getInput_byJS(url):
+def getInput_byJS(browser,url):
     try:
     try:
-        browser = hd.getdriver()
-        debug("get driver")
-        hd.loadPage(browser, url)
+        # browser = hd.getdriver()
+        # debug("get driver")
+        # hd.loadPage(browser, url)
     
     
-        data = browser.execute_script(scripts_common+scripts_title)
+        # data = browser.execute_script(scripts_common+scripts_title)
+        data = get_js_rs(browser, scripts_common+scripts_title)
         deal_data = dealWithScriptOut(data)
         deal_data = dealWithScriptOut(data)
         if deal_data is None:
         if deal_data is None:
             return False,""
             return False,""
@@ -242,9 +243,9 @@ def getInput_byJS(url):
         if re.search("frame",str(e)) is not None:
         if re.search("frame",str(e)) is not None:
             err_msg = "#iframe#"
             err_msg = "#iframe#"
         return None,err_msg
         return None,err_msg
-    finally:
-        hd.adddriver(browser)
-        debug("release driver")
+    # finally:
+        # hd.adddriver(browser)
+        # debug("release driver")
 
 
 def encodeInput_byJS(url,targethtml):
 def encodeInput_byJS(url,targethtml):
     def label(innerhtml,target_source):
     def label(innerhtml,target_source):
@@ -267,7 +268,8 @@ def encodeInput_byJS(url,targethtml):
         browser.maximize_window()
         browser.maximize_window()
         start = time.time()
         start = time.time()
         
         
-        data = browser.execute_script(scripts_common+scripts_title)
+        # data = browser.execute_script(scripts_common+scripts_title)
+        data = get_js_rs(browser, scripts_common+scripts_title)
         input_x,list_inner,_,_ = dealWithScriptOut(data)
         input_x,list_inner,_,_ = dealWithScriptOut(data)
         list_label = []
         list_label = []
         for item in list_inner:
         for item in list_inner:

+ 5 - 0
module/extractFlow.py

@@ -24,7 +24,10 @@ def ruleExtract(listpage_url):
             result["status_code"] = "404"
             result["status_code"] = "404"
             add_err_msg(result, "#网页打不开#")
             add_err_msg(result, "#网页打不开#")
             return result
             return result
+        print('准备取列表页 ')
         data_listpage = ext_listpage.getRule_listpage(listpage_url)
         data_listpage = ext_listpage.getRule_listpage(listpage_url)
+        print('完成列表页处理')
+        # print('data_listpage:', data_listpage)
         if data_listpage is None:
         if data_listpage is None:
             log("data_listpage is None")
             log("data_listpage is None")
             rule_listpage = None
             rule_listpage = None
@@ -34,7 +37,9 @@ def ruleExtract(listpage_url):
             result["status_code"] = "201"
             result["status_code"] = "201"
         else:
         else:
             rule_listpage,list_hrefs = data_listpage
             rule_listpage,list_hrefs = data_listpage
+            print('准备处理详情页')
             rule_detail = ext_detail.getRule_detail(list_hrefs)
             rule_detail = ext_detail.getRule_detail(list_hrefs)
+            print('详情页处理完毕')
             result = mergeDict([rule_listpage,rule_detail])
             result = mergeDict([rule_listpage,rule_detail])
             result["status_code"] = "201"
             result["status_code"] = "201"
     except Exception as e:
     except Exception as e:

+ 54 - 30
module/htmlDrawing.py

@@ -14,25 +14,28 @@ import time
 
 
 header={
 header={
     "Accept": "text/html, application/xhtml+xml, image/jxr, */*",
     "Accept": "text/html, application/xhtml+xml, image/jxr, */*",
-    "Referer": "http://uia.hnist.cn/sso/login?service=http%3A%2F%2Fportal.hnist.\
-                cn%2Fuser%2FsimpleSSOLogin",    
+    # "Referer": "http://uia.hnist.cn/sso/login?service=http%3A%2F%2Fportal.hnist.\
+    #             cn%2Fuser%2FsimpleSSOLogin",
     "Accept-Language": "zh-Hans-CN,zh-Hans;q=0.8,en-US;q=0.5,en;q=0.3",
     "Accept-Language": "zh-Hans-CN,zh-Hans;q=0.8,en-US;q=0.5,en;q=0.3",
     "Content-Type": "application/x-www-form-urlencoded",
     "Content-Type": "application/x-www-form-urlencoded",
     "Connection": "Keep-Alive",
     "Connection": "Keep-Alive",
     "User-Agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64) \
     "User-Agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64) \
      AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.162 Safari/537.36",
      AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.162 Safari/537.36",
     #"Accept-Encoding": "gzip, deflate",
     #"Accept-Encoding": "gzip, deflate",
-    "Origin": "http://uia.hnist.cn",
+    # "Origin": "http://uia.hnist.cn",
     "Upgrade-Insecure-Requests": "1",
     "Upgrade-Insecure-Requests": "1",
     }  
     }  
 
 
 TYPE = "phantomjs"
 TYPE = "phantomjs"
+# TYPE = "chrome"
 current_path = os.path.abspath("/".join(__file__.split("\\")[:-1]))
 current_path = os.path.abspath("/".join(__file__.split("\\")[:-1]))
 driver_paths = {"phantomjs_linux":current_path+"/../driver/phantomjs/phantomjs-2.1.1-linux-x86_64/bin/phantomjs",
 driver_paths = {"phantomjs_linux":current_path+"/../driver/phantomjs/phantomjs-2.1.1-linux-x86_64/bin/phantomjs",
                 "phantomjs_window":current_path+"/../driver/phantomjs/phantomjs-2.1.1-windows/bin/phantomjs.exe",
                 "phantomjs_window":current_path+"/../driver/phantomjs/phantomjs-2.1.1-windows/bin/phantomjs.exe",
                 "chrome_linux":current_path+"/../driver/chromedriver/chromedriver_linux64/chromedriver",
                 "chrome_linux":current_path+"/../driver/chromedriver/chromedriver_linux64/chromedriver",
                 "chrome_window":current_path+"/../driver/chromedriver/chromedriver_win32/chromedriver.exe"}
                 "chrome_window":current_path+"/../driver/chromedriver/chromedriver_win32/chromedriver.exe"}
 
 
+print(driver_paths)
+
 
 
 def getBrowser_phantomJS(platform="linux",straight=False):
 def getBrowser_phantomJS(platform="linux",straight=False):
     
     
@@ -41,12 +44,13 @@ def getBrowser_phantomJS(platform="linux",straight=False):
     else:
     else:
         executable_path = driver_paths["phantomjs_window"]
         executable_path = driver_paths["phantomjs_window"]
     desired_capabilities= DesiredCapabilities.PHANTOMJS.copy()
     desired_capabilities= DesiredCapabilities.PHANTOMJS.copy()
+    print('os.path.exists executable_path', executable_path, os.path.exists(executable_path))
     for key, value in header.items():
     for key, value in header.items():
         desired_capabilities['phantomjs.page.customHeaders.{}'.format(key)] = value
         desired_capabilities['phantomjs.page.customHeaders.{}'.format(key)] = value
-    desired_capabilities['phantomjs.page.customHeaders.User-Agent'] ='Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95 Safari/537.36'
+    desired_capabilities['phantomjs.page.customHeaders.User-Agent'] ='Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/100.0.4896.75 Safari/537.36 Edg/100.0.1185.366'
     desired_capabilities["phantomjs.page.settings.loadImages"] = False
     desired_capabilities["phantomjs.page.settings.loadImages"] = False
     desired_capabilities["phantomjs.page.settings.disk-cache"] = False
     desired_capabilities["phantomjs.page.settings.disk-cache"] = False
-    browser_phantomjs = webdriver.PhantomJS(executable_path=executable_path,desired_capabilities=desired_capabilities,service_args=['--ignore-ssl-errors=true','--ssl-protocol=any'])
+    browser_phantomjs = webdriver.PhantomJS(executable_path=executable_path,desired_capabilities=desired_capabilities,service_args=['--ignore-ssl-errors=true','--ssl-protocol=TLSv1'])
     browser_phantomjs.implicitly_wait(10)
     browser_phantomjs.implicitly_wait(10)
     browser_phantomjs.set_script_timeout(20)
     browser_phantomjs.set_script_timeout(20)
     browser_phantomjs.set_page_load_timeout(10)
     browser_phantomjs.set_page_load_timeout(10)
@@ -61,14 +65,15 @@ def getBrowser_chrome(platform="linux",straight=False):
     chrome_options = webdriver.ChromeOptions()
     chrome_options = webdriver.ChromeOptions()
     prefs = {"profile.managed_default_content_settings.images":2}
     prefs = {"profile.managed_default_content_settings.images":2}
     chrome_options.add_experimental_option("prefs",prefs)
     chrome_options.add_experimental_option("prefs",prefs)
-    chrome_options.add_argument('--headless') 
+    chrome_options.add_argument('--headless')
     chrome_options.add_argument('--no-sandbox')
     chrome_options.add_argument('--no-sandbox')
-    chrome_options.add_argument('--user-agent=iphoneMozilla/5.0 (Macintosh; Intel Mac OS X 10_10_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95 Safari/537.36')
+    chrome_options.add_argument('--user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/100.0.4896.75 Safari/537.36 Edg/100.0.1185.36')
     desired_capabilities= DesiredCapabilities.CHROME.copy()
     desired_capabilities= DesiredCapabilities.CHROME.copy()
     desired_capabilities['loggingPrefs'] = { 'performance':'ALL' }
     desired_capabilities['loggingPrefs'] = { 'performance':'ALL' }
-    browser_chrome = webdriver.Chrome(desired_capabilities=desired_capabilities,executable_path=executable_path, chrome_options=chrome_options,service_args=['--ignore-ssl-errors=true','--ssl-protocol=any'])
-    browser_chrome.implicitly_wait(10)
-    browser_chrome.set_page_load_timeout(10)
+    browser_chrome = webdriver.Chrome(desired_capabilities=desired_capabilities,executable_path=executable_path, chrome_options=chrome_options,service_args=['--ignore-ssl-errors=true','--ssl-protocol=any'])  # '--ssl-protocol=any'  TLSv1
+    browser_chrome.implicitly_wait(15)
+    browser_chrome.set_page_load_timeout(15)
+    # browser_chrome = webdriver.Chrome(executable_path=executable_path)
     
     
     return browser_chrome
     return browser_chrome
 
 
@@ -88,8 +93,9 @@ def getBrowser(type=TYPE,straight=False):
 
 
 def getStatus(url):
 def getStatus(url):
     try:
     try:
-        r = requests.get(url, headers=header, allow_redirects = False,timeout=10)
+        r = requests.get(url, headers=header, allow_redirects = False,timeout=15)
     except Exception as e:
     except Exception as e:
+        log('requests.get error :%s'%e)
         return 404    
         return 404    
     return r.status_code
     return r.status_code
 
 
@@ -106,9 +112,13 @@ def releaseAllDriver():
             try:
             try:
                 lock.acquire()
                 lock.acquire()
                 wait_count = 0
                 wait_count = 0
+                t0 = time.time()
                 while(True):
                 while(True):
                     if _queue.full():
                     if _queue.full():
                         break
                         break
+                    elif time.time()-t0>60:
+                        log('等待放回浏览器超时,强制释放所有driver')
+                        break
                     else:
                     else:
                         wait_count += 1
                         wait_count += 1
                         log("waitting for drivers all back..."+str(wait_count)+"qsize:"+str(_queue.qsize()))
                         log("waitting for drivers all back..."+str(wait_count)+"qsize:"+str(_queue.qsize()))
@@ -119,6 +129,7 @@ def releaseAllDriver():
                 lock_kill.release()
                 lock_kill.release()
     t = Thread(target=_method)
     t = Thread(target=_method)
     t.start()
     t.start()
+    t.join(100)
             
             
     
     
 
 
@@ -155,7 +166,7 @@ def getdriver():
     global _get_count
     global _get_count
     _get_count += 1
     _get_count += 1
     if _get_count>1000:
     if _get_count>1000:
-        log("get driver 达到调用次数,重新进行初始化")
+        log("get_driver 达到调用次数,重新进行初始化")
         releaseAllDriver()
         releaseAllDriver()
         _get_count = 0
         _get_count = 0
     lock.acquire()
     lock.acquire()
@@ -198,23 +209,29 @@ def hasDrew(url,list_rule):
     @summary: 根据规则判断是否渲染
     @summary: 根据规则判断是否渲染
     @param: url:网页链接,list_rule: xpath规则数组 
     @param: url:网页链接,list_rule: xpath规则数组 
     '''
     '''
-    try:
-        r = requests.get(url, headers=header, allow_redirects = False)
-        _encoding = r.encoding
-        if _encoding is None:
-            _encoding = "utf8"
-        dom = html.fromstring(r.content.decode(_encoding))
-        for item in list_rule:
-            if item["type"]=="xpath":
-                if item["rule"] is not None:
-                    list_nodes = dom.xpath(item["rule"])
-                    if len(list_nodes)==0:
-                        return True
-    except Exception as e:
-        error(str(e))
-    return False
+    def hasdrew(url,list_rule):
+        try:
+            r = requests.get(url, headers=header, allow_redirects = False, timeout=10)
+            _encoding = r.encoding
+            if _encoding is None:
+                _encoding = "utf8"
+            dom = html.fromstring(r.content.decode(_encoding))
+            for item in list_rule:
+                if item["type"]=="xpath":
+                    if item["rule"] is not None:
+                        list_nodes = dom.xpath(item["rule"])
+                        if len(list_nodes)==0:
+                            return True
+        except Exception as e:
+            error(str(e))
+        return False
+    rs = thread_run(hasdrew, url,list_rule)
+    if rs != None:
+        return rs
+    else:
+        return False
 
 
-def loadPage(browser,url,timeout=20):
+def loadPage(browser,url,timeout=30):
     '''
     '''
     @summary: 解决selenium加载网页不返回的问题,设置线程进行加载,对线程设置超时时间
     @summary: 解决selenium加载网页不返回的问题,设置线程进行加载,对线程设置超时时间
     '''
     '''
@@ -225,7 +242,9 @@ def loadPage(browser,url,timeout=20):
             debug("load "+url+" done")
             debug("load "+url+" done")
         except Exception as e:
         except Exception as e:
             error(str(e))
             error(str(e))
+            log('加载页面抛出异常:'+str(e))
             if re.search("由于目标计算机积极拒绝",str(e)) is not None:
             if re.search("由于目标计算机积极拒绝",str(e)) is not None:
+                log('log page exception')
                 releaseAllDriver()
                 releaseAllDriver()
         
         
     t = Thread(target=_thread_load,args=(browser,url))
     t = Thread(target=_thread_load,args=(browser,url))
@@ -239,9 +258,14 @@ def loadPage(browser,url,timeout=20):
         '''
         '''
         #执行释放资源的线程
         #执行释放资源的线程
         error("driver get方法卡住,强制释放所有资源")
         error("driver get方法卡住,强制释放所有资源")
-        releaseAllDriver()
         stop_thread(t)
         stop_thread(t)
-        raise NameError("超时加载"+str(url))
+        log('stop_loadpage thread return false')
+        adddriver(browser)
+        debug("release driver")
+        releaseAllDriver()
+        return False
+        # raise NameError("超时加载"+str(url))
+    return True
     
     
     
     
 def getSource(url):
 def getSource(url):

+ 141 - 130
module/listpage/content/featureEngine.py

@@ -86,13 +86,13 @@ function statistic(node,deepth){
             if (child.tagName.toLowerCase() in {a:"",input:"",select:""} || child.onclick!=null){
             if (child.tagName.toLowerCase() in {a:"",input:"",select:""} || child.onclick!=null){
                 node.counts_communicateTags += 1;
                 node.counts_communicateTags += 1;
             }
             }
-        }
-        if(child.tagName!=null && child.tagName.toLowerCase()=="iframe" && child.contentWindow.document!=null){
+        }        
+        /*if(child.tagName!=null && child.tagName.toLowerCase()=="iframe" && child.contentWindow.document!=null){
             node.counts_communicateTags += statisticIframe(child.contentWindow.document.all);
             node.counts_communicateTags += statisticIframe(child.contentWindow.document.all);
         }else{
         }else{
             node.counts_communicateTags += statistic(child,deepth+1);
             node.counts_communicateTags += statistic(child,deepth+1);
-        }
-            
+        }*/
+        node.counts_communicateTags += statistic(child,deepth+1);    
     }
     }
     node.counts_tagType = set_tag.size();
     node.counts_tagType = set_tag.size();
     var sum_width = 0;
     var sum_width = 0;
@@ -379,23 +379,29 @@ function clustering_xpath(array_xpath){
 
 
 
 
 function search(content_xpath){
 function search(content_xpath){
-    content_node = getNode_listContent(content_xpath)
-    if(content_node!=null){
-        var array_a_href = statistic_A(content_node);
-        var array_a = array_a_href[0];
-        var array_href = new Array();
-        var array_date = new Array();
-        statistic_time(content_node,array_date);
-        var _clustered_a = clustering_xpath(array_a);
-        var _clustered_date = clustering_xpath(array_date);
-        for(var i=0;i<array_a.length;i++){
-            if(_clustered_a[1].indexOf(array_a_href[0][i])>=0){
-                array_href.push(array_a_href[1][i]);
+    try{
+        content_node = getNode_listContent(content_xpath) //获取列表页标签节点
+        if(content_node!=null){
+            var array_a_href = statistic_A(content_node);
+            var array_a = array_a_href[0];
+            var array_href = new Array();
+            var array_date = new Array();
+            statistic_time(content_node,array_date);
+            var _clustered_a = clustering_xpath(array_a);
+            var _clustered_date = clustering_xpath(array_date);
+            for(var i=0;i<array_a.length;i++){
+                if(_clustered_a[1].indexOf(array_a_href[0][i])>=0){
+                    array_href.push(array_a_href[1][i]);
+                }
             }
             }
+            return [_clustered_a,_clustered_date,array_href]
         }
         }
-        return [_clustered_a,_clustered_date,array_href]
+        return null;
     }
     }
-    return null;
+    catch(e){
+        return null
+    }
+
 }
 }
 return search(arguments[0]);
 return search(arguments[0]);
 '''
 '''
@@ -433,7 +439,8 @@ def encodeInput_byJS(url,str_href):
         browser = hd.getdriver()
         browser = hd.getdriver()
         debug("get driver")
         debug("get driver")
         hd.loadPage(browser, url)
         hd.loadPage(browser, url)
-        data = browser.execute_script(scripts_common+script_content,str_href)
+        # data = browser.execute_script(scripts_common+script_content,str_href)
+        data = get_js_rs(browser, scripts_common+script_content,str_href)
         deal_data = dealWithScriptOut(data)
         deal_data = dealWithScriptOut(data)
         
         
         if deal_data is None:
         if deal_data is None:
@@ -453,8 +460,10 @@ def encodeInput_byJS(url,str_href):
 
 
 def getInput_byJS(browser,url,str_href):
 def getInput_byJS(browser,url,str_href):
     try:
     try:
-        hd.loadPage(browser,url)
-        data = browser.execute_script(scripts_common+script_content,str_href)
+        # hd.loadPage(browser,url)
+        # data = browser.execute_script(scripts_common+script_content,str_href)
+        data = get_js_rs(browser, scripts_common+script_content,str_href)
+
         deal_data = dealWithScriptOut(data)
         deal_data = dealWithScriptOut(data)
         if deal_data is None:
         if deal_data is None:
             return None
             return None
@@ -465,8 +474,7 @@ def getInput_byJS(browser,url,str_href):
         error(str(e))
         error(str(e))
     return None
     return None
         
         
-def getRule_A_Date(url,content_xpath):
-    
+def getRule_A_Date(browser, url,content_xpath):
     def appendXpath(list_xpath,_xpath):
     def appendXpath(list_xpath,_xpath):
         if len(list_xpath)==0:
         if len(list_xpath)==0:
             list_xpath.append(_xpath)
             list_xpath.append(_xpath)
@@ -477,119 +485,122 @@ def getRule_A_Date(url,content_xpath):
                         "listpage_Date":None,
                         "listpage_Date":None,
                         "flag":True,
                         "flag":True,
                         "hasDrew":False}
                         "hasDrew":False}
-    try:
-        browser = hd.getdriver()
-        debug("get driver")
-        hd.loadPage(browser,url)
-        
-        list_a = None
-        for _content_xpath in [content_xpath,"/html"]:
+    # try:
+        # browser = hd.getdriver()
+        # debug("get driver")
+        # hd.loadPage(browser,url)
         
         
-            
-            data = browser.execute_script(scripts_common+script_get_A_Date,_content_xpath)
-            if data is None:
-                log("A_Date not found with xpath:"+_content_xpath)
-                continue
-            if _content_xpath==content_xpath or len(data[0][1])==len(data[1][1]):
-                list_a = data[0]
-                list_date = data[1]
-                list_hrefs = data[2]
-            if list_a is not None and len(list_a[1])==len(list_date[1]):
-                break
-            else:
-                log("different length of A and Date:with xpath:"+_content_xpath)
-            
-        if list_a is None:
-            log("A_Date not found with all xpath")
-            return None;
-        log("xpath of a:\t"+str(list_a[1][0])+"-"+str(list_a[0]))
-        log("xpath of date:\t"+str(list_date[1][0])+"-"+str(list_date[0]))
-
-        log("length of A and Date:"+str(len(list_a[1]))+"-"+str(len(list_date[1])))
-        if len(list_a[1])!=len(list_date[1]):
-            dict_Rule_A_Date["flag"] = False
-            add_err_msg(dict_Rule_A_Date, "#列表页链接和标题数量不一致#")
-            return dict_Rule_A_Date,list_hrefs
+    list_a = None
+    for _content_xpath in [content_xpath,"/html"]:
+        # data = browser.execute_script(scripts_common+script_get_A_Date,_content_xpath)
+        data = get_js_rs(browser, scripts_common+script_get_A_Date,_content_xpath)
+        if data is None:
+            log("A_Date not found with xpath:"+_content_xpath)
+            continue
+        if _content_xpath==content_xpath or len(data[0][1])==len(data[1][1]):
+            list_a = data[0]
+            list_date = data[1]
+            list_hrefs = data[2]
+        if list_a is not None and len(list_a[1])==len(list_date[1]):
+            log('list_a is not None and len(list_a[1])==len(list_date[1])')
+            break
         else:
         else:
-            list_diffindex = list_a[0]
-            _xpath = list_a[1][0]
-            listpage_a = []
-            begin = 0
-            list_diffindex.sort(key=lambda x:x)
-            _jump_flag = False
-            
-            dict_Rule_A_Date["hasDrew"] = dict_Rule_A_Date["hasDrew"] or hd.hasDrew(url, [{"rule":_xpath,"type":"xpath"}])
-            _xpath_split = re.split("(\d+)",_xpath)
-            for i in range(len(list_diffindex)):
-                _index = list_diffindex[i]
-                if not (_xpath_split[_index-1][-1]=="[" and _xpath_split[_index+1][0]=="]"):
-                    add_err_msg(dict_Rule_A_Date, "#列表页链接xpath无法分割#")
-                    dict_Rule_A_Date["flag"] = False
-                    return dict_Rule_A_Date,list_hrefs
+            log("different length of A and Date:with xpath:"+_content_xpath)
+
+    if list_a is None:
+        log("A_Date not found with all xpath")
+        return None;
+    log("xpath of a:\t"+str(list_a[1][0])+"-"+str(list_a[0]))
+    log("xpath of date:\t"+str(list_date[1][0])+"-"+str(list_date[0]))
+
+    log("length of A and Date:"+str(len(list_a[1]))+"-"+str(len(list_date[1])))
+    if len(list_a[1])!=len(list_date[1]):
+        dict_Rule_A_Date["flag"] = False
+        add_err_msg(dict_Rule_A_Date, "#列表页链接和标题数量不一致#")
+        return dict_Rule_A_Date,list_hrefs
+    else:
+        list_diffindex = list_a[0]
+        _xpath = list_a[1][0]
+        listpage_a = []
+        begin = 0
+        list_diffindex.sort(key=lambda x:x)
+        _jump_flag = False
+
+        dict_Rule_A_Date["hasDrew"] = dict_Rule_A_Date["hasDrew"] or hd.hasDrew(url, [{"rule":_xpath,"type":"xpath"}])
+        _xpath_split = re.split("(\d+)",_xpath)
+        for i in range(len(list_diffindex)):
+            _index = list_diffindex[i]
+            if not (_xpath_split[_index-1][-1]=="[" and _xpath_split[_index+1][0]=="]"):
+                add_err_msg(dict_Rule_A_Date, "#列表页链接xpath无法分割#")
+                dict_Rule_A_Date["flag"] = False
+                return dict_Rule_A_Date,list_hrefs
+            else:
+                if i==0:
+                    appendXpath(listpage_a,re.search("(.*)\[","".join(_xpath_split[:_index])).group(1))
+                    begin = _index+1
+                elif i<len(list_diffindex):
+                    appendXpath(listpage_a,re.search("/(.*)\[","".join(_xpath_split[begin:_index])).group(1))
+                    begin = _index+1
                 else:
                 else:
-                    if i==0:
-                        appendXpath(listpage_a,re.search("(.*)\[","".join(_xpath_split[:_index])).group(1))
-                        begin = _index+1
-                    elif i<len(list_diffindex):
-                        appendXpath(listpage_a,re.search("/(.*)\[","".join(_xpath_split[begin:_index])).group(1))
-                        begin = _index+1
-                    else:
-                        appendXpath(listpage_a,re.search("/(.*)","".join(_xpath_split[begin:])).group(1))
-                        
-                    
-                    if i==len(list_diffindex)-1:
-                        _group = re.search("/(.*)","".join(_xpath_split[begin:]))
-                        if _group is not None:
-                            appendXpath(listpage_a,_group.group(1))
-                
-            for i in range(len(listpage_a)):
-                if len(listpage_a[i].split("/"))>6:
-                    listpage_a[i] = browser.execute_script(scripts_replaceXpath,listpage_a[i])
-            dict_Rule_A_Date["listpage_A"] = listpage_a
-            list_diffindex = list_date[0]
-            _xpath = list_date[1][0]
-            listpage_date = []
-            begin = 0
-            list_diffindex.sort(key=lambda x:x)
-            _jump_flag = False
-            
-            dict_Rule_A_Date["hasDrew"] = dict_Rule_A_Date["hasDrew"] or hd.hasDrew(url, [{"rule":_xpath,"type":"xpath"}])
-            _xpath_split = re.split("(\d+)",_xpath)
-            for i in range(len(list_diffindex)):
-                _index = list_diffindex[i]
-                if not (_xpath_split[_index-1][-1]=="[" and _xpath_split[_index+1][0]=="]"):
-                    add_err_msg(dict_Rule_A_Date, "#列表页链接xpath无法分割#")
-                    dict_Rule_A_Date["flag"] = False
-                    return dict_Rule_A_Date,list_hrefs
+                    appendXpath(listpage_a,re.search("/(.*)","".join(_xpath_split[begin:])).group(1))
+
+
+                if i==len(list_diffindex)-1:
+                    _group = re.search("/(.*)","".join(_xpath_split[begin:]))
+                    if _group is not None:
+                        appendXpath(listpage_a,_group.group(1))
+
+        for i in range(len(listpage_a)):
+            if len(listpage_a[i].split("/"))>6:
+                # listpage_a[i] = browser.execute_script(scripts_replaceXpath,listpage_a[i])
+                listpage_a[i] = get_js_rs(browser, scripts_replaceXpath,listpage_a[i])
+        dict_Rule_A_Date["listpage_A"] = listpage_a
+        list_diffindex = list_date[0]
+        _xpath = list_date[1][0]
+        listpage_date = []
+        begin = 0
+        list_diffindex.sort(key=lambda x:x)
+        _jump_flag = False
+
+        dict_Rule_A_Date["hasDrew"] = dict_Rule_A_Date["hasDrew"] or hd.hasDrew(url, [{"rule":_xpath,"type":"xpath"}])
+        _xpath_split = re.split("(\d+)",_xpath)
+        for i in range(len(list_diffindex)):
+            _index = list_diffindex[i]
+            if not (_xpath_split[_index-1][-1]=="[" and _xpath_split[_index+1][0]=="]"):
+                add_err_msg(dict_Rule_A_Date, "#列表页链接xpath无法分割#")
+                dict_Rule_A_Date["flag"] = False
+                return dict_Rule_A_Date,list_hrefs
+            else:
+                if i==0:
+                    appendXpath(listpage_date,re.search("(.*)\[","".join(_xpath_split[:_index])).group(1))
+                    begin = _index+1
+                elif i<len(list_diffindex):
+                    appendXpath(listpage_date,re.search("/(.*)\[","".join(_xpath_split[begin:_index])).group(1))
+                    begin = _index+1
                 else:
                 else:
-                    if i==0:
-                        appendXpath(listpage_date,re.search("(.*)\[","".join(_xpath_split[:_index])).group(1))
-                        begin = _index+1
-                    elif i<len(list_diffindex):
-                        appendXpath(listpage_date,re.search("/(.*)\[","".join(_xpath_split[begin:_index])).group(1))
-                        begin = _index+1
-                    else:
-                        appendXpath(listpage_date,re.search("/(.*)","".join(_xpath_split[begin:])).group(1))
-                    
-                    if i==len(list_diffindex)-1:
-                        _group = re.search("/(.*)","".join(_xpath_split[begin:]))
-                        if _group is not None:
-                            appendXpath(listpage_date,_group.group(1))
-            
-            
-            for i in range(len(listpage_date)):
-                if len(listpage_date[i].split("/"))>6:
-                    listpage_date[i] = browser.execute_script(scripts_replaceXpath,listpage_date[i])        
-            dict_Rule_A_Date["listpage_Date"] = listpage_date
-            
-        return dict_Rule_A_Date,list_hrefs
+                    appendXpath(listpage_date,re.search("/(.*)","".join(_xpath_split[begin:])).group(1))
+
+                if i==len(list_diffindex)-1:
+                    _group = re.search("/(.*)","".join(_xpath_split[begin:]))
+                    if _group is not None:
+                        appendXpath(listpage_date,_group.group(1))
+
+        for i in range(len(listpage_date)):
+            if len(listpage_date[i].split("/"))>6:
+                # listpage_date[i] = browser.execute_script(scripts_replaceXpath,listpage_date[i])
+                listpage_date[i] = get_js_rs(browser, scripts_replaceXpath,listpage_date[i])
+
+        dict_Rule_A_Date["listpage_Date"] = listpage_date
+
+    return dict_Rule_A_Date,list_hrefs
                 
                 
                 
                 
-    except Exception as e:
-        error(str(e))
-    finally:
-        hd.adddriver(browser)
-        debug("release driver")
+    # except Exception as e:
+    #     error(str(e))
+    # finally:
+    #     # hd.adddriver(browser)
+    #     # debug("release driver")
+    #     log('getRule_A_Date done')
     return None
     return None
         
         
 def dumpLinkContent():
 def dumpLinkContent():

+ 26 - 7
module/listpage/extractor.py

@@ -83,28 +83,47 @@ def getRule_listpage(listpage_url,try_times=3):
     for i in range(try_times):
     for i in range(try_times):
         browser = hd.getdriver()
         browser = hd.getdriver()
         debug("get driver")
         debug("get driver")
+        loadsuccess = hd.loadPage(browser, listpage_url)
+        if not loadsuccess:
+            log('加载列表主页失败, 重新请求网页。')
+            continue
+        log('准备执行获取列表页内容标签脚本')
+        # with open('d:/html/home_page.html', 'w', encoding='utf-8') as f:
+        #     f.write(browser.page_source)
         data_listpage = featureEngine.getInput_byJS(browser,listpage_url,"")
         data_listpage = featureEngine.getInput_byJS(browser,listpage_url,"")
+        log('获取列表页内容标签成功')
         #print(browser.page_source)
         #print(browser.page_source)
-        hd.adddriver(browser)
-        debug("release driver")
+        # hd.adddriver(browser)
+        # debug("release driver")
         if data_listpage is not None:
         if data_listpage is not None:
             x,_,list_xpath = data_listpage
             x,_,list_xpath = data_listpage
             _index = listpageContentPredictor.predict(x)
             _index = listpageContentPredictor.predict(x)
+            log('模型预测列表页标签完毕')
             if len(list_xpath[_index])>0:
             if len(list_xpath[_index])>0:
                 content_xpath = list_xpath[_index][0]
                 content_xpath = list_xpath[_index][0]
                 #content_xpath = "/html"
                 #content_xpath = "/html"
                 log("the content_xpath of listpage is "+str(content_xpath))
                 log("the content_xpath of listpage is "+str(content_xpath))
-                data_rule = featureEngine.getRule_A_Date(listpage_url,content_xpath)
+                data_rule = featureEngine.getRule_A_Date(browser,listpage_url,content_xpath)
+                log('执行脚本获取列表页链接及日期完毕')
                 if data_rule is not None:
                 if data_rule is not None:
                     dict_rule_A_Date,list_hrefs = data_rule
                     dict_rule_A_Date,list_hrefs = data_rule
-                    browser = hd.getdriver()
-                    debug("get driver")
+                    # if dict_rule_A_Date.get('flag', '') == False:
+                    #     return None
+                    # browser = hd.getdriver()
+                    # debug("get driver")
+                    log('begin getTurnRule')
                     turn_data = engine.getTurnRule(browser,listpage_url)
                     turn_data = engine.getTurnRule(browser,listpage_url)
-                    hd.adddriver(browser)
-                    debug("release driver")
+                    log('获取翻页内容完毕')
+                    # hd.adddriver(browser)
+                    # debug("release driver")
                     dict_rule_pageTurn,list_listpage_url = turn_data
                     dict_rule_pageTurn,list_listpage_url = turn_data
                     dict_rule_recog = getRecognize_detail_listpage(list_listpage_url, list_hrefs)
                     dict_rule_recog = getRecognize_detail_listpage(list_listpage_url, list_hrefs)
+                    log('解析列表页规则完毕')
+                    hd.adddriver(browser)
+                    debug("release driver")
                     return mergeDict([dict_rule_A_Date,dict_rule_pageTurn,dict_rule_recog]),list_hrefs
                     return mergeDict([dict_rule_A_Date,dict_rule_pageTurn,dict_rule_recog]),list_hrefs
+        hd.adddriver(browser)
+        debug("release driver")
     return None
     return None
     
     
     
     

+ 166 - 93
module/listpage/pageTurn/engine.py

@@ -14,15 +14,16 @@ script = '''
 
 
 function click_bt(type_click){
 function click_bt(type_click){
     var pattern_pageNum = /[共\/]\s*(\d+)\s*页|\d+\s*\/\s*(\d+)|\.{2}\s*(\d+)/
     var pattern_pageNum = /[共\/]\s*(\d+)\s*页|\d+\s*\/\s*(\d+)|\.{2}\s*(\d+)/
-    var pattern_nextPage = /^\s*.?([下后]一?页|[下后]一?页\s*»|»|>|[Nn]ext).?\s*$/
+    var pattern_nextPage = /^\s*[^最]?([下后]一?页|[下后]一?页\s*»|»|>|[Nn]ext).?\s*$/
     
     
-    var pattern_tailPage = /^\s*.?(最?[尾末]一?页|tail|>\|).?s\s*$/
+    var pattern_tailPage = /^\s*(最[尾末]一?页|tail|>\|).?s\s*$/
     list_cluster = clustering_turnPage();
     list_cluster = clustering_turnPage();
     var pageNum = null;
     var pageNum = null;
     var pageNum_jsoup = null;
     var pageNum_jsoup = null;
     var _node_xpath = null;
     var _node_xpath = null;
     var _node_jsoup = null;
     var _node_jsoup = null;
     var _node_click = null;
     var _node_click = null;
+    var click_message = '';
     for(var i=0;i<list_cluster.length;i++){
     for(var i=0;i<list_cluster.length;i++){
         _node = list_cluster[i][0]
         _node = list_cluster[i][0]
         _type = list_cluster[i][1]
         _type = list_cluster[i][1]
@@ -60,17 +61,42 @@ function click_bt(type_click){
                     }
                     }
                     
                     
                 }
                 }
+                if(_href==null || _href=="" || _href=="#"){
+                    click_message = '翻页链接为空或#异常';
+                }
+                if(_href!=null && _href.indexOf('javascript')>=0){
+                    click_message = '翻页链接为javascript';
+                }
                 if(_node_click==null){
                 if(_node_click==null){
                     _node_click = _node;
                     _node_click = _node;
+                }               
+               
+            }
+            else if(_node.getAttribute("type")=='button'){
+                _node_click = _node;
+                click_message = '标签属性type为button的翻页';
+            }            
+            else if(_node.parentNode.tagName.toLowerCase() in {a:"",button:""} || _node.parentNode.onclick!=null){
+                _href = _node.parentNode.getAttribute("href")
+                if(_href!=null && _href!="" && _href!="#" && _href.indexOf('javascript')<0){
+                    if(_node_xpath==null){
+                        _node_xpath = getXpath(_node.parentNode);
+                    }
+                    if(_node_jsoup==null){
+                        _node_jsoup = getJsoup(_node.parentNode);
+                    }
+                    
+                }
+                if(_node_click==null){
+                    _node_click = _node.parentNode;
                 }
                 }
-                
-                
+                click_message = '父节点为翻页链接';				
             }
             }
         }
         }
     }
     }
     if(_node_click!=null){
     if(_node_click!=null){
         _node_click.click();
         _node_click.click();
-        return [true,[pattern_pageNum.toString(),pageNum,pageNum_jsoup],[_node_xpath,_node_jsoup]];
+        return [true,[pattern_pageNum.toString(),pageNum,pageNum_jsoup],[_node_xpath,_node_jsoup], click_message];
     }else{
     }else{
         var _pattern = null;
         var _pattern = null;
         if(type_click=="nextPage"){
         if(type_click=="nextPage"){
@@ -88,11 +114,13 @@ function click_bt(type_click){
                     _node_jsoup = getJsoup(_node);
                     _node_jsoup = getJsoup(_node);
                 }
                 }
                 _node.click();
                 _node.click();
-                return [true,[pattern_pageNum.toString(),pageNum,pageNum_jsoup],[_node_xpath,_node_jsoup]];
+                click_message = '找不到翻页按钮,a标签为翻页链接';
+                return [true,[pattern_pageNum.toString(),pageNum,pageNum_jsoup],[_node_xpath,_node_jsoup], click_message];
             }
             }
         }
         }
     }
     }
-    return [false,[pattern_pageNum.toString(),pageNum,pageNum_jsoup],[_node_xpath,_node_jsoup]];
+    if(click_message==''){click_message = '最终没找到翻页按钮';}
+    return [false,[pattern_pageNum.toString(),pageNum,pageNum_jsoup],[_node_xpath,_node_jsoup], click_message];
 }
 }
 return click_bt(arguments[0]);
 return click_bt(arguments[0]);
 '''
 '''
@@ -119,7 +147,8 @@ return turnpage_by_pattern(arguments[0]);
 def click_bt_lastPage(browser):
 def click_bt_lastPage(browser):
     _url = browser.current_url
     _url = browser.current_url
     _window_handles = len(browser.window_handles)
     _window_handles = len(browser.window_handles)
-    _result = browser.execute_script(scripts_common+script,"lastPage")
+    # _result = browser.execute_script(scripts_common+script,"lastPage")
+    _result = get_js_rs(browser, scripts_common+script,"lastPage")
     if _result[0]:
     if _result[0]:
         if len(browser.window_handles)>_window_handles:
         if len(browser.window_handles)>_window_handles:
             switch_window(browser)
             switch_window(browser)
@@ -133,8 +162,9 @@ def click_bt_lastPage(browser):
 def click_bt_nextPage(browser):
 def click_bt_nextPage(browser):
     _url = browser.current_url
     _url = browser.current_url
     _window_handles = len(browser.window_handles)
     _window_handles = len(browser.window_handles)
-    _result = browser.execute_script(scripts_common+script,"nextPage")
-    if _result[0]:
+    # _result = browser.execute_script(scripts_common+script,"nextPage")
+    _result = get_js_rs(browser, scripts_common+script,"nextPage", timeout=30)
+    if _result!=None and _result[0]:
         if len(browser.window_handles)>_window_handles:
         if len(browser.window_handles)>_window_handles:
             switch_window(browser)
             switch_window(browser)
         for i in range(4):
         for i in range(4):
@@ -147,8 +177,9 @@ def click_bt_nextPage(browser):
 def click_bt_tailPage(browser):
 def click_bt_tailPage(browser):
     _url = browser.current_url
     _url = browser.current_url
     _window_handles = len(browser.window_handles)
     _window_handles = len(browser.window_handles)
-    _result = browser.execute_script(scripts_common+script,"tailPage")
-    if _result[0]:
+    # _result = browser.execute_script(scripts_common+script,"tailPage")
+    _result = get_js_rs(browser, scripts_common+script,"tailPage")
+    if _result!=None and  _result[0]:
         if len(browser.window_handles)>_window_handles:
         if len(browser.window_handles)>_window_handles:
             switch_window(browser)
             switch_window(browser)
         for i in range(4):
         for i in range(4):
@@ -161,7 +192,8 @@ def click_bt_tailPage(browser):
 def click_bt_pattern(browser,pattern):
 def click_bt_pattern(browser,pattern):
     _url = browser.current_url
     _url = browser.current_url
     _window_handles = len(browser.window_handles)
     _window_handles = len(browser.window_handles)
-    _result = browser.execute_script(scripts_common+script_pattern,pattern)
+    # _result = browser.execute_script(scripts_common+script_pattern,pattern)
+    _result = get_js_rs(browser, scripts_common+script_pattern,pattern)
     if _result:
     if _result:
         if len(browser.window_handles)>_window_handles:
         if len(browser.window_handles)>_window_handles:
             switch_window(browser)
             switch_window(browser)
@@ -191,6 +223,13 @@ def getRuleOfUrl(first_url,second_url):
     log("pageTurn first_url:\t"+first_url)
     log("pageTurn first_url:\t"+first_url)
     log("pageTurn second_url:\t"+second_url)
     log("pageTurn second_url:\t"+second_url)
     if len(split_all_first)!=len(split_all_second):
     if len(split_all_first)!=len(split_all_second):
+        split_url = second_url.split('/')
+        if split_url[-1]== 'index_2.html':
+            dict_rule["listpage_turn_before"] = '/'.join(split_url[:-1])+'/index_'
+            dict_rule["listpage_turn_after"] = '.html'
+            dict_rule["listpage_pageBegin"] = 2
+            dict_rule["listpage_pageStep"] = 1
+            return dict_rule
         add_err_msg(dict_rule, "#翻页链接不匹配#")
         add_err_msg(dict_rule, "#翻页链接不匹配#")
         dict_rule["flag"] = False
         dict_rule["flag"] = False
         return dict_rule
         return dict_rule
@@ -226,86 +265,119 @@ def getRuleOfUrl(first_url,second_url):
     return dict_rule
     return dict_rule
 
 
 def getTurnRule(browser,listpage_url):
 def getTurnRule(browser,listpage_url):
-    try:
-        hd.loadPage(browser,listpage_url)
-        first_url = browser.current_url
-        list_listpage_url = []
-        click_flag = True
-        #点击下一页
-        click_next_1 = click_bt_nextPage(browser)
-        
-        url1 = browser.current_url
-        log("click next bt:"+str(click_next_1))
-        #点击下一页
-        click_next_2 = click_bt_nextPage(browser)
-        log("click next bt:"+str(click_next_2))
-        list_pageNum1 = click_next_1[1]
-        list_node1 = click_next_1[2]
-        list_pageNum2 = click_next_2[1]
-        list_node2 = click_next_2[2]
-        dict_rule = None
-        url2 = browser.current_url
-        
-        #是否有点击到下一页
-        #click_flag = click_next_1[0] or click_next_2[0]
-        click_flag = click_next_2[0]
-        
-        
-        
-        #点击数字翻页
-        if not click_flag:
-            #第一个下一页点击到而第二个未点击到
-            if click_next_1[0]:
-                click_last_1 = click_bt_lastPage(browser)
-                url2 = browser.current_url
-            if not click_next_1[0] or not click_last_1[0]:
-                click_pattern_2 = click_bt_pattern(browser, "^\\s*2\\s*$")
-                if click_pattern_2:
-                    url2 = browser.current_url
-                click_pattern_1 = click_bt_pattern(browser, "^\\s*1\\s*$")
-                if click_pattern_1:
-                    url1 = browser.current_url
-                    if url1==first_url:
-                        click_pattern_3 = click_bt_pattern(browser, "^\\s*3\\s*$")
-                        if click_pattern_3:
-                            url1 = url2
-                            url2 = browser.current_url
-        
-        dict_rule = getRuleOfUrl(url1, url2)
-        list_listpage_url.append(url1)
-        list_listpage_url.append(url2)
-    
-        if list_pageNum1[2]==list_pageNum2[2] and list_pageNum1[2] is not None:
-            dict_rule["listpage_pageNum"] = [list_pageNum1[2],"jsoup"]
-        elif list_pageNum1[1]==list_pageNum2[1] and list_pageNum1[1] is not None:
-            dict_rule["listpage_pageNum"] = [list_pageNum1[1],"xpath"]
-        else:
-            dict_rule["listpage_pageNum"] = None
-        dict_rule["listpage_pageNum_pattern"] = list_pageNum1[0]
-        '''
-        #若是未识别到pageNum则flag为False
-        if dict_rule["listpage_pageNum"] is None:
-            dict_rule["flag"] = False
-        '''
-        #优先jsoup,后xpath
-        if list_node1[1]==list_node2[1] and list_node1[1] is not None:
-            dict_rule["listpage_nextPage"] = [list_node1[1],"jsoup"]
-        #只有2页的适配
-        elif list_node1[1] is not None and list_node2[1] is None:
-            dict_rule["listpage_nextPage"] = [list_node1[1],"jsoup"]
-        elif list_node1[0]==list_node2[0] and list_node1[0] is not None:
-            dict_rule["listpage_nextPage"] = [list_node1[0],"xpath"]
-        else:
-            dict_rule["listpage_nextPage"] = None
-        
-        #翻页按钮或者是拼接规则有一个即可
-        if dict_rule["listpage_nextPage"] is not None:
-            dict_rule["flag"] = True
+    '''
+    通过点击下一页或数字翻页得到下一页规则(页数,下一页路径等),list_listpage_url(前后列表页url)
+    :param browser: 浏览器对象
+    :param listpage_url: 列表页url
+    :return:
+    '''
+    # try:
+    # hd.loadPage(browser,listpage_url)
+    first_url = browser.current_url
+    list_listpage_url = []
+    click_flag = True
+    #点击下一页
+    # click_next_1 = click_bt_nextPage(browser)
+    click_next_1 = thread_run(click_bt_nextPage, browser)
+    url1 = ''
+    url2 = browser.current_url
+    log("click next bt:"+str(click_next_1))
+    #点击下一页
+    # click_next_2 = click_bt_nextPage(browser)
+    click_next_2 = thread_run(click_bt_nextPage, browser)
+    if click_next_1==None:
+        click_next_1 = [False, ['/[共\\/]\\s*(\\d+)\\s*页|\\d+\\s*\\/\\s*(\\d+)|\\.{2}\\s*(\\d+)/', None, None],
+                        [None, None]]
+    if click_next_2==None:
+        click_next_2 = [False, ['/[共\\/]\\s*(\\d+)\\s*页|\\d+\\s*\\/\\s*(\\d+)|\\.{2}\\s*(\\d+)/', None, None],
+                        [None, None]]
+    log("click next bt:"+str(click_next_2))
+    list_pageNum1 = click_next_1[1]
+    list_node1 = click_next_1[2]
+    list_pageNum2 = click_next_2[1]
+    list_node2 = click_next_2[2]
+    dict_rule = None
+    url3 = browser.current_url
+
+    #是否有点击到下一页
+    #click_flag = click_next_1[0] or click_next_2[0]
+    click_flag = click_next_2[0]
+
+
+
+    #点击数字翻页
+    # if not click_flag:
+    #     #第一个下一页点击到而第二个未点击到
+    #     log('开始数字翻页')
+        # if click_next_1[0]:
+        #     click_last_1 = click_bt_lastPage(browser)
+        #     url2 = browser.current_url
+        #     log('第一次翻页成功,最后一页作为第二页')
+    if not click_next_1[0]: # or not click_last_1[0]
+        log('开始数字翻页')
+        # click_pattern_2 = click_bt_pattern(browser, "^\\s*2\\s*$")
+        click_pattern_2 = thread_run(click_bt_pattern, browser, "^\\s*2\\s*$")
+        if click_pattern_2:
+            url2 = browser.current_url
+            log('数字翻页第二页%s'%url2)
+        # click_pattern_3 = click_bt_pattern(browser, "^\\s*3\\s*$")
+        click_pattern_3 = thread_run(click_bt_pattern , browser, "^\\s*3\\s*$")
+        if click_pattern_3:
+            url3 = browser.current_url
+            log('数字翻页第三页%s'%url3)
         else:
         else:
-            add_err_msg(dict_rule, "#下一页规则未获取#")
-        return dict_rule,list_listpage_url
-    except Exception as e:
-        error(str(e))
+            # click_pattern_1 = click_bt_pattern(browser, "^\\s*1\\s*$")
+            click_pattern_1 = thread_run(click_bt_pattern, browser, "^\\s*1\\s*$")
+            if click_pattern_1:
+                url1 = browser.current_url
+                log('数字翻页第一页%s'%url1)
+    if url2 != url3:
+        dict_rule = getRuleOfUrl(url2, url3)
+    elif url1!='' and url2 != url1:
+        dict_rule = getRuleOfUrl(url1, url2)
+    else:
+        dict_rule = getRuleOfUrl(first_url, url2)
+    if click_next_1 != None and len(click_next_1)==4:
+        click_message = click_next_1[3]
+        if click_message!="":
+            add_err_msg(dict_rule, '#%s#'%click_message)
+    if not click_flag:
+        add_err_msg(dict_rule, "#进行数字翻页#")
+    list_listpage_url.append(url1)
+    list_listpage_url.append(url2)
+
+    if list_pageNum1[2]==list_pageNum2[2] and list_pageNum1[2] is not None:
+        dict_rule["listpage_pageNum"] = [list_pageNum1[2],"jsoup"]
+    elif list_pageNum1[1]==list_pageNum2[1] and list_pageNum1[1] is not None:
+        dict_rule["listpage_pageNum"] = [list_pageNum1[1],"xpath"]
+    else:
+        dict_rule["listpage_pageNum"] = None
+    dict_rule["listpage_pageNum_pattern"] = list_pageNum1[0]
+    '''
+    #若是未识别到pageNum则flag为False
+    if dict_rule["listpage_pageNum"] is None:
+        dict_rule["flag"] = False
+    '''
+    #优先jsoup,后xpath
+    if list_node1[1]==list_node2[1] and list_node1[1] is not None:
+        dict_rule["listpage_nextPage"] = [list_node1[1],"jsoup"]
+    #只有2页的适配
+    elif list_node1[1] is not None and list_node2[1] is None:
+        log('只有两页更新适配 ')
+        dict_rule["listpage_nextPage"] = [list_node1[1],"jsoup"]
+    elif list_node1[0]==list_node2[0] and list_node1[0] is not None:
+        dict_rule["listpage_nextPage"] = [list_node1[0],"xpath"]
+    else:
+        dict_rule["listpage_nextPage"] = None
+
+    #翻页按钮或者是拼接规则有一个即可
+    if dict_rule["listpage_nextPage"] is not None:
+        dict_rule["flag"] = True
+    else:
+        add_err_msg(dict_rule, "#下一页规则未获取#")
+    return dict_rule,list_listpage_url
+    # except Exception as e:
+    #     error(str(e))
 
 
 if __name__=="__main__":
 if __name__=="__main__":
     browser = hd.getBrowser()
     browser = hd.getBrowser()
@@ -323,7 +395,8 @@ if __name__=="__main__":
     return _array
     return _array
     '''
     '''
     
     
-    data = browser.execute_script(scripts_common+script1)
+    # data = browser.execute_script(scripts_common+script1)
+    data = get_js_rs(browser, scripts_common+script1)
     #browser.maximize_window()
     #browser.maximize_window()
     browser.save_screenshot("112.png")
     browser.save_screenshot("112.png")
     for item in data:
     for item in data:

+ 34 - 18
module/run_single_server.py

@@ -1,21 +1,25 @@
+# -*- coding: utf-8 -*-
+import sys
+import json
+import re
+import os
+sys.path.append(os.path.abspath("../.."))
 
 
-from module.Utils import log# -*- coding: utf-8 -*-
+os.environ['KERAS_BACKEND']='tensorflow'
+from module.Utils import log
 """
 """
 Created on Fri Jun  1 18:03:03 2018
 Created on Fri Jun  1 18:03:03 2018
 
 
 @author: DONG
 @author: DONG
 """
 """
-import sys
-import os
-import json
-import re
-sys.path.append(os.path.abspath("../.."))
+
 from module import extractFlow
 from module import extractFlow
 from flask import Flask, jsonify
 from flask import Flask, jsonify
 from flask import abort
 from flask import abort
 from flask import request
 from flask import request
 import time
 import time
 import uuid
 import uuid
+from module.Utils import xpath2css
 
 
 app = Flask(__name__)
 app = Flask(__name__)
 app.config['JSON_AS_ASCII'] = False
 app.config['JSON_AS_ASCII'] = False
@@ -30,9 +34,12 @@ def transformInterface(_dict):
     if listpage_a  and listpage_date:
     if listpage_a  and listpage_date:
         if listpage_a[0]==listpage_date[0]:
         if listpage_a[0]==listpage_date[0]:
             ruleValue = listpage_a[0]
             ruleValue = listpage_a[0]
-            trans_dict["listPageNode"] = {"ruleType":"xpath",
-                                          "ruleValue":ruleValue,
-                                          "ruleKey":""}
+            # trans_dict["listPageNode"] = {"ruleType":"xpath",
+            #                               "ruleValue":ruleValue,
+            #                               "ruleKey":""}
+            trans_dict["listPageNode"] = {"ruleType": "css",
+                                          "ruleValue": xpath2css(ruleValue),
+                                          "ruleKey": ""}
         else:
         else:
             flag = False
             flag = False
     else:
     else:
@@ -67,8 +74,11 @@ def transformInterface(_dict):
     detail_date = _dict.get("detail_date")
     detail_date = _dict.get("detail_date")
     trans_dict["needDetailTime"] = False
     trans_dict["needDetailTime"] = False
     if detail_date:
     if detail_date:
-        trans_dict["detailDateNode"] = {"ruleType": "xpath",
-                                        "ruleValue": detail_date
+        # trans_dict["detailDateNode"] = {"ruleType": "xpath",
+        #                                 "ruleValue": detail_date
+        #                                 }
+        trans_dict["detailDateNode"] = {"ruleType": "css",
+                                        "ruleValue": xpath2css(detail_date)
                                         }
                                         }
         trans_dict["needDetailTime"] = True
         trans_dict["needDetailTime"] = True
     else:
     else:
@@ -76,16 +86,22 @@ def transformInterface(_dict):
     detail_title = _dict.get("detail_title")
     detail_title = _dict.get("detail_title")
     trans_dict["needDetailTitle"] = False
     trans_dict["needDetailTitle"] = False
     if detail_title:
     if detail_title:
-        trans_dict["detailTitleNode"] = {"ruleType": "xpath",
-                                         "ruleValue": detail_title
+        # trans_dict["detailTitleNode"] = {"ruleType": "xpath",
+        #                                  "ruleValue": detail_title
+        #                                  }
+        trans_dict["detailTitleNode"] = {"ruleType": "css",
+                                         "ruleValue": xpath2css(detail_title)
                                          }
                                          }
         trans_dict["needDetailTitle"] = True
         trans_dict["needDetailTitle"] = True
     else:
     else:
         flag = False
         flag = False
     detail_content = _dict.get("detail_content")
     detail_content = _dict.get("detail_content")
     if detail_content:
     if detail_content:
-        trans_dict["detailContentNode"] = {"ruleType": "xpath",
-                                           "ruleValue": detail_content
+        # trans_dict["detailContentNode"] = {"ruleType": "xpath",
+        #                                    "ruleValue": detail_content
+        #                                    }
+        trans_dict["detailContentNode"] = {"ruleType": "css",
+                                           "ruleValue": xpath2css(detail_content)
                                            }
                                            }
     else:
     else:
         flag = False
         flag = False
@@ -117,7 +133,7 @@ def text_predict():
                 if re.search("http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+",listpage_url) is None:
                 if re.search("http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+",listpage_url) is None:
                     data["status_code"] = 400
                     data["status_code"] = 400
                     abort(400)
                     abort(400)
-                else:   
+                else:
                     data = extractFlow.ruleExtract(listpage_url)
                     data = extractFlow.ruleExtract(listpage_url)
                 log("done for setting result of listpage:"+str(listpage_url))
                 log("done for setting result of listpage:"+str(listpage_url))
                 data["listpage_url"] = listpage_url
                 data["listpage_url"] = listpage_url
@@ -130,7 +146,7 @@ def text_predict():
         log(" time from receive to send: "+str(time.time()-start_time))
         log(" time from receive to send: "+str(time.time()-start_time))
 
 
         data = transformInterface(data)
         data = transformInterface(data)
-        log(str(data))
+        # log(str(data))
 
 
         _resp = jsonify(data)
         _resp = jsonify(data)
         #log(str(data["flag"])+str(data))
         #log(str(data["flag"])+str(data))
@@ -138,5 +154,5 @@ def text_predict():
 
 
 
 
 if __name__ == '__main__':
 if __name__ == '__main__':
-    app.run(host='0.0.0.0', port=15015, threaded=True, debug=False)
+    app.run(host='192.168.2.65', port=15015, threaded=True, debug=False) #15015  2.65
     log("ContentExtractor running")
     log("ContentExtractor running")

+ 157 - 19
module/testInterface.py

@@ -345,23 +345,161 @@ list_url = ["http://www.csssyxx.com/xwgk/tzgg",
 _sum = 0
 _sum = 0
 _count = 0
 _count = 0
 ''' '''
 ''' '''
-with codecs.open("errorLink.txt","r",encoding="utf8") as f:
-    while(True):
-        line = f.readline().strip()
-        if not line:
-            break
-       
-        a = time.time()
-        # user = {"listpage_url":list_url[0]}
-        user = {"listpage_url":"http://www.gsbtn96333.com.cn/news-41-1.html"}
-        #_resp = requests.post("http://192.168.2.52:15015/content_extract", json=user, verify=True)
-        _resp = requests.post("http://127.0.0.1:15015/content_extract", json=user, verify=True)
-        resp_json = _resp.content.decode("utf-8")
-        _resp = json.loads(resp_json)
-        print(resp_json)
-        _sum += 1
-        if "flag" in _resp and _resp["flag"]:
-            _count += 1
-            print("take:",time.time()-a,json.dumps(_resp,sort_keys=True,indent=4,ensure_ascii=False))
-        print(_count,_sum)
+# with codecs.open("errorLink.txt","r",encoding="utf8") as f:
+#     while(True):
+#         line = f.readline().strip()
+#         if not line:
+#             break
+#
+#         a = time.time()
+#         # user = {"listpage_url":list_url[0]}
+#         user = {"listpage_url":"http://www.gsbtn96333.com.cn/news-41-1.html"}
+#         #_resp = requests.post("http://192.168.2.52:15015/content_extract", json=user, verify=True)
+#         _resp = requests.post("http://127.0.0.1:15015/content_extract", json=user, verify=True)
+#         resp_json = _resp.content.decode("utf-8")
+#         _resp = json.loads(resp_json)
+#         print(resp_json)
+#         _sum += 1
+#         if "flag" in _resp and _resp["flag"]:
+#             _count += 1
+#             print("take:",time.time()-a,json.dumps(_resp,sort_keys=True,indent=4,ensure_ascii=False))
+#         print(_count,_sum)
 
 
+
+def get_rs(url):
+    user = {"listpage_url": url}
+    _resp = requests.post("http://192.168.2.177:15015/content_extract", json=user, verify=True) #127.0.0.1  177
+    resp_json = _resp.content.decode("utf-8")
+    return resp_json
+
+    # _resp = json.loads(resp_json)
+    # print(resp_json)
+    # print(_resp)
+
+# url = 'http://www.clrmyy.com/Newslist/NewsList.aspx?code=ZPXX'
+# url = 'http://ec.chongchi.com.cn:8080/Ec468Web/ysxjcggg.jsp' # 列表页太长 js 溢出  #已设置超时
+# url = 'https://tyj.huangshan.gov.cn/content/column/6794951?pageIndex=1'
+# url = 'http://www.yangdong.gov.cn/xwzx/gggs/index.html'  # 获取详情页报错
+# url = 'https://www.guit.edu.cn/xwzx/tzgg.htm ' # 日志报错
+
+# rs = get_rs(url)
+# print(rs)
+
+# url = 'http://www.yangzhou.gov.cn/cnyzfront/newxxgk/bmfdgklist.jsp?zjclassid=b8273cd5944b41c1b6f5aeb88194340f&bmcode=KA024&showlmmc=1&showbm=0&currentPage=2' # 翻页提取失败
+# url = 'http://www.yangzhou.gov.cn/cnyzfront/newxxgk/bmfdgklist.jsp?zjclassid=aedecc7ea4cb4fbdb34df0d57db50c62&bmcode=11321000014407012K' # 所有要素提取失败, 重跑正常
+url = 'http://www.gztaijiang.gov.cn/zwgk/zdlygk/zfcg/zbgg/index.html' # 所有要素提取失败, 重跑正常
+# url = 'http://gxs.yun.liuzhou.gov.cn/xwzx/tzgg/index.shtml' # 所有要素提取失败, bug 已修复
+# url = 'http://www.yangzhou.gov.cn/cnyzfront/newxxgk/bmfdgklist.jsp?zjclassid=aedecc7ea4cb4fbdb34df0d57db50c62&bmcode=11321000014407012K' # 所有要素提取失败, 重跑正常
+# url = 'http://www.chengan.gov.cn/main/newsMore.action?subjectid=9052&pagenum=1' # 所有要素提取失败, bug 已修复
+# url = 'http://hsxzwgk.mas.gov.cn/opennessTarget/?branch_id=57a3df762c262ea9a00aadae&column_code=280200' #主页提取失败  #网页打不开# 404
+# url = 'http://www.crra.org.cn/news/tongzhi/o1' # 执行js完毕  getRule_A_Date done 后卡住 已修复
+
+# url = 'http://www.ptstjxx.org.cn/pttsjyxx_lists-16.html' # 翻页超时错误 已修复 提取正常
+
+# # url = 'https://www.neias.cn/news_list.jsp?id=10775' # 报 201 浏览器打开正常  重新提取 #翻页链接不匹配##下一页规则未获取#
+# # url = 'https://www.gzmedri.com/xwdt/list_14_page_1.html' # 报 201 浏览器打开很慢,有时正常
+# # url = 'http://www.wjqwhg.cn/Article?pageIndex=1' #列表页规则未获取#  网页打开报错 504
+#
+# # url = 'http://gxs.yun.liuzhou.gov.cn/xwzx/tzgg/index.shtml' # 所有要素提取失败, bug 已修复  列表页xpath预测错误
+# # url = 'http://sz.nxeduyun.com/index.php?r=space/school/portal/content/index&sid=6ce9765e85694be7838c7f7272199346&cid=50160' #列表页获取失败 已修复
+# # url = 'https://www.nbzjy.cn/list-gsgg.html' # #列表页规则未获取# 已解决
+# # url = 'http://www.gdhealth.net.cn/index.php?m=content&c=index&a=lists&catid=38' # # #列表页规则未获取# chome浏览器打开异常 换另一个浏览器正常
+# # url = 'http://www.kbs.gov.cn/ywdt/tzgg/index.html' #列表页规则未获取# iframe报错 已处理
+# # url = 'http://www.xs9z.com/News.asp?PageNo=1&classid=17' #包含iframe 报错  已处理
+# # url = 'http://www.tdxbmj.cn/html/qyxw1/index.html' #列表页规则未获取# 已优化处理,详情页时间没日期报错,标签id重复导致只提取到一个链接
+# # url = 'http://www.sxsltlyy.com/newslist.php?cid=29'  # 列表页获取失败,详情页xpath错误  浏览器打开界面与selenium 的不一样  ua问题已修复
+# # url = 'http://view.landtz.com:8092/jj/index' # #列表页规则未获取# 拍卖多个图标纵向列表   content_xpath of listpage is //*[@class="wp"]/div[2]/div[1]/a[1]/div[2] 预测错误
+# # url = 'http://www.hbbidcloud.cn/suizhou/jyxx/004003/004003006/about.html' # #翻页链接不匹配##下一页规则未获取#  网页本身无翻页机制
+# # url = 'http://www.cqcjda.com/ShowList.aspx?pkey=3&p=3'  #翻页链接不匹配##下一页规则未获取##详情页列表页区分长度未识别#
+# # url = 'https://www.sxeec.com/gpgg/p4.html' ##翻页链接不匹配##下一页规则未获取#  下一页在标签<i>,链接在父节点<a>标签
+# # url = 'http://sthjj.liaoyuan.gov.cn/xxgk/tzgg/' #翻页链接不匹配  第二页开始规律  翻页超时导致拿不到翻页规则 无头模式打开网页超时, 正常模式不超时
+# # url = 'http://zyjy.huizhou.gov.cn/ggfw/jyxx/gycqjy/gpjggg/index_3.html'  #翻页链接不匹配
+# # url = 'http://bj.sxggzyjy.cn/jydt/001001/001001004/001001004001/subPage.html'  #翻页链接不匹配##下一页规则未获取#
+# # url = 'http://www.tlgljs.com/cpzs.html'
+# # url = 'http://zrzyj.jlbc.gov.cn/xxgk/tzgg/'
+# # url = 'http://www.zqcyl.cn/zlzx/ggl/' #抛出异常导致返回结果失败,
+# # url = 'http://www.cqcjda.com/ShowList.aspx?pkey=3'
+# # url = 'http://www.cqcjda.com/ShowList.aspx?pkey=3&p=1'
+# # url = 'http://zyjy.huizhou.gov.cn/ggfw/jyxx/gycqjy/gpjggg/'
+# # url = 'http://www.sxeec.com/gpgg.html'
+# url = 'http://zrzyj.jlbc.gov.cn/xxgk/tzgg/'
+# url = 'http://bbkx.bb.ah.cn/kxxw/tzgg/index.html'
+# url = 'http://www.lzwhg.com/tongzhigonggao/'
+# url = 'http://www.slwr.gov.cn/zfxxgk/gkml/216/240/257/list_640.htm'  # 列表页脚本异常
+# url = 'http://view.landtz.com:8091/xh/index?resourceStatus=0&useType=&orderBy=0&title='
+# url = 'http://ggzy.yueqing.gov.cn/yqwebnew/jyxx/001009/001009010/'
+# url = 'http://ggzy.xjbt.gov.cn/TPFront/bt5/083003/083003002/083003002006/'
+# url = 'http://www.longmen.gov.cn/xzfbm/xcl/zwgk/bmwj/tzgg/index.html'
+# url = 'http://nyncj.yq.gov.cn/tzgg/'
+url = 'http://www.yrcc.gov.cn/zwzc/gzgb/gg/index.html'
+url = 'http://www.hzsq.gov.cn/index.php?r=article/Category/index&class_id=61'
+url = 'http://zyjy.huizhou.gov.cn/ggfw/jyxx/gycqjy/gpjggg/'
+url = 'http://www.lzwhg.com/tongzhigonggao/'  #翻页失败
+rs = get_rs(url)
+print(rs)
+
+
+
+import pandas as pd
+import time
+l = []
+def get_url_root(text):
+    url = re.search('https?:[a-z0-9-./]+\.(cn|com|org|net|gov|edu|biz|cc|mil|top|pub|info)', text)
+    if url:
+        return url.group(0)
+    else:
+        return ''
+def get_url(text):
+    try:
+        url = json.loads(text).get('ruleLink', '')
+        return url
+    except:
+        print('CRAWLER_LINK json loads 出错:', text)
+        return ''
+
+# df = pd.read_csv('E:/crawl_data/crawler.BXKC_CRAWLING_RULES_INFO_utf8.csv')[:]
+# df = pd.read_csv('E:/crawl_data/crawler.BXKC_CRAWLING_RULES_INFO_utf8_predict100-200.csv')[:]
+
+# df = pd.read_excel('E:\crawl_data/新建 XLS 工作表.xls')
+df = pd.read_excel('E:\crawl_data/20220526new_url_2test.xlsx')
+# df.drop_duplicates(subset=['首页网址'], inplace=True)
+
+#
+# df['url_root'] = df['CRAWLER_LINK '].apply(lambda x:get_url_root(x))
+# df['url'] = df['CRAWLER_LINK '].apply(lambda x:get_url(x))
+# df = df[df['url']!=""]
+# print(len(df))
+# df.drop_duplicates(subset=['url_root'], inplace=True)
+# print(len(df))
+# df.drop_duplicates(subset=['DETAIL_CONTENT_NODE'], inplace=True)
+# # df = df[100:200]
+df.reset_index(drop=True, inplace=True)
+print(len(df), df.columns)
+t0 = time.time()
+for i in df.index:
+    # if '#列表页规则未获取#' not in df.loc[i, 'rs']:
+    #     continue
+    t1 = time.time()
+    # url = df.loc[i, 'url']
+    url = df.loc[i, '列表页链接']
+    if not re.match('http', url):
+        l.append('')
+        print(url)
+        continue
+    print(url)
+    rs = get_rs(url)
+    # try:
+    #     url = json.loads(df.loc[i, 'CRAWLER_LINK ']).get('ruleLink', '')
+    #     print(url)
+    #     rs = get_rs(url)
+    # except:
+    #     rs = json.dumps({'err_msg': 'json loads link error'})
+    print('耗时:', time.time()-t1)
+    print(rs)
+    l.append(rs)
+df['rs3'] = pd.Series(l)
+print('完成,总耗时:', time.time()-t0)
+# # df.to_csv('E:/crawl_data/crawler.BXKC_CRAWLING_RULES_INFO_utf8_predict后1000-900.csv', encoding='utf-8')
+# df.to_excel('E:/crawl_data/20220526new_url_0531.xlsx', encoding='utf-8')
+print('写入完成,总耗时:', time.time()-t0)
+# #