4 lat temu · 2949be0410
--- a/driver/chromedriver/chromedriver_win32/chromedriver.exe
+++ b/driver/chromedriver/chromedriver_win32/chromedriver.exe
--- a/module/Utils.py
+++ b/module/Utils.py
@@ -1,3 +1,4 @@
 
															+# -*- coding: utf-8 -*-
														
 
															 '''
														
 
															 Created on 2018年12月20日
														
@@ -20,10 +21,70 @@ import logging
 
															 import pickle
														
 
															 import tensorflow as tf
														
 
															 from keras import losses
														
 
															+import threading
														
 
															 __author__ = 'baniu.yao'
														
 
															+class MyThread(threading.Thread):
														
 
															+    def __init__(self, func, args=()):
														
 
															+        super(MyThread, self).__init__()
														
 
															+        self.func = func
														
 
															+        self.args = args
														
 
															+    def run(self):
														
 
															+        self.result = self.func(*self.args)
														
 
															+
														
 
															+    def get_result(self):
														
 
															+        try:
														
 
															+            return self.result
														
 
															+        except Exception as e:
														
 
															+            print('执行js抛出异常：', e)
														
 
															+            return None
														
 
															+
														
 
															+def get_js_rs(browser, script, *arg, timeout=20):
														
 
															+    '''
														
 
															+    浏览器执行脚本，返回结果，超时中断
														
 
															+    :param browser:浏览器对象
														
 
															+    :param script: 脚本
														
 
															+    :param arg:参数
														
 
															+    :param timeout:超时时间
														
 
															+    :return:
														
 
															+    '''
														
 
															+    def execute_js():
														
 
															+        data = browser.execute_script(script, *arg)
														
 
															+        return data
														
 
															+    t = MyThread(func=execute_js, args=())
														
 
															+    t.setDaemon(True)
														
 
															+    t.start()
														
 
															+    t.join(timeout)
														
 
															+    if t.isAlive():
														
 
															+        print('执行js超时')
														
 
															+        stop_thread(t)
														
 
															+        return None
														
 
															+    data = t.get_result()
														
 
															+    return data
														
 
															+
														
 
															+import time
														
 
															+def thread_run(func, *arg, timeout=30):
														
 
															+    t = MyThread(func=func, args=(*arg,))
														
 
															+    t.setDaemon(True)
														
 
															+    t.start()
														
 
															+    t.join(timeout)
														
 
															+    if t.isAlive():
														
 
															+        print('thread_run time out')
														
 
															+    result = t.get_result()
														
 
															+    return result
														
 
															+
														
 
															+def xpath2css(xpath):
														
 
															+    '''
														
 
															+    把xpath路径转为css路径
														
 
															+    :param xpath:
														
 
															+    :return:
														
 
															+    '''
														
 
															+    xpath = xpath.replace('//', '').replace('@', '').replace('/', '>')
														
 
															+    for it in re.finditer('\[(\d)\]', xpath):
														
 
															+        xpath = xpath.replace(it.group(0), ':nth-child(%s)'%it.group(1))
														
 
															+    return xpath
														
 
															 def get_class_from_frame(fr):
														
 
															     args, _, _, value_dict = inspect.getargvalues(fr)
														
@@ -520,6 +581,56 @@ def print_metrics(history):
 
															     plt.show()
														
 
															 scripts_common = '''
														
 
															+document.getElementsByClassName = function (Name,e,tag) {
														
 
															+            var ele = [],
														
 
															+                allEle,
														
 
															+                length,
														
 
															+                i = 0;
														
 
															+ 
														
 
															+            if (typeof tag === "undefined" ){
														
 
															+                tag = "*"
														
 
															+            }
														
 
															+ 
														
 
															+            if (typeof e === "undefined"){
														
 
															+                e = document;
														
 
															+            }
														
 
															+ 
														
 
															+            allEle = e.getElementsByTagName(tag);
														
 
															+ 
														
 
															+            for (length = allEle.length;i < length;i = i + 1){
														
 
															+                if (allEle[i].className === Name) {
														
 
															+                    ele.push(allEle[i]);
														
 
															+                }
														
 
															+            }
														
 
															+ 
														
 
															+            return ele;
														
 
															+        }
														
 
															+
														
 
															+document.countElementById = function (id,e,tag) {
														
 
															+            var ele = [],
														
 
															+                allEle,
														
 
															+                length,
														
 
															+                i = 0;
														
 
															+ 
														
 
															+            if (typeof tag === "undefined" ){
														
 
															+                tag = "*"
														
 
															+            }
														
 
															+ 
														
 
															+            if (typeof e === "undefined"){
														
 
															+                e = document;
														
 
															+            }
														
 
															+ 
														
 
															+            allEle = e.getElementsByTagName(tag);
														
 
															+ 
														
 
															+            for (length = allEle.length;i < length;i = i + 1){
														
 
															+                if (allEle[i].id === id) {
														
 
															+                    ele.push(allEle[i]);
														
 
															+                }
														
 
															+            }
														
 
															+ 
														
 
															+            return ele;
														
 
															+        }
														
 
															+
														
 
															 /*js集合set类的实现*/
														
 
															 function Set() {
														
 
															     this.dataStore = [];
														
@@ -664,7 +775,7 @@ function getRemoveList(node,recurse,list_remove){
 
															 }
														
 
															 function getListXpath(el,list_xpath,getRemove){
														
 
															-    if (el==document.body){
														
 
															+    if (el==document || el==document.body){
														
 
															         return list_xpath;
														
 
															     }
														
 
															     if(getRemove){
														
@@ -678,7 +789,7 @@ function getListXpath(el,list_xpath,getRemove){
 
															     return getListXpath(el.parentNode,list_xpath,getRemove);
														
 
															 }
														
 
															 function getXpath(el,b,notfirst){
														
 
															-    if (el.id !=""){
														
 
															+    if (el.id !="" && document.countElementById(el.id).length==1){
														
 
															         var _jump_flag = false;
														
 
															         if(b!=null){
														
 
															             for(var i=0;i<b.length;i++){
														
@@ -691,14 +802,16 @@ function getXpath(el,b,notfirst){
 
															             _jump_flag = true;
														
 
															         }
														
 
															         if(!_jump_flag){
														
 
															-            return '//*[@id=\"'+el.id+'\"]';
														
 
															+            //return '//*[@id=\"'+el.id+'\"]';
														
 
															+            return '//'+el.tagName.toLowerCase()+'[@id=\"'+el.id+'\"]';
														
 
															         }
														
 
															     }
														
 
															     if (el.getAttribute("class")!=null && document.getElementsByClassName(el.getAttribute("class")).length==1){
														
 
															         if(!notfirst){
														
 
															-            return '//*[@class=\"'+el.getAttribute("class")+'\"]';
														
 
															+            //return '//*[@class=\"'+el.getAttribute("class")+'\"]';
														
 
															+            return '//'+el.tagName.toLowerCase()+'[@class=\"'+el.getAttribute("class")+'\"]';
														
 
															         }
														
 
															     }
														
@@ -823,7 +936,7 @@ function clustering(list_hitTag){
 
															 function clustering_turnPage(){
														
 
															     //var pattern_page = /((?<nextPage>下一?页|>>|>)|(?<lastPage>上一?页|<<|<)|(?<firstPage>首页|第一页)|(?<tailPage>尾页)|(?<other>\.{1,2}|共\d[条页]|\d+\/\d+))/ //phantomjs不支持命名分组
														
 
															-    var pattern_page = /^\s*.?\s*([下后]\s*一?\s*页?|[下后]\s*一?\s*页\s*»|[下后]\s*一?\s*页\s*>|[下后]\s*一?\s*页\s*>>|»|>>|>|[Nn]ext)\s*.?\s*$|^\s*.?([前上]\s*一?\s*页?|«\s*[前上]\s*一?\s*页|«|<<|<|[Pp]revious).?\s*$|^\s*.?(首\s*页?|第\s*一\s*页|first|\|<).?\s*$|^\s*.?([尾末]\s*一?\s*页?|tail|>\|).?s\s*$|(^\s*\.{1,2}\s*$|^.{,10}共\s*\d+\s*[条页].{,10}$|^.{,10}\d+\/\d+.{,3}$|^\s*\.{0,2}\d+\s*$|^\s*[gG][oO]\s*$|^.{0,2}跳?转到?)/
														
 
															+    var pattern_page = /^\s*[^最]?\s*([下后]\s*一?\s*页?|[下后]\s*一?\s*页\s*»|[下后]\s*一?\s*页\s*>|[下后]\s*一?\s*页\s*>>|»|>>|>|[Nn]ext)\s*.?\s*$|^\s*.?([前上]\s*一?\s*页?|«\s*[前上]\s*一?\s*页|«|<<|<|[Pp]revious).?\s*$|^\s*.?(首\s*页?|第\s*一\s*页|first|\|<).?\s*$|^\s*.?([尾末]\s*一?\s*页?|tail|>\|).?s\s*$|(^\s*\.{1,2}\s*$|^.{,10}共\s*\d+\s*[条页].{,10}$|^.{,10}\d+\/\d+.{,3}$|^\s*\.{0,2}\d+\s*$|^\s*[gG][oO]\s*$|^.{0,2}跳?转到?)/
														
 
															     var pattern_nextPage = /[Nn]ext/
														
 
															     var list_hitTag = new Array();
														
--- a/module/detail/content/featureEngine.py
+++ b/module/detail/content/featureEngine.py
@@ -54,12 +54,12 @@ function statistic(node,deepth){
 
															                 node.counts_communicateTags += 1;
														
 
															             }
														
 
															         }
														
 
															-        if(child.tagName!=null && child.tagName.toLowerCase()=="iframe" && child.contentWindow.document!=null){
														
 
															+        /*if(child.tagName!=null && child.tagName.toLowerCase()=="iframe" && child.contentWindow.document!=null){
														
 
															             node.counts_communicateTags += statisticIframe(child.contentWindow.document.all);
														
 
															         }else{
														
 
															             node.counts_communicateTags += statistic(child,deepth+1);
														
 
															-        }
														
 
															-            
														
 
															+        }*/
														
 
															+        node.counts_communicateTags += statistic(child,deepth+1);    
														
 
															     }
														
 
															     var innertext = node.innerText;
														
 
															     if(innertext){
														
@@ -133,7 +133,7 @@ function stastic_time(node,_array){
 
															         }
														
 
															     }
														
 
															-    if (!_find_flag){
														
 
															+    if (!_find_flag && node!=document){
														
 
															         _array_fontSize = new Array();
														
 
															         getListFontSize(node,_array_fontSize);
														
 
															         _array.push([getOffsetLeft(node),getOffsetTop(node),getListXpath(node,new Array()),Math.min(_array_fontSize)]);
														
@@ -334,7 +334,8 @@ def encodeInput_byJS(url,targethtml):
 
															             browser.maximize_window()
														
 
															             start = time.time()
														
 
															-            data = browser.execute_script(scripts_common+scripts)
														
 
															+            # data = browser.execute_script(scripts_common+scripts)
														
 
															+            data = get_js_rs(browser, scripts_common+scripts)
														
 
															             input_x,list_inner = dealWithScriptOut(data)
														
 
															             list_label = []
														
 
															             for item in list_inner:
														
@@ -352,7 +353,7 @@ def encodeInput_byJS(url,targethtml):
 
															     args = {"url":url,"targethtml":targethtml}
														
 
															     hd.executeMethod(_method, args)
														
 
															-def getInput_byJS(url):
														
 
															+def getInput_byJS(browser, url):
														
 
															     def label(innerhtml,target_source):
														
 
															         target_source =re.sub("[\r\n\s]","",str(target_source))
														
 
															         pattern = ">(.*)<"
														
@@ -365,12 +366,14 @@ def getInput_byJS(url):
 
															             return 1
														
 
															         return 0
														
 
															     try:
														
 
															-        browser = hd.getdriver()
														
 
															-        debug("get driver")
														
 
															-        hd.loadPage(browser, url)
														
 
															-        browser.maximize_window()
														
 
															+        # browser = hd.getdriver()
														
 
															+        # debug("get driver")
														
 
															+        # hd.loadPage(browser, url)
														
 
															+        # browser.maximize_window()
														
 
															-        data,data_time = browser.execute_script(scripts_common+scripts)
														
 
															+        # data,data_time = browser.execute_script(scripts_common+scripts)
														
 
															+        data,data_time = get_js_rs(browser, scripts_common+scripts)
														
 
															+        log('获取正文、时间脚本执行完毕')
														
 
															         input_x,list_inner,list_xpath = dealWithScriptOut(data)
														
 
															         if input_x is not None:
														
 
															             #return [np.expand_dims(np.transpose(pad_sequences(np.transpose(input_x,(1,0)), 155,padding="post", truncating="post", value=0,dtype="float32"),(1,0)),0)],list_inner
														
@@ -383,9 +386,9 @@ def getInput_byJS(url):
 
															         if re.search("frame",str(e)) is not None:
														
 
															             err_msg = "#iframe#"
														
 
															         return None,err_msg
														
 
															-    finally:
														
 
															-        hd.adddriver(browser)
														
 
															-        debug("release driver")
														
 
															+    # finally:
														
 
															+    #     hd.adddriver(browser)
														
 
															+    #     debug("release driver")
														
--- a/module/detail/extractor.py
+++ b/module/detail/extractor.py
@@ -87,11 +87,31 @@ def getRule_detail(list_hrefs,try_times=3,MAX_HREFS=10):
 
															             continue
														
 
															         list_legal_time = []
														
 
															         _flag = -2
														
 
															-        flag,data = featureEngine_content.getInput_byJS(_url)
														
 
															+        browser = hd.getdriver()
														
 
															+        debug("get driver")
														
 
															+        loadsucess = hd.loadPage(browser, _url)
														
 
															+        if not loadsucess:
														
 
															+            browser = hd.getdriver()
														
 
															+        # browser.maximize_window()
														
 
															+        flag,data = featureEngine_content.getInput_byJS(browser,_url)
														
 
															         hasGotten = True
														
 
															         if flag:
														
 
															-            x,_,list_xpath,data_time = data
														
 
															+            x,inner_html,list_xpath,data_time = data
														
 
															             _index = detailContentPredictor.predict(x)
														
 
															+
														
 
															+            pt = '<a.*?\.(zip|rar|tar|7z|wim|docx|doc|xlsx|xls|pdf|txt|hnzf|bmp|tif).*?</a>'
														
 
															+            total_annex = len(re.findall(pt, browser.page_source))
														
 
															+            extract_annex = len(re.findall(pt, inner_html[_index]))
														
 
															+            if total_annex > extract_annex and _index>5 and len(list_xpath[_index])>0:
														
 
															+                extract_xpath = list_xpath[_index][0][0]
														
 
															+                for i in range(_index-1, _index-5, -1):
														
 
															+                    if len(re.findall(pt, inner_html[i]))== total_annex:
														
 
															+                        log('规格调整模型正文提取附件不完整')
														
 
															+                        _index = i
														
 
															+                        break
														
 
															+                    elif len(list_xpath[i])>0 and list_xpath[i][0][0] not in extract_xpath:
														
 
															+                        break
														
 
															+
														
 
															             _xpath = list_xpath[_index]
														
 
															             _xpath.reverse()
														
 
															             list_xpath_remove_content.append(_xpath)
														
@@ -102,10 +122,12 @@ def getRule_detail(list_hrefs,try_times=3,MAX_HREFS=10):
 
															         else:
														
 
															             hasGotten = False
														
 
															             add_err_msg(dict_rule_detail, data)
														
 
															-             
														
 
															-        flag,data_title = featureEngine_title.getInput_byJS(_url)
														
 
															+        flag,data_title = featureEngine_title.getInput_byJS(browser,_url)
														
 
															+        hd.adddriver(browser)
														
 
															+        debug("release driver")
														
 
															         if flag:
														
 
															             x,_,list_xpath,list_top = data_title
														
 
															+            log('详情标题获取成功')
														
 
															             _index = detailTitlePredictor.predict(x)
														
 
															             _xpath = list_xpath[_index]
														
 
															             _xpath.reverse()
														
@@ -130,7 +152,7 @@ def getRule_detail(list_hrefs,try_times=3,MAX_HREFS=10):
 
															             _xpath.append(_xpath_remove[0])
														
 
															         list_xpaths_content.append(_xpath)
														
 
															     dict_rule_detail["detail_content"] = getCommonXpath(list_xpaths_content)
														
 
															-    
														
 
															+
														
 
															     set_remove_list = None
														
 
															     for item in list_xpath_remove_content:
														
 
															         for _xpath_remove in item:
														
@@ -139,31 +161,36 @@ def getRule_detail(list_hrefs,try_times=3,MAX_HREFS=10):
 
															                     set_remove_list = set(_xpath_remove[1])
														
 
															                 else:
														
 
															                     set_remove_list = set(_xpath_remove[1])&set_remove_list
														
 
															-    dict_rule_detail["detail_removeList"] = list(set_remove_list)
														
 
															-    
														
 
															+    dict_rule_detail["detail_removeList"] = list(set_remove_list) if set_remove_list!=None else []
														
 
															     dict_rule_detail["detail_date"] = getCommonXpath_time(list_data_time)
														
 
															     dict_rule_detail["detail_title"] = getCommonXpath(list_xpaths_title)
														
 
															-    try:
														
 
															-        browser = hd.getdriver()
														
 
															-        debug("get driver")
														
 
															-        if len(list_hrefs)>0:
														
 
															-            hd.loadPage(browser, list_hrefs[-1],)
														
 
															-            dict_rule_detail["hasDrew"] = dict_rule_detail["hasDrew"] or hd.hasDrew(list_hrefs[0], [{"type":"xpath","rule":dict_rule_detail["detail_content"]},
														
 
															-                                                                                                    {"type":"xpath","rule":dict_rule_detail["detail_date"]},
														
 
															-                                                                                                    {"type":"xpath","rule":dict_rule_detail["detail_title"]}])
														
 
															-        if dict_rule_detail["detail_content"] is not None and len(dict_rule_detail["detail_content"].split("/"))>6:
														
 
															-            log("before being replaced xpath of detail_content"+dict_rule_detail["detail_content"])
														
 
															-            dict_rule_detail["detail_content"] = browser.execute_script(scripts_replaceXpath,dict_rule_detail["detail_content"])
														
 
															-            log("after being replaced xpath of detail_content"+dict_rule_detail["detail_content"])
														
 
															-        if dict_rule_detail["detail_date"] is not None and len(dict_rule_detail["detail_date"].split("/"))>6:
														
 
															-            dict_rule_detail["detail_date"] = browser.execute_script(scripts_replaceXpath,dict_rule_detail["detail_date"])
														
 
															-        if dict_rule_detail["detail_title"] is not None and len(dict_rule_detail["detail_title"].split("/"))>6:
														
 
															-            dict_rule_detail["detail_title"] = browser.execute_script(scripts_replaceXpath,dict_rule_detail["detail_title"])
														
 
															-    finally:
														
 
															-        hd.adddriver(browser)
														
 
															-        debug("release driver")  
														
 
															-        
														
 
															+    # try:
														
 
															+    browser = hd.getdriver()
														
 
															+    debug("get driver")
														
 
															+    if len(list_hrefs)>0:
														
 
															+        loadsucess = hd.loadPage(browser, list_hrefs[-1],)
														
 
															+        log('logPage: ')
														
 
															+        if loadsucess==False:
														
 
															+            browser = hd.getdriver()
														
 
															+        dict_rule_detail["hasDrew"] = dict_rule_detail["hasDrew"] or hd.hasDrew(list_hrefs[0], [{"type":"xpath","rule":dict_rule_detail["detail_content"]},
														
 
															+                                                                                            {"type":"xpath","rule":dict_rule_detail["detail_date"]},
														
 
															+                                                                                            {"type":"xpath","rule":dict_rule_detail["detail_title"]}])
														
 
															+    if dict_rule_detail["detail_content"] is not None and len(dict_rule_detail["detail_content"].split("/"))>6:
														
 
															+        log("before being replaced xpath of detail_content"+dict_rule_detail["detail_content"])
														
 
															+        # dict_rule_detail["detail_content"] = browser.execute_script(scripts_replaceXpath,dict_rule_detail["detail_content"])
														
 
															+        dict_rule_detail["detail_content"] = get_js_rs(browser, scripts_replaceXpath,dict_rule_detail["detail_content"])
														
 
															+        log("after being replaced xpath of detail_content"+dict_rule_detail["detail_content"])
														
 
															+    if dict_rule_detail["detail_date"] is not None and len(dict_rule_detail["detail_date"].split("/"))>6:
														
 
															+        # dict_rule_detail["detail_date"] = browser.execute_script(scripts_replaceXpath,dict_rule_detail["detail_date"])
														
 
															+        dict_rule_detail["detail_date"] = get_js_rs(browser, scripts_replaceXpath,dict_rule_detail["detail_date"])
														
 
															+    if dict_rule_detail["detail_title"] is not None and len(dict_rule_detail["detail_title"].split("/"))>6:
														
 
															+        # dict_rule_detail["detail_title"] = browser.execute_script(scripts_replaceXpath,dict_rule_detail["detail_title"])
														
 
															+        dict_rule_detail["detail_title"] = get_js_rs(browser, scripts_replaceXpath,dict_rule_detail["detail_title"])
														
 
															+    # finally:
														
 
															+    hd.adddriver(browser)
														
 
															+    debug("release driver")
														
 
															+
														
 
															     if dict_rule_detail["detail_content"] is not None and dict_rule_detail["detail_date"] is not None and dict_rule_detail["detail_title"] is not None:
														
 
															         dict_rule_detail["flag"] = True
														
 
															     else:
														
--- a/module/detail/title/featureEngine.py
+++ b/module/detail/title/featureEngine.py
@@ -48,12 +48,12 @@ function statistic(node,deepth){
 
															                 node.counts_communicateTags += 1;
														
 
															             }
														
 
															         }
														
 
															-        if(child.tagName!=null && child.tagName.toLowerCase()=="iframe" && child.contentWindow.document!=null){
														
 
															+        /*if(child.tagName!=null && child.tagName.toLowerCase()=="iframe" && child.contentWindow.document!=null){
														
 
															             node.counts_communicateTags += statisticIframe(child.contentWindow.document.all);
														
 
															         }else{
														
 
															             node.counts_communicateTags += statistic(child,deepth+1);
														
 
															-        }
														
 
															-            
														
 
															+        }*/
														
 
															+        node.counts_communicateTags += statistic(child,deepth+1);                
														
 
															     }
														
 
															     var innertext = node.innerText;
														
 
															     if(innertext){
														
@@ -223,13 +223,14 @@ def dealWithScriptOut(data,sort_index=3):
 
															     else:
														
 
															         return None
														
 
															-def getInput_byJS(url):
														
 
															+def getInput_byJS(browser,url):
														
 
															     try:
														
 
															-        browser = hd.getdriver()
														
 
															-        debug("get driver")
														
 
															-        hd.loadPage(browser, url)
														
 
															+        # browser = hd.getdriver()
														
 
															+        # debug("get driver")
														
 
															+        # hd.loadPage(browser, url)
														
 
															-        data = browser.execute_script(scripts_common+scripts_title)
														
 
															+        # data = browser.execute_script(scripts_common+scripts_title)
														
 
															+        data = get_js_rs(browser, scripts_common+scripts_title)
														
 
															         deal_data = dealWithScriptOut(data)
														
 
															         if deal_data is None:
														
 
															             return False,""
														
@@ -242,9 +243,9 @@ def getInput_byJS(url):
 
															         if re.search("frame",str(e)) is not None:
														
 
															             err_msg = "#iframe#"
														
 
															         return None,err_msg
														
 
															-    finally:
														
 
															-        hd.adddriver(browser)
														
 
															-        debug("release driver")
														
 
															+    # finally:
														
 
															+        # hd.adddriver(browser)
														
 
															+        # debug("release driver")
														
 
															 def encodeInput_byJS(url,targethtml):
														
 
															     def label(innerhtml,target_source):
														
@@ -267,7 +268,8 @@ def encodeInput_byJS(url,targethtml):
 
															         browser.maximize_window()
														
 
															         start = time.time()
														
 
															-        data = browser.execute_script(scripts_common+scripts_title)
														
 
															+        # data = browser.execute_script(scripts_common+scripts_title)
														
 
															+        data = get_js_rs(browser, scripts_common+scripts_title)
														
 
															         input_x,list_inner,_,_ = dealWithScriptOut(data)
														
 
															         list_label = []
														
 
															         for item in list_inner:
														
--- a/module/extractFlow.py
+++ b/module/extractFlow.py
@@ -24,7 +24,10 @@ def ruleExtract(listpage_url):
 
															             result["status_code"] = "404"
														
 
															             add_err_msg(result, "#网页打不开#")
														
 
															             return result
														
 
															+        print('准备取列表页 ')
														
 
															         data_listpage = ext_listpage.getRule_listpage(listpage_url)
														
 
															+        print('完成列表页处理')
														
 
															+        # print('data_listpage:', data_listpage)
														
 
															         if data_listpage is None:
														
 
															             log("data_listpage is None")
														
 
															             rule_listpage = None
														
@@ -34,7 +37,9 @@ def ruleExtract(listpage_url):
 
															             result["status_code"] = "201"
														
 
															         else:
														
 
															             rule_listpage,list_hrefs = data_listpage
														
 
															+            print('准备处理详情页')
														
 
															             rule_detail = ext_detail.getRule_detail(list_hrefs)
														
 
															+            print('详情页处理完毕')
														
 
															             result = mergeDict([rule_listpage,rule_detail])
														
 
															             result["status_code"] = "201"
														
 
															     except Exception as e:
														
--- a/module/htmlDrawing.py
+++ b/module/htmlDrawing.py
@@ -14,25 +14,28 @@ import time
 
															 header={
														
 
															     "Accept": "text/html, application/xhtml+xml, image/jxr, */*",
														
 
															-    "Referer": "http://uia.hnist.cn/sso/login?service=http%3A%2F%2Fportal.hnist.\
														
 
															-                cn%2Fuser%2FsimpleSSOLogin",    
														
 
															+    # "Referer": "http://uia.hnist.cn/sso/login?service=http%3A%2F%2Fportal.hnist.\
														
 
															+    #             cn%2Fuser%2FsimpleSSOLogin",
														
 
															     "Accept-Language": "zh-Hans-CN,zh-Hans;q=0.8,en-US;q=0.5,en;q=0.3",
														
 
															     "Content-Type": "application/x-www-form-urlencoded",
														
 
															     "Connection": "Keep-Alive",
														
 
															     "User-Agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64) \
														
 
															      AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.162 Safari/537.36",
														
 
															     #"Accept-Encoding": "gzip, deflate",
														
 
															-    "Origin": "http://uia.hnist.cn",
														
 
															+    # "Origin": "http://uia.hnist.cn",
														
 
															     "Upgrade-Insecure-Requests": "1",
														
 
															     }  
														
 
															 TYPE = "phantomjs"
														
 
															+# TYPE = "chrome"
														
 
															 current_path = os.path.abspath("/".join(__file__.split("\\")[:-1]))
														
 
															 driver_paths = {"phantomjs_linux":current_path+"/../driver/phantomjs/phantomjs-2.1.1-linux-x86_64/bin/phantomjs",
														
 
															                 "phantomjs_window":current_path+"/../driver/phantomjs/phantomjs-2.1.1-windows/bin/phantomjs.exe",
														
 
															                 "chrome_linux":current_path+"/../driver/chromedriver/chromedriver_linux64/chromedriver",
														
 
															                 "chrome_window":current_path+"/../driver/chromedriver/chromedriver_win32/chromedriver.exe"}
														
 
															+print(driver_paths)
														
 
															+
														
 
															 def getBrowser_phantomJS(platform="linux",straight=False):
														
@@ -41,12 +44,13 @@ def getBrowser_phantomJS(platform="linux",straight=False):
 
															     else:
														
 
															         executable_path = driver_paths["phantomjs_window"]
														
 
															     desired_capabilities= DesiredCapabilities.PHANTOMJS.copy()
														
 
															+    print('os.path.exists executable_path', executable_path, os.path.exists(executable_path))
														
 
															     for key, value in header.items():
														
 
															         desired_capabilities['phantomjs.page.customHeaders.{}'.format(key)] = value
														
 
															-    desired_capabilities['phantomjs.page.customHeaders.User-Agent'] ='Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95 Safari/537.36'
														
 
															+    desired_capabilities['phantomjs.page.customHeaders.User-Agent'] ='Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/100.0.4896.75 Safari/537.36 Edg/100.0.1185.366'
														
 
															     desired_capabilities["phantomjs.page.settings.loadImages"] = False
														
 
															     desired_capabilities["phantomjs.page.settings.disk-cache"] = False
														
 
															-    browser_phantomjs = webdriver.PhantomJS(executable_path=executable_path,desired_capabilities=desired_capabilities,service_args=['--ignore-ssl-errors=true','--ssl-protocol=any'])
														
 
															+    browser_phantomjs = webdriver.PhantomJS(executable_path=executable_path,desired_capabilities=desired_capabilities,service_args=['--ignore-ssl-errors=true','--ssl-protocol=TLSv1'])
														
 
															     browser_phantomjs.implicitly_wait(10)
														
 
															     browser_phantomjs.set_script_timeout(20)
														
 
															     browser_phantomjs.set_page_load_timeout(10)
														
@@ -61,14 +65,15 @@ def getBrowser_chrome(platform="linux",straight=False):
 
															     chrome_options = webdriver.ChromeOptions()
														
 
															     prefs = {"profile.managed_default_content_settings.images":2}
														
 
															     chrome_options.add_experimental_option("prefs",prefs)
														
 
															-    chrome_options.add_argument('--headless') 
														
 
															+    chrome_options.add_argument('--headless')
														
 
															     chrome_options.add_argument('--no-sandbox')
														
 
															-    chrome_options.add_argument('--user-agent=iphoneMozilla/5.0 (Macintosh; Intel Mac OS X 10_10_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95 Safari/537.36')
														
 
															+    chrome_options.add_argument('--user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/100.0.4896.75 Safari/537.36 Edg/100.0.1185.36')
														
 
															     desired_capabilities= DesiredCapabilities.CHROME.copy()
														
 
															     desired_capabilities['loggingPrefs'] = { 'performance':'ALL' }
														
 
															-    browser_chrome = webdriver.Chrome(desired_capabilities=desired_capabilities,executable_path=executable_path, chrome_options=chrome_options,service_args=['--ignore-ssl-errors=true','--ssl-protocol=any'])
														
 
															-    browser_chrome.implicitly_wait(10)
														
 
															-    browser_chrome.set_page_load_timeout(10)
														
 
															+    browser_chrome = webdriver.Chrome(desired_capabilities=desired_capabilities,executable_path=executable_path, chrome_options=chrome_options,service_args=['--ignore-ssl-errors=true','--ssl-protocol=any'])  # '--ssl-protocol=any'  TLSv1
														
 
															+    browser_chrome.implicitly_wait(15)
														
 
															+    browser_chrome.set_page_load_timeout(15)
														
 
															+    # browser_chrome = webdriver.Chrome(executable_path=executable_path)
														
 
															     return browser_chrome
														
@@ -88,8 +93,9 @@ def getBrowser(type=TYPE,straight=False):
 
															 def getStatus(url):
														
 
															     try:
														
 
															-        r = requests.get(url, headers=header, allow_redirects = False,timeout=10)
														
 
															+        r = requests.get(url, headers=header, allow_redirects = False,timeout=15)
														
 
															     except Exception as e:
														
 
															+        log('requests.get error :%s'%e)
														
 
															         return 404    
														
 
															     return r.status_code
														
@@ -106,9 +112,13 @@ def releaseAllDriver():
 
															             try:
														
 
															                 lock.acquire()
														
 
															                 wait_count = 0
														
 
															+                t0 = time.time()
														
 
															                 while(True):
														
 
															                     if _queue.full():
														
 
															                         break
														
 
															+                    elif time.time()-t0>60:
														
 
															+                        log('等待放回浏览器超时，强制释放所有driver')
														
 
															+                        break
														
 
															                     else:
														
 
															                         wait_count += 1
														
 
															                         log("waitting for drivers all back..."+str(wait_count)+"qsize:"+str(_queue.qsize()))
														
@@ -119,6 +129,7 @@ def releaseAllDriver():
 
															                 lock_kill.release()
														
 
															     t = Thread(target=_method)
														
 
															     t.start()
														
 
															+    t.join(100)
														
@@ -155,7 +166,7 @@ def getdriver():
 
															     global _get_count
														
 
															     _get_count += 1
														
 
															     if _get_count>1000:
														
 
															-        log("get driver 达到调用次数，重新进行初始化")
														
 
															+        log("get_driver 达到调用次数，重新进行初始化")
														
 
															         releaseAllDriver()
														
 
															         _get_count = 0
														
 
															     lock.acquire()
														
@@ -198,23 +209,29 @@ def hasDrew(url,list_rule):
 
															     @summary: 根据规则判断是否渲染
														
 
															     @param: url:网页链接，list_rule: xpath规则数组 
														
 
															     '''
														
 
															-    try:
														
 
															-        r = requests.get(url, headers=header, allow_redirects = False)
														
 
															-        _encoding = r.encoding
														
 
															-        if _encoding is None:
														
 
															-            _encoding = "utf8"
														
 
															-        dom = html.fromstring(r.content.decode(_encoding))
														
 
															-        for item in list_rule:
														
 
															-            if item["type"]=="xpath":
														
 
															-                if item["rule"] is not None:
														
 
															-                    list_nodes = dom.xpath(item["rule"])
														
 
															-                    if len(list_nodes)==0:
														
 
															-                        return True
														
 
															-    except Exception as e:
														
 
															-        error(str(e))
														
 
															-    return False
														
 
															+    def hasdrew(url,list_rule):
														
 
															+        try:
														
 
															+            r = requests.get(url, headers=header, allow_redirects = False, timeout=10)
														
 
															+            _encoding = r.encoding
														
 
															+            if _encoding is None:
														
 
															+                _encoding = "utf8"
														
 
															+            dom = html.fromstring(r.content.decode(_encoding))
														
 
															+            for item in list_rule:
														
 
															+                if item["type"]=="xpath":
														
 
															+                    if item["rule"] is not None:
														
 
															+                        list_nodes = dom.xpath(item["rule"])
														
 
															+                        if len(list_nodes)==0:
														
 
															+                            return True
														
 
															+        except Exception as e:
														
 
															+            error(str(e))
														
 
															+        return False
														
 
															+    rs = thread_run(hasdrew, url,list_rule)
														
 
															+    if rs != None:
														
 
															+        return rs
														
 
															+    else:
														
 
															+        return False
														
 
															-def loadPage(browser,url,timeout=20):
														
 
															+def loadPage(browser,url,timeout=30):
														
 
															     '''
														
 
															     @summary: 解决selenium加载网页不返回的问题，设置线程进行加载，对线程设置超时时间
														
 
															     '''
														
@@ -225,7 +242,9 @@ def loadPage(browser,url,timeout=20):
 
															             debug("load "+url+" done")
														
 
															         except Exception as e:
														
 
															             error(str(e))
														
 
															+            log('加载页面抛出异常：'+str(e))
														
 
															             if re.search("由于目标计算机积极拒绝",str(e)) is not None:
														
 
															+                log('log page exception')
														
 
															                 releaseAllDriver()
														
 
															     t = Thread(target=_thread_load,args=(browser,url))
														
@@ -239,9 +258,14 @@ def loadPage(browser,url,timeout=20):
 
															         '''
														
 
															         #执行释放资源的线程
														
 
															         error("driver get方法卡住，强制释放所有资源")
														
 
															-        releaseAllDriver()
														
 
															         stop_thread(t)
														
 
															-        raise NameError("超时加载"+str(url))
														
 
															+        log('stop_loadpage thread return false')
														
 
															+        adddriver(browser)
														
 
															+        debug("release driver")
														
 
															+        releaseAllDriver()
														
 
															+        return False
														
 
															+        # raise NameError("超时加载"+str(url))
														
 
															+    return True
														
 
															 def getSource(url):
														
--- a/module/listpage/content/featureEngine.py
+++ b/module/listpage/content/featureEngine.py
@@ -86,13 +86,13 @@ function statistic(node,deepth){
 
															             if (child.tagName.toLowerCase() in {a:"",input:"",select:""} || child.onclick!=null){
														
 
															                 node.counts_communicateTags += 1;
														
 
															             }
														
 
															-        }
														
 
															-        if(child.tagName!=null && child.tagName.toLowerCase()=="iframe" && child.contentWindow.document!=null){
														
 
															+        }        
														
 
															+        /*if(child.tagName!=null && child.tagName.toLowerCase()=="iframe" && child.contentWindow.document!=null){
														
 
															             node.counts_communicateTags += statisticIframe(child.contentWindow.document.all);
														
 
															         }else{
														
 
															             node.counts_communicateTags += statistic(child,deepth+1);
														
 
															-        }
														
 
															-            
														
 
															+        }*/
														
 
															+        node.counts_communicateTags += statistic(child,deepth+1);    
														
 
															     }
														
 
															     node.counts_tagType = set_tag.size();
														
 
															     var sum_width = 0;
														
@@ -379,23 +379,29 @@ function clustering_xpath(array_xpath){
 
															 function search(content_xpath){
														
 
															-    content_node = getNode_listContent(content_xpath)
														
 
															-    if(content_node!=null){
														
 
															-        var array_a_href = statistic_A(content_node);
														
 
															-        var array_a = array_a_href[0];
														
 
															-        var array_href = new Array();
														
 
															-        var array_date = new Array();
														
 
															-        statistic_time(content_node,array_date);
														
 
															-        var _clustered_a = clustering_xpath(array_a);
														
 
															-        var _clustered_date = clustering_xpath(array_date);
														
 
															-        for(var i=0;i<array_a.length;i++){
														
 
															-            if(_clustered_a[1].indexOf(array_a_href[0][i])>=0){
														
 
															-                array_href.push(array_a_href[1][i]);
														
 
															+    try{
														
 
															+        content_node = getNode_listContent(content_xpath) //获取列表页标签节点
														
 
															+        if(content_node!=null){
														
 
															+            var array_a_href = statistic_A(content_node);
														
 
															+            var array_a = array_a_href[0];
														
 
															+            var array_href = new Array();
														
 
															+            var array_date = new Array();
														
 
															+            statistic_time(content_node,array_date);
														
 
															+            var _clustered_a = clustering_xpath(array_a);
														
 
															+            var _clustered_date = clustering_xpath(array_date);
														
 
															+            for(var i=0;i<array_a.length;i++){
														
 
															+                if(_clustered_a[1].indexOf(array_a_href[0][i])>=0){
														
 
															+                    array_href.push(array_a_href[1][i]);
														
 
															+                }
														
 
															             }
														
 
															+            return [_clustered_a,_clustered_date,array_href]
														
 
															         }
														
 
															-        return [_clustered_a,_clustered_date,array_href]
														
 
															+        return null;
														
 
															     }
														
 
															-    return null;
														
 
															+    catch(e){
														
 
															+        return null
														
 
															+    }
														
 
															+
														
 
															 }
														
 
															 return search(arguments[0]);
														
 
															 '''
														
@@ -433,7 +439,8 @@ def encodeInput_byJS(url,str_href):
 
															         browser = hd.getdriver()
														
 
															         debug("get driver")
														
 
															         hd.loadPage(browser, url)
														
 
															-        data = browser.execute_script(scripts_common+script_content,str_href)
														
 
															+        # data = browser.execute_script(scripts_common+script_content,str_href)
														
 
															+        data = get_js_rs(browser, scripts_common+script_content,str_href)
														
 
															         deal_data = dealWithScriptOut(data)
														
 
															         if deal_data is None:
														
@@ -453,8 +460,10 @@ def encodeInput_byJS(url,str_href):
 
															 def getInput_byJS(browser,url,str_href):
														
 
															     try:
														
 
															-        hd.loadPage(browser,url)
														
 
															-        data = browser.execute_script(scripts_common+script_content,str_href)
														
 
															+        # hd.loadPage(browser,url)
														
 
															+        # data = browser.execute_script(scripts_common+script_content,str_href)
														
 
															+        data = get_js_rs(browser, scripts_common+script_content,str_href)
														
 
															+
														
 
															         deal_data = dealWithScriptOut(data)
														
 
															         if deal_data is None:
														
 
															             return None
														
@@ -465,8 +474,7 @@ def getInput_byJS(browser,url,str_href):
 
															         error(str(e))
														
 
															     return None
														
 
															-def getRule_A_Date(url,content_xpath):
														
 
															-    
														
 
															+def getRule_A_Date(browser, url,content_xpath):
														
 
															     def appendXpath(list_xpath,_xpath):
														
 
															         if len(list_xpath)==0:
														
 
															             list_xpath.append(_xpath)
														
@@ -477,119 +485,122 @@ def getRule_A_Date(url,content_xpath):
 
															                         "listpage_Date":None,
														
 
															                         "flag":True,
														
 
															                         "hasDrew":False}
														
 
															-    try:
														
 
															-        browser = hd.getdriver()
														
 
															-        debug("get driver")
														
 
															-        hd.loadPage(browser,url)
														
 
															-        
														
 
															-        list_a = None
														
 
															-        for _content_xpath in [content_xpath,"/html"]:
														
 
															+    # try:
														
 
															+        # browser = hd.getdriver()
														
 
															+        # debug("get driver")
														
 
															+        # hd.loadPage(browser,url)
														
 
															-            
														
 
															-            data = browser.execute_script(scripts_common+script_get_A_Date,_content_xpath)
														
 
															-            if data is None:
														
 
															-                log("A_Date not found with xpath:"+_content_xpath)
														
 
															-                continue
														
 
															-            if _content_xpath==content_xpath or len(data[0][1])==len(data[1][1]):
														
 
															-                list_a = data[0]
														
 
															-                list_date = data[1]
														
 
															-                list_hrefs = data[2]
														
 
															-            if list_a is not None and len(list_a[1])==len(list_date[1]):
														
 
															-                break
														
 
															-            else:
														
 
															-                log("different length of A and Date:with xpath:"+_content_xpath)
														
 
															-            
														
 
															-        if list_a is None:
														
 
															-            log("A_Date not found with all xpath")
														
 
															-            return None;
														
 
															-        log("xpath of a:\t"+str(list_a[1][0])+"-"+str(list_a[0]))
														
 
															-        log("xpath of date:\t"+str(list_date[1][0])+"-"+str(list_date[0]))
														
 
															-
														
 
															-        log("length of A and Date:"+str(len(list_a[1]))+"-"+str(len(list_date[1])))
														
 
															-        if len(list_a[1])!=len(list_date[1]):
														
 
															-            dict_Rule_A_Date["flag"] = False
														
 
															-            add_err_msg(dict_Rule_A_Date, "#列表页链接和标题数量不一致#")
														
 
															-            return dict_Rule_A_Date,list_hrefs
														
 
															+    list_a = None
														
 
															+    for _content_xpath in [content_xpath,"/html"]:
														
 
															+        # data = browser.execute_script(scripts_common+script_get_A_Date,_content_xpath)
														
 
															+        data = get_js_rs(browser, scripts_common+script_get_A_Date,_content_xpath)
														
 
															+        if data is None:
														
 
															+            log("A_Date not found with xpath:"+_content_xpath)
														
 
															+            continue
														
 
															+        if _content_xpath==content_xpath or len(data[0][1])==len(data[1][1]):
														
 
															+            list_a = data[0]
														
 
															+            list_date = data[1]
														
 
															+            list_hrefs = data[2]
														
 
															+        if list_a is not None and len(list_a[1])==len(list_date[1]):
														
 
															+            log('list_a is not None and len(list_a[1])==len(list_date[1])')
														
 
															+            break
														
 
															         else:
														
 
															-            list_diffindex = list_a[0]
														
 
															-            _xpath = list_a[1][0]
														
 
															-            listpage_a = []
														
 
															-            begin = 0
														
 
															-            list_diffindex.sort(key=lambda x:x)
														
 
															-            _jump_flag = False
														
 
															-            
														
 
															-            dict_Rule_A_Date["hasDrew"] = dict_Rule_A_Date["hasDrew"] or hd.hasDrew(url, [{"rule":_xpath,"type":"xpath"}])
														
 
															-            _xpath_split = re.split("(\d+)",_xpath)
														
 
															-            for i in range(len(list_diffindex)):
														
 
															-                _index = list_diffindex[i]
														
 
															-                if not (_xpath_split[_index-1][-1]=="[" and _xpath_split[_index+1][0]=="]"):
														
 
															-                    add_err_msg(dict_Rule_A_Date, "#列表页链接xpath无法分割#")
														
 
															-                    dict_Rule_A_Date["flag"] = False
														
 
															-                    return dict_Rule_A_Date,list_hrefs
														
 
															+            log("different length of A and Date:with xpath:"+_content_xpath)
														
 
															+
														
 
															+    if list_a is None:
														
 
															+        log("A_Date not found with all xpath")
														
 
															+        return None;
														
 
															+    log("xpath of a:\t"+str(list_a[1][0])+"-"+str(list_a[0]))
														
 
															+    log("xpath of date:\t"+str(list_date[1][0])+"-"+str(list_date[0]))
														
 
															+
														
 
															+    log("length of A and Date:"+str(len(list_a[1]))+"-"+str(len(list_date[1])))
														
 
															+    if len(list_a[1])!=len(list_date[1]):
														
 
															+        dict_Rule_A_Date["flag"] = False
														
 
															+        add_err_msg(dict_Rule_A_Date, "#列表页链接和标题数量不一致#")
														
 
															+        return dict_Rule_A_Date,list_hrefs
														
 
															+    else:
														
 
															+        list_diffindex = list_a[0]
														
 
															+        _xpath = list_a[1][0]
														
 
															+        listpage_a = []
														
 
															+        begin = 0
														
 
															+        list_diffindex.sort(key=lambda x:x)
														
 
															+        _jump_flag = False
														
 
															+
														
 
															+        dict_Rule_A_Date["hasDrew"] = dict_Rule_A_Date["hasDrew"] or hd.hasDrew(url, [{"rule":_xpath,"type":"xpath"}])
														
 
															+        _xpath_split = re.split("(\d+)",_xpath)
														
 
															+        for i in range(len(list_diffindex)):
														
 
															+            _index = list_diffindex[i]
														
 
															+            if not (_xpath_split[_index-1][-1]=="[" and _xpath_split[_index+1][0]=="]"):
														
 
															+                add_err_msg(dict_Rule_A_Date, "#列表页链接xpath无法分割#")
														
 
															+                dict_Rule_A_Date["flag"] = False
														
 
															+                return dict_Rule_A_Date,list_hrefs
														
 
															+            else:
														
 
															+                if i==0:
														
 
															+                    appendXpath(listpage_a,re.search("(.*)\[","".join(_xpath_split[:_index])).group(1))
														
 
															+                    begin = _index+1
														
 
															+                elif i<len(list_diffindex):
														
 
															+                    appendXpath(listpage_a,re.search("/(.*)\[","".join(_xpath_split[begin:_index])).group(1))
														
 
															+                    begin = _index+1
														
 
															                 else:
														
 
															-                    if i==0:
														
 
															-                        appendXpath(listpage_a,re.search("(.*)\[","".join(_xpath_split[:_index])).group(1))
														
 
															-                        begin = _index+1
														
 
															-                    elif i<len(list_diffindex):
														
 
															-                        appendXpath(listpage_a,re.search("/(.*)\[","".join(_xpath_split[begin:_index])).group(1))
														
 
															-                        begin = _index+1
														
 
															-                    else:
														
 
															-                        appendXpath(listpage_a,re.search("/(.*)","".join(_xpath_split[begin:])).group(1))
														
 
															-                        
														
 
															-                    
														
 
															-                    if i==len(list_diffindex)-1:
														
 
															-                        _group = re.search("/(.*)","".join(_xpath_split[begin:]))
														
 
															-                        if _group is not None:
														
 
															-                            appendXpath(listpage_a,_group.group(1))
														
 
															-                
														
 
															-            for i in range(len(listpage_a)):
														
 
															-                if len(listpage_a[i].split("/"))>6:
														
 
															-                    listpage_a[i] = browser.execute_script(scripts_replaceXpath,listpage_a[i])
														
 
															-            dict_Rule_A_Date["listpage_A"] = listpage_a
														
 
															-            list_diffindex = list_date[0]
														
 
															-            _xpath = list_date[1][0]
														
 
															-            listpage_date = []
														
 
															-            begin = 0
														
 
															-            list_diffindex.sort(key=lambda x:x)
														
 
															-            _jump_flag = False
														
 
															-            
														
 
															-            dict_Rule_A_Date["hasDrew"] = dict_Rule_A_Date["hasDrew"] or hd.hasDrew(url, [{"rule":_xpath,"type":"xpath"}])
														
 
															-            _xpath_split = re.split("(\d+)",_xpath)
														
 
															-            for i in range(len(list_diffindex)):
														
 
															-                _index = list_diffindex[i]
														
 
															-                if not (_xpath_split[_index-1][-1]=="[" and _xpath_split[_index+1][0]=="]"):
														
 
															-                    add_err_msg(dict_Rule_A_Date, "#列表页链接xpath无法分割#")
														
 
															-                    dict_Rule_A_Date["flag"] = False
														
 
															-                    return dict_Rule_A_Date,list_hrefs
														
 
															+                    appendXpath(listpage_a,re.search("/(.*)","".join(_xpath_split[begin:])).group(1))
														
 
															+
														
 
															+
														
 
															+                if i==len(list_diffindex)-1:
														
 
															+                    _group = re.search("/(.*)","".join(_xpath_split[begin:]))
														
 
															+                    if _group is not None:
														
 
															+                        appendXpath(listpage_a,_group.group(1))
														
 
															+
														
 
															+        for i in range(len(listpage_a)):
														
 
															+            if len(listpage_a[i].split("/"))>6:
														
 
															+                # listpage_a[i] = browser.execute_script(scripts_replaceXpath,listpage_a[i])
														
 
															+                listpage_a[i] = get_js_rs(browser, scripts_replaceXpath,listpage_a[i])
														
 
															+        dict_Rule_A_Date["listpage_A"] = listpage_a
														
 
															+        list_diffindex = list_date[0]
														
 
															+        _xpath = list_date[1][0]
														
 
															+        listpage_date = []
														
 
															+        begin = 0
														
 
															+        list_diffindex.sort(key=lambda x:x)
														
 
															+        _jump_flag = False
														
 
															+
														
 
															+        dict_Rule_A_Date["hasDrew"] = dict_Rule_A_Date["hasDrew"] or hd.hasDrew(url, [{"rule":_xpath,"type":"xpath"}])
														
 
															+        _xpath_split = re.split("(\d+)",_xpath)
														
 
															+        for i in range(len(list_diffindex)):
														
 
															+            _index = list_diffindex[i]
														
 
															+            if not (_xpath_split[_index-1][-1]=="[" and _xpath_split[_index+1][0]=="]"):
														
 
															+                add_err_msg(dict_Rule_A_Date, "#列表页链接xpath无法分割#")
														
 
															+                dict_Rule_A_Date["flag"] = False
														
 
															+                return dict_Rule_A_Date,list_hrefs
														
 
															+            else:
														
 
															+                if i==0:
														
 
															+                    appendXpath(listpage_date,re.search("(.*)\[","".join(_xpath_split[:_index])).group(1))
														
 
															+                    begin = _index+1
														
 
															+                elif i<len(list_diffindex):
														
 
															+                    appendXpath(listpage_date,re.search("/(.*)\[","".join(_xpath_split[begin:_index])).group(1))
														
 
															+                    begin = _index+1
														
 
															                 else:
														
 
															-                    if i==0:
														
 
															-                        appendXpath(listpage_date,re.search("(.*)\[","".join(_xpath_split[:_index])).group(1))
														
 
															-                        begin = _index+1
														
 
															-                    elif i<len(list_diffindex):
														
 
															-                        appendXpath(listpage_date,re.search("/(.*)\[","".join(_xpath_split[begin:_index])).group(1))
														
 
															-                        begin = _index+1
														
 
															-                    else:
														
 
															-                        appendXpath(listpage_date,re.search("/(.*)","".join(_xpath_split[begin:])).group(1))
														
 
															-                    
														
 
															-                    if i==len(list_diffindex)-1:
														
 
															-                        _group = re.search("/(.*)","".join(_xpath_split[begin:]))
														
 
															-                        if _group is not None:
														
 
															-                            appendXpath(listpage_date,_group.group(1))
														
 
															-            
														
 
															-            
														
 
															-            for i in range(len(listpage_date)):
														
 
															-                if len(listpage_date[i].split("/"))>6:
														
 
															-                    listpage_date[i] = browser.execute_script(scripts_replaceXpath,listpage_date[i])        
														
 
															-            dict_Rule_A_Date["listpage_Date"] = listpage_date
														
 
															-            
														
 
															-        return dict_Rule_A_Date,list_hrefs
														
 
															+                    appendXpath(listpage_date,re.search("/(.*)","".join(_xpath_split[begin:])).group(1))
														
 
															+
														
 
															+                if i==len(list_diffindex)-1:
														
 
															+                    _group = re.search("/(.*)","".join(_xpath_split[begin:]))
														
 
															+                    if _group is not None:
														
 
															+                        appendXpath(listpage_date,_group.group(1))
														
 
															+
														
 
															+        for i in range(len(listpage_date)):
														
 
															+            if len(listpage_date[i].split("/"))>6:
														
 
															+                # listpage_date[i] = browser.execute_script(scripts_replaceXpath,listpage_date[i])
														
 
															+                listpage_date[i] = get_js_rs(browser, scripts_replaceXpath,listpage_date[i])
														
 
															+
														
 
															+        dict_Rule_A_Date["listpage_Date"] = listpage_date
														
 
															+
														
 
															+    return dict_Rule_A_Date,list_hrefs
														
 
															-    except Exception as e:
														
 
															-        error(str(e))
														
 
															-    finally:
														
 
															-        hd.adddriver(browser)
														
 
															-        debug("release driver")
														
 
															+    # except Exception as e:
														
 
															+    #     error(str(e))
														
 
															+    # finally:
														
 
															+    #     # hd.adddriver(browser)
														
 
															+    #     # debug("release driver")
														
 
															+    #     log('getRule_A_Date done')
														
 
															     return None
														
 
															 def dumpLinkContent():
														
--- a/module/listpage/extractor.py
+++ b/module/listpage/extractor.py
@@ -83,28 +83,47 @@ def getRule_listpage(listpage_url,try_times=3):
 
															     for i in range(try_times):
														
 
															         browser = hd.getdriver()
														
 
															         debug("get driver")
														
 
															+        loadsuccess = hd.loadPage(browser, listpage_url)
														
 
															+        if not loadsuccess:
														
 
															+            log('加载列表主页失败， 重新请求网页。')
														
 
															+            continue
														
 
															+        log('准备执行获取列表页内容标签脚本')
														
 
															+        # with open('d:/html/home_page.html', 'w', encoding='utf-8') as f:
														
 
															+        #     f.write(browser.page_source)
														
 
															         data_listpage = featureEngine.getInput_byJS(browser,listpage_url,"")
														
 
															+        log('获取列表页内容标签成功')
														
 
															         #print(browser.page_source)
														
 
															-        hd.adddriver(browser)
														
 
															-        debug("release driver")
														
 
															+        # hd.adddriver(browser)
														
 
															+        # debug("release driver")
														
 
															         if data_listpage is not None:
														
 
															             x,_,list_xpath = data_listpage
														
 
															             _index = listpageContentPredictor.predict(x)
														
 
															+            log('模型预测列表页标签完毕')
														
 
															             if len(list_xpath[_index])>0:
														
 
															                 content_xpath = list_xpath[_index][0]
														
 
															                 #content_xpath = "/html"
														
 
															                 log("the content_xpath of listpage is "+str(content_xpath))
														
 
															-                data_rule = featureEngine.getRule_A_Date(listpage_url,content_xpath)
														
 
															+                data_rule = featureEngine.getRule_A_Date(browser,listpage_url,content_xpath)
														
 
															+                log('执行脚本获取列表页链接及日期完毕')
														
 
															                 if data_rule is not None:
														
 
															                     dict_rule_A_Date,list_hrefs = data_rule
														
 
															-                    browser = hd.getdriver()
														
 
															-                    debug("get driver")
														
 
															+                    # if dict_rule_A_Date.get('flag', '') == False:
														
 
															+                    #     return None
														
 
															+                    # browser = hd.getdriver()
														
 
															+                    # debug("get driver")
														
 
															+                    log('begin getTurnRule')
														
 
															                     turn_data = engine.getTurnRule(browser,listpage_url)
														
 
															-                    hd.adddriver(browser)
														
 
															-                    debug("release driver")
														
 
															+                    log('获取翻页内容完毕')
														
 
															+                    # hd.adddriver(browser)
														
 
															+                    # debug("release driver")
														
 
															                     dict_rule_pageTurn,list_listpage_url = turn_data
														
 
															                     dict_rule_recog = getRecognize_detail_listpage(list_listpage_url, list_hrefs)
														
 
															+                    log('解析列表页规则完毕')
														
 
															+                    hd.adddriver(browser)
														
 
															+                    debug("release driver")
														
 
															                     return mergeDict([dict_rule_A_Date,dict_rule_pageTurn,dict_rule_recog]),list_hrefs
														
 
															+        hd.adddriver(browser)
														
 
															+        debug("release driver")
														
 
															     return None
														
--- a/module/listpage/pageTurn/engine.py
+++ b/module/listpage/pageTurn/engine.py
@@ -14,15 +14,16 @@ script = '''
 
															 function click_bt(type_click){
														
 
															     var pattern_pageNum = /[共\/]\s*(\d+)\s*页|\d+\s*\/\s*(\d+)|\.{2}\s*(\d+)/
														
 
															-    var pattern_nextPage = /^\s*.?([下后]一?页|[下后]一?页\s*»|»|>|[Nn]ext).?\s*$/
														
 
															+    var pattern_nextPage = /^\s*[^最]?([下后]一?页|[下后]一?页\s*»|»|>|[Nn]ext).?\s*$/
														
 
															-    var pattern_tailPage = /^\s*.?(最?[尾末]一?页|tail|>\|).?s\s*$/
														
 
															+    var pattern_tailPage = /^\s*(最[尾末后]一?页|tail|>\|).?s\s*$/
														
 
															     list_cluster = clustering_turnPage();
														
 
															     var pageNum = null;
														
 
															     var pageNum_jsoup = null;
														
 
															     var _node_xpath = null;
														
 
															     var _node_jsoup = null;
														
 
															     var _node_click = null;
														
 
															+    var click_message = '';
														
 
															     for(var i=0;i<list_cluster.length;i++){
														
 
															         _node = list_cluster[i][0]
														
 
															         _type = list_cluster[i][1]
														
@@ -60,17 +61,42 @@ function click_bt(type_click){
 
															                     }
														
 
															                 }
														
 
															+                if(_href==null || _href=="" || _href=="#"){
														
 
															+                    click_message = '翻页链接为空或#异常';
														
 
															+                }
														
 
															+                if(_href!=null && _href.indexOf('javascript')>=0){
														
 
															+                    click_message = '翻页链接为javascript';
														
 
															+                }
														
 
															                 if(_node_click==null){
														
 
															                     _node_click = _node;
														
 
															+                }               
														
 
															+               
														
 
															+            }
														
 
															+            else if(_node.getAttribute("type")=='button'){
														
 
															+                _node_click = _node;
														
 
															+                click_message = '标签属性type为button的翻页';
														
 
															+            }            
														
 
															+            else if(_node.parentNode.tagName.toLowerCase() in {a:"",button:""} || _node.parentNode.onclick!=null){
														
 
															+                _href = _node.parentNode.getAttribute("href")
														
 
															+                if(_href!=null && _href!="" && _href!="#" && _href.indexOf('javascript')<0){
														
 
															+                    if(_node_xpath==null){
														
 
															+                        _node_xpath = getXpath(_node.parentNode);
														
 
															+                    }
														
 
															+                    if(_node_jsoup==null){
														
 
															+                        _node_jsoup = getJsoup(_node.parentNode);
														
 
															+                    }
														
 
															+                    
														
 
															+                }
														
 
															+                if(_node_click==null){
														
 
															+                    _node_click = _node.parentNode;
														
 
															                 }
														
 
															-                
														
 
															-                
														
 
															+                click_message = '父节点为翻页链接';				
														
 
															             }
														
 
															         }
														
 
															     }
														
 
															     if(_node_click!=null){
														
 
															         _node_click.click();
														
 
															-        return [true,[pattern_pageNum.toString(),pageNum,pageNum_jsoup],[_node_xpath,_node_jsoup]];
														
 
															+        return [true,[pattern_pageNum.toString(),pageNum,pageNum_jsoup],[_node_xpath,_node_jsoup], click_message];
														
 
															     }else{
														
 
															         var _pattern = null;
														
 
															         if(type_click=="nextPage"){
														
@@ -88,11 +114,13 @@ function click_bt(type_click){
 
															                     _node_jsoup = getJsoup(_node);
														
 
															                 }
														
 
															                 _node.click();
														
 
															-                return [true,[pattern_pageNum.toString(),pageNum,pageNum_jsoup],[_node_xpath,_node_jsoup]];
														
 
															+                click_message = '找不到翻页按钮，a标签为翻页链接';
														
 
															+                return [true,[pattern_pageNum.toString(),pageNum,pageNum_jsoup],[_node_xpath,_node_jsoup], click_message];
														
 
															             }
														
 
															         }
														
 
															     }
														
 
															-    return [false,[pattern_pageNum.toString(),pageNum,pageNum_jsoup],[_node_xpath,_node_jsoup]];
														
 
															+    if(click_message==''){click_message = '最终没找到翻页按钮';}
														
 
															+    return [false,[pattern_pageNum.toString(),pageNum,pageNum_jsoup],[_node_xpath,_node_jsoup], click_message];
														
 
															 }
														
 
															 return click_bt(arguments[0]);
														
 
															 '''
														
@@ -119,7 +147,8 @@ return turnpage_by_pattern(arguments[0]);
 
															 def click_bt_lastPage(browser):
														
 
															     _url = browser.current_url
														
 
															     _window_handles = len(browser.window_handles)
														
 
															-    _result = browser.execute_script(scripts_common+script,"lastPage")
														
 
															+    # _result = browser.execute_script(scripts_common+script,"lastPage")
														
 
															+    _result = get_js_rs(browser, scripts_common+script,"lastPage")
														
 
															     if _result[0]:
														
 
															         if len(browser.window_handles)>_window_handles:
														
 
															             switch_window(browser)
														
@@ -133,8 +162,9 @@ def click_bt_lastPage(browser):
 
															 def click_bt_nextPage(browser):
														
 
															     _url = browser.current_url
														
 
															     _window_handles = len(browser.window_handles)
														
 
															-    _result = browser.execute_script(scripts_common+script,"nextPage")
														
 
															-    if _result[0]:
														
 
															+    # _result = browser.execute_script(scripts_common+script,"nextPage")
														
 
															+    _result = get_js_rs(browser, scripts_common+script,"nextPage", timeout=30)
														
 
															+    if _result!=None and _result[0]:
														
 
															         if len(browser.window_handles)>_window_handles:
														
 
															             switch_window(browser)
														
 
															         for i in range(4):
														
@@ -147,8 +177,9 @@ def click_bt_nextPage(browser):
 
															 def click_bt_tailPage(browser):
														
 
															     _url = browser.current_url
														
 
															     _window_handles = len(browser.window_handles)
														
 
															-    _result = browser.execute_script(scripts_common+script,"tailPage")
														
 
															-    if _result[0]:
														
 
															+    # _result = browser.execute_script(scripts_common+script,"tailPage")
														
 
															+    _result = get_js_rs(browser, scripts_common+script,"tailPage")
														
 
															+    if _result!=None and  _result[0]:
														
 
															         if len(browser.window_handles)>_window_handles:
														
 
															             switch_window(browser)
														
 
															         for i in range(4):
														
@@ -161,7 +192,8 @@ def click_bt_tailPage(browser):
 
															 def click_bt_pattern(browser,pattern):
														
 
															     _url = browser.current_url
														
 
															     _window_handles = len(browser.window_handles)
														
 
															-    _result = browser.execute_script(scripts_common+script_pattern,pattern)
														
 
															+    # _result = browser.execute_script(scripts_common+script_pattern,pattern)
														
 
															+    _result = get_js_rs(browser, scripts_common+script_pattern,pattern)
														
 
															     if _result:
														
 
															         if len(browser.window_handles)>_window_handles:
														
 
															             switch_window(browser)
														
@@ -191,6 +223,13 @@ def getRuleOfUrl(first_url,second_url):
 
															     log("pageTurn first_url:\t"+first_url)
														
 
															     log("pageTurn second_url:\t"+second_url)
														
 
															     if len(split_all_first)!=len(split_all_second):
														
 
															+        split_url = second_url.split('/')
														
 
															+        if split_url[-1]== 'index_2.html':
														
 
															+            dict_rule["listpage_turn_before"] = '/'.join(split_url[:-1])+'/index_'
														
 
															+            dict_rule["listpage_turn_after"] = '.html'
														
 
															+            dict_rule["listpage_pageBegin"] = 2
														
 
															+            dict_rule["listpage_pageStep"] = 1
														
 
															+            return dict_rule
														
 
															         add_err_msg(dict_rule, "#翻页链接不匹配#")
														
 
															         dict_rule["flag"] = False
														
 
															         return dict_rule
														
@@ -226,86 +265,119 @@ def getRuleOfUrl(first_url,second_url):
 
															     return dict_rule
														
 
															 def getTurnRule(browser,listpage_url):
														
 
															-    try:
														
 
															-        hd.loadPage(browser,listpage_url)
														
 
															-        first_url = browser.current_url
														
 
															-        list_listpage_url = []
														
 
															-        click_flag = True
														
 
															-        #点击下一页
														
 
															-        click_next_1 = click_bt_nextPage(browser)
														
 
															-        
														
 
															-        url1 = browser.current_url
														
 
															-        log("click next bt:"+str(click_next_1))
														
 
															-        #点击下一页
														
 
															-        click_next_2 = click_bt_nextPage(browser)
														
 
															-        log("click next bt:"+str(click_next_2))
														
 
															-        list_pageNum1 = click_next_1[1]
														
 
															-        list_node1 = click_next_1[2]
														
 
															-        list_pageNum2 = click_next_2[1]
														
 
															-        list_node2 = click_next_2[2]
														
 
															-        dict_rule = None
														
 
															-        url2 = browser.current_url
														
 
															-        
														
 
															-        #是否有点击到下一页
														
 
															-        #click_flag = click_next_1[0] or click_next_2[0]
														
 
															-        click_flag = click_next_2[0]
														
 
															-        
														
 
															-        
														
 
															-        
														
 
															-        #点击数字翻页
														
 
															-        if not click_flag:
														
 
															-            #第一个下一页点击到而第二个未点击到
														
 
															-            if click_next_1[0]:
														
 
															-                click_last_1 = click_bt_lastPage(browser)
														
 
															-                url2 = browser.current_url
														
 
															-            if not click_next_1[0] or not click_last_1[0]:
														
 
															-                click_pattern_2 = click_bt_pattern(browser, "^\\s*2\\s*$")
														
 
															-                if click_pattern_2:
														
 
															-                    url2 = browser.current_url
														
 
															-                click_pattern_1 = click_bt_pattern(browser, "^\\s*1\\s*$")
														
 
															-                if click_pattern_1:
														
 
															-                    url1 = browser.current_url
														
 
															-                    if url1==first_url:
														
 
															-                        click_pattern_3 = click_bt_pattern(browser, "^\\s*3\\s*$")
														
 
															-                        if click_pattern_3:
														
 
															-                            url1 = url2
														
 
															-                            url2 = browser.current_url
														
 
															-        
														
 
															-        dict_rule = getRuleOfUrl(url1, url2)
														
 
															-        list_listpage_url.append(url1)
														
 
															-        list_listpage_url.append(url2)
														
 
															-    
														
 
															-        if list_pageNum1[2]==list_pageNum2[2] and list_pageNum1[2] is not None:
														
 
															-            dict_rule["listpage_pageNum"] = [list_pageNum1[2],"jsoup"]
														
 
															-        elif list_pageNum1[1]==list_pageNum2[1] and list_pageNum1[1] is not None:
														
 
															-            dict_rule["listpage_pageNum"] = [list_pageNum1[1],"xpath"]
														
 
															-        else:
														
 
															-            dict_rule["listpage_pageNum"] = None
														
 
															-        dict_rule["listpage_pageNum_pattern"] = list_pageNum1[0]
														
 
															-        '''
														
 
															-        #若是未识别到pageNum则flag为False
														
 
															-        if dict_rule["listpage_pageNum"] is None:
														
 
															-            dict_rule["flag"] = False
														
 
															-        '''
														
 
															-        #优先jsoup，后xpath
														
 
															-        if list_node1[1]==list_node2[1] and list_node1[1] is not None:
														
 
															-            dict_rule["listpage_nextPage"] = [list_node1[1],"jsoup"]
														
 
															-        #只有2页的适配
														
 
															-        elif list_node1[1] is not None and list_node2[1] is None:
														
 
															-            dict_rule["listpage_nextPage"] = [list_node1[1],"jsoup"]
														
 
															-        elif list_node1[0]==list_node2[0] and list_node1[0] is not None:
														
 
															-            dict_rule["listpage_nextPage"] = [list_node1[0],"xpath"]
														
 
															-        else:
														
 
															-            dict_rule["listpage_nextPage"] = None
														
 
															-        
														
 
															-        #翻页按钮或者是拼接规则有一个即可
														
 
															-        if dict_rule["listpage_nextPage"] is not None:
														
 
															-            dict_rule["flag"] = True
														
 
															+    '''
														
 
															+    通过点击下一页或数字翻页得到下一页规则（页数，下一页路径等），list_listpage_url(前后列表页url)
														
 
															+    :param browser: 浏览器对象
														
 
															+    :param listpage_url: 列表页url
														
 
															+    :return:
														
 
															+    '''
														
 
															+    # try:
														
 
															+    # hd.loadPage(browser,listpage_url)
														
 
															+    first_url = browser.current_url
														
 
															+    list_listpage_url = []
														
 
															+    click_flag = True
														
 
															+    #点击下一页
														
 
															+    # click_next_1 = click_bt_nextPage(browser)
														
 
															+    click_next_1 = thread_run(click_bt_nextPage, browser)
														
 
															+    url1 = ''
														
 
															+    url2 = browser.current_url
														
 
															+    log("click next bt:"+str(click_next_1))
														
 
															+    #点击下一页
														
 
															+    # click_next_2 = click_bt_nextPage(browser)
														
 
															+    click_next_2 = thread_run(click_bt_nextPage, browser)
														
 
															+    if click_next_1==None:
														
 
															+        click_next_1 = [False, ['/[共\\/]\\s*(\\d+)\\s*页|\\d+\\s*\\/\\s*(\\d+)|\\.{2}\\s*(\\d+)/', None, None],
														
 
															+                        [None, None]]
														
 
															+    if click_next_2==None:
														
 
															+        click_next_2 = [False, ['/[共\\/]\\s*(\\d+)\\s*页|\\d+\\s*\\/\\s*(\\d+)|\\.{2}\\s*(\\d+)/', None, None],
														
 
															+                        [None, None]]
														
 
															+    log("click next bt:"+str(click_next_2))
														
 
															+    list_pageNum1 = click_next_1[1]
														
 
															+    list_node1 = click_next_1[2]
														
 
															+    list_pageNum2 = click_next_2[1]
														
 
															+    list_node2 = click_next_2[2]
														
 
															+    dict_rule = None
														
 
															+    url3 = browser.current_url
														
 
															+
														
 
															+    #是否有点击到下一页
														
 
															+    #click_flag = click_next_1[0] or click_next_2[0]
														
 
															+    click_flag = click_next_2[0]
														
 
															+
														
 
															+
														
 
															+
														
 
															+    #点击数字翻页
														
 
															+    # if not click_flag:
														
 
															+    #     #第一个下一页点击到而第二个未点击到
														
 
															+    #     log('开始数字翻页')
														
 
															+        # if click_next_1[0]:
														
 
															+        #     click_last_1 = click_bt_lastPage(browser)
														
 
															+        #     url2 = browser.current_url
														
 
															+        #     log('第一次翻页成功，最后一页作为第二页')
														
 
															+    if not click_next_1[0]: # or not click_last_1[0]
														
 
															+        log('开始数字翻页')
														
 
															+        # click_pattern_2 = click_bt_pattern(browser, "^\\s*2\\s*$")
														
 
															+        click_pattern_2 = thread_run(click_bt_pattern, browser, "^\\s*2\\s*$")
														
 
															+        if click_pattern_2:
														
 
															+            url2 = browser.current_url
														
 
															+            log('数字翻页第二页%s'%url2)
														
 
															+        # click_pattern_3 = click_bt_pattern(browser, "^\\s*3\\s*$")
														
 
															+        click_pattern_3 = thread_run(click_bt_pattern , browser, "^\\s*3\\s*$")
														
 
															+        if click_pattern_3:
														
 
															+            url3 = browser.current_url
														
 
															+            log('数字翻页第三页%s'%url3)
														
 
															         else:
														
 
															-            add_err_msg(dict_rule, "#下一页规则未获取#")
														
 
															-        return dict_rule,list_listpage_url
														
 
															-    except Exception as e:
														
 
															-        error(str(e))
														
 
															+            # click_pattern_1 = click_bt_pattern(browser, "^\\s*1\\s*$")
														
 
															+            click_pattern_1 = thread_run(click_bt_pattern, browser, "^\\s*1\\s*$")
														
 
															+            if click_pattern_1:
														
 
															+                url1 = browser.current_url
														
 
															+                log('数字翻页第一页%s'%url1)
														
 
															+    if url2 != url3:
														
 
															+        dict_rule = getRuleOfUrl(url2, url3)
														
 
															+    elif url1!='' and url2 != url1:
														
 
															+        dict_rule = getRuleOfUrl(url1, url2)
														
 
															+    else:
														
 
															+        dict_rule = getRuleOfUrl(first_url, url2)
														
 
															+    if click_next_1 != None and len(click_next_1)==4:
														
 
															+        click_message = click_next_1[3]
														
 
															+        if click_message!="":
														
 
															+            add_err_msg(dict_rule, '#%s#'%click_message)
														
 
															+    if not click_flag:
														
 
															+        add_err_msg(dict_rule, "#进行数字翻页#")
														
 
															+    list_listpage_url.append(url1)
														
 
															+    list_listpage_url.append(url2)
														
 
															+
														
 
															+    if list_pageNum1[2]==list_pageNum2[2] and list_pageNum1[2] is not None:
														
 
															+        dict_rule["listpage_pageNum"] = [list_pageNum1[2],"jsoup"]
														
 
															+    elif list_pageNum1[1]==list_pageNum2[1] and list_pageNum1[1] is not None:
														
 
															+        dict_rule["listpage_pageNum"] = [list_pageNum1[1],"xpath"]
														
 
															+    else:
														
 
															+        dict_rule["listpage_pageNum"] = None
														
 
															+    dict_rule["listpage_pageNum_pattern"] = list_pageNum1[0]
														
 
															+    '''
														
 
															+    #若是未识别到pageNum则flag为False
														
 
															+    if dict_rule["listpage_pageNum"] is None:
														
 
															+        dict_rule["flag"] = False
														
 
															+    '''
														
 
															+    #优先jsoup，后xpath
														
 
															+    if list_node1[1]==list_node2[1] and list_node1[1] is not None:
														
 
															+        dict_rule["listpage_nextPage"] = [list_node1[1],"jsoup"]
														
 
															+    #只有2页的适配
														
 
															+    elif list_node1[1] is not None and list_node2[1] is None:
														
 
															+        log('只有两页更新适配 ')
														
 
															+        dict_rule["listpage_nextPage"] = [list_node1[1],"jsoup"]
														
 
															+    elif list_node1[0]==list_node2[0] and list_node1[0] is not None:
														
 
															+        dict_rule["listpage_nextPage"] = [list_node1[0],"xpath"]
														
 
															+    else:
														
 
															+        dict_rule["listpage_nextPage"] = None
														
 
															+
														
 
															+    #翻页按钮或者是拼接规则有一个即可
														
 
															+    if dict_rule["listpage_nextPage"] is not None:
														
 
															+        dict_rule["flag"] = True
														
 
															+    else:
														
 
															+        add_err_msg(dict_rule, "#下一页规则未获取#")
														
 
															+    return dict_rule,list_listpage_url
														
 
															+    # except Exception as e:
														
 
															+    #     error(str(e))
														
 
															 if __name__=="__main__":
														
 
															     browser = hd.getBrowser()
														
@@ -323,7 +395,8 @@ if __name__=="__main__":
 
															     return _array
														
 
															     '''
														
 
															-    data = browser.execute_script(scripts_common+script1)
														
 
															+    # data = browser.execute_script(scripts_common+script1)
														
 
															+    data = get_js_rs(browser, scripts_common+script1)
														
 
															     #browser.maximize_window()
														
 
															     browser.save_screenshot("112.png")
														
 
															     for item in data:
														
--- a/module/run_single_server.py
+++ b/module/run_single_server.py
@@ -1,21 +1,25 @@
 
															+# -*- coding: utf-8 -*-
														
 
															+import sys
														
 
															+import json
														
 
															+import re
														
 
															+import os
														
 
															+sys.path.append(os.path.abspath("../.."))
														
 
															-from module.Utils import log# -*- coding: utf-8 -*-
														
 
															+os.environ['KERAS_BACKEND']='tensorflow'
														
 
															+from module.Utils import log
														
 
															 """
														
 
															 Created on Fri Jun  1 18:03:03 2018
														
 
															 @author: DONG
														
 
															 """
														
 
															-import sys
														
 
															-import os
														
 
															-import json
														
 
															-import re
														
 
															-sys.path.append(os.path.abspath("../.."))
														
 
															+
														
 
															 from module import extractFlow
														
 
															 from flask import Flask, jsonify
														
 
															 from flask import abort
														
 
															 from flask import request
														
 
															 import time
														
 
															 import uuid
														
 
															+from module.Utils import xpath2css
														
 
															 app = Flask(__name__)
														
 
															 app.config['JSON_AS_ASCII'] = False
														
@@ -30,9 +34,12 @@ def transformInterface(_dict):
 
															     if listpage_a  and listpage_date:
														
 
															         if listpage_a[0]==listpage_date[0]:
														
 
															             ruleValue = listpage_a[0]
														
 
															-            trans_dict["listPageNode"] = {"ruleType":"xpath",
														
 
															-                                          "ruleValue":ruleValue,
														
 
															-                                          "ruleKey":""}
														
 
															+            # trans_dict["listPageNode"] = {"ruleType":"xpath",
														
 
															+            #                               "ruleValue":ruleValue,
														
 
															+            #                               "ruleKey":""}
														
 
															+            trans_dict["listPageNode"] = {"ruleType": "css",
														
 
															+                                          "ruleValue": xpath2css(ruleValue),
														
 
															+                                          "ruleKey": ""}
														
 
															         else:
														
 
															             flag = False
														
 
															     else:
														
@@ -67,8 +74,11 @@ def transformInterface(_dict):
 
															     detail_date = _dict.get("detail_date")
														
 
															     trans_dict["needDetailTime"] = False
														
 
															     if detail_date:
														
 
															-        trans_dict["detailDateNode"] = {"ruleType": "xpath",
														
 
															-                                        "ruleValue": detail_date
														
 
															+        # trans_dict["detailDateNode"] = {"ruleType": "xpath",
														
 
															+        #                                 "ruleValue": detail_date
														
 
															+        #                                 }
														
 
															+        trans_dict["detailDateNode"] = {"ruleType": "css",
														
 
															+                                        "ruleValue": xpath2css(detail_date)
														
 
															                                         }
														
 
															         trans_dict["needDetailTime"] = True
														
 
															     else:
														
@@ -76,16 +86,22 @@ def transformInterface(_dict):
 
															     detail_title = _dict.get("detail_title")
														
 
															     trans_dict["needDetailTitle"] = False
														
 
															     if detail_title:
														
 
															-        trans_dict["detailTitleNode"] = {"ruleType": "xpath",
														
 
															-                                         "ruleValue": detail_title
														
 
															+        # trans_dict["detailTitleNode"] = {"ruleType": "xpath",
														
 
															+        #                                  "ruleValue": detail_title
														
 
															+        #                                  }
														
 
															+        trans_dict["detailTitleNode"] = {"ruleType": "css",
														
 
															+                                         "ruleValue": xpath2css(detail_title)
														
 
															                                          }
														
 
															         trans_dict["needDetailTitle"] = True
														
 
															     else:
														
 
															         flag = False
														
 
															     detail_content = _dict.get("detail_content")
														
 
															     if detail_content:
														
 
															-        trans_dict["detailContentNode"] = {"ruleType": "xpath",
														
 
															-                                           "ruleValue": detail_content
														
 
															+        # trans_dict["detailContentNode"] = {"ruleType": "xpath",
														
 
															+        #                                    "ruleValue": detail_content
														
 
															+        #                                    }
														
 
															+        trans_dict["detailContentNode"] = {"ruleType": "css",
														
 
															+                                           "ruleValue": xpath2css(detail_content)
														
 
															                                            }
														
 
															     else:
														
 
															         flag = False
														
@@ -117,7 +133,7 @@ def text_predict():
 
															                 if re.search("http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+",listpage_url) is None:
														
 
															                     data["status_code"] = 400
														
 
															                     abort(400)
														
 
															-                else:   
														
 
															+                else:
														
 
															                     data = extractFlow.ruleExtract(listpage_url)
														
 
															                 log("done for setting result of listpage:"+str(listpage_url))
														
 
															                 data["listpage_url"] = listpage_url
														
@@ -130,7 +146,7 @@ def text_predict():
 
															         log(" time from receive to send: "+str(time.time()-start_time))
														
 
															         data = transformInterface(data)
														
 
															-        log(str(data))
														
 
															+        # log(str(data))
														
 
															         _resp = jsonify(data)
														
 
															         #log(str(data["flag"])+str(data))
														
@@ -138,5 +154,5 @@ def text_predict():
 
															 if __name__ == '__main__':
														
 
															-    app.run(host='0.0.0.0', port=15015, threaded=True, debug=False)
														
 
															+    app.run(host='192.168.2.65', port=15015, threaded=True, debug=False) #15015  2.65
														
 
															     log("ContentExtractor running")
														
--- a/module/testInterface.py
+++ b/module/testInterface.py
@@ -345,23 +345,161 @@ list_url = ["http://www.csssyxx.com/xwgk/tzgg",
 
															 _sum = 0
														
 
															 _count = 0
														
 
															 ''' '''
														
 
															-with codecs.open("errorLink.txt","r",encoding="utf8") as f:
														
 
															-    while(True):
														
 
															-        line = f.readline().strip()
														
 
															-        if not line:
														
 
															-            break
														
 
															-       
														
 
															-        a = time.time()
														
 
															-        # user = {"listpage_url":list_url[0]}
														
 
															-        user = {"listpage_url":"http://www.gsbtn96333.com.cn/news-41-1.html"}
														
 
															-        #_resp = requests.post("http://192.168.2.52:15015/content_extract", json=user, verify=True)
														
 
															-        _resp = requests.post("http://127.0.0.1:15015/content_extract", json=user, verify=True)
														
 
															-        resp_json = _resp.content.decode("utf-8")
														
 
															-        _resp = json.loads(resp_json)
														
 
															-        print(resp_json)
														
 
															-        _sum += 1
														
 
															-        if "flag" in _resp and _resp["flag"]:
														
 
															-            _count += 1
														
 
															-            print("take:",time.time()-a,json.dumps(_resp,sort_keys=True,indent=4,ensure_ascii=False))
														
 
															-        print(_count,_sum)
														
 
															+# with codecs.open("errorLink.txt","r",encoding="utf8") as f:
														
 
															+#     while(True):
														
 
															+#         line = f.readline().strip()
														
 
															+#         if not line:
														
 
															+#             break
														
 
															+#
														
 
															+#         a = time.time()
														
 
															+#         # user = {"listpage_url":list_url[0]}
														
 
															+#         user = {"listpage_url":"http://www.gsbtn96333.com.cn/news-41-1.html"}
														
 
															+#         #_resp = requests.post("http://192.168.2.52:15015/content_extract", json=user, verify=True)
														
 
															+#         _resp = requests.post("http://127.0.0.1:15015/content_extract", json=user, verify=True)
														
 
															+#         resp_json = _resp.content.decode("utf-8")
														
 
															+#         _resp = json.loads(resp_json)
														
 
															+#         print(resp_json)
														
 
															+#         _sum += 1
														
 
															+#         if "flag" in _resp and _resp["flag"]:
														
 
															+#             _count += 1
														
 
															+#             print("take:",time.time()-a,json.dumps(_resp,sort_keys=True,indent=4,ensure_ascii=False))
														
 
															+#         print(_count,_sum)
														
 
															+
														
 
															+def get_rs(url):
														
 
															+    user = {"listpage_url": url}
														
 
															+    _resp = requests.post("http://192.168.2.177:15015/content_extract", json=user, verify=True) #127.0.0.1  177
														
 
															+    resp_json = _resp.content.decode("utf-8")
														
 
															+    return resp_json
														
 
															+
														
 
															+    # _resp = json.loads(resp_json)
														
 
															+    # print(resp_json)
														
 
															+    # print(_resp)
														
 
															+
														
 
															+# url = 'http://www.clrmyy.com/Newslist/NewsList.aspx?code=ZPXX'
														
 
															+# url = 'http://ec.chongchi.com.cn:8080/Ec468Web/ysxjcggg.jsp' # 列表页太长 js 溢出  #已设置超时
														
 
															+# url = 'https://tyj.huangshan.gov.cn/content/column/6794951?pageIndex=1'
														
 
															+# url = 'http://www.yangdong.gov.cn/xwzx/gggs/index.html'  # 获取详情页报错
														
 
															+# url = 'https://www.guit.edu.cn/xwzx/tzgg.htm ' # 日志报错
														
 
															+
														
 
															+# rs = get_rs(url)
														
 
															+# print(rs)
														
 
															+
														
 
															+# url = 'http://www.yangzhou.gov.cn/cnyzfront/newxxgk/bmfdgklist.jsp?zjclassid=b8273cd5944b41c1b6f5aeb88194340f&bmcode=KA024&showlmmc=1&showbm=0&currentPage=2' # 翻页提取失败
														
 
															+# url = 'http://www.yangzhou.gov.cn/cnyzfront/newxxgk/bmfdgklist.jsp?zjclassid=aedecc7ea4cb4fbdb34df0d57db50c62&bmcode=11321000014407012K' # 所有要素提取失败, 重跑正常
														
 
															+url = 'http://www.gztaijiang.gov.cn/zwgk/zdlygk/zfcg/zbgg/index.html' # 所有要素提取失败, 重跑正常
														
 
															+# url = 'http://gxs.yun.liuzhou.gov.cn/xwzx/tzgg/index.shtml' # 所有要素提取失败, bug 已修复
														
 
															+# url = 'http://www.yangzhou.gov.cn/cnyzfront/newxxgk/bmfdgklist.jsp?zjclassid=aedecc7ea4cb4fbdb34df0d57db50c62&bmcode=11321000014407012K' # 所有要素提取失败, 重跑正常
														
 
															+# url = 'http://www.chengan.gov.cn/main/newsMore.action?subjectid=9052&pagenum=1' # 所有要素提取失败, bug 已修复
														
 
															+# url = 'http://hsxzwgk.mas.gov.cn/opennessTarget/?branch_id=57a3df762c262ea9a00aadae&column_code=280200' #主页提取失败  #网页打不开# 404
														
 
															+# url = 'http://www.crra.org.cn/news/tongzhi/o1' # 执行js完毕  getRule_A_Date done 后卡住 已修复
														
 
															+
														
 
															+# url = 'http://www.ptstjxx.org.cn/pttsjyxx_lists-16.html' # 翻页超时错误 已修复 提取正常
														
 
															+
														
 
															+# # url = 'https://www.neias.cn/news_list.jsp?id=10775' # 报 201 浏览器打开正常  重新提取 #翻页链接不匹配##下一页规则未获取#
														
 
															+# # url = 'https://www.gzmedri.com/xwdt/list_14_page_1.html' # 报 201 浏览器打开很慢，有时正常
														
 
															+# # url = 'http://www.wjqwhg.cn/Article?pageIndex=1' #列表页规则未获取#  网页打开报错 504
														
 
															+#
														
 
															+# # url = 'http://gxs.yun.liuzhou.gov.cn/xwzx/tzgg/index.shtml' # 所有要素提取失败, bug 已修复  列表页xpath预测错误
														
 
															+# # url = 'http://sz.nxeduyun.com/index.php?r=space/school/portal/content/index&sid=6ce9765e85694be7838c7f7272199346&cid=50160' #列表页获取失败 已修复
														
 
															+# # url = 'https://www.nbzjy.cn/list-gsgg.html' # #列表页规则未获取# 已解决
														
 
															+# # url = 'http://www.gdhealth.net.cn/index.php?m=content&c=index&a=lists&catid=38' # # #列表页规则未获取# chome浏览器打开异常 换另一个浏览器正常
														
 
															+# # url = 'http://www.kbs.gov.cn/ywdt/tzgg/index.html' #列表页规则未获取# iframe报错 已处理
														
 
															+# # url = 'http://www.xs9z.com/News.asp?PageNo=1&classid=17' #包含iframe 报错  已处理
														
 
															+# # url = 'http://www.tdxbmj.cn/html/qyxw1/index.html' #列表页规则未获取# 已优化处理，详情页时间没日期报错，标签id重复导致只提取到一个链接
														
 
															+# # url = 'http://www.sxsltlyy.com/newslist.php?cid=29'  # 列表页获取失败，详情页xpath错误  浏览器打开界面与selenium 的不一样  ua问题已修复
														
 
															+# # url = 'http://view.landtz.com:8092/jj/index' # #列表页规则未获取# 拍卖多个图标纵向列表   content_xpath of listpage is //*[@class="wp"]/div[2]/div[1]/a[1]/div[2] 预测错误
														
 
															+# # url = 'http://www.hbbidcloud.cn/suizhou/jyxx/004003/004003006/about.html' # #翻页链接不匹配##下一页规则未获取#  网页本身无翻页机制
														
 
															+# # url = 'http://www.cqcjda.com/ShowList.aspx?pkey=3&p=3'  #翻页链接不匹配##下一页规则未获取##详情页列表页区分长度未识别#
														
 
															+# # url = 'https://www.sxeec.com/gpgg/p4.html' ##翻页链接不匹配##下一页规则未获取#  下一页在标签<i>，链接在父节点<a>标签
														
 
															+# # url = 'http://sthjj.liaoyuan.gov.cn/xxgk/tzgg/' #翻页链接不匹配  第二页开始规律  翻页超时导致拿不到翻页规则 无头模式打开网页超时， 正常模式不超时
														
 
															+# # url = 'http://zyjy.huizhou.gov.cn/ggfw/jyxx/gycqjy/gpjggg/index_3.html'  #翻页链接不匹配
														
 
															+# # url = 'http://bj.sxggzyjy.cn/jydt/001001/001001004/001001004001/subPage.html'  #翻页链接不匹配##下一页规则未获取#
														
 
															+# # url = 'http://www.tlgljs.com/cpzs.html'
														
 
															+# # url = 'http://zrzyj.jlbc.gov.cn/xxgk/tzgg/'
														
 
															+# # url = 'http://www.zqcyl.cn/zlzx/ggl/' #抛出异常导致返回结果失败，
														
 
															+# # url = 'http://www.cqcjda.com/ShowList.aspx?pkey=3'
														
 
															+# # url = 'http://www.cqcjda.com/ShowList.aspx?pkey=3&p=1'
														
 
															+# # url = 'http://zyjy.huizhou.gov.cn/ggfw/jyxx/gycqjy/gpjggg/'
														
 
															+# # url = 'http://www.sxeec.com/gpgg.html'
														
 
															+# url = 'http://zrzyj.jlbc.gov.cn/xxgk/tzgg/'
														
 
															+# url = 'http://bbkx.bb.ah.cn/kxxw/tzgg/index.html'
														
 
															+# url = 'http://www.lzwhg.com/tongzhigonggao/'
														
 
															+# url = 'http://www.slwr.gov.cn/zfxxgk/gkml/216/240/257/list_640.htm'  # 列表页脚本异常
														
 
															+# url = 'http://view.landtz.com:8091/xh/index?resourceStatus=0&useType=&orderBy=0&title='
														
 
															+# url = 'http://ggzy.yueqing.gov.cn/yqwebnew/jyxx/001009/001009010/'
														
 
															+# url = 'http://ggzy.xjbt.gov.cn/TPFront/bt5/083003/083003002/083003002006/'
														
 
															+# url = 'http://www.longmen.gov.cn/xzfbm/xcl/zwgk/bmwj/tzgg/index.html'
														
 
															+# url = 'http://nyncj.yq.gov.cn/tzgg/'
														
 
															+url = 'http://www.yrcc.gov.cn/zwzc/gzgb/gg/index.html'
														
 
															+url = 'http://www.hzsq.gov.cn/index.php?r=article/Category/index&class_id=61'
														
 
															+url = 'http://zyjy.huizhou.gov.cn/ggfw/jyxx/gycqjy/gpjggg/'
														
 
															+url = 'http://www.lzwhg.com/tongzhigonggao/'  #翻页失败
														
 
															+rs = get_rs(url)
														
 
															+print(rs)
														
 
															+
														
 
															+
														
 
															+
														
 
															+import pandas as pd
														
 
															+import time
														
 
															+l = []
														
 
															+def get_url_root(text):
														
 
															+    url = re.search('https?:[a-z0-9-./]+\.(cn|com|org|net|gov|edu|biz|cc|mil|top|pub|info)', text)
														
 
															+    if url:
														
 
															+        return url.group(0)
														
 
															+    else:
														
 
															+        return ''
														
 
															+def get_url(text):
														
 
															+    try:
														
 
															+        url = json.loads(text).get('ruleLink', '')
														
 
															+        return url
														
 
															+    except:
														
 
															+        print('CRAWLER_LINK json loads 出错：', text)
														
 
															+        return ''
														
 
															+
														
 
															+# df = pd.read_csv('E:/crawl_data/crawler.BXKC_CRAWLING_RULES_INFO_utf8.csv')[:]
														
 
															+# df = pd.read_csv('E:/crawl_data/crawler.BXKC_CRAWLING_RULES_INFO_utf8_predict100-200.csv')[:]
														
 
															+
														
 
															+# df = pd.read_excel('E:\crawl_data/新建 XLS 工作表.xls')
														
 
															+df = pd.read_excel('E:\crawl_data/20220526new_url_2test.xlsx')
														
 
															+# df.drop_duplicates(subset=['首页网址'], inplace=True)
														
 
															+
														
 
															+#
														
 
															+# df['url_root'] = df['CRAWLER_LINK '].apply(lambda x:get_url_root(x))
														
 
															+# df['url'] = df['CRAWLER_LINK '].apply(lambda x:get_url(x))
														
 
															+# df = df[df['url']!=""]
														
 
															+# print(len(df))
														
 
															+# df.drop_duplicates(subset=['url_root'], inplace=True)
														
 
															+# print(len(df))
														
 
															+# df.drop_duplicates(subset=['DETAIL_CONTENT_NODE'], inplace=True)
														
 
															+# # df = df[100:200]
														
 
															+df.reset_index(drop=True, inplace=True)
														
 
															+print(len(df), df.columns)
														
 
															+t0 = time.time()
														
 
															+for i in df.index:
														
 
															+    # if '#列表页规则未获取#' not in df.loc[i, 'rs']:
														
 
															+    #     continue
														
 
															+    t1 = time.time()
														
 
															+    # url = df.loc[i, 'url']
														
 
															+    url = df.loc[i, '列表页链接']
														
 
															+    if not re.match('http', url):
														
 
															+        l.append('')
														
 
															+        print(url)
														
 
															+        continue
														
 
															+    print(url)
														
 
															+    rs = get_rs(url)
														
 
															+    # try:
														
 
															+    #     url = json.loads(df.loc[i, 'CRAWLER_LINK ']).get('ruleLink', '')
														
 
															+    #     print(url)
														
 
															+    #     rs = get_rs(url)
														
 
															+    # except:
														
 
															+    #     rs = json.dumps({'err_msg': 'json loads link error'})
														
 
															+    print('耗时：', time.time()-t1)
														
 
															+    print(rs)
														
 
															+    l.append(rs)
														
 
															+df['rs3'] = pd.Series(l)
														
 
															+print('完成，总耗时：', time.time()-t0)
														
 
															+# # df.to_csv('E:/crawl_data/crawler.BXKC_CRAWLING_RULES_INFO_utf8_predict后1000-900.csv', encoding='utf-8')
														
 
															+# df.to_excel('E:/crawl_data/20220526new_url_0531.xlsx', encoding='utf-8')
														
 
															+print('写入完成，总耗时：', time.time()-t0)
														
 
															+# #