4 年之前 · 2949be0410
--- a/driver/chromedriver/chromedriver_win32/chromedriver.exe
+++ b/driver/chromedriver/chromedriver_win32/chromedriver.exe
--- a/module/Utils.py
+++ b/module/Utils.py
@@ -1,3 +1,4 @@
 
				+# -*- coding: utf-8 -*-
			
 
				 '''
			
 
				 Created on 2018年12月20日
			
 
				 
			
@@ -20,10 +21,70 @@ import logging
 
				 import pickle
			
 
				 import tensorflow as tf
			
 
				 from keras import losses
			
 
				+import threading
			
 
				 
			
 
				 __author__ = 'baniu.yao'
			
 
				 
			
 
				+class MyThread(threading.Thread):
			
 
				+    def __init__(self, func, args=()):
			
 
				+        super(MyThread, self).__init__()
			
 
				+        self.func = func
			
 
				+        self.args = args
			
 
				 
			
 
				+    def run(self):
			
 
				+        self.result = self.func(*self.args)
			
 
				+
			
 
				+    def get_result(self):
			
 
				+        try:
			
 
				+            return self.result
			
 
				+        except Exception as e:
			
 
				+            print('执行js抛出异常：', e)
			
 
				+            return None
			
 
				+
			
 
				+def get_js_rs(browser, script, *arg, timeout=20):
			
 
				+    '''
			
 
				+    浏览器执行脚本，返回结果，超时中断
			
 
				+    :param browser:浏览器对象
			
 
				+    :param script: 脚本
			
 
				+    :param arg:参数
			
 
				+    :param timeout:超时时间
			
 
				+    :return:
			
 
				+    '''
			
 
				+    def execute_js():
			
 
				+        data = browser.execute_script(script, *arg)
			
 
				+        return data
			
 
				+    t = MyThread(func=execute_js, args=())
			
 
				+    t.setDaemon(True)
			
 
				+    t.start()
			
 
				+    t.join(timeout)
			
 
				+    if t.isAlive():
			
 
				+        print('执行js超时')
			
 
				+        stop_thread(t)
			
 
				+        return None
			
 
				+    data = t.get_result()
			
 
				+    return data
			
 
				+
			
 
				+import time
			
 
				+def thread_run(func, *arg, timeout=30):
			
 
				+    t = MyThread(func=func, args=(*arg,))
			
 
				+    t.setDaemon(True)
			
 
				+    t.start()
			
 
				+    t.join(timeout)
			
 
				+    if t.isAlive():
			
 
				+        print('thread_run time out')
			
 
				+    result = t.get_result()
			
 
				+    return result
			
 
				+
			
 
				+def xpath2css(xpath):
			
 
				+    '''
			
 
				+    把xpath路径转为css路径
			
 
				+    :param xpath:
			
 
				+    :return:
			
 
				+    '''
			
 
				+    xpath = xpath.replace('//', '').replace('@', '').replace('/', '>')
			
 
				+    for it in re.finditer('\[(\d)\]', xpath):
			
 
				+        xpath = xpath.replace(it.group(0), ':nth-child(%s)'%it.group(1))
			
 
				+    return xpath
			
 
				 
			
 
				 def get_class_from_frame(fr):
			
 
				     args, _, _, value_dict = inspect.getargvalues(fr)
			
@@ -520,6 +581,56 @@ def print_metrics(history):
 
				     plt.show()
			
 
				 
			
 
				 scripts_common = '''
			
 
				+document.getElementsByClassName = function (Name,e,tag) {
			
 
				+            var ele = [],
			
 
				+                allEle,
			
 
				+                length,
			
 
				+                i = 0;
			
 
				+ 
			
 
				+            if (typeof tag === "undefined" ){
			
 
				+                tag = "*"
			
 
				+            }
			
 
				+ 
			
 
				+            if (typeof e === "undefined"){
			
 
				+                e = document;
			
 
				+            }
			
 
				+ 
			
 
				+            allEle = e.getElementsByTagName(tag);
			
 
				+ 
			
 
				+            for (length = allEle.length;i < length;i = i + 1){
			
 
				+                if (allEle[i].className === Name) {
			
 
				+                    ele.push(allEle[i]);
			
 
				+                }
			
 
				+            }
			
 
				+ 
			
 
				+            return ele;
			
 
				+        }
			
 
				+
			
 
				+document.countElementById = function (id,e,tag) {
			
 
				+            var ele = [],
			
 
				+                allEle,
			
 
				+                length,
			
 
				+                i = 0;
			
 
				+ 
			
 
				+            if (typeof tag === "undefined" ){
			
 
				+                tag = "*"
			
 
				+            }
			
 
				+ 
			
 
				+            if (typeof e === "undefined"){
			
 
				+                e = document;
			
 
				+            }
			
 
				+ 
			
 
				+            allEle = e.getElementsByTagName(tag);
			
 
				+ 
			
 
				+            for (length = allEle.length;i < length;i = i + 1){
			
 
				+                if (allEle[i].id === id) {
			
 
				+                    ele.push(allEle[i]);
			
 
				+                }
			
 
				+            }
			
 
				+ 
			
 
				+            return ele;
			
 
				+        }
			
 
				+
			
 
				 /*js集合set类的实现*/
			
 
				 function Set() {
			
 
				     this.dataStore = [];
			
@@ -664,7 +775,7 @@ function getRemoveList(node,recurse,list_remove){
 
				 }
			
 
				 
			
 
				 function getListXpath(el,list_xpath,getRemove){
			
 
				-    if (el==document.body){
			
 
				+    if (el==document || el==document.body){
			
 
				         return list_xpath;
			
 
				     }
			
 
				     if(getRemove){
			
@@ -678,7 +789,7 @@ function getListXpath(el,list_xpath,getRemove){
 
				     return getListXpath(el.parentNode,list_xpath,getRemove);
			
 
				 }
			
 
				 function getXpath(el,b,notfirst){
			
 
				-    if (el.id !=""){
			
 
				+    if (el.id !="" && document.countElementById(el.id).length==1){
			
 
				         var _jump_flag = false;
			
 
				         if(b!=null){
			
 
				             for(var i=0;i<b.length;i++){
			
@@ -691,14 +802,16 @@ function getXpath(el,b,notfirst){
 
				             _jump_flag = true;
			
 
				         }
			
 
				         if(!_jump_flag){
			
 
				-            return '//*[@id=\"'+el.id+'\"]';
			
 
				+            //return '//*[@id=\"'+el.id+'\"]';
			
 
				+            return '//'+el.tagName.toLowerCase()+'[@id=\"'+el.id+'\"]';
			
 
				         }
			
 
				         
			
 
				     }
			
 
				     
			
 
				     if (el.getAttribute("class")!=null && document.getElementsByClassName(el.getAttribute("class")).length==1){
			
 
				         if(!notfirst){
			
 
				-            return '//*[@class=\"'+el.getAttribute("class")+'\"]';
			
 
				+            //return '//*[@class=\"'+el.getAttribute("class")+'\"]';
			
 
				+            return '//'+el.tagName.toLowerCase()+'[@class=\"'+el.getAttribute("class")+'\"]';
			
 
				         }
			
 
				         
			
 
				     }
			
@@ -823,7 +936,7 @@ function clustering(list_hitTag){
 
				 
			
 
				 function clustering_turnPage(){
			
 
				     //var pattern_page = /((?<nextPage>下一?页|>>|>)|(?<lastPage>上一?页|<<|<)|(?<firstPage>首页|第一页)|(?<tailPage>尾页)|(?<other>\.{1,2}|共\d[条页]|\d+\/\d+))/ //phantomjs不支持命名分组
			
 
				-    var pattern_page = /^\s*.?\s*([下后]\s*一?\s*页?|[下后]\s*一?\s*页\s*»|[下后]\s*一?\s*页\s*>|[下后]\s*一?\s*页\s*>>|»|>>|>|[Nn]ext)\s*.?\s*$|^\s*.?([前上]\s*一?\s*页?|«\s*[前上]\s*一?\s*页|«|<<|<|[Pp]revious).?\s*$|^\s*.?(首\s*页?|第\s*一\s*页|first|\|<).?\s*$|^\s*.?([尾末]\s*一?\s*页?|tail|>\|).?s\s*$|(^\s*\.{1,2}\s*$|^.{,10}共\s*\d+\s*[条页].{,10}$|^.{,10}\d+\/\d+.{,3}$|^\s*\.{0,2}\d+\s*$|^\s*[gG][oO]\s*$|^.{0,2}跳?转到?)/
			
 
				+    var pattern_page = /^\s*[^最]?\s*([下后]\s*一?\s*页?|[下后]\s*一?\s*页\s*»|[下后]\s*一?\s*页\s*>|[下后]\s*一?\s*页\s*>>|»|>>|>|[Nn]ext)\s*.?\s*$|^\s*.?([前上]\s*一?\s*页?|«\s*[前上]\s*一?\s*页|«|<<|<|[Pp]revious).?\s*$|^\s*.?(首\s*页?|第\s*一\s*页|first|\|<).?\s*$|^\s*.?([尾末]\s*一?\s*页?|tail|>\|).?s\s*$|(^\s*\.{1,2}\s*$|^.{,10}共\s*\d+\s*[条页].{,10}$|^.{,10}\d+\/\d+.{,3}$|^\s*\.{0,2}\d+\s*$|^\s*[gG][oO]\s*$|^.{0,2}跳?转到?)/
			
 
				     var pattern_nextPage = /[Nn]ext/
			
 
				     var list_hitTag = new Array();
			
 
				     
			
--- a/module/detail/content/featureEngine.py
+++ b/module/detail/content/featureEngine.py
@@ -54,12 +54,12 @@ function statistic(node,deepth){
 
				                 node.counts_communicateTags += 1;
			
 
				             }
			
 
				         }
			
 
				-        if(child.tagName!=null && child.tagName.toLowerCase()=="iframe" && child.contentWindow.document!=null){
			
 
				+        /*if(child.tagName!=null && child.tagName.toLowerCase()=="iframe" && child.contentWindow.document!=null){
			
 
				             node.counts_communicateTags += statisticIframe(child.contentWindow.document.all);
			
 
				         }else{
			
 
				             node.counts_communicateTags += statistic(child,deepth+1);
			
 
				-        }
			
 
				-            
			
 
				+        }*/
			
 
				+        node.counts_communicateTags += statistic(child,deepth+1);    
			
 
				     }
			
 
				     var innertext = node.innerText;
			
 
				     if(innertext){
			
@@ -133,7 +133,7 @@ function stastic_time(node,_array){
 
				         }
			
 
				     }
			
 
				 
			
 
				-    if (!_find_flag){
			
 
				+    if (!_find_flag && node!=document){
			
 
				         _array_fontSize = new Array();
			
 
				         getListFontSize(node,_array_fontSize);
			
 
				         _array.push([getOffsetLeft(node),getOffsetTop(node),getListXpath(node,new Array()),Math.min(_array_fontSize)]);
			
@@ -334,7 +334,8 @@ def encodeInput_byJS(url,targethtml):
 
				             browser.maximize_window()
			
 
				             start = time.time()
			
 
				             
			
 
				-            data = browser.execute_script(scripts_common+scripts)
			
 
				+            # data = browser.execute_script(scripts_common+scripts)
			
 
				+            data = get_js_rs(browser, scripts_common+scripts)
			
 
				             input_x,list_inner = dealWithScriptOut(data)
			
 
				             list_label = []
			
 
				             for item in list_inner:
			
@@ -352,7 +353,7 @@ def encodeInput_byJS(url,targethtml):
 
				     args = {"url":url,"targethtml":targethtml}
			
 
				     hd.executeMethod(_method, args)
			
 
				     
			
 
				-def getInput_byJS(url):
			
 
				+def getInput_byJS(browser, url):
			
 
				     def label(innerhtml,target_source):
			
 
				         target_source =re.sub("[\r\n\s]","",str(target_source))
			
 
				         pattern = ">(.*)<"
			
@@ -365,12 +366,14 @@ def getInput_byJS(url):
 
				             return 1
			
 
				         return 0
			
 
				     try:
			
 
				-        browser = hd.getdriver()
			
 
				-        debug("get driver")
			
 
				-        hd.loadPage(browser, url)
			
 
				-        browser.maximize_window()
			
 
				+        # browser = hd.getdriver()
			
 
				+        # debug("get driver")
			
 
				+        # hd.loadPage(browser, url)
			
 
				+        # browser.maximize_window()
			
 
				         
			
 
				-        data,data_time = browser.execute_script(scripts_common+scripts)
			
 
				+        # data,data_time = browser.execute_script(scripts_common+scripts)
			
 
				+        data,data_time = get_js_rs(browser, scripts_common+scripts)
			
 
				+        log('获取正文、时间脚本执行完毕')
			
 
				         input_x,list_inner,list_xpath = dealWithScriptOut(data)
			
 
				         if input_x is not None:
			
 
				             #return [np.expand_dims(np.transpose(pad_sequences(np.transpose(input_x,(1,0)), 155,padding="post", truncating="post", value=0,dtype="float32"),(1,0)),0)],list_inner
			
@@ -383,9 +386,9 @@ def getInput_byJS(url):
 
				         if re.search("frame",str(e)) is not None:
			
 
				             err_msg = "#iframe#"
			
 
				         return None,err_msg
			
 
				-    finally:
			
 
				-        hd.adddriver(browser)
			
 
				-        debug("release driver")
			
 
				+    # finally:
			
 
				+    #     hd.adddriver(browser)
			
 
				+    #     debug("release driver")
			
 
				 
			
 
				 
			
 
				 
			
--- a/module/detail/extractor.py
+++ b/module/detail/extractor.py
@@ -87,11 +87,31 @@ def getRule_detail(list_hrefs,try_times=3,MAX_HREFS=10):
 
				             continue
			
 
				         list_legal_time = []
			
 
				         _flag = -2
			
 
				-        flag,data = featureEngine_content.getInput_byJS(_url)
			
 
				+        browser = hd.getdriver()
			
 
				+        debug("get driver")
			
 
				+        loadsucess = hd.loadPage(browser, _url)
			
 
				+        if not loadsucess:
			
 
				+            browser = hd.getdriver()
			
 
				+        # browser.maximize_window()
			
 
				+        flag,data = featureEngine_content.getInput_byJS(browser,_url)
			
 
				         hasGotten = True
			
 
				         if flag:
			
 
				-            x,_,list_xpath,data_time = data
			
 
				+            x,inner_html,list_xpath,data_time = data
			
 
				             _index = detailContentPredictor.predict(x)
			
 
				+
			
 
				+            pt = '<a.*?\.(zip|rar|tar|7z|wim|docx|doc|xlsx|xls|pdf|txt|hnzf|bmp|tif).*?</a>'
			
 
				+            total_annex = len(re.findall(pt, browser.page_source))
			
 
				+            extract_annex = len(re.findall(pt, inner_html[_index]))
			
 
				+            if total_annex > extract_annex and _index>5 and len(list_xpath[_index])>0:
			
 
				+                extract_xpath = list_xpath[_index][0][0]
			
 
				+                for i in range(_index-1, _index-5, -1):
			
 
				+                    if len(re.findall(pt, inner_html[i]))== total_annex:
			
 
				+                        log('规格调整模型正文提取附件不完整')
			
 
				+                        _index = i
			
 
				+                        break
			
 
				+                    elif len(list_xpath[i])>0 and list_xpath[i][0][0] not in extract_xpath:
			
 
				+                        break
			
 
				+
			
 
				             _xpath = list_xpath[_index]
			
 
				             _xpath.reverse()
			
 
				             list_xpath_remove_content.append(_xpath)
			
@@ -102,10 +122,12 @@ def getRule_detail(list_hrefs,try_times=3,MAX_HREFS=10):
 
				         else:
			
 
				             hasGotten = False
			
 
				             add_err_msg(dict_rule_detail, data)
			
 
				-             
			
 
				-        flag,data_title = featureEngine_title.getInput_byJS(_url)
			
 
				+        flag,data_title = featureEngine_title.getInput_byJS(browser,_url)
			
 
				+        hd.adddriver(browser)
			
 
				+        debug("release driver")
			
 
				         if flag:
			
 
				             x,_,list_xpath,list_top = data_title
			
 
				+            log('详情标题获取成功')
			
 
				             _index = detailTitlePredictor.predict(x)
			
 
				             _xpath = list_xpath[_index]
			
 
				             _xpath.reverse()
			
@@ -130,7 +152,7 @@ def getRule_detail(list_hrefs,try_times=3,MAX_HREFS=10):
 
				             _xpath.append(_xpath_remove[0])
			
 
				         list_xpaths_content.append(_xpath)
			
 
				     dict_rule_detail["detail_content"] = getCommonXpath(list_xpaths_content)
			
 
				-    
			
 
				+
			
 
				     set_remove_list = None
			
 
				     for item in list_xpath_remove_content:
			
 
				         for _xpath_remove in item:
			
@@ -139,31 +161,36 @@ def getRule_detail(list_hrefs,try_times=3,MAX_HREFS=10):
 
				                     set_remove_list = set(_xpath_remove[1])
			
 
				                 else:
			
 
				                     set_remove_list = set(_xpath_remove[1])&set_remove_list
			
 
				-    dict_rule_detail["detail_removeList"] = list(set_remove_list)
			
 
				-    
			
 
				+    dict_rule_detail["detail_removeList"] = list(set_remove_list) if set_remove_list!=None else []
			
 
				     dict_rule_detail["detail_date"] = getCommonXpath_time(list_data_time)
			
 
				     dict_rule_detail["detail_title"] = getCommonXpath(list_xpaths_title)
			
 
				     
			
 
				-    try:
			
 
				-        browser = hd.getdriver()
			
 
				-        debug("get driver")
			
 
				-        if len(list_hrefs)>0:
			
 
				-            hd.loadPage(browser, list_hrefs[-1],)
			
 
				-            dict_rule_detail["hasDrew"] = dict_rule_detail["hasDrew"] or hd.hasDrew(list_hrefs[0], [{"type":"xpath","rule":dict_rule_detail["detail_content"]},
			
 
				-                                                                                                    {"type":"xpath","rule":dict_rule_detail["detail_date"]},
			
 
				-                                                                                                    {"type":"xpath","rule":dict_rule_detail["detail_title"]}])
			
 
				-        if dict_rule_detail["detail_content"] is not None and len(dict_rule_detail["detail_content"].split("/"))>6:
			
 
				-            log("before being replaced xpath of detail_content"+dict_rule_detail["detail_content"])
			
 
				-            dict_rule_detail["detail_content"] = browser.execute_script(scripts_replaceXpath,dict_rule_detail["detail_content"])
			
 
				-            log("after being replaced xpath of detail_content"+dict_rule_detail["detail_content"])
			
 
				-        if dict_rule_detail["detail_date"] is not None and len(dict_rule_detail["detail_date"].split("/"))>6:
			
 
				-            dict_rule_detail["detail_date"] = browser.execute_script(scripts_replaceXpath,dict_rule_detail["detail_date"])
			
 
				-        if dict_rule_detail["detail_title"] is not None and len(dict_rule_detail["detail_title"].split("/"))>6:
			
 
				-            dict_rule_detail["detail_title"] = browser.execute_script(scripts_replaceXpath,dict_rule_detail["detail_title"])
			
 
				-    finally:
			
 
				-        hd.adddriver(browser)
			
 
				-        debug("release driver")  
			
 
				-        
			
 
				+    # try:
			
 
				+    browser = hd.getdriver()
			
 
				+    debug("get driver")
			
 
				+    if len(list_hrefs)>0:
			
 
				+        loadsucess = hd.loadPage(browser, list_hrefs[-1],)
			
 
				+        log('logPage: ')
			
 
				+        if loadsucess==False:
			
 
				+            browser = hd.getdriver()
			
 
				+        dict_rule_detail["hasDrew"] = dict_rule_detail["hasDrew"] or hd.hasDrew(list_hrefs[0], [{"type":"xpath","rule":dict_rule_detail["detail_content"]},
			
 
				+                                                                                            {"type":"xpath","rule":dict_rule_detail["detail_date"]},
			
 
				+                                                                                            {"type":"xpath","rule":dict_rule_detail["detail_title"]}])
			
 
				+    if dict_rule_detail["detail_content"] is not None and len(dict_rule_detail["detail_content"].split("/"))>6:
			
 
				+        log("before being replaced xpath of detail_content"+dict_rule_detail["detail_content"])
			
 
				+        # dict_rule_detail["detail_content"] = browser.execute_script(scripts_replaceXpath,dict_rule_detail["detail_content"])
			
 
				+        dict_rule_detail["detail_content"] = get_js_rs(browser, scripts_replaceXpath,dict_rule_detail["detail_content"])
			
 
				+        log("after being replaced xpath of detail_content"+dict_rule_detail["detail_content"])
			
 
				+    if dict_rule_detail["detail_date"] is not None and len(dict_rule_detail["detail_date"].split("/"))>6:
			
 
				+        # dict_rule_detail["detail_date"] = browser.execute_script(scripts_replaceXpath,dict_rule_detail["detail_date"])
			
 
				+        dict_rule_detail["detail_date"] = get_js_rs(browser, scripts_replaceXpath,dict_rule_detail["detail_date"])
			
 
				+    if dict_rule_detail["detail_title"] is not None and len(dict_rule_detail["detail_title"].split("/"))>6:
			
 
				+        # dict_rule_detail["detail_title"] = browser.execute_script(scripts_replaceXpath,dict_rule_detail["detail_title"])
			
 
				+        dict_rule_detail["detail_title"] = get_js_rs(browser, scripts_replaceXpath,dict_rule_detail["detail_title"])
			
 
				+    # finally:
			
 
				+    hd.adddriver(browser)
			
 
				+    debug("release driver")
			
 
				+
			
 
				     if dict_rule_detail["detail_content"] is not None and dict_rule_detail["detail_date"] is not None and dict_rule_detail["detail_title"] is not None:
			
 
				         dict_rule_detail["flag"] = True
			
 
				     else:
			
--- a/module/detail/title/featureEngine.py
+++ b/module/detail/title/featureEngine.py
@@ -48,12 +48,12 @@ function statistic(node,deepth){
 
				                 node.counts_communicateTags += 1;
			
 
				             }
			
 
				         }
			
 
				-        if(child.tagName!=null && child.tagName.toLowerCase()=="iframe" && child.contentWindow.document!=null){
			
 
				+        /*if(child.tagName!=null && child.tagName.toLowerCase()=="iframe" && child.contentWindow.document!=null){
			
 
				             node.counts_communicateTags += statisticIframe(child.contentWindow.document.all);
			
 
				         }else{
			
 
				             node.counts_communicateTags += statistic(child,deepth+1);
			
 
				-        }
			
 
				-            
			
 
				+        }*/
			
 
				+        node.counts_communicateTags += statistic(child,deepth+1);                
			
 
				     }
			
 
				     var innertext = node.innerText;
			
 
				     if(innertext){
			
@@ -223,13 +223,14 @@ def dealWithScriptOut(data,sort_index=3):
 
				     else:
			
 
				         return None
			
 
				 
			
 
				-def getInput_byJS(url):
			
 
				+def getInput_byJS(browser,url):
			
 
				     try:
			
 
				-        browser = hd.getdriver()
			
 
				-        debug("get driver")
			
 
				-        hd.loadPage(browser, url)
			
 
				+        # browser = hd.getdriver()
			
 
				+        # debug("get driver")
			
 
				+        # hd.loadPage(browser, url)
			
 
				     
			
 
				-        data = browser.execute_script(scripts_common+scripts_title)
			
 
				+        # data = browser.execute_script(scripts_common+scripts_title)
			
 
				+        data = get_js_rs(browser, scripts_common+scripts_title)
			
 
				         deal_data = dealWithScriptOut(data)
			
 
				         if deal_data is None:
			
 
				             return False,""
			
@@ -242,9 +243,9 @@ def getInput_byJS(url):
 
				         if re.search("frame",str(e)) is not None:
			
 
				             err_msg = "#iframe#"
			
 
				         return None,err_msg
			
 
				-    finally:
			
 
				-        hd.adddriver(browser)
			
 
				-        debug("release driver")
			
 
				+    # finally:
			
 
				+        # hd.adddriver(browser)
			
 
				+        # debug("release driver")
			
 
				 
			
 
				 def encodeInput_byJS(url,targethtml):
			
 
				     def label(innerhtml,target_source):
			
@@ -267,7 +268,8 @@ def encodeInput_byJS(url,targethtml):
 
				         browser.maximize_window()
			
 
				         start = time.time()
			
 
				         
			
 
				-        data = browser.execute_script(scripts_common+scripts_title)
			
 
				+        # data = browser.execute_script(scripts_common+scripts_title)
			
 
				+        data = get_js_rs(browser, scripts_common+scripts_title)
			
 
				         input_x,list_inner,_,_ = dealWithScriptOut(data)
			
 
				         list_label = []
			
 
				         for item in list_inner:
			
--- a/module/extractFlow.py
+++ b/module/extractFlow.py
@@ -24,7 +24,10 @@ def ruleExtract(listpage_url):
 
				             result["status_code"] = "404"
			
 
				             add_err_msg(result, "#网页打不开#")
			
 
				             return result
			
 
				+        print('准备取列表页 ')
			
 
				         data_listpage = ext_listpage.getRule_listpage(listpage_url)
			
 
				+        print('完成列表页处理')
			
 
				+        # print('data_listpage:', data_listpage)
			
 
				         if data_listpage is None:
			
 
				             log("data_listpage is None")
			
 
				             rule_listpage = None
			
@@ -34,7 +37,9 @@ def ruleExtract(listpage_url):
 
				             result["status_code"] = "201"
			
 
				         else:
			
 
				             rule_listpage,list_hrefs = data_listpage
			
 
				+            print('准备处理详情页')
			
 
				             rule_detail = ext_detail.getRule_detail(list_hrefs)
			
 
				+            print('详情页处理完毕')
			
 
				             result = mergeDict([rule_listpage,rule_detail])
			
 
				             result["status_code"] = "201"
			
 
				     except Exception as e:
			
--- a/module/htmlDrawing.py
+++ b/module/htmlDrawing.py
@@ -14,25 +14,28 @@ import time
 
				 
			
 
				 header={
			
 
				     "Accept": "text/html, application/xhtml+xml, image/jxr, */*",
			
 
				-    "Referer": "http://uia.hnist.cn/sso/login?service=http%3A%2F%2Fportal.hnist.\
			
 
				-                cn%2Fuser%2FsimpleSSOLogin",    
			
 
				+    # "Referer": "http://uia.hnist.cn/sso/login?service=http%3A%2F%2Fportal.hnist.\
			
 
				+    #             cn%2Fuser%2FsimpleSSOLogin",
			
 
				     "Accept-Language": "zh-Hans-CN,zh-Hans;q=0.8,en-US;q=0.5,en;q=0.3",
			
 
				     "Content-Type": "application/x-www-form-urlencoded",
			
 
				     "Connection": "Keep-Alive",
			
 
				     "User-Agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64) \
			
 
				      AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.162 Safari/537.36",
			
 
				     #"Accept-Encoding": "gzip, deflate",
			
 
				-    "Origin": "http://uia.hnist.cn",
			
 
				+    # "Origin": "http://uia.hnist.cn",
			
 
				     "Upgrade-Insecure-Requests": "1",
			
 
				     }  
			
 
				 
			
 
				 TYPE = "phantomjs"
			
 
				+# TYPE = "chrome"
			
 
				 current_path = os.path.abspath("/".join(__file__.split("\\")[:-1]))
			
 
				 driver_paths = {"phantomjs_linux":current_path+"/../driver/phantomjs/phantomjs-2.1.1-linux-x86_64/bin/phantomjs",
			
 
				                 "phantomjs_window":current_path+"/../driver/phantomjs/phantomjs-2.1.1-windows/bin/phantomjs.exe",
			
 
				                 "chrome_linux":current_path+"/../driver/chromedriver/chromedriver_linux64/chromedriver",
			
 
				                 "chrome_window":current_path+"/../driver/chromedriver/chromedriver_win32/chromedriver.exe"}
			
 
				 
			
 
				+print(driver_paths)
			
 
				+
			
 
				 
			
 
				 def getBrowser_phantomJS(platform="linux",straight=False):
			
 
				     
			
@@ -41,12 +44,13 @@ def getBrowser_phantomJS(platform="linux",straight=False):
 
				     else:
			
 
				         executable_path = driver_paths["phantomjs_window"]
			
 
				     desired_capabilities= DesiredCapabilities.PHANTOMJS.copy()
			
 
				+    print('os.path.exists executable_path', executable_path, os.path.exists(executable_path))
			
 
				     for key, value in header.items():
			
 
				         desired_capabilities['phantomjs.page.customHeaders.{}'.format(key)] = value
			
 
				-    desired_capabilities['phantomjs.page.customHeaders.User-Agent'] ='Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95 Safari/537.36'
			
 
				+    desired_capabilities['phantomjs.page.customHeaders.User-Agent'] ='Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/100.0.4896.75 Safari/537.36 Edg/100.0.1185.366'
			
 
				     desired_capabilities["phantomjs.page.settings.loadImages"] = False
			
 
				     desired_capabilities["phantomjs.page.settings.disk-cache"] = False
			
 
				-    browser_phantomjs = webdriver.PhantomJS(executable_path=executable_path,desired_capabilities=desired_capabilities,service_args=['--ignore-ssl-errors=true','--ssl-protocol=any'])
			
 
				+    browser_phantomjs = webdriver.PhantomJS(executable_path=executable_path,desired_capabilities=desired_capabilities,service_args=['--ignore-ssl-errors=true','--ssl-protocol=TLSv1'])
			
 
				     browser_phantomjs.implicitly_wait(10)
			
 
				     browser_phantomjs.set_script_timeout(20)
			
 
				     browser_phantomjs.set_page_load_timeout(10)
			
@@ -61,14 +65,15 @@ def getBrowser_chrome(platform="linux",straight=False):
 
				     chrome_options = webdriver.ChromeOptions()
			
 
				     prefs = {"profile.managed_default_content_settings.images":2}
			
 
				     chrome_options.add_experimental_option("prefs",prefs)
			
 
				-    chrome_options.add_argument('--headless') 
			
 
				+    chrome_options.add_argument('--headless')
			
 
				     chrome_options.add_argument('--no-sandbox')
			
 
				-    chrome_options.add_argument('--user-agent=iphoneMozilla/5.0 (Macintosh; Intel Mac OS X 10_10_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95 Safari/537.36')
			
 
				+    chrome_options.add_argument('--user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/100.0.4896.75 Safari/537.36 Edg/100.0.1185.36')
			
 
				     desired_capabilities= DesiredCapabilities.CHROME.copy()
			
 
				     desired_capabilities['loggingPrefs'] = { 'performance':'ALL' }
			
 
				-    browser_chrome = webdriver.Chrome(desired_capabilities=desired_capabilities,executable_path=executable_path, chrome_options=chrome_options,service_args=['--ignore-ssl-errors=true','--ssl-protocol=any'])
			
 
				-    browser_chrome.implicitly_wait(10)
			
 
				-    browser_chrome.set_page_load_timeout(10)
			
 
				+    browser_chrome = webdriver.Chrome(desired_capabilities=desired_capabilities,executable_path=executable_path, chrome_options=chrome_options,service_args=['--ignore-ssl-errors=true','--ssl-protocol=any'])  # '--ssl-protocol=any'  TLSv1
			
 
				+    browser_chrome.implicitly_wait(15)
			
 
				+    browser_chrome.set_page_load_timeout(15)
			
 
				+    # browser_chrome = webdriver.Chrome(executable_path=executable_path)
			
 
				     
			
 
				     return browser_chrome
			
 
				 
			
@@ -88,8 +93,9 @@ def getBrowser(type=TYPE,straight=False):
 
				 
			
 
				 def getStatus(url):
			
 
				     try:
			
 
				-        r = requests.get(url, headers=header, allow_redirects = False,timeout=10)
			
 
				+        r = requests.get(url, headers=header, allow_redirects = False,timeout=15)
			
 
				     except Exception as e:
			
 
				+        log('requests.get error :%s'%e)
			
 
				         return 404    
			
 
				     return r.status_code
			
 
				 
			
@@ -106,9 +112,13 @@ def releaseAllDriver():
 
				             try:
			
 
				                 lock.acquire()
			
 
				                 wait_count = 0
			
 
				+                t0 = time.time()
			
 
				                 while(True):
			
 
				                     if _queue.full():
			
 
				                         break
			
 
				+                    elif time.time()-t0>60:
			
 
				+                        log('等待放回浏览器超时，强制释放所有driver')
			
 
				+                        break
			
 
				                     else:
			
 
				                         wait_count += 1
			
 
				                         log("waitting for drivers all back..."+str(wait_count)+"qsize:"+str(_queue.qsize()))
			
@@ -119,6 +129,7 @@ def releaseAllDriver():
 
				                 lock_kill.release()
			
 
				     t = Thread(target=_method)
			
 
				     t.start()
			
 
				+    t.join(100)
			
 
				             
			
 
				     
			
 
				 
			
@@ -155,7 +166,7 @@ def getdriver():
 
				     global _get_count
			
 
				     _get_count += 1
			
 
				     if _get_count>1000:
			
 
				-        log("get driver 达到调用次数，重新进行初始化")
			
 
				+        log("get_driver 达到调用次数，重新进行初始化")
			
 
				         releaseAllDriver()
			
 
				         _get_count = 0
			
 
				     lock.acquire()
			
@@ -198,23 +209,29 @@ def hasDrew(url,list_rule):
 
				     @summary: 根据规则判断是否渲染
			
 
				     @param: url:网页链接，list_rule: xpath规则数组 
			
 
				     '''
			
 
				-    try:
			
 
				-        r = requests.get(url, headers=header, allow_redirects = False)
			
 
				-        _encoding = r.encoding
			
 
				-        if _encoding is None:
			
 
				-            _encoding = "utf8"
			
 
				-        dom = html.fromstring(r.content.decode(_encoding))
			
 
				-        for item in list_rule:
			
 
				-            if item["type"]=="xpath":
			
 
				-                if item["rule"] is not None:
			
 
				-                    list_nodes = dom.xpath(item["rule"])
			
 
				-                    if len(list_nodes)==0:
			
 
				-                        return True
			
 
				-    except Exception as e:
			
 
				-        error(str(e))
			
 
				-    return False
			
 
				+    def hasdrew(url,list_rule):
			
 
				+        try:
			
 
				+            r = requests.get(url, headers=header, allow_redirects = False, timeout=10)
			
 
				+            _encoding = r.encoding
			
 
				+            if _encoding is None:
			
 
				+                _encoding = "utf8"
			
 
				+            dom = html.fromstring(r.content.decode(_encoding))
			
 
				+            for item in list_rule:
			
 
				+                if item["type"]=="xpath":
			
 
				+                    if item["rule"] is not None:
			
 
				+                        list_nodes = dom.xpath(item["rule"])
			
 
				+                        if len(list_nodes)==0:
			
 
				+                            return True
			
 
				+        except Exception as e:
			
 
				+            error(str(e))
			
 
				+        return False
			
 
				+    rs = thread_run(hasdrew, url,list_rule)
			
 
				+    if rs != None:
			
 
				+        return rs
			
 
				+    else:
			
 
				+        return False
			
 
				 
			
 
				-def loadPage(browser,url,timeout=20):
			
 
				+def loadPage(browser,url,timeout=30):
			
 
				     '''
			
 
				     @summary: 解决selenium加载网页不返回的问题，设置线程进行加载，对线程设置超时时间
			
 
				     '''
			
@@ -225,7 +242,9 @@ def loadPage(browser,url,timeout=20):
 
				             debug("load "+url+" done")
			
 
				         except Exception as e:
			
 
				             error(str(e))
			
 
				+            log('加载页面抛出异常：'+str(e))
			
 
				             if re.search("由于目标计算机积极拒绝",str(e)) is not None:
			
 
				+                log('log page exception')
			
 
				                 releaseAllDriver()
			
 
				         
			
 
				     t = Thread(target=_thread_load,args=(browser,url))
			
@@ -239,9 +258,14 @@ def loadPage(browser,url,timeout=20):
 
				         '''
			
 
				         #执行释放资源的线程
			
 
				         error("driver get方法卡住，强制释放所有资源")
			
 
				-        releaseAllDriver()
			
 
				         stop_thread(t)
			
 
				-        raise NameError("超时加载"+str(url))
			
 
				+        log('stop_loadpage thread return false')
			
 
				+        adddriver(browser)
			
 
				+        debug("release driver")
			
 
				+        releaseAllDriver()
			
 
				+        return False
			
 
				+        # raise NameError("超时加载"+str(url))
			
 
				+    return True
			
 
				     
			
 
				     
			
 
				 def getSource(url):
			
--- a/module/listpage/content/featureEngine.py
+++ b/module/listpage/content/featureEngine.py
@@ -86,13 +86,13 @@ function statistic(node,deepth){
 
				             if (child.tagName.toLowerCase() in {a:"",input:"",select:""} || child.onclick!=null){
			
 
				                 node.counts_communicateTags += 1;
			
 
				             }
			
 
				-        }
			
 
				-        if(child.tagName!=null && child.tagName.toLowerCase()=="iframe" && child.contentWindow.document!=null){
			
 
				+        }        
			
 
				+        /*if(child.tagName!=null && child.tagName.toLowerCase()=="iframe" && child.contentWindow.document!=null){
			
 
				             node.counts_communicateTags += statisticIframe(child.contentWindow.document.all);
			
 
				         }else{
			
 
				             node.counts_communicateTags += statistic(child,deepth+1);
			
 
				-        }
			
 
				-            
			
 
				+        }*/
			
 
				+        node.counts_communicateTags += statistic(child,deepth+1);    
			
 
				     }
			
 
				     node.counts_tagType = set_tag.size();
			
 
				     var sum_width = 0;
			
@@ -379,23 +379,29 @@ function clustering_xpath(array_xpath){
 
				 
			
 
				 
			
 
				 function search(content_xpath){
			
 
				-    content_node = getNode_listContent(content_xpath)
			
 
				-    if(content_node!=null){
			
 
				-        var array_a_href = statistic_A(content_node);
			
 
				-        var array_a = array_a_href[0];
			
 
				-        var array_href = new Array();
			
 
				-        var array_date = new Array();
			
 
				-        statistic_time(content_node,array_date);
			
 
				-        var _clustered_a = clustering_xpath(array_a);
			
 
				-        var _clustered_date = clustering_xpath(array_date);
			
 
				-        for(var i=0;i<array_a.length;i++){
			
 
				-            if(_clustered_a[1].indexOf(array_a_href[0][i])>=0){
			
 
				-                array_href.push(array_a_href[1][i]);
			
 
				+    try{
			
 
				+        content_node = getNode_listContent(content_xpath) //获取列表页标签节点
			
 
				+        if(content_node!=null){
			
 
				+            var array_a_href = statistic_A(content_node);
			
 
				+            var array_a = array_a_href[0];
			
 
				+            var array_href = new Array();
			
 
				+            var array_date = new Array();
			
 
				+            statistic_time(content_node,array_date);
			
 
				+            var _clustered_a = clustering_xpath(array_a);
			
 
				+            var _clustered_date = clustering_xpath(array_date);
			
 
				+            for(var i=0;i<array_a.length;i++){
			
 
				+                if(_clustered_a[1].indexOf(array_a_href[0][i])>=0){
			
 
				+                    array_href.push(array_a_href[1][i]);
			
 
				+                }
			
 
				             }
			
 
				+            return [_clustered_a,_clustered_date,array_href]
			
 
				         }
			
 
				-        return [_clustered_a,_clustered_date,array_href]
			
 
				+        return null;
			
 
				     }
			
 
				-    return null;
			
 
				+    catch(e){
			
 
				+        return null
			
 
				+    }
			
 
				+
			
 
				 }
			
 
				 return search(arguments[0]);
			
 
				 '''
			
@@ -433,7 +439,8 @@ def encodeInput_byJS(url,str_href):
 
				         browser = hd.getdriver()
			
 
				         debug("get driver")
			
 
				         hd.loadPage(browser, url)
			
 
				-        data = browser.execute_script(scripts_common+script_content,str_href)
			
 
				+        # data = browser.execute_script(scripts_common+script_content,str_href)
			
 
				+        data = get_js_rs(browser, scripts_common+script_content,str_href)
			
 
				         deal_data = dealWithScriptOut(data)
			
 
				         
			
 
				         if deal_data is None:
			
@@ -453,8 +460,10 @@ def encodeInput_byJS(url,str_href):
 
				 
			
 
				 def getInput_byJS(browser,url,str_href):
			
 
				     try:
			
 
				-        hd.loadPage(browser,url)
			
 
				-        data = browser.execute_script(scripts_common+script_content,str_href)
			
 
				+        # hd.loadPage(browser,url)
			
 
				+        # data = browser.execute_script(scripts_common+script_content,str_href)
			
 
				+        data = get_js_rs(browser, scripts_common+script_content,str_href)
			
 
				+
			
 
				         deal_data = dealWithScriptOut(data)
			
 
				         if deal_data is None:
			
 
				             return None
			
@@ -465,8 +474,7 @@ def getInput_byJS(browser,url,str_href):
 
				         error(str(e))
			
 
				     return None
			
 
				         
			
 
				-def getRule_A_Date(url,content_xpath):
			
 
				-    
			
 
				+def getRule_A_Date(browser, url,content_xpath):
			
 
				     def appendXpath(list_xpath,_xpath):
			
 
				         if len(list_xpath)==0:
			
 
				             list_xpath.append(_xpath)
			
@@ -477,119 +485,122 @@ def getRule_A_Date(url,content_xpath):
 
				                         "listpage_Date":None,
			
 
				                         "flag":True,
			
 
				                         "hasDrew":False}
			
 
				-    try:
			
 
				-        browser = hd.getdriver()
			
 
				-        debug("get driver")
			
 
				-        hd.loadPage(browser,url)
			
 
				-        
			
 
				-        list_a = None
			
 
				-        for _content_xpath in [content_xpath,"/html"]:
			
 
				+    # try:
			
 
				+        # browser = hd.getdriver()
			
 
				+        # debug("get driver")
			
 
				+        # hd.loadPage(browser,url)
			
 
				         
			
 
				-            
			
 
				-            data = browser.execute_script(scripts_common+script_get_A_Date,_content_xpath)
			
 
				-            if data is None:
			
 
				-                log("A_Date not found with xpath:"+_content_xpath)
			
 
				-                continue
			
 
				-            if _content_xpath==content_xpath or len(data[0][1])==len(data[1][1]):
			
 
				-                list_a = data[0]
			
 
				-                list_date = data[1]
			
 
				-                list_hrefs = data[2]
			
 
				-            if list_a is not None and len(list_a[1])==len(list_date[1]):
			
 
				-                break
			
 
				-            else:
			
 
				-                log("different length of A and Date:with xpath:"+_content_xpath)
			
 
				-            
			
 
				-        if list_a is None:
			
 
				-            log("A_Date not found with all xpath")
			
 
				-            return None;
			
 
				-        log("xpath of a:\t"+str(list_a[1][0])+"-"+str(list_a[0]))
			
 
				-        log("xpath of date:\t"+str(list_date[1][0])+"-"+str(list_date[0]))
			
 
				-
			
 
				-        log("length of A and Date:"+str(len(list_a[1]))+"-"+str(len(list_date[1])))
			
 
				-        if len(list_a[1])!=len(list_date[1]):
			
 
				-            dict_Rule_A_Date["flag"] = False
			
 
				-            add_err_msg(dict_Rule_A_Date, "#列表页链接和标题数量不一致#")
			
 
				-            return dict_Rule_A_Date,list_hrefs
			
 
				+    list_a = None
			
 
				+    for _content_xpath in [content_xpath,"/html"]:
			
 
				+        # data = browser.execute_script(scripts_common+script_get_A_Date,_content_xpath)
			
 
				+        data = get_js_rs(browser, scripts_common+script_get_A_Date,_content_xpath)
			
 
				+        if data is None:
			
 
				+            log("A_Date not found with xpath:"+_content_xpath)
			
 
				+            continue
			
 
				+        if _content_xpath==content_xpath or len(data[0][1])==len(data[1][1]):
			
 
				+            list_a = data[0]
			
 
				+            list_date = data[1]
			
 
				+            list_hrefs = data[2]
			
 
				+        if list_a is not None and len(list_a[1])==len(list_date[1]):
			
 
				+            log('list_a is not None and len(list_a[1])==len(list_date[1])')
			
 
				+            break
			
 
				         else:
			
 
				-            list_diffindex = list_a[0]
			
 
				-            _xpath = list_a[1][0]
			
 
				-            listpage_a = []
			
 
				-            begin = 0
			
 
				-            list_diffindex.sort(key=lambda x:x)
			
 
				-            _jump_flag = False
			
 
				-            
			
 
				-            dict_Rule_A_Date["hasDrew"] = dict_Rule_A_Date["hasDrew"] or hd.hasDrew(url, [{"rule":_xpath,"type":"xpath"}])
			
 
				-            _xpath_split = re.split("(\d+)",_xpath)
			
 
				-            for i in range(len(list_diffindex)):
			
 
				-                _index = list_diffindex[i]
			
 
				-                if not (_xpath_split[_index-1][-1]=="[" and _xpath_split[_index+1][0]=="]"):
			
 
				-                    add_err_msg(dict_Rule_A_Date, "#列表页链接xpath无法分割#")
			
 
				-                    dict_Rule_A_Date["flag"] = False
			
 
				-                    return dict_Rule_A_Date,list_hrefs
			
 
				+            log("different length of A and Date:with xpath:"+_content_xpath)
			
 
				+
			
 
				+    if list_a is None:
			
 
				+        log("A_Date not found with all xpath")
			
 
				+        return None;
			
 
				+    log("xpath of a:\t"+str(list_a[1][0])+"-"+str(list_a[0]))
			
 
				+    log("xpath of date:\t"+str(list_date[1][0])+"-"+str(list_date[0]))
			
 
				+
			
 
				+    log("length of A and Date:"+str(len(list_a[1]))+"-"+str(len(list_date[1])))
			
 
				+    if len(list_a[1])!=len(list_date[1]):
			
 
				+        dict_Rule_A_Date["flag"] = False
			
 
				+        add_err_msg(dict_Rule_A_Date, "#列表页链接和标题数量不一致#")
			
 
				+        return dict_Rule_A_Date,list_hrefs
			
 
				+    else:
			
 
				+        list_diffindex = list_a[0]
			
 
				+        _xpath = list_a[1][0]
			
 
				+        listpage_a = []
			
 
				+        begin = 0
			
 
				+        list_diffindex.sort(key=lambda x:x)
			
 
				+        _jump_flag = False
			
 
				+
			
 
				+        dict_Rule_A_Date["hasDrew"] = dict_Rule_A_Date["hasDrew"] or hd.hasDrew(url, [{"rule":_xpath,"type":"xpath"}])
			
 
				+        _xpath_split = re.split("(\d+)",_xpath)
			
 
				+        for i in range(len(list_diffindex)):
			
 
				+            _index = list_diffindex[i]
			
 
				+            if not (_xpath_split[_index-1][-1]=="[" and _xpath_split[_index+1][0]=="]"):
			
 
				+                add_err_msg(dict_Rule_A_Date, "#列表页链接xpath无法分割#")
			
 
				+                dict_Rule_A_Date["flag"] = False
			
 
				+                return dict_Rule_A_Date,list_hrefs
			
 
				+            else:
			
 
				+                if i==0:
			
 
				+                    appendXpath(listpage_a,re.search("(.*)\[","".join(_xpath_split[:_index])).group(1))
			
 
				+                    begin = _index+1
			
 
				+                elif i<len(list_diffindex):
			
 
				+                    appendXpath(listpage_a,re.search("/(.*)\[","".join(_xpath_split[begin:_index])).group(1))
			
 
				+                    begin = _index+1
			
 
				                 else:
			
 
				-                    if i==0:
			
 
				-                        appendXpath(listpage_a,re.search("(.*)\[","".join(_xpath_split[:_index])).group(1))
			
 
				-                        begin = _index+1
			
 
				-                    elif i<len(list_diffindex):
			
 
				-                        appendXpath(listpage_a,re.search("/(.*)\[","".join(_xpath_split[begin:_index])).group(1))
			
 
				-                        begin = _index+1
			
 
				-                    else:
			
 
				-                        appendXpath(listpage_a,re.search("/(.*)","".join(_xpath_split[begin:])).group(1))
			
 
				-                        
			
 
				-                    
			
 
				-                    if i==len(list_diffindex)-1:
			
 
				-                        _group = re.search("/(.*)","".join(_xpath_split[begin:]))
			
 
				-                        if _group is not None:
			
 
				-                            appendXpath(listpage_a,_group.group(1))
			
 
				-                
			
 
				-            for i in range(len(listpage_a)):
			
 
				-                if len(listpage_a[i].split("/"))>6:
			
 
				-                    listpage_a[i] = browser.execute_script(scripts_replaceXpath,listpage_a[i])
			
 
				-            dict_Rule_A_Date["listpage_A"] = listpage_a
			
 
				-            list_diffindex = list_date[0]
			
 
				-            _xpath = list_date[1][0]
			
 
				-            listpage_date = []
			
 
				-            begin = 0
			
 
				-            list_diffindex.sort(key=lambda x:x)
			
 
				-            _jump_flag = False
			
 
				-            
			
 
				-            dict_Rule_A_Date["hasDrew"] = dict_Rule_A_Date["hasDrew"] or hd.hasDrew(url, [{"rule":_xpath,"type":"xpath"}])
			
 
				-            _xpath_split = re.split("(\d+)",_xpath)
			
 
				-            for i in range(len(list_diffindex)):
			
 
				-                _index = list_diffindex[i]
			
 
				-                if not (_xpath_split[_index-1][-1]=="[" and _xpath_split[_index+1][0]=="]"):
			
 
				-                    add_err_msg(dict_Rule_A_Date, "#列表页链接xpath无法分割#")
			
 
				-                    dict_Rule_A_Date["flag"] = False
			
 
				-                    return dict_Rule_A_Date,list_hrefs
			
 
				+                    appendXpath(listpage_a,re.search("/(.*)","".join(_xpath_split[begin:])).group(1))
			
 
				+
			
 
				+
			
 
				+                if i==len(list_diffindex)-1:
			
 
				+                    _group = re.search("/(.*)","".join(_xpath_split[begin:]))
			
 
				+                    if _group is not None:
			
 
				+                        appendXpath(listpage_a,_group.group(1))
			
 
				+
			
 
				+        for i in range(len(listpage_a)):
			
 
				+            if len(listpage_a[i].split("/"))>6:
			
 
				+                # listpage_a[i] = browser.execute_script(scripts_replaceXpath,listpage_a[i])
			
 
				+                listpage_a[i] = get_js_rs(browser, scripts_replaceXpath,listpage_a[i])
			
 
				+        dict_Rule_A_Date["listpage_A"] = listpage_a
			
 
				+        list_diffindex = list_date[0]
			
 
				+        _xpath = list_date[1][0]
			
 
				+        listpage_date = []
			
 
				+        begin = 0
			
 
				+        list_diffindex.sort(key=lambda x:x)
			
 
				+        _jump_flag = False
			
 
				+
			
 
				+        dict_Rule_A_Date["hasDrew"] = dict_Rule_A_Date["hasDrew"] or hd.hasDrew(url, [{"rule":_xpath,"type":"xpath"}])
			
 
				+        _xpath_split = re.split("(\d+)",_xpath)
			
 
				+        for i in range(len(list_diffindex)):
			
 
				+            _index = list_diffindex[i]
			
 
				+            if not (_xpath_split[_index-1][-1]=="[" and _xpath_split[_index+1][0]=="]"):
			
 
				+                add_err_msg(dict_Rule_A_Date, "#列表页链接xpath无法分割#")
			
 
				+                dict_Rule_A_Date["flag"] = False
			
 
				+                return dict_Rule_A_Date,list_hrefs
			
 
				+            else:
			
 
				+                if i==0:
			
 
				+                    appendXpath(listpage_date,re.search("(.*)\[","".join(_xpath_split[:_index])).group(1))
			
 
				+                    begin = _index+1
			
 
				+                elif i<len(list_diffindex):
			
 
				+                    appendXpath(listpage_date,re.search("/(.*)\[","".join(_xpath_split[begin:_index])).group(1))
			
 
				+                    begin = _index+1
			
 
				                 else:
			
 
				-                    if i==0:
			
 
				-                        appendXpath(listpage_date,re.search("(.*)\[","".join(_xpath_split[:_index])).group(1))
			
 
				-                        begin = _index+1
			
 
				-                    elif i<len(list_diffindex):
			
 
				-                        appendXpath(listpage_date,re.search("/(.*)\[","".join(_xpath_split[begin:_index])).group(1))
			
 
				-                        begin = _index+1
			
 
				-                    else:
			
 
				-                        appendXpath(listpage_date,re.search("/(.*)","".join(_xpath_split[begin:])).group(1))
			
 
				-                    
			
 
				-                    if i==len(list_diffindex)-1:
			
 
				-                        _group = re.search("/(.*)","".join(_xpath_split[begin:]))
			
 
				-                        if _group is not None:
			
 
				-                            appendXpath(listpage_date,_group.group(1))
			
 
				-            
			
 
				-            
			
 
				-            for i in range(len(listpage_date)):
			
 
				-                if len(listpage_date[i].split("/"))>6:
			
 
				-                    listpage_date[i] = browser.execute_script(scripts_replaceXpath,listpage_date[i])        
			
 
				-            dict_Rule_A_Date["listpage_Date"] = listpage_date
			
 
				-            
			
 
				-        return dict_Rule_A_Date,list_hrefs
			
 
				+                    appendXpath(listpage_date,re.search("/(.*)","".join(_xpath_split[begin:])).group(1))
			
 
				+
			
 
				+                if i==len(list_diffindex)-1:
			
 
				+                    _group = re.search("/(.*)","".join(_xpath_split[begin:]))
			
 
				+                    if _group is not None:
			
 
				+                        appendXpath(listpage_date,_group.group(1))
			
 
				+
			
 
				+        for i in range(len(listpage_date)):
			
 
				+            if len(listpage_date[i].split("/"))>6:
			
 
				+                # listpage_date[i] = browser.execute_script(scripts_replaceXpath,listpage_date[i])
			
 
				+                listpage_date[i] = get_js_rs(browser, scripts_replaceXpath,listpage_date[i])
			
 
				+
			
 
				+        dict_Rule_A_Date["listpage_Date"] = listpage_date
			
 
				+
			
 
				+    return dict_Rule_A_Date,list_hrefs
			
 
				                 
			
 
				                 
			
 
				-    except Exception as e:
			
 
				-        error(str(e))
			
 
				-    finally:
			
 
				-        hd.adddriver(browser)
			
 
				-        debug("release driver")
			
 
				+    # except Exception as e:
			
 
				+    #     error(str(e))
			
 
				+    # finally:
			
 
				+    #     # hd.adddriver(browser)
			
 
				+    #     # debug("release driver")
			
 
				+    #     log('getRule_A_Date done')
			
 
				     return None
			
 
				         
			
 
				 def dumpLinkContent():
			
--- a/module/listpage/extractor.py
+++ b/module/listpage/extractor.py
@@ -83,28 +83,47 @@ def getRule_listpage(listpage_url,try_times=3):
 
				     for i in range(try_times):
			
 
				         browser = hd.getdriver()
			
 
				         debug("get driver")
			
 
				+        loadsuccess = hd.loadPage(browser, listpage_url)
			
 
				+        if not loadsuccess:
			
 
				+            log('加载列表主页失败， 重新请求网页。')
			
 
				+            continue
			
 
				+        log('准备执行获取列表页内容标签脚本')
			
 
				+        # with open('d:/html/home_page.html', 'w', encoding='utf-8') as f:
			
 
				+        #     f.write(browser.page_source)
			
 
				         data_listpage = featureEngine.getInput_byJS(browser,listpage_url,"")
			
 
				+        log('获取列表页内容标签成功')
			
 
				         #print(browser.page_source)
			
 
				-        hd.adddriver(browser)
			
 
				-        debug("release driver")
			
 
				+        # hd.adddriver(browser)
			
 
				+        # debug("release driver")
			
 
				         if data_listpage is not None:
			
 
				             x,_,list_xpath = data_listpage
			
 
				             _index = listpageContentPredictor.predict(x)
			
 
				+            log('模型预测列表页标签完毕')
			
 
				             if len(list_xpath[_index])>0:
			
 
				                 content_xpath = list_xpath[_index][0]
			
 
				                 #content_xpath = "/html"
			
 
				                 log("the content_xpath of listpage is "+str(content_xpath))
			
 
				-                data_rule = featureEngine.getRule_A_Date(listpage_url,content_xpath)
			
 
				+                data_rule = featureEngine.getRule_A_Date(browser,listpage_url,content_xpath)
			
 
				+                log('执行脚本获取列表页链接及日期完毕')
			
 
				                 if data_rule is not None:
			
 
				                     dict_rule_A_Date,list_hrefs = data_rule
			
 
				-                    browser = hd.getdriver()
			
 
				-                    debug("get driver")
			
 
				+                    # if dict_rule_A_Date.get('flag', '') == False:
			
 
				+                    #     return None
			
 
				+                    # browser = hd.getdriver()
			
 
				+                    # debug("get driver")
			
 
				+                    log('begin getTurnRule')
			
 
				                     turn_data = engine.getTurnRule(browser,listpage_url)
			
 
				-                    hd.adddriver(browser)
			
 
				-                    debug("release driver")
			
 
				+                    log('获取翻页内容完毕')
			
 
				+                    # hd.adddriver(browser)
			
 
				+                    # debug("release driver")
			
 
				                     dict_rule_pageTurn,list_listpage_url = turn_data
			
 
				                     dict_rule_recog = getRecognize_detail_listpage(list_listpage_url, list_hrefs)
			
 
				+                    log('解析列表页规则完毕')
			
 
				+                    hd.adddriver(browser)
			
 
				+                    debug("release driver")
			
 
				                     return mergeDict([dict_rule_A_Date,dict_rule_pageTurn,dict_rule_recog]),list_hrefs
			
 
				+        hd.adddriver(browser)
			
 
				+        debug("release driver")
			
 
				     return None
			
 
				     
			
 
				     
			
--- a/module/listpage/pageTurn/engine.py
+++ b/module/listpage/pageTurn/engine.py
@@ -14,15 +14,16 @@ script = '''
 
				 
			
 
				 function click_bt(type_click){
			
 
				     var pattern_pageNum = /[共\/]\s*(\d+)\s*页|\d+\s*\/\s*(\d+)|\.{2}\s*(\d+)/
			
 
				-    var pattern_nextPage = /^\s*.?([下后]一?页|[下后]一?页\s*»|»|>|[Nn]ext).?\s*$/
			
 
				+    var pattern_nextPage = /^\s*[^最]?([下后]一?页|[下后]一?页\s*»|»|>|[Nn]ext).?\s*$/
			
 
				     
			
 
				-    var pattern_tailPage = /^\s*.?(最?[尾末]一?页|tail|>\|).?s\s*$/
			
 
				+    var pattern_tailPage = /^\s*(最[尾末后]一?页|tail|>\|).?s\s*$/
			
 
				     list_cluster = clustering_turnPage();
			
 
				     var pageNum = null;
			
 
				     var pageNum_jsoup = null;
			
 
				     var _node_xpath = null;
			
 
				     var _node_jsoup = null;
			
 
				     var _node_click = null;
			
 
				+    var click_message = '';
			
 
				     for(var i=0;i<list_cluster.length;i++){
			
 
				         _node = list_cluster[i][0]
			
 
				         _type = list_cluster[i][1]
			
@@ -60,17 +61,42 @@ function click_bt(type_click){
 
				                     }
			
 
				                     
			
 
				                 }
			
 
				+                if(_href==null || _href=="" || _href=="#"){
			
 
				+                    click_message = '翻页链接为空或#异常';
			
 
				+                }
			
 
				+                if(_href!=null && _href.indexOf('javascript')>=0){
			
 
				+                    click_message = '翻页链接为javascript';
			
 
				+                }
			
 
				                 if(_node_click==null){
			
 
				                     _node_click = _node;
			
 
				+                }               
			
 
				+               
			
 
				+            }
			
 
				+            else if(_node.getAttribute("type")=='button'){
			
 
				+                _node_click = _node;
			
 
				+                click_message = '标签属性type为button的翻页';
			
 
				+            }            
			
 
				+            else if(_node.parentNode.tagName.toLowerCase() in {a:"",button:""} || _node.parentNode.onclick!=null){
			
 
				+                _href = _node.parentNode.getAttribute("href")
			
 
				+                if(_href!=null && _href!="" && _href!="#" && _href.indexOf('javascript')<0){
			
 
				+                    if(_node_xpath==null){
			
 
				+                        _node_xpath = getXpath(_node.parentNode);
			
 
				+                    }
			
 
				+                    if(_node_jsoup==null){
			
 
				+                        _node_jsoup = getJsoup(_node.parentNode);
			
 
				+                    }
			
 
				+                    
			
 
				+                }
			
 
				+                if(_node_click==null){
			
 
				+                    _node_click = _node.parentNode;
			
 
				                 }
			
 
				-                
			
 
				-                
			
 
				+                click_message = '父节点为翻页链接';				
			
 
				             }
			
 
				         }
			
 
				     }
			
 
				     if(_node_click!=null){
			
 
				         _node_click.click();
			
 
				-        return [true,[pattern_pageNum.toString(),pageNum,pageNum_jsoup],[_node_xpath,_node_jsoup]];
			
 
				+        return [true,[pattern_pageNum.toString(),pageNum,pageNum_jsoup],[_node_xpath,_node_jsoup], click_message];
			
 
				     }else{
			
 
				         var _pattern = null;
			
 
				         if(type_click=="nextPage"){
			
@@ -88,11 +114,13 @@ function click_bt(type_click){
 
				                     _node_jsoup = getJsoup(_node);
			
 
				                 }
			
 
				                 _node.click();
			
 
				-                return [true,[pattern_pageNum.toString(),pageNum,pageNum_jsoup],[_node_xpath,_node_jsoup]];
			
 
				+                click_message = '找不到翻页按钮，a标签为翻页链接';
			
 
				+                return [true,[pattern_pageNum.toString(),pageNum,pageNum_jsoup],[_node_xpath,_node_jsoup], click_message];
			
 
				             }
			
 
				         }
			
 
				     }
			
 
				-    return [false,[pattern_pageNum.toString(),pageNum,pageNum_jsoup],[_node_xpath,_node_jsoup]];
			
 
				+    if(click_message==''){click_message = '最终没找到翻页按钮';}
			
 
				+    return [false,[pattern_pageNum.toString(),pageNum,pageNum_jsoup],[_node_xpath,_node_jsoup], click_message];
			
 
				 }
			
 
				 return click_bt(arguments[0]);
			
 
				 '''
			
@@ -119,7 +147,8 @@ return turnpage_by_pattern(arguments[0]);
 
				 def click_bt_lastPage(browser):
			
 
				     _url = browser.current_url
			
 
				     _window_handles = len(browser.window_handles)
			
 
				-    _result = browser.execute_script(scripts_common+script,"lastPage")
			
 
				+    # _result = browser.execute_script(scripts_common+script,"lastPage")
			
 
				+    _result = get_js_rs(browser, scripts_common+script,"lastPage")
			
 
				     if _result[0]:
			
 
				         if len(browser.window_handles)>_window_handles:
			
 
				             switch_window(browser)
			
@@ -133,8 +162,9 @@ def click_bt_lastPage(browser):
 
				 def click_bt_nextPage(browser):
			
 
				     _url = browser.current_url
			
 
				     _window_handles = len(browser.window_handles)
			
 
				-    _result = browser.execute_script(scripts_common+script,"nextPage")
			
 
				-    if _result[0]:
			
 
				+    # _result = browser.execute_script(scripts_common+script,"nextPage")
			
 
				+    _result = get_js_rs(browser, scripts_common+script,"nextPage", timeout=30)
			
 
				+    if _result!=None and _result[0]:
			
 
				         if len(browser.window_handles)>_window_handles:
			
 
				             switch_window(browser)
			
 
				         for i in range(4):
			
@@ -147,8 +177,9 @@ def click_bt_nextPage(browser):
 
				 def click_bt_tailPage(browser):
			
 
				     _url = browser.current_url
			
 
				     _window_handles = len(browser.window_handles)
			
 
				-    _result = browser.execute_script(scripts_common+script,"tailPage")
			
 
				-    if _result[0]:
			
 
				+    # _result = browser.execute_script(scripts_common+script,"tailPage")
			
 
				+    _result = get_js_rs(browser, scripts_common+script,"tailPage")
			
 
				+    if _result!=None and  _result[0]:
			
 
				         if len(browser.window_handles)>_window_handles:
			
 
				             switch_window(browser)
			
 
				         for i in range(4):
			
@@ -161,7 +192,8 @@ def click_bt_tailPage(browser):
 
				 def click_bt_pattern(browser,pattern):
			
 
				     _url = browser.current_url
			
 
				     _window_handles = len(browser.window_handles)
			
 
				-    _result = browser.execute_script(scripts_common+script_pattern,pattern)
			
 
				+    # _result = browser.execute_script(scripts_common+script_pattern,pattern)
			
 
				+    _result = get_js_rs(browser, scripts_common+script_pattern,pattern)
			
 
				     if _result:
			
 
				         if len(browser.window_handles)>_window_handles:
			
 
				             switch_window(browser)
			
@@ -191,6 +223,13 @@ def getRuleOfUrl(first_url,second_url):
 
				     log("pageTurn first_url:\t"+first_url)
			
 
				     log("pageTurn second_url:\t"+second_url)
			
 
				     if len(split_all_first)!=len(split_all_second):
			
 
				+        split_url = second_url.split('/')
			
 
				+        if split_url[-1]== 'index_2.html':
			
 
				+            dict_rule["listpage_turn_before"] = '/'.join(split_url[:-1])+'/index_'
			
 
				+            dict_rule["listpage_turn_after"] = '.html'
			
 
				+            dict_rule["listpage_pageBegin"] = 2
			
 
				+            dict_rule["listpage_pageStep"] = 1
			
 
				+            return dict_rule
			
 
				         add_err_msg(dict_rule, "#翻页链接不匹配#")
			
 
				         dict_rule["flag"] = False
			
 
				         return dict_rule
			
@@ -226,86 +265,119 @@ def getRuleOfUrl(first_url,second_url):
 
				     return dict_rule
			
 
				 
			
 
				 def getTurnRule(browser,listpage_url):
			
 
				-    try:
			
 
				-        hd.loadPage(browser,listpage_url)
			
 
				-        first_url = browser.current_url
			
 
				-        list_listpage_url = []
			
 
				-        click_flag = True
			
 
				-        #点击下一页
			
 
				-        click_next_1 = click_bt_nextPage(browser)
			
 
				-        
			
 
				-        url1 = browser.current_url
			
 
				-        log("click next bt:"+str(click_next_1))
			
 
				-        #点击下一页
			
 
				-        click_next_2 = click_bt_nextPage(browser)
			
 
				-        log("click next bt:"+str(click_next_2))
			
 
				-        list_pageNum1 = click_next_1[1]
			
 
				-        list_node1 = click_next_1[2]
			
 
				-        list_pageNum2 = click_next_2[1]
			
 
				-        list_node2 = click_next_2[2]
			
 
				-        dict_rule = None
			
 
				-        url2 = browser.current_url
			
 
				-        
			
 
				-        #是否有点击到下一页
			
 
				-        #click_flag = click_next_1[0] or click_next_2[0]
			
 
				-        click_flag = click_next_2[0]
			
 
				-        
			
 
				-        
			
 
				-        
			
 
				-        #点击数字翻页
			
 
				-        if not click_flag:
			
 
				-            #第一个下一页点击到而第二个未点击到
			
 
				-            if click_next_1[0]:
			
 
				-                click_last_1 = click_bt_lastPage(browser)
			
 
				-                url2 = browser.current_url
			
 
				-            if not click_next_1[0] or not click_last_1[0]:
			
 
				-                click_pattern_2 = click_bt_pattern(browser, "^\\s*2\\s*$")
			
 
				-                if click_pattern_2:
			
 
				-                    url2 = browser.current_url
			
 
				-                click_pattern_1 = click_bt_pattern(browser, "^\\s*1\\s*$")
			
 
				-                if click_pattern_1:
			
 
				-                    url1 = browser.current_url
			
 
				-                    if url1==first_url:
			
 
				-                        click_pattern_3 = click_bt_pattern(browser, "^\\s*3\\s*$")
			
 
				-                        if click_pattern_3:
			
 
				-                            url1 = url2
			
 
				-                            url2 = browser.current_url
			
 
				-        
			
 
				-        dict_rule = getRuleOfUrl(url1, url2)
			
 
				-        list_listpage_url.append(url1)
			
 
				-        list_listpage_url.append(url2)
			
 
				-    
			
 
				-        if list_pageNum1[2]==list_pageNum2[2] and list_pageNum1[2] is not None:
			
 
				-            dict_rule["listpage_pageNum"] = [list_pageNum1[2],"jsoup"]
			
 
				-        elif list_pageNum1[1]==list_pageNum2[1] and list_pageNum1[1] is not None:
			
 
				-            dict_rule["listpage_pageNum"] = [list_pageNum1[1],"xpath"]
			
 
				-        else:
			
 
				-            dict_rule["listpage_pageNum"] = None
			
 
				-        dict_rule["listpage_pageNum_pattern"] = list_pageNum1[0]
			
 
				-        '''
			
 
				-        #若是未识别到pageNum则flag为False
			
 
				-        if dict_rule["listpage_pageNum"] is None:
			
 
				-            dict_rule["flag"] = False
			
 
				-        '''
			
 
				-        #优先jsoup，后xpath
			
 
				-        if list_node1[1]==list_node2[1] and list_node1[1] is not None:
			
 
				-            dict_rule["listpage_nextPage"] = [list_node1[1],"jsoup"]
			
 
				-        #只有2页的适配
			
 
				-        elif list_node1[1] is not None and list_node2[1] is None:
			
 
				-            dict_rule["listpage_nextPage"] = [list_node1[1],"jsoup"]
			
 
				-        elif list_node1[0]==list_node2[0] and list_node1[0] is not None:
			
 
				-            dict_rule["listpage_nextPage"] = [list_node1[0],"xpath"]
			
 
				-        else:
			
 
				-            dict_rule["listpage_nextPage"] = None
			
 
				-        
			
 
				-        #翻页按钮或者是拼接规则有一个即可
			
 
				-        if dict_rule["listpage_nextPage"] is not None:
			
 
				-            dict_rule["flag"] = True
			
 
				+    '''
			
 
				+    通过点击下一页或数字翻页得到下一页规则（页数，下一页路径等），list_listpage_url(前后列表页url)
			
 
				+    :param browser: 浏览器对象
			
 
				+    :param listpage_url: 列表页url
			
 
				+    :return:
			
 
				+    '''
			
 
				+    # try:
			
 
				+    # hd.loadPage(browser,listpage_url)
			
 
				+    first_url = browser.current_url
			
 
				+    list_listpage_url = []
			
 
				+    click_flag = True
			
 
				+    #点击下一页
			
 
				+    # click_next_1 = click_bt_nextPage(browser)
			
 
				+    click_next_1 = thread_run(click_bt_nextPage, browser)
			
 
				+    url1 = ''
			
 
				+    url2 = browser.current_url
			
 
				+    log("click next bt:"+str(click_next_1))
			
 
				+    #点击下一页
			
 
				+    # click_next_2 = click_bt_nextPage(browser)
			
 
				+    click_next_2 = thread_run(click_bt_nextPage, browser)
			
 
				+    if click_next_1==None:
			
 
				+        click_next_1 = [False, ['/[共\\/]\\s*(\\d+)\\s*页|\\d+\\s*\\/\\s*(\\d+)|\\.{2}\\s*(\\d+)/', None, None],
			
 
				+                        [None, None]]
			
 
				+    if click_next_2==None:
			
 
				+        click_next_2 = [False, ['/[共\\/]\\s*(\\d+)\\s*页|\\d+\\s*\\/\\s*(\\d+)|\\.{2}\\s*(\\d+)/', None, None],
			
 
				+                        [None, None]]
			
 
				+    log("click next bt:"+str(click_next_2))
			
 
				+    list_pageNum1 = click_next_1[1]
			
 
				+    list_node1 = click_next_1[2]
			
 
				+    list_pageNum2 = click_next_2[1]
			
 
				+    list_node2 = click_next_2[2]
			
 
				+    dict_rule = None
			
 
				+    url3 = browser.current_url
			
 
				+
			
 
				+    #是否有点击到下一页
			
 
				+    #click_flag = click_next_1[0] or click_next_2[0]
			
 
				+    click_flag = click_next_2[0]
			
 
				+
			
 
				+
			
 
				+
			
 
				+    #点击数字翻页
			
 
				+    # if not click_flag:
			
 
				+    #     #第一个下一页点击到而第二个未点击到
			
 
				+    #     log('开始数字翻页')
			
 
				+        # if click_next_1[0]:
			
 
				+        #     click_last_1 = click_bt_lastPage(browser)
			
 
				+        #     url2 = browser.current_url
			
 
				+        #     log('第一次翻页成功，最后一页作为第二页')
			
 
				+    if not click_next_1[0]: # or not click_last_1[0]
			
 
				+        log('开始数字翻页')
			
 
				+        # click_pattern_2 = click_bt_pattern(browser, "^\\s*2\\s*$")
			
 
				+        click_pattern_2 = thread_run(click_bt_pattern, browser, "^\\s*2\\s*$")
			
 
				+        if click_pattern_2:
			
 
				+            url2 = browser.current_url
			
 
				+            log('数字翻页第二页%s'%url2)
			
 
				+        # click_pattern_3 = click_bt_pattern(browser, "^\\s*3\\s*$")
			
 
				+        click_pattern_3 = thread_run(click_bt_pattern , browser, "^\\s*3\\s*$")
			
 
				+        if click_pattern_3:
			
 
				+            url3 = browser.current_url
			
 
				+            log('数字翻页第三页%s'%url3)
			
 
				         else:
			
 
				-            add_err_msg(dict_rule, "#下一页规则未获取#")
			
 
				-        return dict_rule,list_listpage_url
			
 
				-    except Exception as e:
			
 
				-        error(str(e))
			
 
				+            # click_pattern_1 = click_bt_pattern(browser, "^\\s*1\\s*$")
			
 
				+            click_pattern_1 = thread_run(click_bt_pattern, browser, "^\\s*1\\s*$")
			
 
				+            if click_pattern_1:
			
 
				+                url1 = browser.current_url
			
 
				+                log('数字翻页第一页%s'%url1)
			
 
				+    if url2 != url3:
			
 
				+        dict_rule = getRuleOfUrl(url2, url3)
			
 
				+    elif url1!='' and url2 != url1:
			
 
				+        dict_rule = getRuleOfUrl(url1, url2)
			
 
				+    else:
			
 
				+        dict_rule = getRuleOfUrl(first_url, url2)
			
 
				+    if click_next_1 != None and len(click_next_1)==4:
			
 
				+        click_message = click_next_1[3]
			
 
				+        if click_message!="":
			
 
				+            add_err_msg(dict_rule, '#%s#'%click_message)
			
 
				+    if not click_flag:
			
 
				+        add_err_msg(dict_rule, "#进行数字翻页#")
			
 
				+    list_listpage_url.append(url1)
			
 
				+    list_listpage_url.append(url2)
			
 
				+
			
 
				+    if list_pageNum1[2]==list_pageNum2[2] and list_pageNum1[2] is not None:
			
 
				+        dict_rule["listpage_pageNum"] = [list_pageNum1[2],"jsoup"]
			
 
				+    elif list_pageNum1[1]==list_pageNum2[1] and list_pageNum1[1] is not None:
			
 
				+        dict_rule["listpage_pageNum"] = [list_pageNum1[1],"xpath"]
			
 
				+    else:
			
 
				+        dict_rule["listpage_pageNum"] = None
			
 
				+    dict_rule["listpage_pageNum_pattern"] = list_pageNum1[0]
			
 
				+    '''
			
 
				+    #若是未识别到pageNum则flag为False
			
 
				+    if dict_rule["listpage_pageNum"] is None:
			
 
				+        dict_rule["flag"] = False
			
 
				+    '''
			
 
				+    #优先jsoup，后xpath
			
 
				+    if list_node1[1]==list_node2[1] and list_node1[1] is not None:
			
 
				+        dict_rule["listpage_nextPage"] = [list_node1[1],"jsoup"]
			
 
				+    #只有2页的适配
			
 
				+    elif list_node1[1] is not None and list_node2[1] is None:
			
 
				+        log('只有两页更新适配 ')
			
 
				+        dict_rule["listpage_nextPage"] = [list_node1[1],"jsoup"]
			
 
				+    elif list_node1[0]==list_node2[0] and list_node1[0] is not None:
			
 
				+        dict_rule["listpage_nextPage"] = [list_node1[0],"xpath"]
			
 
				+    else:
			
 
				+        dict_rule["listpage_nextPage"] = None
			
 
				+
			
 
				+    #翻页按钮或者是拼接规则有一个即可
			
 
				+    if dict_rule["listpage_nextPage"] is not None:
			
 
				+        dict_rule["flag"] = True
			
 
				+    else:
			
 
				+        add_err_msg(dict_rule, "#下一页规则未获取#")
			
 
				+    return dict_rule,list_listpage_url
			
 
				+    # except Exception as e:
			
 
				+    #     error(str(e))
			
 
				 
			
 
				 if __name__=="__main__":
			
 
				     browser = hd.getBrowser()
			
@@ -323,7 +395,8 @@ if __name__=="__main__":
 
				     return _array
			
 
				     '''
			
 
				     
			
 
				-    data = browser.execute_script(scripts_common+script1)
			
 
				+    # data = browser.execute_script(scripts_common+script1)
			
 
				+    data = get_js_rs(browser, scripts_common+script1)
			
 
				     #browser.maximize_window()
			
 
				     browser.save_screenshot("112.png")
			
 
				     for item in data:
			
--- a/module/run_single_server.py
+++ b/module/run_single_server.py
@@ -1,21 +1,25 @@
 
				+# -*- coding: utf-8 -*-
			
 
				+import sys
			
 
				+import json
			
 
				+import re
			
 
				+import os
			
 
				+sys.path.append(os.path.abspath("../.."))
			
 
				 
			
 
				-from module.Utils import log# -*- coding: utf-8 -*-
			
 
				+os.environ['KERAS_BACKEND']='tensorflow'
			
 
				+from module.Utils import log
			
 
				 """
			
 
				 Created on Fri Jun  1 18:03:03 2018
			
 
				 
			
 
				 @author: DONG
			
 
				 """
			
 
				-import sys
			
 
				-import os
			
 
				-import json
			
 
				-import re
			
 
				-sys.path.append(os.path.abspath("../.."))
			
 
				+
			
 
				 from module import extractFlow
			
 
				 from flask import Flask, jsonify
			
 
				 from flask import abort
			
 
				 from flask import request
			
 
				 import time
			
 
				 import uuid
			
 
				+from module.Utils import xpath2css
			
 
				 
			
 
				 app = Flask(__name__)
			
 
				 app.config['JSON_AS_ASCII'] = False
			
@@ -30,9 +34,12 @@ def transformInterface(_dict):
 
				     if listpage_a  and listpage_date:
			
 
				         if listpage_a[0]==listpage_date[0]:
			
 
				             ruleValue = listpage_a[0]
			
 
				-            trans_dict["listPageNode"] = {"ruleType":"xpath",
			
 
				-                                          "ruleValue":ruleValue,
			
 
				-                                          "ruleKey":""}
			
 
				+            # trans_dict["listPageNode"] = {"ruleType":"xpath",
			
 
				+            #                               "ruleValue":ruleValue,
			
 
				+            #                               "ruleKey":""}
			
 
				+            trans_dict["listPageNode"] = {"ruleType": "css",
			
 
				+                                          "ruleValue": xpath2css(ruleValue),
			
 
				+                                          "ruleKey": ""}
			
 
				         else:
			
 
				             flag = False
			
 
				     else:
			
@@ -67,8 +74,11 @@ def transformInterface(_dict):
 
				     detail_date = _dict.get("detail_date")
			
 
				     trans_dict["needDetailTime"] = False
			
 
				     if detail_date:
			
 
				-        trans_dict["detailDateNode"] = {"ruleType": "xpath",
			
 
				-                                        "ruleValue": detail_date
			
 
				+        # trans_dict["detailDateNode"] = {"ruleType": "xpath",
			
 
				+        #                                 "ruleValue": detail_date
			
 
				+        #                                 }
			
 
				+        trans_dict["detailDateNode"] = {"ruleType": "css",
			
 
				+                                        "ruleValue": xpath2css(detail_date)
			
 
				                                         }
			
 
				         trans_dict["needDetailTime"] = True
			
 
				     else:
			
@@ -76,16 +86,22 @@ def transformInterface(_dict):
 
				     detail_title = _dict.get("detail_title")
			
 
				     trans_dict["needDetailTitle"] = False
			
 
				     if detail_title:
			
 
				-        trans_dict["detailTitleNode"] = {"ruleType": "xpath",
			
 
				-                                         "ruleValue": detail_title
			
 
				+        # trans_dict["detailTitleNode"] = {"ruleType": "xpath",
			
 
				+        #                                  "ruleValue": detail_title
			
 
				+        #                                  }
			
 
				+        trans_dict["detailTitleNode"] = {"ruleType": "css",
			
 
				+                                         "ruleValue": xpath2css(detail_title)
			
 
				                                          }
			
 
				         trans_dict["needDetailTitle"] = True
			
 
				     else:
			
 
				         flag = False
			
 
				     detail_content = _dict.get("detail_content")
			
 
				     if detail_content:
			
 
				-        trans_dict["detailContentNode"] = {"ruleType": "xpath",
			
 
				-                                           "ruleValue": detail_content
			
 
				+        # trans_dict["detailContentNode"] = {"ruleType": "xpath",
			
 
				+        #                                    "ruleValue": detail_content
			
 
				+        #                                    }
			
 
				+        trans_dict["detailContentNode"] = {"ruleType": "css",
			
 
				+                                           "ruleValue": xpath2css(detail_content)
			
 
				                                            }
			
 
				     else:
			
 
				         flag = False
			
@@ -117,7 +133,7 @@ def text_predict():
 
				                 if re.search("http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+",listpage_url) is None:
			
 
				                     data["status_code"] = 400
			
 
				                     abort(400)
			
 
				-                else:   
			
 
				+                else:
			
 
				                     data = extractFlow.ruleExtract(listpage_url)
			
 
				                 log("done for setting result of listpage:"+str(listpage_url))
			
 
				                 data["listpage_url"] = listpage_url
			
@@ -130,7 +146,7 @@ def text_predict():
 
				         log(" time from receive to send: "+str(time.time()-start_time))
			
 
				 
			
 
				         data = transformInterface(data)
			
 
				-        log(str(data))
			
 
				+        # log(str(data))
			
 
				 
			
 
				         _resp = jsonify(data)
			
 
				         #log(str(data["flag"])+str(data))
			
@@ -138,5 +154,5 @@ def text_predict():
 
				 
			
 
				 
			
 
				 if __name__ == '__main__':
			
 
				-    app.run(host='0.0.0.0', port=15015, threaded=True, debug=False)
			
 
				+    app.run(host='192.168.2.65', port=15015, threaded=True, debug=False) #15015  2.65
			
 
				     log("ContentExtractor running")
			
--- a/module/testInterface.py
+++ b/module/testInterface.py
@@ -345,23 +345,161 @@ list_url = ["http://www.csssyxx.com/xwgk/tzgg",
 
				 _sum = 0
			
 
				 _count = 0
			
 
				 ''' '''
			
 
				-with codecs.open("errorLink.txt","r",encoding="utf8") as f:
			
 
				-    while(True):
			
 
				-        line = f.readline().strip()
			
 
				-        if not line:
			
 
				-            break
			
 
				-       
			
 
				-        a = time.time()
			
 
				-        # user = {"listpage_url":list_url[0]}
			
 
				-        user = {"listpage_url":"http://www.gsbtn96333.com.cn/news-41-1.html"}
			
 
				-        #_resp = requests.post("http://192.168.2.52:15015/content_extract", json=user, verify=True)
			
 
				-        _resp = requests.post("http://127.0.0.1:15015/content_extract", json=user, verify=True)
			
 
				-        resp_json = _resp.content.decode("utf-8")
			
 
				-        _resp = json.loads(resp_json)
			
 
				-        print(resp_json)
			
 
				-        _sum += 1
			
 
				-        if "flag" in _resp and _resp["flag"]:
			
 
				-            _count += 1
			
 
				-            print("take:",time.time()-a,json.dumps(_resp,sort_keys=True,indent=4,ensure_ascii=False))
			
 
				-        print(_count,_sum)
			
 
				+# with codecs.open("errorLink.txt","r",encoding="utf8") as f:
			
 
				+#     while(True):
			
 
				+#         line = f.readline().strip()
			
 
				+#         if not line:
			
 
				+#             break
			
 
				+#
			
 
				+#         a = time.time()
			
 
				+#         # user = {"listpage_url":list_url[0]}
			
 
				+#         user = {"listpage_url":"http://www.gsbtn96333.com.cn/news-41-1.html"}
			
 
				+#         #_resp = requests.post("http://192.168.2.52:15015/content_extract", json=user, verify=True)
			
 
				+#         _resp = requests.post("http://127.0.0.1:15015/content_extract", json=user, verify=True)
			
 
				+#         resp_json = _resp.content.decode("utf-8")
			
 
				+#         _resp = json.loads(resp_json)
			
 
				+#         print(resp_json)
			
 
				+#         _sum += 1
			
 
				+#         if "flag" in _resp and _resp["flag"]:
			
 
				+#             _count += 1
			
 
				+#             print("take:",time.time()-a,json.dumps(_resp,sort_keys=True,indent=4,ensure_ascii=False))
			
 
				+#         print(_count,_sum)
			
 
				 
			
 
				+
			
 
				+def get_rs(url):
			
 
				+    user = {"listpage_url": url}
			
 
				+    _resp = requests.post("http://192.168.2.177:15015/content_extract", json=user, verify=True) #127.0.0.1  177
			
 
				+    resp_json = _resp.content.decode("utf-8")
			
 
				+    return resp_json
			
 
				+
			
 
				+    # _resp = json.loads(resp_json)
			
 
				+    # print(resp_json)
			
 
				+    # print(_resp)
			
 
				+
			
 
				+# url = 'http://www.clrmyy.com/Newslist/NewsList.aspx?code=ZPXX'
			
 
				+# url = 'http://ec.chongchi.com.cn:8080/Ec468Web/ysxjcggg.jsp' # 列表页太长 js 溢出  #已设置超时
			
 
				+# url = 'https://tyj.huangshan.gov.cn/content/column/6794951?pageIndex=1'
			
 
				+# url = 'http://www.yangdong.gov.cn/xwzx/gggs/index.html'  # 获取详情页报错
			
 
				+# url = 'https://www.guit.edu.cn/xwzx/tzgg.htm ' # 日志报错
			
 
				+
			
 
				+# rs = get_rs(url)
			
 
				+# print(rs)
			
 
				+
			
 
				+# url = 'http://www.yangzhou.gov.cn/cnyzfront/newxxgk/bmfdgklist.jsp?zjclassid=b8273cd5944b41c1b6f5aeb88194340f&bmcode=KA024&showlmmc=1&showbm=0&currentPage=2' # 翻页提取失败
			
 
				+# url = 'http://www.yangzhou.gov.cn/cnyzfront/newxxgk/bmfdgklist.jsp?zjclassid=aedecc7ea4cb4fbdb34df0d57db50c62&bmcode=11321000014407012K' # 所有要素提取失败, 重跑正常
			
 
				+url = 'http://www.gztaijiang.gov.cn/zwgk/zdlygk/zfcg/zbgg/index.html' # 所有要素提取失败, 重跑正常
			
 
				+# url = 'http://gxs.yun.liuzhou.gov.cn/xwzx/tzgg/index.shtml' # 所有要素提取失败, bug 已修复
			
 
				+# url = 'http://www.yangzhou.gov.cn/cnyzfront/newxxgk/bmfdgklist.jsp?zjclassid=aedecc7ea4cb4fbdb34df0d57db50c62&bmcode=11321000014407012K' # 所有要素提取失败, 重跑正常
			
 
				+# url = 'http://www.chengan.gov.cn/main/newsMore.action?subjectid=9052&pagenum=1' # 所有要素提取失败, bug 已修复
			
 
				+# url = 'http://hsxzwgk.mas.gov.cn/opennessTarget/?branch_id=57a3df762c262ea9a00aadae&column_code=280200' #主页提取失败  #网页打不开# 404
			
 
				+# url = 'http://www.crra.org.cn/news/tongzhi/o1' # 执行js完毕  getRule_A_Date done 后卡住 已修复
			
 
				+
			
 
				+# url = 'http://www.ptstjxx.org.cn/pttsjyxx_lists-16.html' # 翻页超时错误 已修复 提取正常
			
 
				+
			
 
				+# # url = 'https://www.neias.cn/news_list.jsp?id=10775' # 报 201 浏览器打开正常  重新提取 #翻页链接不匹配##下一页规则未获取#
			
 
				+# # url = 'https://www.gzmedri.com/xwdt/list_14_page_1.html' # 报 201 浏览器打开很慢，有时正常
			
 
				+# # url = 'http://www.wjqwhg.cn/Article?pageIndex=1' #列表页规则未获取#  网页打开报错 504
			
 
				+#
			
 
				+# # url = 'http://gxs.yun.liuzhou.gov.cn/xwzx/tzgg/index.shtml' # 所有要素提取失败, bug 已修复  列表页xpath预测错误
			
 
				+# # url = 'http://sz.nxeduyun.com/index.php?r=space/school/portal/content/index&sid=6ce9765e85694be7838c7f7272199346&cid=50160' #列表页获取失败 已修复
			
 
				+# # url = 'https://www.nbzjy.cn/list-gsgg.html' # #列表页规则未获取# 已解决
			
 
				+# # url = 'http://www.gdhealth.net.cn/index.php?m=content&c=index&a=lists&catid=38' # # #列表页规则未获取# chome浏览器打开异常 换另一个浏览器正常
			
 
				+# # url = 'http://www.kbs.gov.cn/ywdt/tzgg/index.html' #列表页规则未获取# iframe报错 已处理
			
 
				+# # url = 'http://www.xs9z.com/News.asp?PageNo=1&classid=17' #包含iframe 报错  已处理
			
 
				+# # url = 'http://www.tdxbmj.cn/html/qyxw1/index.html' #列表页规则未获取# 已优化处理，详情页时间没日期报错，标签id重复导致只提取到一个链接
			
 
				+# # url = 'http://www.sxsltlyy.com/newslist.php?cid=29'  # 列表页获取失败，详情页xpath错误  浏览器打开界面与selenium 的不一样  ua问题已修复
			
 
				+# # url = 'http://view.landtz.com:8092/jj/index' # #列表页规则未获取# 拍卖多个图标纵向列表   content_xpath of listpage is //*[@class="wp"]/div[2]/div[1]/a[1]/div[2] 预测错误
			
 
				+# # url = 'http://www.hbbidcloud.cn/suizhou/jyxx/004003/004003006/about.html' # #翻页链接不匹配##下一页规则未获取#  网页本身无翻页机制
			
 
				+# # url = 'http://www.cqcjda.com/ShowList.aspx?pkey=3&p=3'  #翻页链接不匹配##下一页规则未获取##详情页列表页区分长度未识别#
			
 
				+# # url = 'https://www.sxeec.com/gpgg/p4.html' ##翻页链接不匹配##下一页规则未获取#  下一页在标签<i>，链接在父节点<a>标签
			
 
				+# # url = 'http://sthjj.liaoyuan.gov.cn/xxgk/tzgg/' #翻页链接不匹配  第二页开始规律  翻页超时导致拿不到翻页规则 无头模式打开网页超时， 正常模式不超时
			
 
				+# # url = 'http://zyjy.huizhou.gov.cn/ggfw/jyxx/gycqjy/gpjggg/index_3.html'  #翻页链接不匹配
			
 
				+# # url = 'http://bj.sxggzyjy.cn/jydt/001001/001001004/001001004001/subPage.html'  #翻页链接不匹配##下一页规则未获取#
			
 
				+# # url = 'http://www.tlgljs.com/cpzs.html'
			
 
				+# # url = 'http://zrzyj.jlbc.gov.cn/xxgk/tzgg/'
			
 
				+# # url = 'http://www.zqcyl.cn/zlzx/ggl/' #抛出异常导致返回结果失败，
			
 
				+# # url = 'http://www.cqcjda.com/ShowList.aspx?pkey=3'
			
 
				+# # url = 'http://www.cqcjda.com/ShowList.aspx?pkey=3&p=1'
			
 
				+# # url = 'http://zyjy.huizhou.gov.cn/ggfw/jyxx/gycqjy/gpjggg/'
			
 
				+# # url = 'http://www.sxeec.com/gpgg.html'
			
 
				+# url = 'http://zrzyj.jlbc.gov.cn/xxgk/tzgg/'
			
 
				+# url = 'http://bbkx.bb.ah.cn/kxxw/tzgg/index.html'
			
 
				+# url = 'http://www.lzwhg.com/tongzhigonggao/'
			
 
				+# url = 'http://www.slwr.gov.cn/zfxxgk/gkml/216/240/257/list_640.htm'  # 列表页脚本异常
			
 
				+# url = 'http://view.landtz.com:8091/xh/index?resourceStatus=0&useType=&orderBy=0&title='
			
 
				+# url = 'http://ggzy.yueqing.gov.cn/yqwebnew/jyxx/001009/001009010/'
			
 
				+# url = 'http://ggzy.xjbt.gov.cn/TPFront/bt5/083003/083003002/083003002006/'
			
 
				+# url = 'http://www.longmen.gov.cn/xzfbm/xcl/zwgk/bmwj/tzgg/index.html'
			
 
				+# url = 'http://nyncj.yq.gov.cn/tzgg/'
			
 
				+url = 'http://www.yrcc.gov.cn/zwzc/gzgb/gg/index.html'
			
 
				+url = 'http://www.hzsq.gov.cn/index.php?r=article/Category/index&class_id=61'
			
 
				+url = 'http://zyjy.huizhou.gov.cn/ggfw/jyxx/gycqjy/gpjggg/'
			
 
				+url = 'http://www.lzwhg.com/tongzhigonggao/'  #翻页失败
			
 
				+rs = get_rs(url)
			
 
				+print(rs)
			
 
				+
			
 
				+
			
 
				+
			
 
				+import pandas as pd
			
 
				+import time
			
 
				+l = []
			
 
				+def get_url_root(text):
			
 
				+    url = re.search('https?:[a-z0-9-./]+\.(cn|com|org|net|gov|edu|biz|cc|mil|top|pub|info)', text)
			
 
				+    if url:
			
 
				+        return url.group(0)
			
 
				+    else:
			
 
				+        return ''
			
 
				+def get_url(text):
			
 
				+    try:
			
 
				+        url = json.loads(text).get('ruleLink', '')
			
 
				+        return url
			
 
				+    except:
			
 
				+        print('CRAWLER_LINK json loads 出错：', text)
			
 
				+        return ''
			
 
				+
			
 
				+# df = pd.read_csv('E:/crawl_data/crawler.BXKC_CRAWLING_RULES_INFO_utf8.csv')[:]
			
 
				+# df = pd.read_csv('E:/crawl_data/crawler.BXKC_CRAWLING_RULES_INFO_utf8_predict100-200.csv')[:]
			
 
				+
			
 
				+# df = pd.read_excel('E:\crawl_data/新建 XLS 工作表.xls')
			
 
				+df = pd.read_excel('E:\crawl_data/20220526new_url_2test.xlsx')
			
 
				+# df.drop_duplicates(subset=['首页网址'], inplace=True)
			
 
				+
			
 
				+#
			
 
				+# df['url_root'] = df['CRAWLER_LINK '].apply(lambda x:get_url_root(x))
			
 
				+# df['url'] = df['CRAWLER_LINK '].apply(lambda x:get_url(x))
			
 
				+# df = df[df['url']!=""]
			
 
				+# print(len(df))
			
 
				+# df.drop_duplicates(subset=['url_root'], inplace=True)
			
 
				+# print(len(df))
			
 
				+# df.drop_duplicates(subset=['DETAIL_CONTENT_NODE'], inplace=True)
			
 
				+# # df = df[100:200]
			
 
				+df.reset_index(drop=True, inplace=True)
			
 
				+print(len(df), df.columns)
			
 
				+t0 = time.time()
			
 
				+for i in df.index:
			
 
				+    # if '#列表页规则未获取#' not in df.loc[i, 'rs']:
			
 
				+    #     continue
			
 
				+    t1 = time.time()
			
 
				+    # url = df.loc[i, 'url']
			
 
				+    url = df.loc[i, '列表页链接']
			
 
				+    if not re.match('http', url):
			
 
				+        l.append('')
			
 
				+        print(url)
			
 
				+        continue
			
 
				+    print(url)
			
 
				+    rs = get_rs(url)
			
 
				+    # try:
			
 
				+    #     url = json.loads(df.loc[i, 'CRAWLER_LINK ']).get('ruleLink', '')
			
 
				+    #     print(url)
			
 
				+    #     rs = get_rs(url)
			
 
				+    # except:
			
 
				+    #     rs = json.dumps({'err_msg': 'json loads link error'})
			
 
				+    print('耗时：', time.time()-t1)
			
 
				+    print(rs)
			
 
				+    l.append(rs)
			
 
				+df['rs3'] = pd.Series(l)
			
 
				+print('完成，总耗时：', time.time()-t0)
			
 
				+# # df.to_csv('E:/crawl_data/crawler.BXKC_CRAWLING_RULES_INFO_utf8_predict后1000-900.csv', encoding='utf-8')
			
 
				+# df.to_excel('E:/crawl_data/20220526new_url_0531.xlsx', encoding='utf-8')
			
 
				+print('写入完成，总耗时：', time.time()-t0)
			
 
				+# #