Browse Source

补充可能漏附件情况处理;优化只有一两页列表处理及其他优化

lsm 2 years ago
parent
commit
de425b8143

+ 8 - 1
module/Utils.py

@@ -84,6 +84,8 @@ def xpath2css(xpath):
     xpath = xpath.replace('//', '').replace('@', '').replace('/', '>')
     xpath = xpath.replace('//', '').replace('@', '').replace('/', '>')
     for it in re.finditer('\[(\d)\]', xpath):
     for it in re.finditer('\[(\d)\]', xpath):
         xpath = xpath.replace(it.group(0), ':nth-child(%s)'%it.group(1))
         xpath = xpath.replace(it.group(0), ':nth-child(%s)'%it.group(1))
+    if xpath[0] == '>':
+        xpath = xpath[1:]
     return xpath
     return xpath
 
 
 def get_class_from_frame(fr):
 def get_class_from_frame(fr):
@@ -827,7 +829,12 @@ function getXpath(el,b,notfirst){
     for (var i=0,l=siblings.length;i<l;i++){
     for (var i=0,l=siblings.length;i<l;i++){
         var sibling = siblings[i];
         var sibling = siblings[i];
         if (sibling==el){
         if (sibling==el){
-            return getXpath(el.parentNode,b)+'/'+el.tagName.toLowerCase()+'['+(ix)+']';
+            if(ix>1 || (ix==1 && i+1<siblings.length && siblings[i+1].tagName==el.tagName)){
+                return getXpath(el.parentNode,b)+'/'+el.tagName.toLowerCase()+'['+(ix)+']';
+            }else{
+                return getXpath(el.parentNode,b)+'/'+el.tagName.toLowerCase();
+            }
+            //return getXpath(el.parentNode,b)+'/'+el.tagName.toLowerCase()+'['+(ix)+']';
         }else if (sibling.tagName==el.tagName){
         }else if (sibling.tagName==el.tagName){
             ix++;
             ix++;
         }
         }

+ 1 - 1
module/detail/content/featureEngine.py

@@ -133,7 +133,7 @@ function stastic_time(node,_array){
         }
         }
     }
     }
 
 
-    if (!_find_flag && node!=document){
+    if (!_find_flag && node!=document && node.tagName.toLowerCase()!='script'){
         _array_fontSize = new Array();
         _array_fontSize = new Array();
         getListFontSize(node,_array_fontSize);
         getListFontSize(node,_array_fontSize);
         _array.push([getOffsetLeft(node),getOffsetTop(node),getListXpath(node,new Array()),Math.min(_array_fontSize)]);
         _array.push([getOffsetLeft(node),getOffsetTop(node),getListXpath(node,new Array()),Math.min(_array_fontSize)]);

+ 16 - 6
module/detail/extractor.py

@@ -77,6 +77,7 @@ def getRule_detail(list_hrefs,try_times=3,MAX_HREFS=10):
     list_xpath_remove_content = []
     list_xpath_remove_content = []
     list_data_time = []
     list_data_time = []
     list_xpaths_title = []
     list_xpaths_title = []
+    list_xpaths_time = []
     list_title_top = []
     list_title_top = []
     count_hrefs = 0
     count_hrefs = 0
     dict_rule_detail = dict()
     dict_rule_detail = dict()
@@ -99,25 +100,25 @@ def getRule_detail(list_hrefs,try_times=3,MAX_HREFS=10):
             x,inner_html,list_xpath,data_time = data
             x,inner_html,list_xpath,data_time = data
             _index = detailContentPredictor.predict(x)
             _index = detailContentPredictor.predict(x)
 
 
-            pt = '<a.*?\.(zip|rar|tar|7z|wim|docx|doc|xlsx|xls|pdf|txt|hnzf|bmp|tif).*?</a>'
+            pt = '<a.*?\.(zip|rar|tar|7z|wim|docx|doc|xlsx|xls|pdf|txt|hnzf|bmp|tif|PDF|DOC|DOCX|XLS|XLSX).*?</a>|<a[^<]*?(中标通知书|合同|文件|附件).*?</a>'
             total_annex = len(re.findall(pt, browser.page_source))
             total_annex = len(re.findall(pt, browser.page_source))
             extract_annex = len(re.findall(pt, inner_html[_index]))
             extract_annex = len(re.findall(pt, inner_html[_index]))
             if total_annex > extract_annex and _index>5 and len(list_xpath[_index])>0:
             if total_annex > extract_annex and _index>5 and len(list_xpath[_index])>0:
                 extract_xpath = list_xpath[_index][0][0]
                 extract_xpath = list_xpath[_index][0][0]
                 for i in range(_index-1, _index-5, -1):
                 for i in range(_index-1, _index-5, -1):
-                    if len(re.findall(pt, inner_html[i]))== total_annex:
-                        log('规格调整模型正文提取附件不完整')
+                    if len(re.findall(pt, inner_html[i]))== total_annex and inner_html[_index] in inner_html[i]:
+                        log('规则调整模型正文提取附件不完整情况')
                         _index = i
                         _index = i
                         break
                         break
-                    elif len(list_xpath[i])>0 and list_xpath[i][0][0] not in extract_xpath:
-                        break
 
 
             _xpath = list_xpath[_index]
             _xpath = list_xpath[_index]
             _xpath.reverse()
             _xpath.reverse()
             list_xpath_remove_content.append(_xpath)
             list_xpath_remove_content.append(_xpath)
-            
+            tmp_xpaths_time = []
             for item in data_time:
             for item in data_time:
                 list_legal_time.append(item)
                 list_legal_time.append(item)
+                tmp_xpaths_time.append(item[2][0])
+            list_xpaths_time.append(tmp_xpaths_time)
             _flag += 1
             _flag += 1
         else:
         else:
             hasGotten = False
             hasGotten = False
@@ -161,6 +162,15 @@ def getRule_detail(list_hrefs,try_times=3,MAX_HREFS=10):
                     set_remove_list = set(_xpath_remove[1])
                     set_remove_list = set(_xpath_remove[1])
                 else:
                 else:
                     set_remove_list = set(_xpath_remove[1])&set_remove_list
                     set_remove_list = set(_xpath_remove[1])&set_remove_list
+
+    commonxpath_time = None
+    if len(list_xpaths_time)>2:
+        xpath_time_set = set(list_xpaths_time[0])
+        for i in range(1, len(list_xpaths_time)):
+            xpath_time_set = xpath_time_set&set(list_xpaths_time[i])
+        if len(xpath_time_set)==1:
+            commonxpath_time = xpath_time_set.pop()
+    # dict_rule_detail["detail_date"] = commonxpath_time
     dict_rule_detail["detail_removeList"] = list(set_remove_list) if set_remove_list!=None else []
     dict_rule_detail["detail_removeList"] = list(set_remove_list) if set_remove_list!=None else []
     dict_rule_detail["detail_date"] = getCommonXpath_time(list_data_time)
     dict_rule_detail["detail_date"] = getCommonXpath_time(list_data_time)
     dict_rule_detail["detail_title"] = getCommonXpath(list_xpaths_title)
     dict_rule_detail["detail_title"] = getCommonXpath(list_xpaths_title)

+ 1 - 1
module/htmlDrawing.py

@@ -44,7 +44,7 @@ def getBrowser_phantomJS(platform="linux",straight=False):
     else:
     else:
         executable_path = driver_paths["phantomjs_window"]
         executable_path = driver_paths["phantomjs_window"]
     desired_capabilities= DesiredCapabilities.PHANTOMJS.copy()
     desired_capabilities= DesiredCapabilities.PHANTOMJS.copy()
-    print('os.path.exists executable_path', executable_path, os.path.exists(executable_path))
+    # print('os.path.exists executable_path', executable_path, os.path.exists(executable_path))
     for key, value in header.items():
     for key, value in header.items():
         desired_capabilities['phantomjs.page.customHeaders.{}'.format(key)] = value
         desired_capabilities['phantomjs.page.customHeaders.{}'.format(key)] = value
     desired_capabilities['phantomjs.page.customHeaders.User-Agent'] ='Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/100.0.4896.75 Safari/537.36 Edg/100.0.1185.366'
     desired_capabilities['phantomjs.page.customHeaders.User-Agent'] ='Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/100.0.4896.75 Safari/537.36 Edg/100.0.1185.366'

+ 1 - 1
module/listpage/content/featureEngine.py

@@ -514,7 +514,7 @@ def getRule_A_Date(browser, url,content_xpath):
     log("xpath of date:\t"+str(list_date[1][0])+"-"+str(list_date[0]))
     log("xpath of date:\t"+str(list_date[1][0])+"-"+str(list_date[0]))
 
 
     log("length of A and Date:"+str(len(list_a[1]))+"-"+str(len(list_date[1])))
     log("length of A and Date:"+str(len(list_a[1]))+"-"+str(len(list_date[1])))
-    if len(list_a[1])!=len(list_date[1]):
+    if len(list_a[1])!=len(list_date[1]) and len(list_hrefs)>2 and len(set(list_hrefs[0])-set(list_hrefs[1]))>1:
         dict_Rule_A_Date["flag"] = False
         dict_Rule_A_Date["flag"] = False
         add_err_msg(dict_Rule_A_Date, "#列表页链接和标题数量不一致#")
         add_err_msg(dict_Rule_A_Date, "#列表页链接和标题数量不一致#")
         return dict_Rule_A_Date,list_hrefs
         return dict_Rule_A_Date,list_hrefs

+ 9 - 4
module/listpage/pageTurn/engine.py

@@ -224,10 +224,12 @@ def getRuleOfUrl(first_url,second_url):
     log("pageTurn second_url:\t"+second_url)
     log("pageTurn second_url:\t"+second_url)
     if len(split_all_first)!=len(split_all_second):
     if len(split_all_first)!=len(split_all_second):
         split_url = second_url.split('/')
         split_url = second_url.split('/')
-        if split_url[-1]== 'index_2.html':
+        if re.search('^index_[12].\w{3,5}$',split_url[-1]):
+            suffix = split_url[-1].split('.')[1]
+            page_begin = int(split_url[-1][6])
             dict_rule["listpage_turn_before"] = '/'.join(split_url[:-1])+'/index_'
             dict_rule["listpage_turn_before"] = '/'.join(split_url[:-1])+'/index_'
-            dict_rule["listpage_turn_after"] = '.html'
-            dict_rule["listpage_pageBegin"] = 2
+            dict_rule["listpage_turn_after"] = '.'+suffix
+            dict_rule["listpage_pageBegin"] = page_begin
             dict_rule["listpage_pageStep"] = 1
             dict_rule["listpage_pageStep"] = 1
             return dict_rule
             return dict_rule
         add_err_msg(dict_rule, "#翻页链接不匹配#")
         add_err_msg(dict_rule, "#翻页链接不匹配#")
@@ -359,7 +361,10 @@ def getTurnRule(browser,listpage_url):
         dict_rule["flag"] = False
         dict_rule["flag"] = False
     '''
     '''
     #优先jsoup,后xpath
     #优先jsoup,后xpath
-    if list_node1[1]==list_node2[1] and list_node1[1] is not None:
+    if list_node1[0]is not None and hd.hasDrew(first_url, [{"rule":list_node1[0],"type":"xpath"}])==True:
+        log('翻页链接经过渲染')
+        dict_rule["listpage_nextPage"] = None
+    elif list_node1[1]==list_node2[1] and list_node1[1] is not None:
         dict_rule["listpage_nextPage"] = [list_node1[1],"jsoup"]
         dict_rule["listpage_nextPage"] = [list_node1[1],"jsoup"]
     #只有2页的适配
     #只有2页的适配
     elif list_node1[1] is not None and list_node2[1] is None:
     elif list_node1[1] is not None and list_node2[1] is None:

+ 11 - 3
module/run_single_server.py

@@ -3,6 +3,7 @@ import sys
 import json
 import json
 import re
 import re
 import os
 import os
+os.environ['CUDA_VISIBLE_DEVICES'] = "-1"
 sys.path.append(os.path.abspath("../.."))
 sys.path.append(os.path.abspath("../.."))
 
 
 os.environ['KERAS_BACKEND']='tensorflow'
 os.environ['KERAS_BACKEND']='tensorflow'
@@ -53,8 +54,13 @@ def transformInterface(_dict):
     _nextPage = False
     _nextPage = False
     if listpage_nextPage:
     if listpage_nextPage:
         _nextPage = True
         _nextPage = True
-    trans_dict["needGetNextPage"] = _nextPage
-    if listpage_turn_before is not None and listpage_turn_after is not None and listpage_pageStep is not None:
+        ruleType = 1
+        ruleValue = xpath2css(listpage_nextPage[0]) if listpage_nextPage[1]=='xpath' else listpage_nextPage[0]
+        trans_dict["nextPageRule"] = {"ruleType": ruleType,
+                                      "paramOrder":1,
+                                      "ruleValue":ruleValue}
+        trans_dict["needGetNextPage"] = _nextPage
+    elif listpage_turn_before is not None and listpage_turn_after is not None and listpage_pageStep is not None:
         if listpage_pageStep>0:
         if listpage_pageStep>0:
             paramOrder = 1
             paramOrder = 1
         else:
         else:
@@ -71,6 +77,7 @@ def transformInterface(_dict):
 
 
     else:
     else:
         flag = False
         flag = False
+        trans_dict["needGetNextPage"] = _nextPage
     detail_date = _dict.get("detail_date")
     detail_date = _dict.get("detail_date")
     trans_dict["needDetailTime"] = False
     trans_dict["needDetailTime"] = False
     if detail_date:
     if detail_date:
@@ -144,6 +151,7 @@ def text_predict():
                 
                 
         # 以json形式返回结果
         # 以json形式返回结果
         log(" time from receive to send: "+str(time.time()-start_time))
         log(" time from receive to send: "+str(time.time()-start_time))
+        # print('返回结果: ',data)
 
 
         data = transformInterface(data)
         data = transformInterface(data)
         # log(str(data))
         # log(str(data))
@@ -154,5 +162,5 @@ def text_predict():
 
 
 
 
 if __name__ == '__main__':
 if __name__ == '__main__':
-    app.run(host='192.168.2.65', port=15015, threaded=True, debug=False) #15015  2.65
+    app.run(host='192.168.2.102', port=15015, threaded=True, debug=False) #15015  2.65
     log("ContentExtractor running")
     log("ContentExtractor running")

+ 50 - 37
module/testInterface.py

@@ -368,7 +368,7 @@ _count = 0
 
 
 def get_rs(url):
 def get_rs(url):
     user = {"listpage_url": url}
     user = {"listpage_url": url}
-    _resp = requests.post("http://192.168.2.177:15015/content_extract", json=user, verify=True) #127.0.0.1  177
+    _resp = requests.post("http://192.168.2.102:15015/content_extract", json=user, verify=True) #127.0.0.1  177
     resp_json = _resp.content.decode("utf-8")
     resp_json = _resp.content.decode("utf-8")
     return resp_json
     return resp_json
 
 
@@ -387,7 +387,7 @@ def get_rs(url):
 
 
 # url = 'http://www.yangzhou.gov.cn/cnyzfront/newxxgk/bmfdgklist.jsp?zjclassid=b8273cd5944b41c1b6f5aeb88194340f&bmcode=KA024&showlmmc=1&showbm=0&currentPage=2' # 翻页提取失败
 # url = 'http://www.yangzhou.gov.cn/cnyzfront/newxxgk/bmfdgklist.jsp?zjclassid=b8273cd5944b41c1b6f5aeb88194340f&bmcode=KA024&showlmmc=1&showbm=0&currentPage=2' # 翻页提取失败
 # url = 'http://www.yangzhou.gov.cn/cnyzfront/newxxgk/bmfdgklist.jsp?zjclassid=aedecc7ea4cb4fbdb34df0d57db50c62&bmcode=11321000014407012K' # 所有要素提取失败, 重跑正常
 # url = 'http://www.yangzhou.gov.cn/cnyzfront/newxxgk/bmfdgklist.jsp?zjclassid=aedecc7ea4cb4fbdb34df0d57db50c62&bmcode=11321000014407012K' # 所有要素提取失败, 重跑正常
-url = 'http://www.gztaijiang.gov.cn/zwgk/zdlygk/zfcg/zbgg/index.html' # 所有要素提取失败, 重跑正常
+# url = 'http://www.gztaijiang.gov.cn/zwgk/zdlygk/zfcg/zbgg/index.html' # 所有要素提取失败, 重跑正常
 # url = 'http://gxs.yun.liuzhou.gov.cn/xwzx/tzgg/index.shtml' # 所有要素提取失败, bug 已修复
 # url = 'http://gxs.yun.liuzhou.gov.cn/xwzx/tzgg/index.shtml' # 所有要素提取失败, bug 已修复
 # url = 'http://www.yangzhou.gov.cn/cnyzfront/newxxgk/bmfdgklist.jsp?zjclassid=aedecc7ea4cb4fbdb34df0d57db50c62&bmcode=11321000014407012K' # 所有要素提取失败, 重跑正常
 # url = 'http://www.yangzhou.gov.cn/cnyzfront/newxxgk/bmfdgklist.jsp?zjclassid=aedecc7ea4cb4fbdb34df0d57db50c62&bmcode=11321000014407012K' # 所有要素提取失败, 重跑正常
 # url = 'http://www.chengan.gov.cn/main/newsMore.action?subjectid=9052&pagenum=1' # 所有要素提取失败, bug 已修复
 # url = 'http://www.chengan.gov.cn/main/newsMore.action?subjectid=9052&pagenum=1' # 所有要素提取失败, bug 已修复
@@ -396,8 +396,8 @@ url = 'http://www.gztaijiang.gov.cn/zwgk/zdlygk/zfcg/zbgg/index.html' # 所有
 
 
 # url = 'http://www.ptstjxx.org.cn/pttsjyxx_lists-16.html' # 翻页超时错误 已修复 提取正常
 # url = 'http://www.ptstjxx.org.cn/pttsjyxx_lists-16.html' # 翻页超时错误 已修复 提取正常
 
 
-# # url = 'https://www.neias.cn/news_list.jsp?id=10775' # 报 201 浏览器打开正常  重新提取 #翻页链接不匹配##下一页规则未获取#
-# # url = 'https://www.gzmedri.com/xwdt/list_14_page_1.html' # 报 201 浏览器打开很慢,有时正常
+# url = 'https://www.neias.cn/news_list.jsp?id=10775' # 报 201 浏览器打开正常  重新提取 #翻页链接不匹配##下一页规则未获取#
+# url = 'https://www.gzmedri.com/xwdt/list_14_page_1.html' # 报 201 浏览器打开很慢,有时正常
 # # url = 'http://www.wjqwhg.cn/Article?pageIndex=1' #列表页规则未获取#  网页打开报错 504
 # # url = 'http://www.wjqwhg.cn/Article?pageIndex=1' #列表页规则未获取#  网页打开报错 504
 #
 #
 # # url = 'http://gxs.yun.liuzhou.gov.cn/xwzx/tzgg/index.shtml' # 所有要素提取失败, bug 已修复  列表页xpath预测错误
 # # url = 'http://gxs.yun.liuzhou.gov.cn/xwzx/tzgg/index.shtml' # 所有要素提取失败, bug 已修复  列表页xpath预测错误
@@ -434,9 +434,22 @@ url = 'http://www.gztaijiang.gov.cn/zwgk/zdlygk/zfcg/zbgg/index.html' # 所有
 url = 'http://www.yrcc.gov.cn/zwzc/gzgb/gg/index.html'
 url = 'http://www.yrcc.gov.cn/zwzc/gzgb/gg/index.html'
 url = 'http://www.hzsq.gov.cn/index.php?r=article/Category/index&class_id=61'
 url = 'http://www.hzsq.gov.cn/index.php?r=article/Category/index&class_id=61'
 url = 'http://zyjy.huizhou.gov.cn/ggfw/jyxx/gycqjy/gpjggg/'
 url = 'http://zyjy.huizhou.gov.cn/ggfw/jyxx/gycqjy/gpjggg/'
-url = 'http://www.lzwhg.com/tongzhigonggao/'  #翻页失败
+# url = 'http://www.lzwhg.com/tongzhigonggao/'  #翻页失败
+# url = 'http://jmxhfy.chinacourt.gov.cn/article/index/id/M8xNNjBINzAwNiACAAA.shtml'  # js报错 getpath
+# url = 'http://www.heshanshi.gov.cn/xxgk/zdlyxxgk/zdjsxmpzhss/ztbgcgk/' # 只有两页,翻页失败 ,已优化
+url = 'http://www.hustp.com/index.php?s=/Index/noticeInfoList/type_id/11.html' # 只有两页,且不规律
+# url = 'http://www.scncggzy.com.cn/TPFront/front_zfcg/071009/'
+# url = 'http://www.stjs.org.cn/zbtb/zbtb_zhongbiaogg.aspx?page=1'
+# url = 'http://www.ccgp-hebei.gov.cn/province/cggg/dyly/'
+# url = 'http://www.ccgp-hebei.gov.cn/province/cggg/dyly/'
+# url = 'http://zyjy.huizhou.gov.cn/ggfw/jyxx/gycqjy/gpjggg/'
+# url = 'http://www.gztpc.com/category/bidding.html?id=230'  # 附件丢失,代优化
+# url = 'https://www.nbzjy.cn/list-gsgg.html' # #列表页规则未获取# 已解决
+t1 = time.time()
 rs = get_rs(url)
 rs = get_rs(url)
+t2 = time.time()
 print(rs)
 print(rs)
+print("耗时:", t2-t1)
 
 
 
 
 
 
@@ -461,7 +474,7 @@ def get_url(text):
 # df = pd.read_csv('E:/crawl_data/crawler.BXKC_CRAWLING_RULES_INFO_utf8_predict100-200.csv')[:]
 # df = pd.read_csv('E:/crawl_data/crawler.BXKC_CRAWLING_RULES_INFO_utf8_predict100-200.csv')[:]
 
 
 # df = pd.read_excel('E:\crawl_data/新建 XLS 工作表.xls')
 # df = pd.read_excel('E:\crawl_data/新建 XLS 工作表.xls')
-df = pd.read_excel('E:\crawl_data/20220526new_url_2test.xlsx')
+# df = pd.read_excel('E:\crawl_data/20220526new_url_2test.xlsx')
 # df.drop_duplicates(subset=['首页网址'], inplace=True)
 # df.drop_duplicates(subset=['首页网址'], inplace=True)
 
 
 #
 #
@@ -472,34 +485,34 @@ df = pd.read_excel('E:\crawl_data/20220526new_url_2test.xlsx')
 # df.drop_duplicates(subset=['url_root'], inplace=True)
 # df.drop_duplicates(subset=['url_root'], inplace=True)
 # print(len(df))
 # print(len(df))
 # df.drop_duplicates(subset=['DETAIL_CONTENT_NODE'], inplace=True)
 # df.drop_duplicates(subset=['DETAIL_CONTENT_NODE'], inplace=True)
-# # df = df[100:200]
-df.reset_index(drop=True, inplace=True)
-print(len(df), df.columns)
-t0 = time.time()
-for i in df.index:
-    # if '#列表页规则未获取#' not in df.loc[i, 'rs']:
-    #     continue
-    t1 = time.time()
-    # url = df.loc[i, 'url']
-    url = df.loc[i, '列表页链接']
-    if not re.match('http', url):
-        l.append('')
-        print(url)
-        continue
-    print(url)
-    rs = get_rs(url)
-    # try:
-    #     url = json.loads(df.loc[i, 'CRAWLER_LINK ']).get('ruleLink', '')
-    #     print(url)
-    #     rs = get_rs(url)
-    # except:
-    #     rs = json.dumps({'err_msg': 'json loads link error'})
-    print('耗时:', time.time()-t1)
-    print(rs)
-    l.append(rs)
-df['rs3'] = pd.Series(l)
-print('完成,总耗时:', time.time()-t0)
-# # df.to_csv('E:/crawl_data/crawler.BXKC_CRAWLING_RULES_INFO_utf8_predict后1000-900.csv', encoding='utf-8')
-# df.to_excel('E:/crawl_data/20220526new_url_0531.xlsx', encoding='utf-8')
-print('写入完成,总耗时:', time.time()-t0)
-# #
+# # # df = df[100:200]
+# df.reset_index(drop=True, inplace=True)
+# print(len(df), df.columns)
+# t0 = time.time()
+# for i in df.index:
+#     # if '#列表页规则未获取#' not in df.loc[i, 'rs']:
+#     #     continue
+#     t1 = time.time()
+#     # url = df.loc[i, 'url']
+#     url = df.loc[i, '列表页链接']
+#     if not re.match('http', url):
+#         l.append('')
+#         print(url)
+#         continue
+#     print(url)
+#     rs = get_rs(url)
+#     # try:
+#     #     url = json.loads(df.loc[i, 'CRAWLER_LINK ']).get('ruleLink', '')
+#     #     print(url)
+#     #     rs = get_rs(url)
+#     # except:
+#     #     rs = json.dumps({'err_msg': 'json loads link error'})
+#     print('耗时:', time.time()-t1)
+#     print(rs)
+#     l.append(rs)
+# df['rs3'] = pd.Series(l)
+# print('完成,总耗时:', time.time()-t0)
+# # # df.to_csv('E:/crawl_data/crawler.BXKC_CRAWLING_RULES_INFO_utf8_predict后1000-900.csv', encoding='utf-8')
+# # df.to_excel('E:/crawl_data/20220526new_url_0531.xlsx', encoding='utf-8')
+# print('写入完成,总耗时:', time.time()-t0)
+# # #