3 yıl önce · de425b8143
--- a/module/Utils.py
+++ b/module/Utils.py
@@ -84,6 +84,8 @@ def xpath2css(xpath):
 
															     xpath = xpath.replace('//', '').replace('@', '').replace('/', '>')
														
 
															     for it in re.finditer('\[(\d)\]', xpath):
														
 
															         xpath = xpath.replace(it.group(0), ':nth-child(%s)'%it.group(1))
														
 
															+    if xpath[0] == '>':
														
 
															+        xpath = xpath[1:]
														
 
															     return xpath
														
 
															 def get_class_from_frame(fr):
														
@@ -827,7 +829,12 @@ function getXpath(el,b,notfirst){
 
															     for (var i=0,l=siblings.length;i<l;i++){
														
 
															         var sibling = siblings[i];
														
 
															         if (sibling==el){
														
 
															-            return getXpath(el.parentNode,b)+'/'+el.tagName.toLowerCase()+'['+(ix)+']';
														
 
															+            if(ix>1 || (ix==1 && i+1<siblings.length && siblings[i+1].tagName==el.tagName)){
														
 
															+                return getXpath(el.parentNode,b)+'/'+el.tagName.toLowerCase()+'['+(ix)+']';
														
 
															+            }else{
														
 
															+                return getXpath(el.parentNode,b)+'/'+el.tagName.toLowerCase();
														
 
															+            }
														
 
															+            //return getXpath(el.parentNode,b)+'/'+el.tagName.toLowerCase()+'['+(ix)+']';
														
 
															         }else if (sibling.tagName==el.tagName){
														
 
															             ix++;
														
 
															         }
														
--- a/module/detail/content/featureEngine.py
+++ b/module/detail/content/featureEngine.py
@@ -133,7 +133,7 @@ function stastic_time(node,_array){
 
															         }
														
 
															     }
														
 
															-    if (!_find_flag && node!=document){
														
 
															+    if (!_find_flag && node!=document && node.tagName.toLowerCase()!='script'){
														
 
															         _array_fontSize = new Array();
														
 
															         getListFontSize(node,_array_fontSize);
														
 
															         _array.push([getOffsetLeft(node),getOffsetTop(node),getListXpath(node,new Array()),Math.min(_array_fontSize)]);
														
--- a/module/detail/extractor.py
+++ b/module/detail/extractor.py
@@ -77,6 +77,7 @@ def getRule_detail(list_hrefs,try_times=3,MAX_HREFS=10):
 
															     list_xpath_remove_content = []
														
 
															     list_data_time = []
														
 
															     list_xpaths_title = []
														
 
															+    list_xpaths_time = []
														
 
															     list_title_top = []
														
 
															     count_hrefs = 0
														
 
															     dict_rule_detail = dict()
														
@@ -99,25 +100,25 @@ def getRule_detail(list_hrefs,try_times=3,MAX_HREFS=10):
 
															             x,inner_html,list_xpath,data_time = data
														
 
															             _index = detailContentPredictor.predict(x)
														
 
															-            pt = '<a.*?\.(zip|rar|tar|7z|wim|docx|doc|xlsx|xls|pdf|txt|hnzf|bmp|tif).*?</a>'
														
 
															+            pt = '<a.*?\.(zip|rar|tar|7z|wim|docx|doc|xlsx|xls|pdf|txt|hnzf|bmp|tif|PDF|DOC|DOCX|XLS|XLSX).*?</a>|<a[^<]*?(中标通知书|合同|文件|附件).*?</a>'
														
 
															             total_annex = len(re.findall(pt, browser.page_source))
														
 
															             extract_annex = len(re.findall(pt, inner_html[_index]))
														
 
															             if total_annex > extract_annex and _index>5 and len(list_xpath[_index])>0:
														
 
															                 extract_xpath = list_xpath[_index][0][0]
														
 
															                 for i in range(_index-1, _index-5, -1):
														
 
															-                    if len(re.findall(pt, inner_html[i]))== total_annex:
														
 
															-                        log('规格调整模型正文提取附件不完整')
														
 
															+                    if len(re.findall(pt, inner_html[i]))== total_annex and inner_html[_index] in inner_html[i]:
														
 
															+                        log('规则调整模型正文提取附件不完整情况')
														
 
															                         _index = i
														
 
															                         break
														
 
															-                    elif len(list_xpath[i])>0 and list_xpath[i][0][0] not in extract_xpath:
														
 
															-                        break
														
 
															             _xpath = list_xpath[_index]
														
 
															             _xpath.reverse()
														
 
															             list_xpath_remove_content.append(_xpath)
														
 
															-            
														
 
															+            tmp_xpaths_time = []
														
 
															             for item in data_time:
														
 
															                 list_legal_time.append(item)
														
 
															+                tmp_xpaths_time.append(item[2][0])
														
 
															+            list_xpaths_time.append(tmp_xpaths_time)
														
 
															             _flag += 1
														
 
															         else:
														
 
															             hasGotten = False
														
@@ -161,6 +162,15 @@ def getRule_detail(list_hrefs,try_times=3,MAX_HREFS=10):
 
															                     set_remove_list = set(_xpath_remove[1])
														
 
															                 else:
														
 
															                     set_remove_list = set(_xpath_remove[1])&set_remove_list
														
 
															+
														
 
															+    commonxpath_time = None
														
 
															+    if len(list_xpaths_time)>2:
														
 
															+        xpath_time_set = set(list_xpaths_time[0])
														
 
															+        for i in range(1, len(list_xpaths_time)):
														
 
															+            xpath_time_set = xpath_time_set&set(list_xpaths_time[i])
														
 
															+        if len(xpath_time_set)==1:
														
 
															+            commonxpath_time = xpath_time_set.pop()
														
 
															+    # dict_rule_detail["detail_date"] = commonxpath_time
														
 
															     dict_rule_detail["detail_removeList"] = list(set_remove_list) if set_remove_list!=None else []
														
 
															     dict_rule_detail["detail_date"] = getCommonXpath_time(list_data_time)
														
 
															     dict_rule_detail["detail_title"] = getCommonXpath(list_xpaths_title)
														
--- a/module/htmlDrawing.py
+++ b/module/htmlDrawing.py
@@ -44,7 +44,7 @@ def getBrowser_phantomJS(platform="linux",straight=False):
 
															     else:
														
 
															         executable_path = driver_paths["phantomjs_window"]
														
 
															     desired_capabilities= DesiredCapabilities.PHANTOMJS.copy()
														
 
															-    print('os.path.exists executable_path', executable_path, os.path.exists(executable_path))
														
 
															+    # print('os.path.exists executable_path', executable_path, os.path.exists(executable_path))
														
 
															     for key, value in header.items():
														
 
															         desired_capabilities['phantomjs.page.customHeaders.{}'.format(key)] = value
														
 
															     desired_capabilities['phantomjs.page.customHeaders.User-Agent'] ='Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/100.0.4896.75 Safari/537.36 Edg/100.0.1185.366'
														
--- a/module/listpage/content/featureEngine.py
+++ b/module/listpage/content/featureEngine.py
@@ -514,7 +514,7 @@ def getRule_A_Date(browser, url,content_xpath):
 
															     log("xpath of date:\t"+str(list_date[1][0])+"-"+str(list_date[0]))
														
 
															     log("length of A and Date:"+str(len(list_a[1]))+"-"+str(len(list_date[1])))
														
 
															-    if len(list_a[1])!=len(list_date[1]):
														
 
															+    if len(list_a[1])!=len(list_date[1]) and len(list_hrefs)>2 and len(set(list_hrefs[0])-set(list_hrefs[1]))>1:
														
 
															         dict_Rule_A_Date["flag"] = False
														
 
															         add_err_msg(dict_Rule_A_Date, "#列表页链接和标题数量不一致#")
														
 
															         return dict_Rule_A_Date,list_hrefs
														
--- a/module/listpage/pageTurn/engine.py
+++ b/module/listpage/pageTurn/engine.py
@@ -224,10 +224,12 @@ def getRuleOfUrl(first_url,second_url):
 
															     log("pageTurn second_url:\t"+second_url)
														
 
															     if len(split_all_first)!=len(split_all_second):
														
 
															         split_url = second_url.split('/')
														
 
															-        if split_url[-1]== 'index_2.html':
														
 
															+        if re.search('^index_[12].\w{3,5}$',split_url[-1]):
														
 
															+            suffix = split_url[-1].split('.')[1]
														
 
															+            page_begin = int(split_url[-1][6])
														
 
															             dict_rule["listpage_turn_before"] = '/'.join(split_url[:-1])+'/index_'
														
 
															-            dict_rule["listpage_turn_after"] = '.html'
														
 
															-            dict_rule["listpage_pageBegin"] = 2
														
 
															+            dict_rule["listpage_turn_after"] = '.'+suffix
														
 
															+            dict_rule["listpage_pageBegin"] = page_begin
														
 
															             dict_rule["listpage_pageStep"] = 1
														
 
															             return dict_rule
														
 
															         add_err_msg(dict_rule, "#翻页链接不匹配#")
														
@@ -359,7 +361,10 @@ def getTurnRule(browser,listpage_url):
 
															         dict_rule["flag"] = False
														
 
															     '''
														
 
															     #优先jsoup，后xpath
														
 
															-    if list_node1[1]==list_node2[1] and list_node1[1] is not None:
														
 
															+    if list_node1[0]is not None and hd.hasDrew(first_url, [{"rule":list_node1[0],"type":"xpath"}])==True:
														
 
															+        log('翻页链接经过渲染')
														
 
															+        dict_rule["listpage_nextPage"] = None
														
 
															+    elif list_node1[1]==list_node2[1] and list_node1[1] is not None:
														
 
															         dict_rule["listpage_nextPage"] = [list_node1[1],"jsoup"]
														
 
															     #只有2页的适配
														
 
															     elif list_node1[1] is not None and list_node2[1] is None:
														
--- a/module/run_single_server.py
+++ b/module/run_single_server.py
@@ -3,6 +3,7 @@ import sys
 
															 import json
														
 
															 import re
														
 
															 import os
														
 
															+os.environ['CUDA_VISIBLE_DEVICES'] = "-1"
														
 
															 sys.path.append(os.path.abspath("../.."))
														
 
															 os.environ['KERAS_BACKEND']='tensorflow'
														
@@ -53,8 +54,13 @@ def transformInterface(_dict):
 
															     _nextPage = False
														
 
															     if listpage_nextPage:
														
 
															         _nextPage = True
														
 
															-    trans_dict["needGetNextPage"] = _nextPage
														
 
															-    if listpage_turn_before is not None and listpage_turn_after is not None and listpage_pageStep is not None:
														
 
															+        ruleType = 1
														
 
															+        ruleValue = xpath2css(listpage_nextPage[0]) if listpage_nextPage[1]=='xpath' else listpage_nextPage[0]
														
 
															+        trans_dict["nextPageRule"] = {"ruleType": ruleType,
														
 
															+                                      "paramOrder":1,
														
 
															+                                      "ruleValue":ruleValue}
														
 
															+        trans_dict["needGetNextPage"] = _nextPage
														
 
															+    elif listpage_turn_before is not None and listpage_turn_after is not None and listpage_pageStep is not None:
														
 
															         if listpage_pageStep>0:
														
 
															             paramOrder = 1
														
 
															         else:
														
@@ -71,6 +77,7 @@ def transformInterface(_dict):
 
															     else:
														
 
															         flag = False
														
 
															+        trans_dict["needGetNextPage"] = _nextPage
														
 
															     detail_date = _dict.get("detail_date")
														
 
															     trans_dict["needDetailTime"] = False
														
 
															     if detail_date:
														
@@ -144,6 +151,7 @@ def text_predict():
 
															         # 以json形式返回结果
														
 
															         log(" time from receive to send: "+str(time.time()-start_time))
														
 
															+        # print('返回结果: ',data)
														
 
															         data = transformInterface(data)
														
 
															         # log(str(data))
														
@@ -154,5 +162,5 @@ def text_predict():
 
															 if __name__ == '__main__':
														
 
															-    app.run(host='192.168.2.65', port=15015, threaded=True, debug=False) #15015  2.65
														
 
															+    app.run(host='192.168.2.102', port=15015, threaded=True, debug=False) #15015  2.65
														
 
															     log("ContentExtractor running")
														
--- a/module/testInterface.py
+++ b/module/testInterface.py
@@ -368,7 +368,7 @@ _count = 0
 
															 def get_rs(url):
														
 
															     user = {"listpage_url": url}
														
 
															-    _resp = requests.post("http://192.168.2.177:15015/content_extract", json=user, verify=True) #127.0.0.1  177
														
 
															+    _resp = requests.post("http://192.168.2.102:15015/content_extract", json=user, verify=True) #127.0.0.1  177
														
 
															     resp_json = _resp.content.decode("utf-8")
														
 
															     return resp_json
														
@@ -387,7 +387,7 @@ def get_rs(url):
 
															 # url = 'http://www.yangzhou.gov.cn/cnyzfront/newxxgk/bmfdgklist.jsp?zjclassid=b8273cd5944b41c1b6f5aeb88194340f&bmcode=KA024&showlmmc=1&showbm=0&currentPage=2' # 翻页提取失败
														
 
															 # url = 'http://www.yangzhou.gov.cn/cnyzfront/newxxgk/bmfdgklist.jsp?zjclassid=aedecc7ea4cb4fbdb34df0d57db50c62&bmcode=11321000014407012K' # 所有要素提取失败, 重跑正常
														
 
															-url = 'http://www.gztaijiang.gov.cn/zwgk/zdlygk/zfcg/zbgg/index.html' # 所有要素提取失败, 重跑正常
														
 
															+# url = 'http://www.gztaijiang.gov.cn/zwgk/zdlygk/zfcg/zbgg/index.html' # 所有要素提取失败, 重跑正常
														
 
															 # url = 'http://gxs.yun.liuzhou.gov.cn/xwzx/tzgg/index.shtml' # 所有要素提取失败, bug 已修复
														
 
															 # url = 'http://www.yangzhou.gov.cn/cnyzfront/newxxgk/bmfdgklist.jsp?zjclassid=aedecc7ea4cb4fbdb34df0d57db50c62&bmcode=11321000014407012K' # 所有要素提取失败, 重跑正常
														
 
															 # url = 'http://www.chengan.gov.cn/main/newsMore.action?subjectid=9052&pagenum=1' # 所有要素提取失败, bug 已修复
														
@@ -396,8 +396,8 @@ url = 'http://www.gztaijiang.gov.cn/zwgk/zdlygk/zfcg/zbgg/index.html' # 所有
 
															 # url = 'http://www.ptstjxx.org.cn/pttsjyxx_lists-16.html' # 翻页超时错误 已修复 提取正常
														
 
															-# # url = 'https://www.neias.cn/news_list.jsp?id=10775' # 报 201 浏览器打开正常  重新提取 #翻页链接不匹配##下一页规则未获取#
														
 
															-# # url = 'https://www.gzmedri.com/xwdt/list_14_page_1.html' # 报 201 浏览器打开很慢，有时正常
														
 
															+# url = 'https://www.neias.cn/news_list.jsp?id=10775' # 报 201 浏览器打开正常  重新提取 #翻页链接不匹配##下一页规则未获取#
														
 
															+# url = 'https://www.gzmedri.com/xwdt/list_14_page_1.html' # 报 201 浏览器打开很慢，有时正常
														
 
															 # # url = 'http://www.wjqwhg.cn/Article?pageIndex=1' #列表页规则未获取#  网页打开报错 504
														
 
															 #
														
 
															 # # url = 'http://gxs.yun.liuzhou.gov.cn/xwzx/tzgg/index.shtml' # 所有要素提取失败, bug 已修复  列表页xpath预测错误
														
@@ -434,9 +434,22 @@ url = 'http://www.gztaijiang.gov.cn/zwgk/zdlygk/zfcg/zbgg/index.html' # 所有
 
															 url = 'http://www.yrcc.gov.cn/zwzc/gzgb/gg/index.html'
														
 
															 url = 'http://www.hzsq.gov.cn/index.php?r=article/Category/index&class_id=61'
														
 
															 url = 'http://zyjy.huizhou.gov.cn/ggfw/jyxx/gycqjy/gpjggg/'
														
 
															-url = 'http://www.lzwhg.com/tongzhigonggao/'  #翻页失败
														
 
															+# url = 'http://www.lzwhg.com/tongzhigonggao/'  #翻页失败
														
 
															+# url = 'http://jmxhfy.chinacourt.gov.cn/article/index/id/M8xNNjBINzAwNiACAAA.shtml'  # js报错 getpath
														
 
															+# url = 'http://www.heshanshi.gov.cn/xxgk/zdlyxxgk/zdjsxmpzhss/ztbgcgk/' # 只有两页，翻页失败 ，已优化
														
 
															+url = 'http://www.hustp.com/index.php?s=/Index/noticeInfoList/type_id/11.html' # 只有两页，且不规律
														
 
															+# url = 'http://www.scncggzy.com.cn/TPFront/front_zfcg/071009/'
														
 
															+# url = 'http://www.stjs.org.cn/zbtb/zbtb_zhongbiaogg.aspx?page=1'
														
 
															+# url = 'http://www.ccgp-hebei.gov.cn/province/cggg/dyly/'
														
 
															+# url = 'http://www.ccgp-hebei.gov.cn/province/cggg/dyly/'
														
 
															+# url = 'http://zyjy.huizhou.gov.cn/ggfw/jyxx/gycqjy/gpjggg/'
														
 
															+# url = 'http://www.gztpc.com/category/bidding.html?id=230'  # 附件丢失，代优化
														
 
															+# url = 'https://www.nbzjy.cn/list-gsgg.html' # #列表页规则未获取# 已解决
														
 
															+t1 = time.time()
														
 
															 rs = get_rs(url)
														
 
															+t2 = time.time()
														
 
															 print(rs)
														
 
															+print("耗时：", t2-t1)
														
@@ -461,7 +474,7 @@ def get_url(text):
 
															 # df = pd.read_csv('E:/crawl_data/crawler.BXKC_CRAWLING_RULES_INFO_utf8_predict100-200.csv')[:]
														
 
															 # df = pd.read_excel('E:\crawl_data/新建 XLS 工作表.xls')
														
 
															-df = pd.read_excel('E:\crawl_data/20220526new_url_2test.xlsx')
														
 
															+# df = pd.read_excel('E:\crawl_data/20220526new_url_2test.xlsx')
														
 
															 # df.drop_duplicates(subset=['首页网址'], inplace=True)
														
 
															 #
														
@@ -472,34 +485,34 @@ df = pd.read_excel('E:\crawl_data/20220526new_url_2test.xlsx')
 
															 # df.drop_duplicates(subset=['url_root'], inplace=True)
														
 
															 # print(len(df))
														
 
															 # df.drop_duplicates(subset=['DETAIL_CONTENT_NODE'], inplace=True)
														
 
															-# # df = df[100:200]
														
 
															-df.reset_index(drop=True, inplace=True)
														
 
															-print(len(df), df.columns)
														
 
															-t0 = time.time()
														
 
															-for i in df.index:
														
 
															-    # if '#列表页规则未获取#' not in df.loc[i, 'rs']:
														
 
															-    #     continue
														
 
															-    t1 = time.time()
														
 
															-    # url = df.loc[i, 'url']
														
 
															-    url = df.loc[i, '列表页链接']
														
 
															-    if not re.match('http', url):
														
 
															-        l.append('')
														
 
															-        print(url)
														
 
															-        continue
														
 
															-    print(url)
														
 
															-    rs = get_rs(url)
														
 
															-    # try:
														
 
															-    #     url = json.loads(df.loc[i, 'CRAWLER_LINK ']).get('ruleLink', '')
														
 
															-    #     print(url)
														
 
															-    #     rs = get_rs(url)
														
 
															-    # except:
														
 
															-    #     rs = json.dumps({'err_msg': 'json loads link error'})
														
 
															-    print('耗时：', time.time()-t1)
														
 
															-    print(rs)
														
 
															-    l.append(rs)
														
 
															-df['rs3'] = pd.Series(l)
														
 
															-print('完成，总耗时：', time.time()-t0)
														
 
															-# # df.to_csv('E:/crawl_data/crawler.BXKC_CRAWLING_RULES_INFO_utf8_predict后1000-900.csv', encoding='utf-8')
														
 
															-# df.to_excel('E:/crawl_data/20220526new_url_0531.xlsx', encoding='utf-8')
														
 
															-print('写入完成，总耗时：', time.time()-t0)
														
 
															-# #
														
 
															+# # # df = df[100:200]
														
 
															+# df.reset_index(drop=True, inplace=True)
														
 
															+# print(len(df), df.columns)
														
 
															+# t0 = time.time()
														
 
															+# for i in df.index:
														
 
															+#     # if '#列表页规则未获取#' not in df.loc[i, 'rs']:
														
 
															+#     #     continue
														
 
															+#     t1 = time.time()
														
 
															+#     # url = df.loc[i, 'url']
														
 
															+#     url = df.loc[i, '列表页链接']
														
 
															+#     if not re.match('http', url):
														
 
															+#         l.append('')
														
 
															+#         print(url)
														
 
															+#         continue
														
 
															+#     print(url)
														
 
															+#     rs = get_rs(url)
														
 
															+#     # try:
														
 
															+#     #     url = json.loads(df.loc[i, 'CRAWLER_LINK ']).get('ruleLink', '')
														
 
															+#     #     print(url)
														
 
															+#     #     rs = get_rs(url)
														
 
															+#     # except:
														
 
															+#     #     rs = json.dumps({'err_msg': 'json loads link error'})
														
 
															+#     print('耗时：', time.time()-t1)
														
 
															+#     print(rs)
														
 
															+#     l.append(rs)
														
 
															+# df['rs3'] = pd.Series(l)
														
 
															+# print('完成，总耗时：', time.time()-t0)
														
 
															+# # # df.to_csv('E:/crawl_data/crawler.BXKC_CRAWLING_RULES_INFO_utf8_predict后1000-900.csv', encoding='utf-8')
														
 
															+# # df.to_excel('E:/crawl_data/20220526new_url_0531.xlsx', encoding='utf-8')
														
 
															+# print('写入完成，总耗时：', time.time()-t0)
														
 
															+# # #