|
@@ -368,7 +368,7 @@ _count = 0
|
|
|
|
|
|
def get_rs(url):
|
|
def get_rs(url):
|
|
user = {"listpage_url": url}
|
|
user = {"listpage_url": url}
|
|
- _resp = requests.post("http://192.168.2.177:15015/content_extract", json=user, verify=True) #127.0.0.1 177
|
|
|
|
|
|
+ _resp = requests.post("http://192.168.2.102:15015/content_extract", json=user, verify=True) #127.0.0.1 177
|
|
resp_json = _resp.content.decode("utf-8")
|
|
resp_json = _resp.content.decode("utf-8")
|
|
return resp_json
|
|
return resp_json
|
|
|
|
|
|
@@ -387,7 +387,7 @@ def get_rs(url):
|
|
|
|
|
|
# url = 'http://www.yangzhou.gov.cn/cnyzfront/newxxgk/bmfdgklist.jsp?zjclassid=b8273cd5944b41c1b6f5aeb88194340f&bmcode=KA024&showlmmc=1&showbm=0¤tPage=2' # 翻页提取失败
|
|
# url = 'http://www.yangzhou.gov.cn/cnyzfront/newxxgk/bmfdgklist.jsp?zjclassid=b8273cd5944b41c1b6f5aeb88194340f&bmcode=KA024&showlmmc=1&showbm=0¤tPage=2' # 翻页提取失败
|
|
# url = 'http://www.yangzhou.gov.cn/cnyzfront/newxxgk/bmfdgklist.jsp?zjclassid=aedecc7ea4cb4fbdb34df0d57db50c62&bmcode=11321000014407012K' # 所有要素提取失败, 重跑正常
|
|
# url = 'http://www.yangzhou.gov.cn/cnyzfront/newxxgk/bmfdgklist.jsp?zjclassid=aedecc7ea4cb4fbdb34df0d57db50c62&bmcode=11321000014407012K' # 所有要素提取失败, 重跑正常
|
|
-url = 'http://www.gztaijiang.gov.cn/zwgk/zdlygk/zfcg/zbgg/index.html' # 所有要素提取失败, 重跑正常
|
|
|
|
|
|
+# url = 'http://www.gztaijiang.gov.cn/zwgk/zdlygk/zfcg/zbgg/index.html' # 所有要素提取失败, 重跑正常
|
|
# url = 'http://gxs.yun.liuzhou.gov.cn/xwzx/tzgg/index.shtml' # 所有要素提取失败, bug 已修复
|
|
# url = 'http://gxs.yun.liuzhou.gov.cn/xwzx/tzgg/index.shtml' # 所有要素提取失败, bug 已修复
|
|
# url = 'http://www.yangzhou.gov.cn/cnyzfront/newxxgk/bmfdgklist.jsp?zjclassid=aedecc7ea4cb4fbdb34df0d57db50c62&bmcode=11321000014407012K' # 所有要素提取失败, 重跑正常
|
|
# url = 'http://www.yangzhou.gov.cn/cnyzfront/newxxgk/bmfdgklist.jsp?zjclassid=aedecc7ea4cb4fbdb34df0d57db50c62&bmcode=11321000014407012K' # 所有要素提取失败, 重跑正常
|
|
# url = 'http://www.chengan.gov.cn/main/newsMore.action?subjectid=9052&pagenum=1' # 所有要素提取失败, bug 已修复
|
|
# url = 'http://www.chengan.gov.cn/main/newsMore.action?subjectid=9052&pagenum=1' # 所有要素提取失败, bug 已修复
|
|
@@ -396,8 +396,8 @@ url = 'http://www.gztaijiang.gov.cn/zwgk/zdlygk/zfcg/zbgg/index.html' # 所有
|
|
|
|
|
|
# url = 'http://www.ptstjxx.org.cn/pttsjyxx_lists-16.html' # 翻页超时错误 已修复 提取正常
|
|
# url = 'http://www.ptstjxx.org.cn/pttsjyxx_lists-16.html' # 翻页超时错误 已修复 提取正常
|
|
|
|
|
|
-# # url = 'https://www.neias.cn/news_list.jsp?id=10775' # 报 201 浏览器打开正常 重新提取 #翻页链接不匹配##下一页规则未获取#
|
|
|
|
-# # url = 'https://www.gzmedri.com/xwdt/list_14_page_1.html' # 报 201 浏览器打开很慢,有时正常
|
|
|
|
|
|
+# url = 'https://www.neias.cn/news_list.jsp?id=10775' # 报 201 浏览器打开正常 重新提取 #翻页链接不匹配##下一页规则未获取#
|
|
|
|
+# url = 'https://www.gzmedri.com/xwdt/list_14_page_1.html' # 报 201 浏览器打开很慢,有时正常
|
|
# # url = 'http://www.wjqwhg.cn/Article?pageIndex=1' #列表页规则未获取# 网页打开报错 504
|
|
# # url = 'http://www.wjqwhg.cn/Article?pageIndex=1' #列表页规则未获取# 网页打开报错 504
|
|
#
|
|
#
|
|
# # url = 'http://gxs.yun.liuzhou.gov.cn/xwzx/tzgg/index.shtml' # 所有要素提取失败, bug 已修复 列表页xpath预测错误
|
|
# # url = 'http://gxs.yun.liuzhou.gov.cn/xwzx/tzgg/index.shtml' # 所有要素提取失败, bug 已修复 列表页xpath预测错误
|
|
@@ -434,9 +434,22 @@ url = 'http://www.gztaijiang.gov.cn/zwgk/zdlygk/zfcg/zbgg/index.html' # 所有
|
|
url = 'http://www.yrcc.gov.cn/zwzc/gzgb/gg/index.html'
|
|
url = 'http://www.yrcc.gov.cn/zwzc/gzgb/gg/index.html'
|
|
url = 'http://www.hzsq.gov.cn/index.php?r=article/Category/index&class_id=61'
|
|
url = 'http://www.hzsq.gov.cn/index.php?r=article/Category/index&class_id=61'
|
|
url = 'http://zyjy.huizhou.gov.cn/ggfw/jyxx/gycqjy/gpjggg/'
|
|
url = 'http://zyjy.huizhou.gov.cn/ggfw/jyxx/gycqjy/gpjggg/'
|
|
-url = 'http://www.lzwhg.com/tongzhigonggao/' #翻页失败
|
|
|
|
|
|
+# url = 'http://www.lzwhg.com/tongzhigonggao/' #翻页失败
|
|
|
|
+# url = 'http://jmxhfy.chinacourt.gov.cn/article/index/id/M8xNNjBINzAwNiACAAA.shtml' # js报错 getpath
|
|
|
|
+# url = 'http://www.heshanshi.gov.cn/xxgk/zdlyxxgk/zdjsxmpzhss/ztbgcgk/' # 只有两页,翻页失败 ,已优化
|
|
|
|
+url = 'http://www.hustp.com/index.php?s=/Index/noticeInfoList/type_id/11.html' # 只有两页,且不规律
|
|
|
|
+# url = 'http://www.scncggzy.com.cn/TPFront/front_zfcg/071009/'
|
|
|
|
+# url = 'http://www.stjs.org.cn/zbtb/zbtb_zhongbiaogg.aspx?page=1'
|
|
|
|
+# url = 'http://www.ccgp-hebei.gov.cn/province/cggg/dyly/'
|
|
|
|
+# url = 'http://www.ccgp-hebei.gov.cn/province/cggg/dyly/'
|
|
|
|
+# url = 'http://zyjy.huizhou.gov.cn/ggfw/jyxx/gycqjy/gpjggg/'
|
|
|
|
+# url = 'http://www.gztpc.com/category/bidding.html?id=230' # 附件丢失,代优化
|
|
|
|
+# url = 'https://www.nbzjy.cn/list-gsgg.html' # #列表页规则未获取# 已解决
|
|
|
|
+t1 = time.time()
|
|
rs = get_rs(url)
|
|
rs = get_rs(url)
|
|
|
|
+t2 = time.time()
|
|
print(rs)
|
|
print(rs)
|
|
|
|
+print("耗时:", t2-t1)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
@@ -461,7 +474,7 @@ def get_url(text):
|
|
# df = pd.read_csv('E:/crawl_data/crawler.BXKC_CRAWLING_RULES_INFO_utf8_predict100-200.csv')[:]
|
|
# df = pd.read_csv('E:/crawl_data/crawler.BXKC_CRAWLING_RULES_INFO_utf8_predict100-200.csv')[:]
|
|
|
|
|
|
# df = pd.read_excel('E:\crawl_data/新建 XLS 工作表.xls')
|
|
# df = pd.read_excel('E:\crawl_data/新建 XLS 工作表.xls')
|
|
-df = pd.read_excel('E:\crawl_data/20220526new_url_2test.xlsx')
|
|
|
|
|
|
+# df = pd.read_excel('E:\crawl_data/20220526new_url_2test.xlsx')
|
|
# df.drop_duplicates(subset=['首页网址'], inplace=True)
|
|
# df.drop_duplicates(subset=['首页网址'], inplace=True)
|
|
|
|
|
|
#
|
|
#
|
|
@@ -472,34 +485,34 @@ df = pd.read_excel('E:\crawl_data/20220526new_url_2test.xlsx')
|
|
# df.drop_duplicates(subset=['url_root'], inplace=True)
|
|
# df.drop_duplicates(subset=['url_root'], inplace=True)
|
|
# print(len(df))
|
|
# print(len(df))
|
|
# df.drop_duplicates(subset=['DETAIL_CONTENT_NODE'], inplace=True)
|
|
# df.drop_duplicates(subset=['DETAIL_CONTENT_NODE'], inplace=True)
|
|
-# # df = df[100:200]
|
|
|
|
-df.reset_index(drop=True, inplace=True)
|
|
|
|
-print(len(df), df.columns)
|
|
|
|
-t0 = time.time()
|
|
|
|
-for i in df.index:
|
|
|
|
- # if '#列表页规则未获取#' not in df.loc[i, 'rs']:
|
|
|
|
- # continue
|
|
|
|
- t1 = time.time()
|
|
|
|
- # url = df.loc[i, 'url']
|
|
|
|
- url = df.loc[i, '列表页链接']
|
|
|
|
- if not re.match('http', url):
|
|
|
|
- l.append('')
|
|
|
|
- print(url)
|
|
|
|
- continue
|
|
|
|
- print(url)
|
|
|
|
- rs = get_rs(url)
|
|
|
|
- # try:
|
|
|
|
- # url = json.loads(df.loc[i, 'CRAWLER_LINK ']).get('ruleLink', '')
|
|
|
|
- # print(url)
|
|
|
|
- # rs = get_rs(url)
|
|
|
|
- # except:
|
|
|
|
- # rs = json.dumps({'err_msg': 'json loads link error'})
|
|
|
|
- print('耗时:', time.time()-t1)
|
|
|
|
- print(rs)
|
|
|
|
- l.append(rs)
|
|
|
|
-df['rs3'] = pd.Series(l)
|
|
|
|
-print('完成,总耗时:', time.time()-t0)
|
|
|
|
-# # df.to_csv('E:/crawl_data/crawler.BXKC_CRAWLING_RULES_INFO_utf8_predict后1000-900.csv', encoding='utf-8')
|
|
|
|
-# df.to_excel('E:/crawl_data/20220526new_url_0531.xlsx', encoding='utf-8')
|
|
|
|
-print('写入完成,总耗时:', time.time()-t0)
|
|
|
|
-# #
|
|
|
|
|
|
+# # # df = df[100:200]
|
|
|
|
+# df.reset_index(drop=True, inplace=True)
|
|
|
|
+# print(len(df), df.columns)
|
|
|
|
+# t0 = time.time()
|
|
|
|
+# for i in df.index:
|
|
|
|
+# # if '#列表页规则未获取#' not in df.loc[i, 'rs']:
|
|
|
|
+# # continue
|
|
|
|
+# t1 = time.time()
|
|
|
|
+# # url = df.loc[i, 'url']
|
|
|
|
+# url = df.loc[i, '列表页链接']
|
|
|
|
+# if not re.match('http', url):
|
|
|
|
+# l.append('')
|
|
|
|
+# print(url)
|
|
|
|
+# continue
|
|
|
|
+# print(url)
|
|
|
|
+# rs = get_rs(url)
|
|
|
|
+# # try:
|
|
|
|
+# # url = json.loads(df.loc[i, 'CRAWLER_LINK ']).get('ruleLink', '')
|
|
|
|
+# # print(url)
|
|
|
|
+# # rs = get_rs(url)
|
|
|
|
+# # except:
|
|
|
|
+# # rs = json.dumps({'err_msg': 'json loads link error'})
|
|
|
|
+# print('耗时:', time.time()-t1)
|
|
|
|
+# print(rs)
|
|
|
|
+# l.append(rs)
|
|
|
|
+# df['rs3'] = pd.Series(l)
|
|
|
|
+# print('完成,总耗时:', time.time()-t0)
|
|
|
|
+# # # df.to_csv('E:/crawl_data/crawler.BXKC_CRAWLING_RULES_INFO_utf8_predict后1000-900.csv', encoding='utf-8')
|
|
|
|
+# # df.to_excel('E:/crawl_data/20220526new_url_0531.xlsx', encoding='utf-8')
|
|
|
|
+# print('写入完成,总耗时:', time.time()-t0)
|
|
|
|
+# # #
|