testInterface.py 30 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505
  1. '''
  2. Created on 2018年12月26日
  3. @author: User
  4. '''
  5. import sys
  6. import os
  7. import json
  8. import codecs
  9. import re
  10. sys.path.append(os.path.abspath("../.."))
  11. import requests
  12. import time
  13. list_url = ["http://www.csssyxx.com/xwgk/tzgg",
  14. "http://nyj.yueyang.gov.cn/nyj/7869/7871/default.htm",
  15. "http://www.xp.gov.cn/zwgk/072011/govmore.html",
  16. "http://www.yiyang.gov.cn/sjkzx/4464/4469/default.htm",
  17. "http://lsyshjj.leshan.gov.cn/szwfwzx/tzgg/list.shtml",
  18. "http://szrzyj.ziyang.gov.cn/catalog.aspx?id=10",
  19. "http://sww.chengdu.gov.cn/cdswh/gsgg/gdlist1526618933789.shtml",
  20. "http://cdaudit.chengdu.gov.cn/cdsjj/c113200/list_1.shtml",
  21. "http://nyj.yueyang.gov.cn/nyj/7869/7871/default.htm",
  22. "http://yyjt.yiyang.gov.cn/yyjtj/6270/6279/6288/6303/default.htm",
  23. "http://www.xp.gov.cn/zwgk/072011/govmore.html",
  24. "http://www.yiyang.gov.cn/sjkzx/4464/4469/default.htm",
  25. "http://lsyshjj.leshan.gov.cn/szwfwzx/tzgg/list.shtml",
  26. "http://szrzyj.ziyang.gov.cn/catalog.aspx?id=10",
  27. "http://jtys.chengdu.gov.cn/cdjt/c108497/xw_list.shtml",
  28. "http://sww.chengdu.gov.cn/cdswh/gsgg/gdlist1526618933789.shtml",
  29. "http://sthj.chengdu.gov.cn/cdhbj/c110798/list_1.shtml",
  30. "http://www.djy.gov.cn/djyszfmhwz/c130537/xwzx_list.shtml",
  31. "http://jhj.my.gov.cn//xwzx/tzgg/index.html",
  32. "http://cl.my.gov.cn/tzgg/index.html",
  33. "http://dag.my.gov.cn/tzgg/index.html",
  34. "http://fgw.my.gov.cn/tzgg/index.html",
  35. "http://nyncj.my.gov.cn/tzgg/index.html",
  36. "http://jyfwzx.my.gov.cn/gggs/index.html",
  37. "http://jxj.my.gov.cn/gsgg/index.html",
  38. "http://mymz.my.gov.cn/zwgk/tzgg/index.html",
  39. "http://wgl.my.gov.cn/gggs/index.html",
  40. "http://scjg.my.gov.cn/zwdt/gsgg/index.html",
  41. "http://www.zitong.gov.cn/content/column/4704911?pageIndex=1",
  42. "http://www.hunancatv.com/tender.aspx?class=64&channel=57",
  43. "http://www.yantaibank.net/publish/ytbank/31388/31548/index.html#main",
  44. "http://www.tjbhb.com/bhyhww/jzcg/zjgg/index.html",
  45. "http://jw.guiyang.gov.cn/c8346/",
  46. "http://ggzyjy.lsz.gov.cn/TPFront/jyxx/005001/005001003/",
  47. "http://ggzyjy.lsz.gov.cn/TPFront/jyxx/005001/005001008/",
  48. "http://www.hflyzx.net/Nav_dongtai.shtml?whichpage=1&SS_ID=11",
  49. "http://hwcz.snxjyj.cn/xwdt/tzgg.htm",
  50. "http://www.jtdzpt.com/expsteel/exp/tender/sell/bout/moreTdBoutResult.htm?type=1&status=3",
  51. "http://www.cinda.com.cn/xdjt/xdjtpd/cgxxgs/list.shtml",
  52. "http://www.sdebank.com/cms/S101_21/infoCenter/pmxx/index.html",
  53. "http://bid.9to.com/list.php/catid-236/",
  54. "http://www.bankcomm.com/BankCommSite/shtml/jyjr/cn/7804/2600473/2600509/list_1.shtml?channelId=7804",
  55. "http://www.xtjfjt.com/a/xinwenzhongxin/tongzhigonggao/",
  56. "http://taochonghu.hf168.net/Home/News/67?pageindex=1",
  57. "http://www.hfsjcxx.com/SortHtml/1/List_19.html",
  58. "http://www.cimeec.com/mxshop/web/goods.do",
  59. "https://www.znjjzx.net/col.jsp?id=116",
  60. "http://www.fcd.com.cn/tzgg/",
  61. "http://www.bnds.cn/news/4.html",
  62. "http://www.tjflis.com/webDisplay/ESite/More.jsp?strID=1117&num=3",
  63. "http://www.nyjyedu.cn/content/init.action?channelId=channel_302&page=1",
  64. "http://www.ouswgd.cn/swou/xnzb/list2.shtml",
  65. "http://www.lsgjsyxx.com/gonewlist?columnId=32",
  66. "https://www.hbdx.gov.cn/info_infoCategory.jsp?data_name=tzgg_new",
  67. "http://www.hnswdx.gov.cn/news/tzgg/tzgg/index.html",
  68. "http://sdx.maoming.gov.cn/newslist.aspx?ID=57",
  69. "http://fjclzz.com/new_list.aspx?fid=9",
  70. "http://fd4zh.30edu.com.cn/Article/46a834fb-c5e2-4805-983d-8162e9887e7e/1.shtml",
  71. "http://www.zhgxqrmyy.com/news/93/",
  72. "http://bdzc.sasu.edu.cn/list.php?pageone=0&cid=28",
  73. "http://sccyy.chuzhou.gov.cn/3903378.html",
  74. "http://www.ymgfgs.cn/index.php/news/admin/1/cn/104/104.html",
  75. "http://www.tjbhb.com/bhyhww/jzcg/zbjg/index.html",
  76. "http://www.mdzx.net/index.php/xsxwin/tzgg/",
  77. "http://www.zzsfybjy.com/index.php?m=content&c=index&a=lists&catid=24",
  78. "http://www.cndnce.com/bidding/barginlist/List_1.aspx",
  79. "http://www.cndnce.com/bidding/notice/List_1.aspx",
  80. "http://www.bankcomm.com/BankCommSite/shtml/jyjr/cn/7804/2600473/2600510/list_1.shtml?channelId=7804",
  81. "http://www.hljnkzyy.org.cn/index.php?p=news_list&lanmu=12",
  82. "http://www.hsxrmyy.com/zhaobiao/menu.aspx",
  83. "http://www.zhswdx.cn/portal/rest/menu/ummenuTwoList?menu=zytz&siteType=zytz",
  84. "https://www.bjsx.com.cn/list.jsp?totalpage=19&PAGENUM=2&urltype=tree.TreeTempUrl&wbtreeid=1049",
  85. "http://swdx.np.gov.cn/cms/sitemanage/index.shtml?siteId=100422578395950000&page=1",
  86. "http://www.gdsgznx.com/xwgk/Index.html",
  87. "http://www.nj13zhs.cn/zhxyjz/articlelist.aspx?id=14&page=1",
  88. "http://www.99snsn.com/zwgk.html",
  89. "http://kfyy.changde.gov.cn/col/col32819/index.html",
  90. "http://gaj.yanan.gov.cn/info/iList.jsp?cat_id=10194",
  91. "http://www.yanhe.gov.cn/xwzx/tzgg/index.html",
  92. "http://www.fdfzjt.com/announcement/2",
  93. "http://www.xagj.com.cn/item?pid=712a95df750c40a48854e25c95bfaa65",
  94. "http://www.xasrlgs.com/index/news/index/id/3.html",
  95. "http://www.xazls.com//zbpt/index.htm",
  96. "http://www.xazls.com//gsgg/index.htm",
  97. "http://www.aspd.gov.cn/zwgk/zdlygk/ggzypz/ggzyjy/",
  98. "http://www.aspd.gov.cn/xwzx/tzgg/index.html",
  99. "http://www.anshunjkq.gov.cn/xxgk/xxgkml/zdlyxxgk/zfcg/zbgg/list.html",
  100. "http://www.anshunjkq.gov.cn/xxgk/xxgkml/zdlyxxgk/ggzypz/list.html",
  101. "http://jtj.anshun.gov.cn/jtxw/tzgg/index.html",
  102. "http://nyncj.anshun.gov.cn/gzdt/tzgg/",
  103. "http://wgl.anshun.gov.cn/gzdt/tzgg/index.html",
  104. "http://hgsgwh.anshun.gov.cn/xxgk/xxgkml/zdlyxx/zfcg/zbgg/",
  105. "http://www.anshunjkq.gov.cn/xxgk/xxgkml/zdlyxxgk/zfcg/zbgg/list.html",
  106. "http://www.gzzn.gov.cn/xxgk/zxgk/tzgg_40284/index.html",
  107. "http://www.gzsmzmuseum.cn/list-7.html",
  108. "http://dsjw.guiyang.gov.cn/c9576/index.html",
  109. "http://gaj.guiyang.gov.cn/jszx/tzgg/jtgll/",
  110. "http://jw.guiyang.gov.cn/c8346/",
  111. "http://jyj.guiyang.gov.cn/pd_jydt/node_2948.htm?current=2951",
  112. "http://kjj.guiyang.gov.cn/a/xxgk/tzgg/list_25_1.html",
  113. "http://nync.guiyang.gov.cn/c8121/",
  114. "http://rfb.guiyang.gov.cn/c6686/index.html",
  115. "http://gzw.guiyang.gov.cn/c7905/",
  116. "http://sfj.guiyang.gov.cn/zwpd/list_infor_dtxx.aspx?tid=7&lx=2&pg=0",
  117. "http://tyj.guiyang.gov.cn/c17124/",
  118. "http://wjw.guiyang.gov.cn/c7124/",
  119. "http://gsj.guiyang.gov.cn/tzgg.jsp?syscode=AE01&id=11",
  120. "http://tzcj.guiyang.gov.cn/c8588/",
  121. "http://www.gzspm.com/new.asp?anclassid=4",
  122. "http://snyncj.gzlps.gov.cn/gzdt_42339/gsgg/index.html",
  123. "http://rfb.gzlps.gov.cn/gzdt/tzgg/index.html",
  124. "http://mwr.gzlps.gov.cn/gzdt/tzgg/index.html",
  125. "http://lpstsjyxx.chinayunnet.com/xwgk/tzgg/index.html",
  126. "http://swjj.gzlps.gov.cn/gzdt/tzgg/index.html",
  127. "http://www.bjq.gov.cn/xxgk/zfxxgkml/zdxxgk/ggzypz/zfcg_57661/cggg/",
  128. "http://www.bjq.gov.cn/xwzx/tzgg/index.html",
  129. "http://www.shiqian.gov.cn/zwgk/xxgkml/zdlygk/tzgg/list.html",
  130. "http://www.shiqian.gov.cn/zwgk/xxgkml/zdlygk/ggzypz/list.html",
  131. "http://www.trws.gov.cn/zwgk/xxgkml/zdlygk/ggzypz/zbgg/list.html",
  132. "http://www.trws.gov.cn/xwzx/tzgg/index.html",
  133. "http://www.dalong.gov.cn/html/zhengwugongkai/tongzhigonggao/index.html",
  134. "http://www.trs.gov.cn/xxgk/zdlygk/zfcg/zbgg_59709/index.html",
  135. "http://www.trs.gov.cn/xwzx/tzgg/gsgg/index.html",
  136. "http://www.yanhe.gov.cn/xwzx/tzgg/index.html",
  137. "http://www.yinjiang.gov.cn/xwzx/tzgg/index.html",
  138. "http://www.yinjiang.gov.cn/xxgk/zdxxgk/ggzypz/tdzy/index.html",
  139. "http://www.yinjiang.gov.cn/xxgk/zdxxgk/zdjsxm/zbtb/",
  140. "http://www.yuping.gov.cn/zwgk/xxgkml/zdlygk/zfcg/zbgg/",
  141. "http://www.yuping.gov.cn/zwgk/xxgkml/zdlygk/zdjsxm/ztb/",
  142. "http://www.gzcz.gov.cn/xwzx/tzgg/index.html",
  143. "http://jr.guizhou.gov.cn/tzgg/index.html",
  144. "http://fpb.guizhou.gov.cn/xwzx/tzgg/index.html",
  145. "http://www.gzcoop.gov.cn/xwzx/tzgg/",
  146. "http://www.gzsjyt.gov.cn/xwzx/tzgg/index.html",
  147. "http://kjt.gzst.gov.cn/xwzx/tzgg_73876/index.html",
  148. "http://www.gzaas.org.cn/xxgk/zdgk/tzgg/",
  149. "http://rfb.guizhou.gov.cn/xwzx/tzgg/index.html",
  150. "http://www.suiyang.gov.cn/xwzx/tzgg/index.html",
  151. "http://daj.zunyi.gov.cn/gzdt/tzgg/",
  152. "http://www.zyredcross.cn/news-1-1-0.html",
  153. "http://www.zysjwssc.com/emall/zunyi/jingjiaInfo.aspx",
  154. "http://rsj.zunyi.gov.cn/web/12731/index.html",
  155. "http://zyepb.zunyi.gov.cn/news34/news_more.asp?page=1&word=&lm=&lm2=340&lmname=&open=&n=&hot=&tj=",
  156. "http://scjgj.zunyi.gov.cn/gsgg/qt/",
  157. "http://mts.zmu.edu.cn/tzxw/tzgg.htm",
  158. "http://www.chidi.com.cn/col/col6870/index.html",
  159. "http://www.chidi.com.cn/col/col6872/index.html",
  160. "http://mpnr.chengdu.gov.cn/second/zpgjg.aspx?ClassID=001002002006001",
  161. "http://mpnr.chengdu.gov.cn/List.aspx?ClassID=001002001003",
  162. "http://gzw.chengdu.gov.cn/cdgzw/c107965/list.shtml",
  163. "http://www.cfyy.net/tender_sub/",
  164. "http://www.hxdental.cn/news/bid/",
  165. "http://www.chinawestagr.com/homepage/list_aff.asp",
  166. "http://sctcm.sc.gov.cn/get/class/scszyyglj/gggs/index.html",
  167. "http://www.dzzjcs.com/jyxx/tradeInfo.html",
  168. "http://gzw.deyang.gov.cn/list.asp?id=1&smallid=248&bname=%E7%BB%BC%E5%90%88%E4%BF%A1%E6%81%AF&Sname=%E9%80%9A%E7%9F%A5%E9%80%9A%E5%91%8A",
  169. "http://www.zjxzyyy.cn/plus/list.php?tid=45",
  170. "http://www.scdl.gov.cn/xwzx/tzgg.htm",
  171. "http://www.dp.gov.cn/dpxw/zwgz/gsgg.htm",
  172. "http://www.rs.gov.cn/xxgk/gsgg.htm",
  173. "http://www.ncjttz.com/index.php?m=content&c=index&a=lists&catid=18",
  174. "http://gsj.nanchong.gov.cn/portal/123",
  175. "http://www.xichong.gov.cn/news/notice/department/index.html",
  176. "http://www.xichong.gov.cn/catalog/378/index.html",
  177. "http://www.xichong.gov.cn/catalog/428/index.html",
  178. "http://gzw.panzhihua.gov.cn/zwgk/tzgg/index.shtml",
  179. "http://ctel.invest.com.cn/news/bid/index_1.html",
  180. "http://nync.guiyang.gov.cn/c8121/",
  181. "http://www.scjianke.com/news1.aspx?t=38",
  182. "http://www.sc2zyy.com/list.asp?id=9",
  183. "http://www.cd7yy.com/news/news-18p1.html?tdsourcetag=s_pctim_aiomsg",
  184. "http://jw.guiyang.gov.cn/c8346/",
  185. "http://www.xacbdc.com/index.php?s=zbzs&c=category&id=1",
  186. "http://gaj.yanan.gov.cn/info/iList.jsp?cat_id=10194",
  187. "http://www.gzsjyt.gov.cn/xwzx/tzgg/index.html",
  188. "http://ztbgl.yangtzeu.edu.cn/hwcg.htm?tdsourcetag=s_pctim_aiomsg",
  189. "http://www.ncct.cc/html/list_11.html",
  190. "http://www.zqdh.gov.cn/gzjg/zqsdhqzfhcxjsj/zbtb/",
  191. "http://www.fucai.cn/art/gsgg/",
  192. "http://www.gscq.com.cn/index.php?s=xm&c=category&id=1",
  193. "http://yh.yali.edu.cn/index.php?m=web&c=list&id=20&pid=0",
  194. "http://www.cscjedu.com/News/Module.aspx?id=c5d5fc0b-70d6-412f-a81c-2ed45c1dd3e5",
  195. "http://www.csssyxx.com/gonewlist?columnId=36",
  196. "http://www.cstsjy.cn/tongzhigonggao",
  197. "http://www.hbhtzx.com/view-18.html",
  198. "http://yh.yali.edu.cn/index.php?m=web&c=list&id=20&pid=0",
  199. "http://www.cscjedu.com/News/Module.aspx?id=c5d5fc0b-70d6-412f-a81c-2ed45c1dd3e5",
  200. "http://www.csssyxx.com/gonewlist?columnId=36",
  201. "http://www.cstsjy.cn/tongzhigonggao",
  202. "http://www.hbhtzx.com/view-18.html",
  203. "http://www.yalisy.cn/xxgg",
  204. "http://www.hiec.cn/e/action/ListInfo/?classid=25",
  205. "http://www.hsdlzx.net/a/xiaowugongkai/tongzhigonggao/",
  206. "http://www.zhounanshiyan.com/gongg/xwgk/",
  207. "https://cyyzxxx.30edu.com.cn/Article/f29e0d5e-04f7-68cf-26bd-d61532162df8/",
  208. "http://www.myfls.com.cn/pcweb/article_list/DynamicNotice.htm",
  209. "http://www.wenshang.gov.cn/module/xxgk/search.jsp?divid=div23663&infotypeId=27030301&jdid=104&area=&sortfield=createdatetime:0,orderid:0",
  210. "http://www.liangshan.gov.cn/module/xxgk/search.jsp?divid=div32242&infotypeId=LSA09100403&jdid=112&area=&currpage=1",
  211. "http://www.csx.gov.cn/csxjiaoyuju/xxgk8592/ztb23/",
  212. "http://www.sdde.gov.cn/ywdt/gggs/",
  213. "http://www.ahjzyy.cn/Nav_wangshang.asp?SS_ID=85",
  214. "http://www.ahjzyy.cn/SortHtml/1/3465299358.html",
  215. "http://www.sdde.gov.cn/dexxgk/xzfbm/daxnyj/",
  216. "http://www.sdde.gov.cn/dexxgk/xzfbm/daxzjj/",
  217. "http://www.yutai.gov.cn/module/xxgk/search.jsp?divid=div55&infotypeId=YTA060201&jdid=111&area=&currpage=1",
  218. "http://lxxxgk.bozhou.gov.cn/opennessTarget/?branch_id=53fe9198cbb812e0d509f771&column_code=170302&page=1",
  219. "http://www.eszwdx.com/index.php?m=content&c=index&a=lists&catid=17",
  220. "http://www.hbei.com.cn/news/mess/",
  221. "http://pl.km.gov.cn/qsbmsz/qzfgbm/qwsj/tzgg/",
  222. "http://gd.km.gov.cn/zfxxgkml/zdlyxxgk/czzjxx/zfcg/",
  223. "http://jn.km.gov.cn/jrjnol/gsggol/",
  224. "http://yl.km.gov.cn/tzgg/",
  225. "http://gbdsj.gd.gov.cn/zwgk/zfcg/index.html",
  226. "https://www.glutnn.cn/article.aspx?classid=7&page=1",
  227. "http://safety.lyg.gov.cn/tzgg/tzgg.html",
  228. "http://www.fenxi.gov.cn/channels/2300.html",
  229. "http://www.hatjy.com/index.php?m=content&c=index&a=lists&catid=197",
  230. "http://ggzy.wulanchabu.gov.cn/jyxx/jsgczbhxrgs",
  231. "http://ledong.hainan.gov.cn/ledong/0400/right.shtml",
  232. "http://xdp.shuanghui.net:8010/webportal/index/bidnotice/list/2.do",
  233. "http://ggzyjy.yanbian.gov.cn/jyxx/005006/005006002/aboutsub.html",
  234. "http://ggzyjy.yanbian.gov.cn/jyxx/005006/005006001/aboutsub.html",
  235. "http://www.icbc.com.cn/ICBC/%E6%B1%9F%E8%8B%8F%E5%88%86%E8%A1%8C/%E6%9C%80%E6%96%B0%E4%B8%9A%E5%8A%A1/%E9%9B%86%E4%B8%AD%E9%87%87%E8%B4%AD%E4%BF%A1%E6%81%AF%E5%85%AC%E5%BC%80/",
  236. "http://tiyuju.tangshan.gov.cn/tiyu/tongzhigonggao_tyj/",
  237. "http://www.jj.gov.cn/col/col1326300/",
  238. "http://www.jj.gov.cn/col/col1375711/",
  239. "http://zrghj.sjz.gov.cn/sjz/gsgg/tdsc/",
  240. "http://tiyuju.tangshan.gov.cn/tiyu/tongzhigonggao_tyj/",
  241. "http://jkq.nanning.gov.cn/html/xxgkml/ggzypz/kyqcr/",
  242. "http://jgswj.jiaxing.gov.cn/col/col1537381/",
  243. "http://www.hbxl.gov.cn/info/index.jsp?id=293&name=%E5%85%AC%E5%91%8A%E5%85%AC%E7%A4%BA&type=DIMMENSION_A",
  244. "http://zrzyhghj.jiaxing.gov.cn/col/col1541584/",
  245. "http://jtj.lf.gov.cn/main.php?action=displaymore&s=133&p=1",
  246. "http://ztbgl.yangtzeu.edu.cn/hwcg.htm",
  247. "http://ztbgl.yangtzeu.edu.cn/index/yx_bm_cg.htm",
  248. "http://zbb.lzu.edu.cn/lzupage/B20160106020352.html",
  249. "http://zbb.lzu.edu.cn/lzupage/B20160106020426.html",
  250. "http://ggzyjy.yanbian.gov.cn/jyxx/005005/005005002/aboutsub.html",
  251. "http://www.gdzyy.cn/xinxi/zhaobiao/",
  252. "https://www.glutnn.cn/article.aspx?classid=7&page=1",
  253. "http://news.hict.org.cn/Html/xiaonagonggao/",
  254. "http://www.tjbhb.com/bhyhww/jzcg/zjgg/index.html",
  255. "http://www.zzsfybjy.com/index.php?m=content&c=index&a=lists&catid=166",
  256. "http://oa.xjtu.edu.cn/zxgg_index.jsp",
  257. "http://www.sdjxxrmyy.com/list/?3_1.html",
  258. "http://v8033290.72119.30la.com.cn/Article/ShowClass.asp?ClassID=27&page=1",
  259. "http://www.hljzy.org.cn/ten?page",
  260. "http://www.sgsyy.com/notice/",
  261. "http://www.hnsyzxyy.com/news/zhaobiao/",
  262. "http://scjgj.qinghai.gov.cn/channel/gg/index.htm",
  263. "http://www.hljnkzyy.org.cn/index.php?p=news_list&lanmu=12",
  264. "http://www.xjxrmyy.net/Home/InformationNote1",
  265. "http://www.taszlyy.com/findTypeByTypeIdIndex.do?typeid=444",
  266. "https://www.szcp.com/Roam/Announce/List_17.html",
  267. "http://www.lnmu3h.com/plus/list.php?tid=340",
  268. "http://27.24.159.155/gz/list.jsp?a6t=5&a6p=1&a6c=10&urltype=tree.TreeTempUrl&wbtreeid=1019",
  269. "http://www.cqyfyl.cn/list/535",
  270. "http://www.pzhzxy66.com/article/lists/category/zbgg.html",
  271. "http://www.cha.org.cn/plus/list.php?tid=68",
  272. "http://news.hict.org.cn/Html/xiaonagonggao/",
  273. "http://www.cinda.com.cn/xdjt/xdjtpd/syhzjh/list.shtml",
  274. "http://jr.chengdu.gov.cn/jinrongban/c139013/list.shtml",
  275. "http://cocenter.casicloud.com/xcl/searchRelease.ht",
  276. "http://www.hbfxly.org/fenlei/?idh=443",
  277. "http://jyj.changsha.gov.cn/zwgk/czxx/",
  278. "http://sfj.bozhou.gov.cn/content/channel/59278c4aceab064621611981/",
  279. "http://yyyzyy.cn/xin-wen-zhong-xin/xin-xi-gong-gao",
  280. "http://www.motmti.cn/tzgg/index.jhtml",
  281. "http://www.yhwgyxx.cn/html/yhwx/list.html?id=%E9%80%9A%E7%9F%A5%E5%85%AC%E5%91%8A&pageNo=1&pageSize=10",
  282. "http://hunan.chinatax.gov.cn/sy/lists/20190719078813",
  283. "http://scjg.sjz.gov.cn/col/1490159811930/",
  284. "http://www.youyang.gov.cn/html/xxgk/gcjslyxxgk/xmjszb/",
  285. "http://hunan.chinatax.gov.cn/yi/county/20190719078353/lists/20190719078446",
  286. "http://www.bestzx.net/index.php/welcome/article/4/176",
  287. "http://ezszy.hbfy.gov.cn/DocManage/getDocsByFolder?folderNo=0502",
  288. "http://shbj.leshan.gov.cn/SiteHuanbaoju/List.aspx?acID=7",
  289. "http://www.zjsjtysj.gov.cn/Class.asp?ID=99&page=1",
  290. "http://www.gsbtn96333.com.cn/news-41-1.html",
  291. "http://www.wusheng.gov.cn/wsxrmzf/c100460/list.shtml",
  292. "http://tzwjjq.zjtz.gov.cn/col/col25734/index.html",
  293. "http://www.bestzx.net/index.php/welcome/article/4/176",
  294. "http://czj.changde.gov.cn/col/col6503/index.html",
  295. "http://sthjj.changde.gov.cn/col/col7389/index.html",
  296. "http://fgj.changde.gov.cn/col/col27394/index.html",
  297. "http://cdjdw.changde.gov.cn/col/col15666/index.html",
  298. "http://cdgxq.changde.gov.cn/col/col32114/index.html",
  299. "http://swj.jingzhou.gov.cn/z/zhengwugongkai/tongzhigonggao/",
  300. "http://www.lhgtj.gov.cn/article.asp?ClassID=33",
  301. "http://slj.qz.gov.cn/col/col1597457/",
  302. "http://ezszy.hbfy.gov.cn/DocManage/getDocsByFolder?folderNo=0502",
  303. "http://jtj.jiujiang.gov.cn/colB/colB6/",
  304. "http://slj.yq.gov.cn/12693/",
  305. "http://tyj.jiaxing.gov.cn/col/col1591242/",
  306. "http://rfb.jiaxing.gov.cn/col/col1537251/",
  307. "http://www.jxsgxs.cn/zwgk.asp?lmid=11",
  308. "http://gjj.jiaxing.gov.cn/col/col1629879/",
  309. "http://www.qz.gov.cn/col/col1525311/",
  310. "http://www.gsbtn96333.com.cn/news-41-1.html",
  311. "http://ec.gslq.com/portal/list.do?chnlcode=result",
  312. "http://ct.yichun.gov.cn/index.php?s=news&c=category&id=12",
  313. "http://yjglj.beijing.gov.cn/col/col573/index.html#!uid=8268&pageNum=1",
  314. "http://jn.km.gov.cn/jrjnol/gsggol/",
  315. "http://www.hanshou.gov.cn/zwgk/tzgg/",
  316. "http://www.gzdf.gov.cn/14166/14258/14268/index.shtml",
  317. "http://www.jxjaxzf.gov.cn/Category_318/Index.aspx",
  318. "http://www.zhuji.gov.cn/col/col1450965/index.html",
  319. "http://www.zhuji.gov.cn/col/col1453321/index.html",
  320. "http://www.gdd.gov.cn/hp/zfcg/list.shtml",
  321. "http://www.ccx.gov.cn/syscolumn/ztzl/ztbzl/",
  322. "http://www.sjzlq.gov.cn/syscolumn/36/82/index_1.html",
  323. "http://www.longquan.gov.cn/xxgk/bm/758064163/03/zfcg/",
  324. "http://www.shengsi.gov.cn/col/col1354811/",
  325. "http://www.shengsi.gov.cn/col/col1354812/",
  326. "http://www.zj-xd.cn/news/class/?106.html",
  327. "http://www.gdveren.com/index.php?controller=List&action=index&id=50&uid=49",
  328. "http://dct.jiangxi.gov.cn/col/col14522/index.html",
  329. "http://www.jxxgj.gov.cn/xxgk_1/ztbxx/",
  330. "http://www.fjrtvu.edu.cn/xxgk1/zbcg.htm",
  331. "http://www.nj13zhs.cn/zhxyjz/articlelist.aspx?id=14&page=1",
  332. "http://rmyy.maoming.gov.cn/index-24.html",
  333. "http://nfzxy.com/yydt/zbxm/",
  334. "http://www.sqswdx.cn/Category_36/Index.aspx",
  335. "http://zcc.lcu.edu.cn/zbcg/index.htm",
  336. "http://zcc.lcu.edu.cn/zccz/index.htm",
  337. "http://wyglzx.lcu.edu.cn/dtgg/index.htm",
  338. "http://www.jxhg510.com/jxadmin/news_more.asp?page=1&word=&lm=&lm2=98&lmname=&open=_blank&n=&hot=0&tj=0",
  339. "http://www.hbsyxx.cn/Item/list.asp?id=1563&page=1",
  340. "http://cyxxgk.jmu.edu.cn/gkml/cggg.htm"]
  341. _sum = 0
  342. _count = 0
  343. ''' '''
  344. # with codecs.open("errorLink.txt","r",encoding="utf8") as f:
  345. # while(True):
  346. # line = f.readline().strip()
  347. # if not line:
  348. # break
  349. #
  350. # a = time.time()
  351. # # user = {"listpage_url":list_url[0]}
  352. # user = {"listpage_url":"http://www.gsbtn96333.com.cn/news-41-1.html"}
  353. # #_resp = requests.post("http://192.168.2.52:15015/content_extract", json=user, verify=True)
  354. # _resp = requests.post("http://127.0.0.1:15015/content_extract", json=user, verify=True)
  355. # resp_json = _resp.content.decode("utf-8")
  356. # _resp = json.loads(resp_json)
  357. # print(resp_json)
  358. # _sum += 1
  359. # if "flag" in _resp and _resp["flag"]:
  360. # _count += 1
  361. # print("take:",time.time()-a,json.dumps(_resp,sort_keys=True,indent=4,ensure_ascii=False))
  362. # print(_count,_sum)
  363. def get_rs(url):
  364. user = {"listpage_url": url}
  365. _resp = requests.post("http://192.168.2.177:15015/content_extract", json=user, verify=True) #127.0.0.1 177
  366. resp_json = _resp.content.decode("utf-8")
  367. return resp_json
  368. # _resp = json.loads(resp_json)
  369. # print(resp_json)
  370. # print(_resp)
  371. # url = 'http://www.clrmyy.com/Newslist/NewsList.aspx?code=ZPXX'
  372. # url = 'http://ec.chongchi.com.cn:8080/Ec468Web/ysxjcggg.jsp' # 列表页太长 js 溢出 #已设置超时
  373. # url = 'https://tyj.huangshan.gov.cn/content/column/6794951?pageIndex=1'
  374. # url = 'http://www.yangdong.gov.cn/xwzx/gggs/index.html' # 获取详情页报错
  375. # url = 'https://www.guit.edu.cn/xwzx/tzgg.htm ' # 日志报错
  376. # rs = get_rs(url)
  377. # print(rs)
  378. # url = 'http://www.yangzhou.gov.cn/cnyzfront/newxxgk/bmfdgklist.jsp?zjclassid=b8273cd5944b41c1b6f5aeb88194340f&bmcode=KA024&showlmmc=1&showbm=0&currentPage=2' # 翻页提取失败
  379. # url = 'http://www.yangzhou.gov.cn/cnyzfront/newxxgk/bmfdgklist.jsp?zjclassid=aedecc7ea4cb4fbdb34df0d57db50c62&bmcode=11321000014407012K' # 所有要素提取失败, 重跑正常
  380. url = 'http://www.gztaijiang.gov.cn/zwgk/zdlygk/zfcg/zbgg/index.html' # 所有要素提取失败, 重跑正常
  381. # url = 'http://gxs.yun.liuzhou.gov.cn/xwzx/tzgg/index.shtml' # 所有要素提取失败, bug 已修复
  382. # url = 'http://www.yangzhou.gov.cn/cnyzfront/newxxgk/bmfdgklist.jsp?zjclassid=aedecc7ea4cb4fbdb34df0d57db50c62&bmcode=11321000014407012K' # 所有要素提取失败, 重跑正常
  383. # url = 'http://www.chengan.gov.cn/main/newsMore.action?subjectid=9052&pagenum=1' # 所有要素提取失败, bug 已修复
  384. # url = 'http://hsxzwgk.mas.gov.cn/opennessTarget/?branch_id=57a3df762c262ea9a00aadae&column_code=280200' #主页提取失败 #网页打不开# 404
  385. # url = 'http://www.crra.org.cn/news/tongzhi/o1' # 执行js完毕 getRule_A_Date done 后卡住 已修复
  386. # url = 'http://www.ptstjxx.org.cn/pttsjyxx_lists-16.html' # 翻页超时错误 已修复 提取正常
  387. # # url = 'https://www.neias.cn/news_list.jsp?id=10775' # 报 201 浏览器打开正常 重新提取 #翻页链接不匹配##下一页规则未获取#
  388. # # url = 'https://www.gzmedri.com/xwdt/list_14_page_1.html' # 报 201 浏览器打开很慢,有时正常
  389. # # url = 'http://www.wjqwhg.cn/Article?pageIndex=1' #列表页规则未获取# 网页打开报错 504
  390. #
  391. # # url = 'http://gxs.yun.liuzhou.gov.cn/xwzx/tzgg/index.shtml' # 所有要素提取失败, bug 已修复 列表页xpath预测错误
  392. # # url = 'http://sz.nxeduyun.com/index.php?r=space/school/portal/content/index&sid=6ce9765e85694be7838c7f7272199346&cid=50160' #列表页获取失败 已修复
  393. # # url = 'https://www.nbzjy.cn/list-gsgg.html' # #列表页规则未获取# 已解决
  394. # # url = 'http://www.gdhealth.net.cn/index.php?m=content&c=index&a=lists&catid=38' # # #列表页规则未获取# chome浏览器打开异常 换另一个浏览器正常
  395. # # url = 'http://www.kbs.gov.cn/ywdt/tzgg/index.html' #列表页规则未获取# iframe报错 已处理
  396. # # url = 'http://www.xs9z.com/News.asp?PageNo=1&classid=17' #包含iframe 报错 已处理
  397. # # url = 'http://www.tdxbmj.cn/html/qyxw1/index.html' #列表页规则未获取# 已优化处理,详情页时间没日期报错,标签id重复导致只提取到一个链接
  398. # # url = 'http://www.sxsltlyy.com/newslist.php?cid=29' # 列表页获取失败,详情页xpath错误 浏览器打开界面与selenium 的不一样 ua问题已修复
  399. # # url = 'http://view.landtz.com:8092/jj/index' # #列表页规则未获取# 拍卖多个图标纵向列表 content_xpath of listpage is //*[@class="wp"]/div[2]/div[1]/a[1]/div[2] 预测错误
  400. # # url = 'http://www.hbbidcloud.cn/suizhou/jyxx/004003/004003006/about.html' # #翻页链接不匹配##下一页规则未获取# 网页本身无翻页机制
  401. # # url = 'http://www.cqcjda.com/ShowList.aspx?pkey=3&p=3' #翻页链接不匹配##下一页规则未获取##详情页列表页区分长度未识别#
  402. # # url = 'https://www.sxeec.com/gpgg/p4.html' ##翻页链接不匹配##下一页规则未获取# 下一页在标签<i>,链接在父节点<a>标签
  403. # # url = 'http://sthjj.liaoyuan.gov.cn/xxgk/tzgg/' #翻页链接不匹配 第二页开始规律 翻页超时导致拿不到翻页规则 无头模式打开网页超时, 正常模式不超时
  404. # # url = 'http://zyjy.huizhou.gov.cn/ggfw/jyxx/gycqjy/gpjggg/index_3.html' #翻页链接不匹配
  405. # # url = 'http://bj.sxggzyjy.cn/jydt/001001/001001004/001001004001/subPage.html' #翻页链接不匹配##下一页规则未获取#
  406. # # url = 'http://www.tlgljs.com/cpzs.html'
  407. # # url = 'http://zrzyj.jlbc.gov.cn/xxgk/tzgg/'
  408. # # url = 'http://www.zqcyl.cn/zlzx/ggl/' #抛出异常导致返回结果失败,
  409. # # url = 'http://www.cqcjda.com/ShowList.aspx?pkey=3'
  410. # # url = 'http://www.cqcjda.com/ShowList.aspx?pkey=3&p=1'
  411. # # url = 'http://zyjy.huizhou.gov.cn/ggfw/jyxx/gycqjy/gpjggg/'
  412. # # url = 'http://www.sxeec.com/gpgg.html'
  413. # url = 'http://zrzyj.jlbc.gov.cn/xxgk/tzgg/'
  414. # url = 'http://bbkx.bb.ah.cn/kxxw/tzgg/index.html'
  415. # url = 'http://www.lzwhg.com/tongzhigonggao/'
  416. # url = 'http://www.slwr.gov.cn/zfxxgk/gkml/216/240/257/list_640.htm' # 列表页脚本异常
  417. # url = 'http://view.landtz.com:8091/xh/index?resourceStatus=0&useType=&orderBy=0&title='
  418. # url = 'http://ggzy.yueqing.gov.cn/yqwebnew/jyxx/001009/001009010/'
  419. # url = 'http://ggzy.xjbt.gov.cn/TPFront/bt5/083003/083003002/083003002006/'
  420. # url = 'http://www.longmen.gov.cn/xzfbm/xcl/zwgk/bmwj/tzgg/index.html'
  421. # url = 'http://nyncj.yq.gov.cn/tzgg/'
  422. url = 'http://www.yrcc.gov.cn/zwzc/gzgb/gg/index.html'
  423. url = 'http://www.hzsq.gov.cn/index.php?r=article/Category/index&class_id=61'
  424. url = 'http://zyjy.huizhou.gov.cn/ggfw/jyxx/gycqjy/gpjggg/'
  425. url = 'http://www.lzwhg.com/tongzhigonggao/' #翻页失败
  426. rs = get_rs(url)
  427. print(rs)
  428. import pandas as pd
  429. import time
  430. l = []
  431. def get_url_root(text):
  432. url = re.search('https?:[a-z0-9-./]+\.(cn|com|org|net|gov|edu|biz|cc|mil|top|pub|info)', text)
  433. if url:
  434. return url.group(0)
  435. else:
  436. return ''
  437. def get_url(text):
  438. try:
  439. url = json.loads(text).get('ruleLink', '')
  440. return url
  441. except:
  442. print('CRAWLER_LINK json loads 出错:', text)
  443. return ''
  444. # df = pd.read_csv('E:/crawl_data/crawler.BXKC_CRAWLING_RULES_INFO_utf8.csv')[:]
  445. # df = pd.read_csv('E:/crawl_data/crawler.BXKC_CRAWLING_RULES_INFO_utf8_predict100-200.csv')[:]
  446. # df = pd.read_excel('E:\crawl_data/新建 XLS 工作表.xls')
  447. df = pd.read_excel('E:\crawl_data/20220526new_url_2test.xlsx')
  448. # df.drop_duplicates(subset=['首页网址'], inplace=True)
  449. #
  450. # df['url_root'] = df['CRAWLER_LINK '].apply(lambda x:get_url_root(x))
  451. # df['url'] = df['CRAWLER_LINK '].apply(lambda x:get_url(x))
  452. # df = df[df['url']!=""]
  453. # print(len(df))
  454. # df.drop_duplicates(subset=['url_root'], inplace=True)
  455. # print(len(df))
  456. # df.drop_duplicates(subset=['DETAIL_CONTENT_NODE'], inplace=True)
  457. # # df = df[100:200]
  458. df.reset_index(drop=True, inplace=True)
  459. print(len(df), df.columns)
  460. t0 = time.time()
  461. for i in df.index:
  462. # if '#列表页规则未获取#' not in df.loc[i, 'rs']:
  463. # continue
  464. t1 = time.time()
  465. # url = df.loc[i, 'url']
  466. url = df.loc[i, '列表页链接']
  467. if not re.match('http', url):
  468. l.append('')
  469. print(url)
  470. continue
  471. print(url)
  472. rs = get_rs(url)
  473. # try:
  474. # url = json.loads(df.loc[i, 'CRAWLER_LINK ']).get('ruleLink', '')
  475. # print(url)
  476. # rs = get_rs(url)
  477. # except:
  478. # rs = json.dumps({'err_msg': 'json loads link error'})
  479. print('耗时:', time.time()-t1)
  480. print(rs)
  481. l.append(rs)
  482. df['rs3'] = pd.Series(l)
  483. print('完成,总耗时:', time.time()-t0)
  484. # # df.to_csv('E:/crawl_data/crawler.BXKC_CRAWLING_RULES_INFO_utf8_predict后1000-900.csv', encoding='utf-8')
  485. # df.to_excel('E:/crawl_data/20220526new_url_0531.xlsx', encoding='utf-8')
  486. print('写入完成,总耗时:', time.time()-t0)
  487. # #