extractor.py 5.7 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135
  1. '''
  2. Created on 2019年8月19日
  3. @author: User
  4. '''
  5. from module import predictor
  6. from module.listpage.content import featureEngine
  7. from module.listpage.pageTurn import engine
  8. import module.htmlDrawing as hd
  9. from module.Utils import mergeDict,log,add_err_msg,error, debug
  10. import re
  11. listpageContentPredictor = predictor.ListpageContentPredictor()
  12. def getRecognize_detail_listpage(list_listpage_url,list_detail_hrefs):
  13. try:
  14. pattern = "([/&=\.\|\?\-_\d]+)"
  15. rule_recog = {}
  16. set_recog_listpage = set()
  17. set_recog_listpage_common = set()
  18. set_recog_detail = set()
  19. set_recog_detail_common = set()
  20. set_length_listpage = set()
  21. set_length_detail = set()
  22. for _url in list_listpage_url:
  23. set_temp = set()
  24. for _split in re.split(pattern, _url):
  25. set_temp.add(_split)
  26. set_recog_listpage = set_recog_listpage | set_temp
  27. if len(set_recog_listpage_common)==0:
  28. set_recog_listpage_common = set_temp
  29. else:
  30. set_recog_listpage_common = set_recog_listpage_common & set_temp
  31. set_length_listpage.add(len(_url))
  32. for _url in list_detail_hrefs:
  33. if _url is None:
  34. continue
  35. set_temp = set()
  36. for _split in re.split(pattern, _url):
  37. set_temp.add(_split)
  38. set_recog_detail = set_recog_detail | set_temp
  39. if len(set_recog_detail_common)==0:
  40. set_recog_detail_common = set_temp
  41. else:
  42. set_recog_detail_common = set_recog_detail_common & set_temp
  43. set_length_detail.add(len(_url))
  44. rule_recog["recog_pattern"] = pattern
  45. set_recog_common = set_recog_listpage & set_recog_detail
  46. list_recog_listpage = list(set_recog_listpage_common-set_recog_common)
  47. if len(list_recog_listpage)>0:
  48. rule_recog["recog_listpage"] = list_recog_listpage
  49. else:
  50. rule_recog["recog_listpage"] = None
  51. list_recog_detail = list(set_recog_detail_common-set_recog_common)
  52. if len(list_recog_detail)>0:
  53. rule_recog["recog_detail"] = list_recog_detail
  54. else:
  55. rule_recog["recog_detail"] = None
  56. _recog1 = True
  57. if rule_recog["recog_listpage"] is None and rule_recog["recog_detail"] is None:
  58. add_err_msg(rule_recog, "#详情页列表页区分字符串未识别#")
  59. _recog1 = False
  60. _recog2 = False
  61. if len(set_length_listpage)==1 and len(set_length_detail)==1 and abs(list(set_length_listpage)[0]-list(set_length_detail)[0])>1:
  62. _recog2 = True
  63. rule_recog["recog_length"] = [list(set_length_detail)[0],list(set_length_listpage)[0]]
  64. else:
  65. rule_recog["recog_length"] = None
  66. add_err_msg(rule_recog, "#详情页列表页区分长度未识别#")
  67. rule_recog["flag"] = _recog1 or _recog2
  68. #rule_recog["flag"] = _recog1
  69. return rule_recog
  70. except Exception as e:
  71. error(str(e))
  72. def getRule_listpage(listpage_url,try_times=3):
  73. for i in range(try_times):
  74. browser = hd.getdriver()
  75. debug("get driver")
  76. loadsuccess = hd.loadPage(browser, listpage_url)
  77. if not loadsuccess:
  78. log('加载列表主页失败, 重新请求网页。')
  79. continue
  80. log('准备执行获取列表页内容标签脚本')
  81. # with open('d:/html/home_page.html', 'w', encoding='utf-8') as f:
  82. # f.write(browser.page_source)
  83. data_listpage = featureEngine.getInput_byJS(browser,listpage_url,"")
  84. log('获取列表页内容标签成功')
  85. #print(browser.page_source)
  86. # hd.adddriver(browser)
  87. # debug("release driver")
  88. if data_listpage is not None:
  89. x,_,list_xpath = data_listpage
  90. _index = listpageContentPredictor.predict(x)
  91. log('模型预测列表页标签完毕')
  92. if len(list_xpath[_index])>0:
  93. content_xpath = list_xpath[_index][0]
  94. #content_xpath = "/html"
  95. log("the content_xpath of listpage is "+str(content_xpath))
  96. data_rule = featureEngine.getRule_A_Date(browser,listpage_url,content_xpath)
  97. log('执行脚本获取列表页链接及日期完毕')
  98. if data_rule is not None:
  99. dict_rule_A_Date,list_hrefs = data_rule
  100. # if dict_rule_A_Date.get('flag', '') == False:
  101. # return None
  102. # browser = hd.getdriver()
  103. # debug("get driver")
  104. log('begin getTurnRule')
  105. turn_data = engine.getTurnRule(browser,listpage_url)
  106. log('获取翻页内容完毕')
  107. # hd.adddriver(browser)
  108. # debug("release driver")
  109. dict_rule_pageTurn,list_listpage_url = turn_data
  110. dict_rule_recog = getRecognize_detail_listpage(list_listpage_url, list_hrefs)
  111. log('解析列表页规则完毕')
  112. hd.adddriver(browser)
  113. debug("release driver")
  114. return mergeDict([dict_rule_A_Date,dict_rule_pageTurn,dict_rule_recog]),list_hrefs
  115. hd.adddriver(browser)
  116. debug("release driver")
  117. return None
  118. if __name__=="__main__":
  119. listpage_url = "http://www.qyggfw.cn/w/bid/qualiInqueryResult/morePageList?filterparam=%7B%22assortment%22%3A%223%22%2C%22areaCode%22%3A%22621000%22%2C%22workNotice%22%3A%7B%22noticeNature%22%3A%221%22%2C%22bulletinType%22%3A%221%22%7D%7D"
  120. data = getRule_listpage(listpage_url)[0]
  121. for item in data.keys():
  122. print(item,data[item])