''' Created on 2019年8月19日 @author: User ''' from module import predictor from module.listpage.content import featureEngine from module.listpage.pageTurn import engine import module.htmlDrawing as hd from module.Utils import mergeDict,log,add_err_msg,error, debug import re listpageContentPredictor = predictor.ListpageContentPredictor() def getRecognize_detail_listpage(list_listpage_url,list_detail_hrefs): try: pattern = "([/&=\.\|\?\-_\d]+)" rule_recog = {} set_recog_listpage = set() set_recog_listpage_common = set() set_recog_detail = set() set_recog_detail_common = set() set_length_listpage = set() set_length_detail = set() for _url in list_listpage_url: set_temp = set() for _split in re.split(pattern, _url): set_temp.add(_split) set_recog_listpage = set_recog_listpage | set_temp if len(set_recog_listpage_common)==0: set_recog_listpage_common = set_temp else: set_recog_listpage_common = set_recog_listpage_common & set_temp set_length_listpage.add(len(_url)) for _url in list_detail_hrefs: if _url is None: continue set_temp = set() for _split in re.split(pattern, _url): set_temp.add(_split) set_recog_detail = set_recog_detail | set_temp if len(set_recog_detail_common)==0: set_recog_detail_common = set_temp else: set_recog_detail_common = set_recog_detail_common & set_temp set_length_detail.add(len(_url)) rule_recog["recog_pattern"] = pattern set_recog_common = set_recog_listpage & set_recog_detail list_recog_listpage = list(set_recog_listpage_common-set_recog_common) if len(list_recog_listpage)>0: rule_recog["recog_listpage"] = list_recog_listpage else: rule_recog["recog_listpage"] = None list_recog_detail = list(set_recog_detail_common-set_recog_common) if len(list_recog_detail)>0: rule_recog["recog_detail"] = list_recog_detail else: rule_recog["recog_detail"] = None _recog1 = True if rule_recog["recog_listpage"] is None and rule_recog["recog_detail"] is None: add_err_msg(rule_recog, "#详情页列表页区分字符串未识别#") _recog1 = False _recog2 = False if len(set_length_listpage)==1 and len(set_length_detail)==1 and abs(list(set_length_listpage)[0]-list(set_length_detail)[0])>1: _recog2 = True rule_recog["recog_length"] = [list(set_length_detail)[0],list(set_length_listpage)[0]] else: rule_recog["recog_length"] = None add_err_msg(rule_recog, "#详情页列表页区分长度未识别#") rule_recog["flag"] = _recog1 or _recog2 #rule_recog["flag"] = _recog1 return rule_recog except Exception as e: error(str(e)) def getRule_listpage(listpage_url,try_times=3): for i in range(try_times): browser = hd.getdriver() debug("get driver") loadsuccess = hd.loadPage(browser, listpage_url) if not loadsuccess: log('加载列表主页失败, 重新请求网页。') continue log('准备执行获取列表页内容标签脚本') # with open('d:/html/home_page.html', 'w', encoding='utf-8') as f: # f.write(browser.page_source) data_listpage = featureEngine.getInput_byJS(browser,listpage_url,"") log('获取列表页内容标签成功') #print(browser.page_source) # hd.adddriver(browser) # debug("release driver") if data_listpage is not None: x,_,list_xpath = data_listpage _index = listpageContentPredictor.predict(x) log('模型预测列表页标签完毕') if len(list_xpath[_index])>0: content_xpath = list_xpath[_index][0] #content_xpath = "/html" log("the content_xpath of listpage is "+str(content_xpath)) data_rule = featureEngine.getRule_A_Date(browser,listpage_url,content_xpath) log('执行脚本获取列表页链接及日期完毕') if data_rule is not None: dict_rule_A_Date,list_hrefs = data_rule # if dict_rule_A_Date.get('flag', '') == False: # return None # browser = hd.getdriver() # debug("get driver") log('begin getTurnRule') turn_data = engine.getTurnRule(browser,listpage_url) log('获取翻页内容完毕') # hd.adddriver(browser) # debug("release driver") dict_rule_pageTurn,list_listpage_url = turn_data dict_rule_recog = getRecognize_detail_listpage(list_listpage_url, list_hrefs) log('解析列表页规则完毕') hd.adddriver(browser) debug("release driver") return mergeDict([dict_rule_A_Date,dict_rule_pageTurn,dict_rule_recog]),list_hrefs hd.adddriver(browser) debug("release driver") return None if __name__=="__main__": listpage_url = "http://www.qyggfw.cn/w/bid/qualiInqueryResult/morePageList?filterparam=%7B%22assortment%22%3A%223%22%2C%22areaCode%22%3A%22621000%22%2C%22workNotice%22%3A%7B%22noticeNature%22%3A%221%22%2C%22bulletinType%22%3A%221%22%7D%7D" data = getRule_listpage(listpage_url)[0] for item in data.keys(): print(item,data[item])