123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203 |
- '''
- Created on 2019年8月19日
- @author: User
- '''
- from module import predictor
- from module.detail.content import featureEngine as featureEngine_content
- from module.detail.title import featureEngine as featureEngine_title
- from module.Utils import mergeDict,getCommonXpath
- import numpy as np
- import module.htmlDrawing as hd
- from module.Utils import *
- detailContentPredictor = predictor.DetailContentPredictor()
- detailTitlePredictor = predictor.DetailTitlePredictor()
- def getCommonXpath_time(data_time):
- list_position_list_xpath = []
- for item in data_time:
- _left = item[0]
- _top = item[1]
- _xpath = item[2][0]
- _fontSize = item[3]
- _offset_top_title = item[4]
- _distance = abs(_top-_offset_top_title)
- if _fontSize is None:
- _fontSize = 16
- _find_flag = False
- for _position_list_xpath in list_position_list_xpath:
- if _position_list_xpath[2]==_xpath:
- _position_left = _position_list_xpath[0]
- _position_top = _position_list_xpath[1]
- _position_left = (_position_left+_left)/2
- _position_top = (_position_top+_top)/2
- if _fontSize<_position_list_xpath[3]:
- _position_list_xpath[3] = _fontSize
- _position_list_xpath[4] += 1
- _position_distance = _position_list_xpath[5]
- _position_distance = (_position_distance+_distance)/2
- _find_flag = True
- break
- if not _find_flag:
- list_position_list_xpath.append([_left,_top,_xpath,_fontSize,1,_distance])
- date_xpath = None
- _max_len = 0
- _max_len_index = None
- _max_distance = 10000
- _score = 0
- for i in range(len(list_position_list_xpath)):
- item = list_position_list_xpath[i]
- '''
- tmp_score = item[4]*0.8-item[5]*0.5
- if tmp_score>_score:
- _score = tmp_score
- _max_len_index = i
- date_xpath = item[2]
-
- '''
- if item[4]>_max_len:
- _max_len = item[4]
- date_xpath = item[2]
- _max_len_index = i
- _max_distance = item[5]
- #如果长度一样,则选择距离标题最近的
- elif item[4]==_max_len:
- if item[5]<_max_distance:
- date_xpath = item[2]
- _max_len_index = i
- _max_distance = item[5]
-
-
- return date_xpath
- def getRule_detail(list_hrefs,try_times=3,MAX_HREFS=10):
- list_xpaths_content = []
- list_xpath_remove_content = []
- list_data_time = []
- list_xpaths_title = []
- list_title_top = []
- count_hrefs = 0
- dict_rule_detail = dict()
- dict_rule_detail["hasDrew"] = False
-
- for _url in list_hrefs:
- if _url is None:
- continue
- list_legal_time = []
- _flag = -2
- browser = hd.getdriver()
- debug("get driver")
- loadsucess = hd.loadPage(browser, _url)
- if not loadsucess:
- browser = hd.getdriver()
- # browser.maximize_window()
- flag,data = featureEngine_content.getInput_byJS(browser,_url)
- hasGotten = True
- if flag:
- x,inner_html,list_xpath,data_time = data
- _index = detailContentPredictor.predict(x)
- pt = '<a.*?\.(zip|rar|tar|7z|wim|docx|doc|xlsx|xls|pdf|txt|hnzf|bmp|tif).*?</a>'
- total_annex = len(re.findall(pt, browser.page_source))
- extract_annex = len(re.findall(pt, inner_html[_index]))
- if total_annex > extract_annex and _index>5 and len(list_xpath[_index])>0:
- extract_xpath = list_xpath[_index][0][0]
- for i in range(_index-1, _index-5, -1):
- if len(re.findall(pt, inner_html[i]))== total_annex:
- log('规格调整模型正文提取附件不完整')
- _index = i
- break
- elif len(list_xpath[i])>0 and list_xpath[i][0][0] not in extract_xpath:
- break
- _xpath = list_xpath[_index]
- _xpath.reverse()
- list_xpath_remove_content.append(_xpath)
-
- for item in data_time:
- list_legal_time.append(item)
- _flag += 1
- else:
- hasGotten = False
- add_err_msg(dict_rule_detail, data)
- flag,data_title = featureEngine_title.getInput_byJS(browser,_url)
- hd.adddriver(browser)
- debug("release driver")
- if flag:
- x,_,list_xpath,list_top = data_title
- log('详情标题获取成功')
- _index = detailTitlePredictor.predict(x)
- _xpath = list_xpath[_index]
- _xpath.reverse()
- list_xpaths_title.append(_xpath)
- list_title_top.append(list_top[_index])
- #加上标题的offset_top
- for i in range(len(list_legal_time)):
- list_legal_time[i].append(list_top[_index])
- list_data_time.append(list_legal_time[i])
- _flag += 1
- else:
- add_err_msg(dict_rule_detail, data_title)
- if _flag==0:
- count_hrefs += 1
- if count_hrefs>=MAX_HREFS:
- break
-
-
- for item in list_xpath_remove_content:
- _xpath = []
- for _xpath_remove in item:
- _xpath.append(_xpath_remove[0])
- list_xpaths_content.append(_xpath)
- dict_rule_detail["detail_content"] = getCommonXpath(list_xpaths_content)
- set_remove_list = None
- for item in list_xpath_remove_content:
- for _xpath_remove in item:
- if _xpath_remove[0]==dict_rule_detail["detail_content"]:
- if set_remove_list is None:
- set_remove_list = set(_xpath_remove[1])
- else:
- set_remove_list = set(_xpath_remove[1])&set_remove_list
- dict_rule_detail["detail_removeList"] = list(set_remove_list) if set_remove_list!=None else []
- dict_rule_detail["detail_date"] = getCommonXpath_time(list_data_time)
- dict_rule_detail["detail_title"] = getCommonXpath(list_xpaths_title)
-
- # try:
- browser = hd.getdriver()
- debug("get driver")
- if len(list_hrefs)>0:
- loadsucess = hd.loadPage(browser, list_hrefs[-1],)
- log('logPage: ')
- if loadsucess==False:
- browser = hd.getdriver()
- dict_rule_detail["hasDrew"] = dict_rule_detail["hasDrew"] or hd.hasDrew(list_hrefs[0], [{"type":"xpath","rule":dict_rule_detail["detail_content"]},
- {"type":"xpath","rule":dict_rule_detail["detail_date"]},
- {"type":"xpath","rule":dict_rule_detail["detail_title"]}])
- if dict_rule_detail["detail_content"] is not None and len(dict_rule_detail["detail_content"].split("/"))>6:
- log("before being replaced xpath of detail_content"+dict_rule_detail["detail_content"])
- # dict_rule_detail["detail_content"] = browser.execute_script(scripts_replaceXpath,dict_rule_detail["detail_content"])
- dict_rule_detail["detail_content"] = get_js_rs(browser, scripts_replaceXpath,dict_rule_detail["detail_content"])
- log("after being replaced xpath of detail_content"+dict_rule_detail["detail_content"])
- if dict_rule_detail["detail_date"] is not None and len(dict_rule_detail["detail_date"].split("/"))>6:
- # dict_rule_detail["detail_date"] = browser.execute_script(scripts_replaceXpath,dict_rule_detail["detail_date"])
- dict_rule_detail["detail_date"] = get_js_rs(browser, scripts_replaceXpath,dict_rule_detail["detail_date"])
- if dict_rule_detail["detail_title"] is not None and len(dict_rule_detail["detail_title"].split("/"))>6:
- # dict_rule_detail["detail_title"] = browser.execute_script(scripts_replaceXpath,dict_rule_detail["detail_title"])
- dict_rule_detail["detail_title"] = get_js_rs(browser, scripts_replaceXpath,dict_rule_detail["detail_title"])
- # finally:
- hd.adddriver(browser)
- debug("release driver")
- if dict_rule_detail["detail_content"] is not None and dict_rule_detail["detail_date"] is not None and dict_rule_detail["detail_title"] is not None:
- dict_rule_detail["flag"] = True
- else:
- dict_rule_detail["flag"] = False
- add_err_msg(dict_rule_detail, "#详情规则不完整#")
- return dict_rule_detail
-
- if __name__=="__main__":
- list_hrefs = ['http://www.beian.miit.gov.cn/', 'http://www.beian.gov.cn/portal/registerSystemInfo?recordcode=22240102000130', 'http://bszs.conac.cn/sitename?method=show&id=072BBE1E1F301B54E053022819AC1765', 'http://121.43.68.40/exposure/jiucuo.html?site_code=2224000007&url=http%3A%2F%2Fwww.ybzfcg.gov.cn%2Fnet%2Fcgxx.jsp%3Fopstatus%3Dxjgg']
- print(getRule_detail(list_hrefs))
|