luojiehua
/
ContentExtract


			
				
					
						
						
							123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203
							'''
Created on 2019年8月19日

@author: User
'''

from module import predictor
from module.detail.content import featureEngine as featureEngine_content
from module.detail.title import featureEngine as featureEngine_title
from module.Utils import mergeDict,getCommonXpath
import numpy as np
import module.htmlDrawing as hd
from module.Utils import *

detailContentPredictor = predictor.DetailContentPredictor()
detailTitlePredictor = predictor.DetailTitlePredictor()

def getCommonXpath_time(data_time):
    list_position_list_xpath = []
    for item in data_time:
        _left = item[0]
        _top = item[1]
        _xpath = item[2][0]
        _fontSize = item[3]
        _offset_top_title = item[4]
        _distance = abs(_top-_offset_top_title)
        if _fontSize is None:
            _fontSize = 16
        _find_flag = False
        for _position_list_xpath in list_position_list_xpath:
            if _position_list_xpath[2]==_xpath:
                _position_left = _position_list_xpath[0]
                _position_top = _position_list_xpath[1]
                _position_left = (_position_left+_left)/2
                _position_top = (_position_top+_top)/2
                if _fontSize<_position_list_xpath[3]:
                    _position_list_xpath[3] = _fontSize
                _position_list_xpath[4] += 1
                _position_distance = _position_list_xpath[5]
                _position_distance = (_position_distance+_distance)/2
                _find_flag = True
                break
        if not _find_flag:
            list_position_list_xpath.append([_left,_top,_xpath,_fontSize,1,_distance])
    date_xpath = None 
    _max_len = 0
    _max_len_index = None
    _max_distance = 10000
    _score = 0
    for i in range(len(list_position_list_xpath)):
        item = list_position_list_xpath[i]
        '''
        tmp_score = item[4]*0.8-item[5]*0.5
        if tmp_score>_score:
            _score = tmp_score
            _max_len_index = i
            date_xpath = item[2]
        
        '''
        if item[4]>_max_len:
            _max_len = item[4]
            date_xpath = item[2]
            _max_len_index = i
            _max_distance = item[5]
        #如果长度一样，则选择距离标题最近的
        elif item[4]==_max_len:
            if item[5]<_max_distance:
                date_xpath = item[2]
                _max_len_index = i
                _max_distance = item[5]
        
        
    return date_xpath

def getRule_detail(list_hrefs,try_times=3,MAX_HREFS=10):
    list_xpaths_content = []
    list_xpath_remove_content = []
    list_data_time = []
    list_xpaths_title = []
    list_title_top = []
    count_hrefs = 0
    dict_rule_detail = dict()
    dict_rule_detail["hasDrew"] = False
    
    for _url in list_hrefs:
        if _url is None:
            continue
        list_legal_time = []
        _flag = -2
        browser = hd.getdriver()
        debug("get driver")
        loadsucess = hd.loadPage(browser, _url)
        if not loadsucess:
            browser = hd.getdriver()
        # browser.maximize_window()
        flag,data = featureEngine_content.getInput_byJS(browser,_url)
        hasGotten = True
        if flag:
            x,inner_html,list_xpath,data_time = data
            _index = detailContentPredictor.predict(x)

            pt = '<a.*?\.(zip|rar|tar|7z|wim|docx|doc|xlsx|xls|pdf|txt|hnzf|bmp|tif).*?</a>'
            total_annex = len(re.findall(pt, browser.page_source))
            extract_annex = len(re.findall(pt, inner_html[_index]))
            if total_annex > extract_annex and _index>5 and len(list_xpath[_index])>0:
                extract_xpath = list_xpath[_index][0][0]
                for i in range(_index-1, _index-5, -1):
                    if len(re.findall(pt, inner_html[i]))== total_annex:
                        log('规格调整模型正文提取附件不完整')
                        _index = i
                        break
                    elif len(list_xpath[i])>0 and list_xpath[i][0][0] not in extract_xpath:
                        break

            _xpath = list_xpath[_index]
            _xpath.reverse()
            list_xpath_remove_content.append(_xpath)
            
            for item in data_time:
                list_legal_time.append(item)
            _flag += 1
        else:
            hasGotten = False
            add_err_msg(dict_rule_detail, data)
        flag,data_title = featureEngine_title.getInput_byJS(browser,_url)
        hd.adddriver(browser)
        debug("release driver")
        if flag:
            x,_,list_xpath,list_top = data_title
            log('详情标题获取成功')
            _index = detailTitlePredictor.predict(x)
            _xpath = list_xpath[_index]
            _xpath.reverse()
            list_xpaths_title.append(_xpath)
            list_title_top.append(list_top[_index])
            #加上标题的offset_top
            for i in range(len(list_legal_time)):
                list_legal_time[i].append(list_top[_index])
                list_data_time.append(list_legal_time[i])
            _flag += 1
        else:
            add_err_msg(dict_rule_detail, data_title)
        if _flag==0:
            count_hrefs += 1
        if count_hrefs>=MAX_HREFS:
            break
    
    
    for item in list_xpath_remove_content:
        _xpath = []
        for _xpath_remove in item:
            _xpath.append(_xpath_remove[0])
        list_xpaths_content.append(_xpath)
    dict_rule_detail["detail_content"] = getCommonXpath(list_xpaths_content)

    set_remove_list = None
    for item in list_xpath_remove_content:
        for _xpath_remove in item:
            if _xpath_remove[0]==dict_rule_detail["detail_content"]:
                if set_remove_list is None:
                    set_remove_list = set(_xpath_remove[1])
                else:
                    set_remove_list = set(_xpath_remove[1])&set_remove_list
    dict_rule_detail["detail_removeList"] = list(set_remove_list) if set_remove_list!=None else []
    dict_rule_detail["detail_date"] = getCommonXpath_time(list_data_time)
    dict_rule_detail["detail_title"] = getCommonXpath(list_xpaths_title)
    
    # try:
    browser = hd.getdriver()
    debug("get driver")
    if len(list_hrefs)>0:
        loadsucess = hd.loadPage(browser, list_hrefs[-1],)
        log('logPage: ')
        if loadsucess==False:
            browser = hd.getdriver()
        dict_rule_detail["hasDrew"] = dict_rule_detail["hasDrew"] or hd.hasDrew(list_hrefs[0], [{"type":"xpath","rule":dict_rule_detail["detail_content"]},
                                                                                            {"type":"xpath","rule":dict_rule_detail["detail_date"]},
                                                                                            {"type":"xpath","rule":dict_rule_detail["detail_title"]}])
    if dict_rule_detail["detail_content"] is not None and len(dict_rule_detail["detail_content"].split("/"))>6:
        log("before being replaced xpath of detail_content"+dict_rule_detail["detail_content"])
        # dict_rule_detail["detail_content"] = browser.execute_script(scripts_replaceXpath,dict_rule_detail["detail_content"])
        dict_rule_detail["detail_content"] = get_js_rs(browser, scripts_replaceXpath,dict_rule_detail["detail_content"])
        log("after being replaced xpath of detail_content"+dict_rule_detail["detail_content"])
    if dict_rule_detail["detail_date"] is not None and len(dict_rule_detail["detail_date"].split("/"))>6:
        # dict_rule_detail["detail_date"] = browser.execute_script(scripts_replaceXpath,dict_rule_detail["detail_date"])
        dict_rule_detail["detail_date"] = get_js_rs(browser, scripts_replaceXpath,dict_rule_detail["detail_date"])
    if dict_rule_detail["detail_title"] is not None and len(dict_rule_detail["detail_title"].split("/"))>6:
        # dict_rule_detail["detail_title"] = browser.execute_script(scripts_replaceXpath,dict_rule_detail["detail_title"])
        dict_rule_detail["detail_title"] = get_js_rs(browser, scripts_replaceXpath,dict_rule_detail["detail_title"])
    # finally:
    hd.adddriver(browser)
    debug("release driver")

    if dict_rule_detail["detail_content"] is not None and dict_rule_detail["detail_date"] is not None and dict_rule_detail["detail_title"] is not None:
        dict_rule_detail["flag"] = True
    else:
        dict_rule_detail["flag"] = False
        add_err_msg(dict_rule_detail, "#详情规则不完整#")
    return dict_rule_detail
    
if __name__=="__main__":
    list_hrefs =  ['http://www.beian.miit.gov.cn/', 'http://www.beian.gov.cn/portal/registerSystemInfo?recordcode=22240102000130', 'http://bszs.conac.cn/sitename?method=show&id=072BBE1E1F301B54E053022819AC1765', 'http://121.43.68.40/exposure/jiucuo.html?site_code=2224000007&url=http%3A%2F%2Fwww.ybzfcg.gov.cn%2Fnet%2Fcgxx.jsp%3Fopstatus%3Dxjgg']
    print(getRule_detail(list_hrefs))