luojiehua
/
iepy-develop


			
							123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853
							import re
from bs4 import BeautifulSoup, Comment
import copy
import sys
import os
import time
sys.path.append(os.path.abspath("../.."))
import fool
from BiddingKG.dl.interface.Connection import *
from BiddingKG.dl.common.Utils import *
import BiddingKG.dl.interface.settings as settings
from BiddingKG.dl.interface.Connection import getConnection
from BiddingKG.dl.interface.Entitys import *
from BiddingKG.dl.form.feature import encoding
from BiddingKG.dl.interface.predictor import *

formPredictor = FormPredictor()

def tableToText(soup):
    '''
    @param:
        soup:网页html的soup
    @return:处理完表格信息的网页text
    '''
    
    def getTrs(tbody):
    #获取所有的tr
        trs = []
        objs = tbody.find_all(recursive=False)
        for obj in objs:
            if obj.name=="tr":
                trs.append(obj)
            if obj.name=="tbody":
                for tr in obj.find_all("tr",recursive=False):
                    trs.append(tr)
        return trs
    
    def getTable(tbody):
        #trs = tbody.findChildren('tr', recursive=False)
        def add_punc(soup):
            # 对表格内部节点添加标点符号
            commaList = ["p","div","h1", "h2", "h3", "h4", "h5", "h6", "header", "dl", "ul", "label"]             
            # 递归遍历所有节点,插入符号
            for child in soup.find_all(recursive=True):
                if child.name == 'br':
                    child.insert_before('，')
                child_text = re.sub('\s', '', child.get_text())
                if child_text == '' or child_text[-1] in ['。','，','：','；']:
                    continue               
                if child.name in commaList:
                    if len(child_text)>3 and len(child_text) <50:   # 先判断是否字数少于50，成立加逗号，否则加句号
                        child.insert_after("，")           
                    elif len(child_text) >=50:
                        child.insert_after("。")  
            return soup
        trs = getTrs(tbody)
        inner_table = []  
        colspan = [] # 记录每个<td>标签的colspan 数
        rowspan = [] # 记录每个<td>标签的rowspan 数       
        for tr in trs:
            tr_line = []
            tr_col = []
            tr_row = []
            tds = tr.findChildren(['td','th'], recursive=False)           
            for td in tds:
                if len(td.find_all(['p','div','br','dl','ul'])) > 2:
                    add_punc(td)
                if 'colspan' in td.attrs and td['colspan'].isdigit():
                    tr_col.append(int(td['colspan']))
                else:
                    tr_col.append(1)
                if 'rowspan' in td.attrs and td['rowspan'].isdigit():
                    tr_row.append(int(td['rowspan']))
                else:
                    tr_row.append(1)
                tr_line.append([re.sub('\s*','',td.get_text()),0])
            if tr_row == []:
                tr_row.append(1)
                tr_col.append(1)
                tr_line.append([re.sub('\s*','',tr.get_text()),0])
            inner_table.append(tr_line)      
            colspan.append(tr_col)
            rowspan.append(tr_row)
        return inner_table, colspan, rowspan  

    def fix_rowspan(inner_table, colspan, rowspan):
        # 思路2：遍历每行i，如果有colspan<3而且列数大于3 则补全colspan :
        # 遍历每行i 如果 rowspan > 1，先判断补全后列数是否相等，如果相等再补全。     
        def is_same_item(list):
            flag = True
            item0 = list[0]
            for i in range(1,len(list)):
                if  item0 != list[i]:
                    flag = False
                    break
            return flag
        
        for i in range(len(inner_table)):  # 有rowspan需要跟下一行比较，所以要减1    
            curent_row_total_colspan = 0    
            if len(colspan[i]) >= 2:
                for w in range(len(colspan[i])):
                    if colspan[i][w+curent_row_total_colspan] > 1 and (colspan[i][w+curent_row_total_colspan] < 3 or colspan[i][w+curent_row_total_colspan] <= len(colspan[i])):
                    #if colspan[i][w+curent_row_total_colspan] > 1 and colspan[i][w+curent_row_total_colspan] < len(colspan[i]): # 修改加条件防止两列的需要补全2列不行
                        for num in range(1, colspan[i][w+curent_row_total_colspan]):
                            colspan[i].insert(w+num+curent_row_total_colspan, 1)
                            rowspan[i].insert(w+num+curent_row_total_colspan, rowspan[i][w+curent_row_total_colspan])
                            inner_table[i].insert(w+num+curent_row_total_colspan, copy.deepcopy(inner_table[i][w+curent_row_total_colspan]))
                        colspan[i][w+curent_row_total_colspan] = 1
                        curent_row_total_colspan += num             

        for i in range(len(inner_table) -1):  # 有rowspan需要跟下一行比较，所以要减1        
            if is_same_item(rowspan[i]):
                continue
            for j in range(len(rowspan[i])):  # 最后一列不用补全       
                if rowspan[i][j] > 1 and rowspan[i][j]<= len(inner_table) :   
                    if sum(colspan[i]) == sum(colspan[i+1]) + colspan[i][j] or sum(colspan[i]) == sum(colspan[i+1]) + len([span for span in rowspan[i] if span > 1]):                                                                             
                        rowspan[i+1].insert(j, rowspan[i][j]-1)                                                                              
                        colspan[i+1].insert(j, 1)
                        inner_table[i+1].insert(j, copy.deepcopy(inner_table[i][j]))
                        rowspan[i][j] = 1    
        return inner_table, colspan, rowspan    


    #设置表头
    def setHead(inner_table, prob_min=0.5):
        # 思路：先按照列数是否一致进行分段，然后在每个分段里面寻找行表头，找出所有的行表头后
        # 再在每个分段行表头之间和最后一个行表头和分段末尾之间找列表头。
        def del_continuous_value(l):   # 去除连续重复性再识别表头 
            new_list = []
            new_list.append(l[0])
            for item in l:
                if item != new_list[-1]:
                    new_list.append(item)
            return new_list        
        
        def find_diflen(l): 
            # 找到连续相同列数区间进行分段
            area = [0]
            temp = l[0]
            for i in range(len(l)):
                if temp != l[i]:
                    area.append(i-1)
                    area.append(i)
                temp = l[i]
            area.append(len(l)-1)
            return area  
        
        l = [len(tr) for tr in inner_table]  # 计算每行列数
        diff = find_diflen(l)   # 得到不同列数区间  
        height = len(inner_table)
        rowHeader = []       
        head_list = []
        area_end_index = []
        for index in range(0,len(diff),2):
            area_begin = diff[index]
            area_end = diff[index+1]  
            head_list.append(area_begin)
            #行表头  
            has_row_head = False
            for i in range(area_begin, area_end+1):   # 在区间内找行表头  
                if i == area_end:  # 区域内最后一行不作为表头 
                    continue                
                if [item[0] for item in inner_table[i] if len(item[0]) > 20] != []:  #　如果某列长度大于20直接判断为非表头
                    continue
                width = len(inner_table[i])
                is_row_head = False
                #item_set = set([item[0] for item in inner_table[i] if item[0] != ''])
                item_set = [item[0] for item in inner_table[i]]
                item_set = del_continuous_value(item_set)                   
                form_prob = formPredictor.predict(encoding('|'.join(item_set),expand=True),type="line")
                if form_prob is not None:
                    if form_prob[0][1]>prob_min:
                        is_row_head = True
                    else:
                        is_row_head = False
                #if fool.ner('|'.join(item_set)) != [[]]:
                    #is_row_head = False
                if is_row_head:
                    head_list.append(i)
                    rowHeader.append(i)
                    has_row_head = True
                    #print('补1前 ',inner_table[i])
                    for j in range(width):
                        inner_table[i][j][1] = 1 
                    #print(inner_table[i])                    
            head_list.append(area_end)
            if has_row_head:                
                area_end_index.append(area_end)
                rowHeader.append(area_end)
        for index in range(len(rowHeader)-1):  # 在行表头出现区间找列表头 
            if rowHeader[index] in area_end_index:
                continue
            width = len(inner_table[rowHeader[index]])
            for i in range(width-1):  # 最后一列不做列表头 
                is_head = False                    
                #predict is head or not with model
                temp_item = []
                for j in range(rowHeader[index],rowHeader[index+1]+1):   #从行表头开始找列表头                
                    temp_item.append(inner_table[j][i][0])
                #item_set = set([item for item in temp_item if item != ''])
                item_set = [item for item in temp_item]
                item_set = del_continuous_value(item_set)                       
                form_prob = formPredictor.predict(encoding('|'.join(item_set),expand=True),type="line")
                if form_prob is not None:
                    if form_prob[0][1]>prob_min:
                        is_head = True
                    else:
                        is_head = False
                if is_head:
                    for j in range(rowHeader[index]+1,rowHeader[index+1]+1):  #从行表头的下一行开始设置列表头
                        inner_table[j][i][1] = 2  
        return inner_table,diff,area_end_index

    #取得表格的处理方向
    def getDirect(inner_table,begin,end):
        column_head = set()
        row_head = set()
        widths = len(inner_table[begin])
        for height in range(begin,end):
            for width in range(widths):
                if inner_table[height][width][1] ==1:
                    row_head.add(height)
                if inner_table[height][width][1] ==2:
                    column_head.add(width)
        company_pattern = re.compile("公司")
        #if 0 in column_head and begin not in row_head:
        if widths == 1 and begin != end:
            return "column"
        if 0 in column_head and begin in row_head:
            for height in range(begin,end):
                count = 0
                count_flag = True
                for width_index in range(width):
                    if inner_table[height][width_index][1]==0:
                        if re.search(company_pattern,inner_table[height][width_index][0])  is not None:
                            count += 1
                        else:
                            count_flag = False
                if count_flag and count>=2:
                    return "column"
        return "row"
            
    #根据表格处理方向生成句子，
    def getTableText(inner_table,head_list,area_end_index):
        # 思路在各个分段内内容往上找表头，如果找到就插入表头，有关键词的排在最前面 
        rankPattern = "(排名|排序|名次|评标结果|评审结果)"
        entityPattern = "(候选|([中投]标|报价)(人|单位|候选)|单位名称|供应商)"
        height = len(inner_table)
        text = ""       
                  
        for index in range(0,len(head_list),2):
            head_begin = head_list[index]
            head_end = head_list[index+1]  
            direct = getDirect(inner_table, head_begin, head_end)     
            if direct=="row":
                #行表头  
                has_row_head = False
                for i in range(head_begin,head_end+1):
                    width = len(inner_table[i])
                    rank_text = ""
                    entity_text = ""
                    text_line = ""
                    #在同一句话中重复的可以去掉
                    text_set = set()
                    for j in range(width):                        
                        cell = inner_table[i][j]
                        #是属性值
                        if cell[1]==0:                            
                            head = ""
                            find_flag = False
                            temp_head = ""
                            for loop_j in range(1,j+1):
                                if inner_table[i][j-loop_j][1]==2:
                                    if find_flag:
                                        if inner_table[i][j-loop_j][0]!=temp_head:
                                            head = inner_table[i][j-loop_j][0]+":"+head
                                    else:
                                        head = inner_table[i][j-loop_j][0]+":"+head
                                    find_flag = True
                                    temp_head = inner_table[i][j-loop_j][0]
                                else:
                                    if find_flag:
                                        break                          
                            find_flag = False
                            temp_head = ""
                            if i > 0:
                                for loop_i in range(i-1, head_begin-1, -1): #  修改为从开始位置往上找
                                    if inner_table[loop_i][j][1]==1:
                                        if find_flag:
                                            if inner_table[loop_i][j][0]!=temp_head:
                                                head = inner_table[loop_i][j][0]+":"+head
                                        else:
                                            head = inner_table[loop_i][j][0]+":"+head
                                        find_flag = True
                                        temp_head = inner_table[loop_i][j][0]
                                    else:
                                        #找到表头后遇到属性值就返回
                                        if find_flag:
                                            break
                            if str(head+inner_table[i][j][0]) in text_set:
                                continue
                            if re.search(rankPattern,head) is not None:
                                
                                rank_text += head+inner_table[i][j][0]+"，"
                                #print(rank_text)
                            elif re.search(entityPattern,head) is not None:
                                entity_text += head+inner_table[i][j][0]+"，"
                                #print(entity_text)
                            else:
                                text_line += head+inner_table[i][j][0]+"，"
                            text_set.add(str(head+inner_table[i][j][0]))
                    text += rank_text+entity_text+text_line
                    #print(re.sub('\s', '', rank_text+entity_text+text_line))
                    text = text[:-1]+"，" if len(re.sub('\s', '', rank_text+entity_text+text_line)) < 20 else text[:-1]+"。"  # 修改为短句加逗号
                #print(text)
            else:
                #列表头  
                has_row_head = False
                width = len(inner_table[head_begin])
                for j in range(width): 
                #for i in range(head_begin,head_end+1):
                    #width = len(inner_table[i])
                    rank_text = ""
                    entity_text = ""
                    text_line = ""
                    #在同一句话中重复的可以去掉
                    text_set = set()
                    for i in range(head_begin,head_end+1):
                    #for j in range(width):                        
                        cell = inner_table[i][j]
                        #是属性值
                        if cell[1]==0:                            
                            head = ""
                            find_flag = False
                            temp_head = ""
                            if i > 0:
                                for loop_i in range(i-1, head_begin-1, -1): #  修改为从开始位置往上找
                                    if inner_table[loop_i][j][1]==1:
                                        if find_flag:
                                            if inner_table[loop_i][j][0]!=temp_head:
                                                head = inner_table[loop_i][j][0]+":"+head
                                        else:
                                            head = inner_table[loop_i][j][0]+":"+head
                                        find_flag = True
                                        temp_head = inner_table[loop_i][j][0]
                                    else:
                                        #找到表头后遇到属性值就返回
                                        if find_flag:
                                            break                                
                            find_flag = False
                            temp_head = ""
                            for loop_j in range(1,j+1):  # 找列表头 
                                if inner_table[i][j-loop_j][1]==2:
                                    if find_flag:
                                        if inner_table[i][j-loop_j][0]!=temp_head:
                                            head = inner_table[i][j-loop_j][0]+":"+head
                                    else:
                                        head = inner_table[i][j-loop_j][0]+":"+head
                                    find_flag = True
                                    temp_head = inner_table[i][j-loop_j][0]
                                else:
                                    if find_flag:
                                        break   
                        
                            if str(head+inner_table[i][j][0]) in text_set:
                                continue
                            if re.search(rankPattern,head) is not None:
                                
                                rank_text += head+inner_table[i][j][0]+"，"
                                #print(rank_text)
                            elif re.search(entityPattern,head) is not None:
                                entity_text += head+inner_table[i][j][0]+"，"
                                #print(entity_text)
                            else:
                                text_line += head+inner_table[i][j][0]+"，"
                            text_set.add(str(head+inner_table[i][j][0]))
                    text += rank_text+entity_text+text_line
                    #print(re.sub('\s', '', rank_text+entity_text+text_line))
                    text = text[:-1]+"，" if len(re.sub('\s', '', rank_text+entity_text+text_line)) < 20 else text[:-1]+"。"  # 修改为短句加逗号
                
        return text   

    def trunTable(tbody):        
        inner_table, colspan, rowspan = getTable(tbody)
        inner_table, colspan, rowspan = fix_rowspan(inner_table, colspan, rowspan)     
        if len(inner_table)>0 and len(inner_table[0])>0:
            inner_table,head_list,area_end_index = setHead(inner_table) 
            tbody.string = getTableText(inner_table,head_list,area_end_index)
            #print(tbody.string)
            tbody.name = "table"
        return inner_table   # 临时修改，保存中间变量
        
    pat_head = re.compile('(名称|序号|项目|标项|工程|品目[一二三四1234]|第[一二三四1234](标段|名|候选人|中标)|包段|包号|货物|单位|数量|价格|报价|金额|总价|单价|[招投中]标|供应商|候选|编号|得分|评委|评分|名次|排名|排序|科室|方式|工期|时间|产品|开始|结束|联系|日期|面积|姓名|证号|备注|级别|地[点址]|类型|代理|制造)')
    #pat_head = re.compile('(名称|序号|项目|工程|品目[一二三四1234]|第[一二三四1234](标段|候选人|中标)|包段|包号|货物|单位|数量|价格|报价|金额|总价|单价|[招投中]标|供应商|候选|编号|得分|评委|评分|名次|排名|排序|科室|方式|工期|时间|产品|开始|结束|联系|日期|面积|姓名|证号|备注|级别|地[点址]|类型|代理)')
    pat_value = re.compile("(\d{2,}.\d{1}|\d+年\d+月|\d{8,}|\d{3,}-\d{6,}|有限[责任]*公司|^\d+$)")
    
    tbodies = soup.find_all('table')
    # 遍历表格中的每个tbody
    
    tables = []  # 临时添加 
    
    #逆序处理嵌套表格
    for tbody_index in range(1,len(tbodies)+1):
        tbody = tbodies[len(tbodies)-tbody_index]
        #trunTable(tbody)
        inner_table = trunTable(tbody) # 临时修改，保存中间变量
        tables.append(inner_table)        
        
    tbodies = soup.find_all('tbody')
    # 遍历表格中的每个tbody
    #逆序处理嵌套表格

    for tbody_index in range(1,len(tbodies)+1):
        tbody = tbodies[len(tbodies)-tbody_index]
        #trunTable(tbody)
        inner_table = trunTable(tbody) # 临时修改，保存中间变量
        tables.append(inner_table)
    #return soup
    return soup, tables  # 临时修改

#数据清洗
def segment(soup):
    segList = ["title"]
    commaList = ["p","div","h1", "h2", "h3", "h4", "h5", "h6", "header", "dl", "ul", "label"]    
    spaceList = ["span"]
    tbodies = soup.find_all('tbody')
    if len(tbodies) == 0:
        tbodies = soup.find_all('table')
    # 递归遍历所有节点,插入符号
    for child in soup.find_all(recursive=True):
        if child.name == 'br':
            child.insert_before('，')
        child_text = re.sub('\s', '', child.get_text())
        if child_text == '' or child_text[-1] in ['。','，','：','；']:
            continue
        if child.name in segList:
            child.insert_after("。")
        if child.name in commaList:
            if len(child_text)>3 and len(child_text) <50:   # 先判断是否字数少于50，成立加逗号，否则加句号
                child.insert_after("，")           
            elif len(child_text) >=50:
                child.insert_after("。")
        #if child.name in spaceList:
            #child.insert_after(" ")  
    text = str(soup.get_text())
    
    #替换"""为"“",否则导入deepdive出错
    text = text.replace('"',"“")    
    #text = text.replace('"',"“").replace("\r","").replace("\n","")
    
    #删除所有空格
    text = re.sub("\s+","#nbsp#",text)
    text_list = text.split('#nbsp#')
    new_text = ''
    for i in range(len(text_list)-1):
        if text_list[i] == '' or text_list[i][-1] in ['，','。','；','：']:
            new_text += text_list[i]  
        elif  re.findall('([一二三四五六七八九]、)', text_list[i+1][:4]) != []:
            new_text += text_list[i] + '。' 
        elif re.findall('([0-9]、)', text_list[i+1][:4]) != []:
            new_text += text_list[i] + '；' 
        elif text_list[i].isdigit() and text_list[i+1].isdigit():
            new_text += text_list[i] + ' '           
        elif text_list[i][-1] in ['-',':','(',')','/','（','）','——','年','月','日','时','分','￥'] or text_list[i+1][0] in ['-',':','(',')','/','（','）','——','年','月','日','时','分','元','万元']:
            new_text += text_list[i]       
        elif len(text_list[i]) >= 3 and  len(text_list[i+1]) >= 3:
            new_text += text_list[i] + '，'
        else:
            new_text += text_list[i]       
    new_text += text_list[-1]      
    text = new_text
    #替换英文冒号为中文冒号
    text = re.sub("(?<=[\u4e00-\u9fa5]):|:(?=[\u4e00-\u9fa5])","：",text)
    #替换为中文逗号
    text = re.sub("(?<=[\u4e00-\u9fa5]),|,(?=[\u4e00-\u9fa5])","，",text)
    #替换为中文分号
    text = re.sub("(?<=[\u4e00-\u9fa5]);|;(?=[\u4e00-\u9fa5])","；",text)
            
    #替换标点
    while(True):
        #替换连续的标点
        punc = re.search("，(?P<punc>：|。|，|；)\s*",text)
        if punc is not None:
            text = re.sub("，"+punc.group("punc")+"\s*",punc.group("punc"),text)
        
        punc = re.search("(?P<punc>：|。|，|；)\s*，",text)
        if punc is not None:
            text = re.sub(punc.group("punc")+"\s*，",punc.group("punc"),text)
        else:
            #替换标点之后的空格
            punc = re.search("(?P<punc>：|。|，|；)\s+",text)
            if punc is not None:
                text = re.sub(punc.group("punc")+"\s+",punc.group("punc"),text)
            else:
                break
    #将连续的中文句号替换为一个
    text_split = text.split("。")
    text_split = [x for x in text_split if len(x)>0]
    text = "。".join(text_split)
  
    return text

def union_ner(list_ner):
    result_list = []
    union_index = []
    union_index_set = set()
    for i in range(len(list_ner)-1):
        if len(set([str(list_ner[i][2]),str(list_ner[i+1][2])])&set(["org","company"]))==2:
            if list_ner[i][1]-list_ner[i+1][0]==1:
                union_index_set.add(i)
                union_index_set.add(i+1)
                union_index.append((i,i+1))
    for i in range(len(list_ner)):
        if i not in union_index_set:
            result_list.append(list_ner[i])
    for item in union_index:
        #print(str(list_ner[item[0]][3])+str(list_ner[item[1]][3]))
        result_list.append((list_ner[item[0]][0],list_ner[item[1]][1],'company',str(list_ner[item[0]][3])+str(list_ner[item[1]][3])))
    return result_list
                
                
def getTokensAndNers(sentences,MAXAREA = 100000):
    '''
    @param: sentences:句子数
    @return 限流执行后的分词和实体识别list
    '''
    
    def getData(tokens,ners,process_data):
        process_sentences = [item[1] for item in process_data]
        token_ = fool.cut(process_sentences)
        ner_ = fool.ner(process_sentences)
        for i in range(len(token_)):
            the_index = process_data[i][0]
            tokens[the_index] = token_[i]
            ners[the_index] = ner_[i]
    sents = []
    for i in range(len(sentences)):
        sents.append([i,sentences[i]])
    sents.sort(key=lambda x:len(x[1]),reverse=True)
    index_ = 0
    tokens = [[]for i in range(len(sentences))]
    ners = [[]for i in range(len(sentences))]
    
    while(True):
        width = len(sents[index_][1])
        height = MAXAREA//width+1
        if height>len(sents)-index_:
            height = len(sents)-index_
        process_data = sents[index_:index_+height]
        getData(tokens, ners, process_data)
        index_ += height
        if index_>=len(sents):
            break
    return tokens,ners
    

def get_articles_processed(articles):
    '''
    @summary:预处理步骤，NLP处理、实体识别
    @param:
        articles:待处理的文章list
    @return:list of articles,list of each article of sentences,list of each article of entitys
    '''
    list_articles = []
    list_sentences = []
    list_entitys = []
    for article in articles:
        list_sentences_temp = []
        list_entitys_temp = []
        doc_id = article[0]
        #表格处理
        #article_processed = segment(tableToText(BeautifulSoup(article[1],"lxml")))
        
        soup, tables = tableToText(BeautifulSoup(article[1],"lxml"))
        article_processed = segment(soup)      
       
        #list_articles.append([doc_id,article_processed, article[1],tables])  # 临时修改，保存比较处理前和处理后结果
    #return list_articles  # 临时修改，保存比较处理前和处理后结果
        
        list_articles.append(Article(doc_id,article_processed))
        #nlp处理
        if article_processed is not None and len(article_processed)!=0:
            split_patten = "。"
            sentences = re.split(split_patten,article_processed)
            sentences = [x for x in sentences if len(x)!=0]
            
            lemmas = []
            doc_offsets = []
            dep_types = []
            dep_tokens = []
            
            time1 = time.time()
            
            '''
            tokens_all = fool.cut(sentences)
            #pos_all = fool.LEXICAL_ANALYSER.pos(tokens_all)
            #ner_tag_all = fool.LEXICAL_ANALYSER.ner_labels(sentences,tokens_all)
            ner_entitys_all = fool.ner(sentences)
            '''
            #限流执行
            tokens_all,ner_entitys_all = getTokensAndNers(sentences)
            
            print("nlp:",time.time()-time1)
            
            for sentence_index in range(len(sentences)):
                
                
                list_sentence_entitys = []
                sentence_text = sentences[sentence_index]
                tokens = tokens_all[sentence_index]
                
                list_tokenbegin = []
                begin = 0
                for i in range(0,len(tokens)):
                    list_tokenbegin.append(begin)
                    begin += len(str(tokens[i]))
                list_tokenbegin.append(begin+1)
                #pos_tag = pos_all[sentence_index]
                pos_tag = ""
                
                ner_entitys = ner_entitys_all[sentence_index]
                
                list_sentences_temp.append(Sentences(doc_id=doc_id,sentence_index=sentence_index,sentence_text=sentence_text,tokens=tokens,pos_tags=pos_tag,ner_tags=ner_entitys))
                
                #识别实体
                for ner_entity in ner_entitys:
                    begin_index_temp = ner_entity[0]
                    entity_type = ner_entity[2]
                    entity_text = ner_entity[3]
                    
                    for j in range(len(list_tokenbegin)):
                        if list_tokenbegin[j]==begin_index_temp:
                            begin_index = j
                            break
                        elif list_tokenbegin[j]>begin_index_temp:
                            begin_index = j-1
                            break
                    begin_index_temp += len(str(entity_text))
                    for j in range(begin_index,len(list_tokenbegin)):
                        if list_tokenbegin[j]>=begin_index_temp:
                            end_index = j-1
                            break
                    entity_id = "%s_%d_%d_%d"%(doc_id,sentence_index,begin_index,end_index)
                    
                    list_sentence_entitys.append(Entity(doc_id,entity_id,entity_text,entity_type,sentence_index,begin_index,end_index))
                
                
                #使用正则识别金额
                entity_type = "money"
                
                #money_patten_str = "(([1-9][\d,，]*(?:\.\d+)?[百千万亿]?[\(\)（）元整]+)|([零壹贰叁肆伍陆柒捌玖拾佰仟萬億十百千万亿元角分]{3,})|(?:[￥¥]+，?|报价|标价)[（\(]?([万])?元?[）\)]?[:：]?.{,7}?([1-9][\d,，]*(?:\.\d+)?(?:，?)[百千万亿]?)|([1-9][\d,，]*(?:\.\d+)?(?:，?)[百千万亿]?)[\(（]?([万元]{1,2}))*"
                
                list_money_pattern = {"cn":"(()()([零壹贰叁肆伍陆柒捌玖拾佰仟萬億十百千万亿元角分]{3,})())*",
                                      "key_word":"((?:[￥¥]+，?|报价|标价)(?:[（\(]?\s*([万元]*)\s*[）\)]?)\s*[:：]?(\s*[^壹贰叁肆伍陆柒捌玖拾佰仟萬億分]{,7}?)([1-9][\d,，]*(?:\.\d+)?(?:，?)[百千万亿元]*)())*",
                                      "front_m":"((?:[（\(]?\s*([万元]+)\s*[）\)])\s*[:：]?(\s*[^壹贰叁肆伍陆柒捌玖拾佰仟萬億分]{,7}?)([1-9][\d,，]*(?:\.\d+)?(?:，?)[百千万亿元]*)())*",
                                      "behind_m":"(()()([1-9][\d,，]*(?:\.\d+)?(?:，?)[百千万亿]*)[\(（]?([万元]+)[\)）]?)*"}
                
                set_begin = set()
                for pattern_key in list_money_pattern.keys():
                    pattern = re.compile(list_money_pattern[pattern_key])
                    all_match = re.findall(pattern, sentence_text)
                    index = 0
                    for i in range(len(all_match)):
                        if len(all_match[i][0])>0:
                            #print(all_match[i][0])
                            unit = ""
                            entity_text = all_match[i][3]
                            if pattern_key in ["key_word","front_m"]:
                                unit = all_match[i][1]
                            else:
                                unit = all_match[i][4]
                            if entity_text.find("元")>=0:
                                unit = ""
                                
                            index += len(all_match[i][0])-len(entity_text)-len(all_match[i][4])#-len(all_match[i][1])-len(all_match[i][2])#整个提出来的作为实体->数字部分作为整体，否则会丢失特征
                            
                            for j in range(len(list_tokenbegin)):
                                if list_tokenbegin[j]==index:
                                    begin_index = j
                                    break
                                elif list_tokenbegin[j]>index:
                                    begin_index = j-1
                                    break
                            index += len(str(entity_text))+len(all_match[i][4])#+len(all_match[i][2])+len(all_match[i][1])#整个提出来的作为实体
                            #index += len(str(all_match[i][0]))
                            for j in range(begin_index,len(list_tokenbegin)):
                                if list_tokenbegin[j]>=index:
                                    end_index = j-1
                                    break
                            entity_id = "%s_%d_%d_%d"%(doc_id,sentence_index,begin_index,end_index)
                            
                            
                            entity_text = re.sub("[^0-9.零壹贰叁肆伍陆柒捌玖拾佰仟萬億十百千万亿元角分]","",entity_text)
                            if len(unit)>0:
                                entity_text = str(getUnifyMoney(entity_text)*getMultipleFactor(unit[0]))
                            else:
                                entity_text = str(getUnifyMoney(entity_text))
                              
                                
                            list_sentence_entitys.append(Entity(doc_id,entity_id,entity_text,entity_type,sentence_index,begin_index,end_index))
                                
                        else:
                            index += 1
                
                list_sentence_entitys.sort(key=lambda x:x.begin_index)
                list_entitys_temp = list_entitys_temp+list_sentence_entitys
        list_sentences.append(list_sentences_temp)
        list_entitys.append(list_entitys_temp)
    return list_articles,list_sentences,list_entitys       


def union_result(codeName,prem):
    '''
    @summary:模型的结果拼成字典
    @param:
        codeName:编号名称模型的结果字典
        prem:拿到属性的角色的字典
    @return:拼接起来的字典
    '''
    result = []
    assert len(codeName)==len(prem)
    for item_code,item_prem in zip(codeName,prem):
        if item_code[0]==item_prem[0]:
            result.append([item_code[0],dict(item_code[1],**item_prem[1])])
    return result

def persistenceData(data):
    '''
    @summary:将中间结果保存到数据库-线上生产的时候不需要执行
    '''
    import psycopg2
    conn = psycopg2.connect(dbname="BiddingKG",user="postgres",password="postgres",host="192.168.2.101")
    cursor = conn.cursor()
    for item_index in range(len(data)):
        item = data[item_index]
        doc_id = item[0]
        dic = item[1]
        code = dic['code']
        name = dic['name']
        prem = dic['prem']
        if len(code)==0:
            code_insert = ""
        else:
            code_insert = ";".join(code)
        prem_insert = ""
        for item in prem:
            for x in item:
                if isinstance(x, list):
                    if len(x)>0:
                        for x1 in x:
                            prem_insert+="/".join(x1)+","
                    prem_insert+="$"
                else:
                    prem_insert+=str(x)+"$"
            prem_insert+=";"
        sql = " insert into predict_validation(doc_id,code,name,prem) values('"+doc_id+"','"+code_insert+"','"+name+"','"+prem_insert+"')"
        cursor.execute(sql)
    conn.commit()
    conn.close()
   
def persistenceData1(list_entitys,list_sentences):
    '''
    @summary:将中间结果保存到数据库-线上生产的时候不需要执行
    '''
    import psycopg2
    conn = psycopg2.connect(dbname="BiddingKG",user="postgres",password="postgres",host="192.168.2.101")
    cursor = conn.cursor()
    for list_entity in list_entitys:
        for entity in list_entity:
            if entity.values is not None:
                sql = " insert into predict_entity(entity_id,entity_text,entity_type,doc_id,sentence_index,begin_index,end_index,label,values) values('"+str(entity.entity_id)+"','"+str(entity.entity_text)+"','"+str(entity.entity_type)+"','"+str(entity.doc_id)+"',"+str(entity.sentence_index)+","+str(entity.begin_index)+","+str(entity.end_index)+","+str(entity.label)+",array"+str(entity.values)+")"
            else:    
                sql = " insert into predict_entity(entity_id,entity_text,entity_type,doc_id,sentence_index,begin_index,end_index) values('"+str(entity.entity_id)+"','"+str(entity.entity_text)+"','"+str(entity.entity_type)+"','"+str(entity.doc_id)+"',"+str(entity.sentence_index)+","+str(entity.begin_index)+","+str(entity.end_index)+")"
            cursor.execute(sql)
    for list_sentence in list_sentences:
        for sentence in list_sentence:
            str_tokens = "["
            for item in sentence.tokens:
                str_tokens += "'"
                if item=="'":
                    str_tokens += "''"
                else:
                    str_tokens += item
                str_tokens += "',"
            str_tokens = str_tokens[:-1]+"]"
            sql = " insert into predict_sentences(doc_id,sentence_index,tokens) values('"+sentence.doc_id+"',"+str(sentence.sentence_index)+",array"+str_tokens+")"
            cursor.execute(sql)
    conn.commit()
    conn.close()

if __name__=="__main__":
    import glob
    import re
    #files = glob.glob( 'F:/工作文档/实体识别实体对其//20190320/*.html')
    #files = glob.glob( 'F:/工作文档/实体识别实体对其//20190306/*.html')    
    #files = glob.glob( 'F:/工作文档/实体识别实体对其//20190513/*.html') 
    #files = glob.glob('F:/工作文档/实体识别实体对其/20190320/比地_101_58466066.html')
    #files = glob.glob('F:/工作文档/实体识别实体对其/20190320\\比地_101_58447523.html')
    #files = glob.glob('F:/工作文档/实体识别实体对其/20190320/比地_101_58511386.html')
    #files = glob.glob('F:/工作文档/实体识别实体对其/20190320/比地_101_58521609.html')
    #files = glob.glob('F:/工作文档/实体识别实体对其/20190320\\比地_101_58502967.html')  # 内容缺失 
    #files = glob.glob('F:/工作文档/实体识别实体对其/20190320\\比地_101_58445908.html')  # 把采购人、单位名称识别为表头 
    #files = glob.glob('F:/工作文档/实体识别实体对其/20190416要素\\比地_101_61320687.html')  #行表头识别不到 
    #files = glob.glob('F:/工作文档/实体识别实体对其/20190306\\比地_52_57131306.html')  # span 空格区分不了 
    #files = glob.glob('F:/工作文档/实体识别实体对其/20190320/比地_101_58522893.html')  # 某行tr没有td
    files = glob.glob('F:/工作文档/实体识别实体对其/20190320\\比地_101_58447523.html')
    #files = glob.glob('F:/工作文档/实体识别实体对其/test/*.html')    
    #files = ['F:/工作文档/实体识别实体对其/1.html']
    print(len(files))
    i = 0
    filePaths =[]
    for file in files:
        with open(file, 'r', encoding='utf-8') as f:
            content = f.read()
            filePaths.append([file, content])  
            #tables = re.findall('<table[^<].*?</table>',  re.sub('\s','',content))
            ##if len(tables) == 0 and re.search('采购人', content) != None and re.search('代理机构',content) != None and re.search('&nbsp;',content) != None:
                ##filePaths.append([file, content]) 
            #for table in tables:
                #if re.search('排序', table) != None or re.search('名次',table) != None\
                            #or re.search('排名', table) != None:                
                ##if re.search('colspan', table) != None and re.search('rowspan',table) != None and re.search('第一中标', table) != None:
                    #filePaths.append([file, content])   
                    #break
    list_articles = get_articles_processed(filePaths)
    #list_articles,list_sentences,list_entitys = get_articles_processed(filePaths)
    with open('F:/工作文档/实体识别实体对其/20190416要素/list_articles_20190306.pkl', 'wb') as f:
        pickle.dump(list_articles, f) 
    print(len(list_articles))
    
    
    #doc_id = "09067598-7076-11e8-9dae-52540087e52f"
    #import psycopg2
    #conn = psycopg2.connect(dbname="BiddingKG",user="postgres",password="postgres",host="192.168.2.101")
    #cursor = conn.cursor()
    #sql = " select id,content from articles where id='"+doc_id+"' "
    #cursor.execute(sql)
    #ContentIDs = cursor.fetchall()
    #list_articles,list_sentences,list_entitys = get_articles_processed(ContentIDs)
    #for i in range(len(list_entitys)):
        #for j in range(len(list_entitys[i])):
            #entity = list_entitys[i][j]
            #sentence = list_sentences[i][entity.sentence_index]
            #tokens = sentence.tokens
            #begin_index = entity.begin_index
            #end_index = entity.end_index
            #if entity.entity_type in ['org','company']:
                #item_x = spanWindow(tokens=tokens,begin_index=begin_index,end_index=end_index,size=settings.MODEL_ROLE_INPUT_SHAPE[1])
                ##print(item_x)