luojiehua
/
BIDI_ML_INFO_EXTRACTION


			
				
					
						
						
							123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393
							'''
Created on 2018年12月29日

@author: User
'''

from bs4 import BeautifulSoup, Comment
import copy
import re
import sys
import os
import time
sys.path.append(os.path.abspath("../.."))
import fool
from BiddingKG.dl.common.Connection import getConnection

def tableToText(soup):
    def fixSpan(tbody):
        # 处理colspan, rowspan信息补全问题
        trs = tbody.findChildren('tr', recursive=False)
        ths_len = 0
        ths = list()
        trs_set = set()
        #修改为先进行列补全再进行行补全，否则可能会出现表格解析混乱
        # 遍历每一个tr
        for indtr, tr in enumerate(trs):
            ths_tmp = tr.findChildren('th', recursive=False)
            #不补全含有表格的tr
            if len(tr.findChildren('table'))>0:
                continue
            if len(ths_tmp) > 0:
                ths_len = ths_len + len(ths_tmp)
                for th in ths_tmp:
                    ths.append(th)
                trs_set.add(tr)
            # 遍历每行中的element
            tds = tr.findChildren(recursive=False)
            for indtd, td in enumerate(tds):
                # 若有colspan 则补全同一行下一个位置
                if 'colspan' in td.attrs:
                    if str(td['colspan'])!="":
                        col = int(re.sub("[^0-9]","",str(td['colspan'])))
                        td['colspan'] = 1
                        for i in range(1, col, 1):
                            td.insert_after(copy.copy(td))
        for indtr, tr in enumerate(trs):
            ths_tmp = tr.findChildren('th', recursive=False)
            #不补全含有表格的tr
            if len(tr.findChildren('table'))>0:
                continue
            if len(ths_tmp) > 0:
                ths_len = ths_len + len(ths_tmp)
                for th in ths_tmp:
                    ths.append(th)
                trs_set.add(tr)
            # 遍历每行中的element
            tds = tr.findChildren(recursive=False)
            for indtd, td in enumerate(tds):
                # 若有rowspan 则补全下一行同样位置
                if 'rowspan' in td.attrs:
                    if str(td['rowspan'])!="":
                        row = int(re.sub("[^0-9]","",str(td['rowspan'])))
                        td['rowspan'] = 1
                        for i in range(1, row, 1):
                            # 获取下一行的所有td， 在对应的位置插入
                            if indtr+i<len(trs):
                                tds1 = trs[indtr + i].findChildren(['td','th'], recursive=False)
                                if len(tds1) >= (indtd) and len(tds1)>0:
                                    if indtd > 0:
                                        tds1[indtd - 1].insert_after(copy.copy(td))
                                    else:
                                        tds1[0].insert_before(copy.copy(td))
    def getTable(tbody):
        trs = tbody.findChildren('tr', recursive=False)
        inner_table = []
        for tr in trs:
            tr_line = []
            tds = tr.findChildren(['td','th'], recursive=False)
            for td in tds:
                tr_line.append([re.sub('\s*','',td.get_text()),0])
            inner_table.append(tr_line)
        return inner_table                          
    
    #处理表格不对齐的问题
    def fixTable(inner_table):
        maxWidth = 0
        for item in inner_table:
            if len(item)>maxWidth:
                maxWidth = len(item)
        for i in range(len(inner_table)):
            if len(inner_table[i])<maxWidth:
                for j in range(maxWidth-len(inner_table[i])):
                    inner_table[i].append(["",0])
        return inner_table
    
    #设置表头
    def setHead(inner_table,pattern,pat_value,count):
        height = len(inner_table)
        width = len(inner_table[0])
        head_list = []
        head_list.append(0)
        #行表头
        for i in range(height):
            set_match = set()
            is_head = False
            for j in range(width):
                if re.search(pat_value,inner_table[i][j][0]) is not None:
                    is_head = False
                    break
                str_find = re.findall(pattern,inner_table[i][j][0])
                if len(str_find)>0:
                    set_match.add(inner_table[i][j][0])
                if len(set_match)>=count:
                    is_head = True
            if is_head:
                head_list.append(i)
                for j in range(width):
                    inner_table[i][j][1] = 1
        head_list.append(height)
        #列表头
        for i in range(len(head_list)-1):
            head_begin = head_list[i]
            head_end = head_list[i+1]
            #最后一列不设置为列表头
            for i in range(width-1):
                set_match = set()
                is_head = False
                for j in range(head_begin,head_end):
                    if re.search(pat_value,inner_table[j][i][0]) is not None:
                        is_head = False
                        break
                    str_find = re.findall(pattern,inner_table[j][i][0])
                    if len(str_find)>0:
                        set_match.add(inner_table[j][i][0])
                    if len(set_match)>=count:
                        is_head = True
                if is_head:
                    for j in range(head_begin,head_end):
                        inner_table[j][i][1] = 2
        return inner_table,head_list
    
    def getDirect(inner_table,begin,end):
        column_head = set()
        row_head = set()
        widths = len(inner_table[0])
        for height in range(begin,end):
            for width in range(widths):
                if inner_table[height][width][1] ==1:
                    row_head.add(height)
                if inner_table[height][width][1] ==2:
                    column_head.add(width)
        company_pattern = re.compile("公司")
        if 0 in column_head and begin not in row_head:
            return "column"
        if 0 in column_head and begin in row_head:
            for height in range(begin,end):
                count = 0
                count_flag = True
                for width in range(width):
                    if inner_table[height][width][1]==0:
                        if re.search(company_pattern,inner_table[height][width][0])  is not None:
                            count += 1
                        else:
                            count_flag = False
                if count_flag and count>=2:
                    return "column"
        return "row"
            
            
    def getTableText(inner_table,head_list):
        rankPattern = "(排名|排序|名次|评标结果)"
        entityPattern = "(候选|([中投]标|报价)(人|单位|候选)|单位名称|供应商)"
        height = len(inner_table)
        width = len(inner_table[0])
        text = ""
        
        for head_i in range(len(head_list)-1):
            text_set = set()
            head_begin = head_list[head_i]
            head_end = head_list[head_i+1]
                
            direct = getDirect(inner_table, head_begin, head_end)
            if direct=="row":
            
                for i in range(head_begin,head_end):
                    rank_text = ""
                    entity_text = ""
                    text_line = ""
                    for j in range(width):
                        cell = inner_table[i][j]
                        #是属性值
                        if cell[1]==0:
                            find_flag = False
                            head = ""
                            temp_head = ""
                            for loop_j in range(1,j+1):
                                if inner_table[i][j-loop_j][1]==2:
                                    if find_flag:
                                        if inner_table[i][j-loop_j][0]!=temp_head:
                                            head = inner_table[i][j-loop_j][0]+":"+head
                                    else:
                                        head = inner_table[i][j-loop_j][0]+":"+head
                                    find_flag = True
                                    temp_head = inner_table[i][j-loop_j][0]
                                else:
                                    if find_flag:
                                        break
                            find_flag = False
                            temp_head = ""
                            for loop_i in range(1,i+1):
                                if inner_table[i-loop_i][j][1]==1:
                                    if find_flag:
                                        if inner_table[i-loop_i][j][0]!=temp_head:
                                            head = inner_table[i-loop_i][j][0]+":"+head
                                    else:
                                        head = inner_table[i-loop_i][j][0]+":"+head
                                    find_flag = True
                                    temp_head = inner_table[i-loop_i][j][0]
                                else:
                                    if find_flag:
                                        break
                            if str(head+inner_table[i][j][0]) in text_set:
                                continue
                            if re.search(rankPattern,head) is not None:
                                rank_text += head+inner_table[i][j][0]+"，"
                                #print(rank_text)
                            elif re.search(entityPattern,head) is not None:
                                entity_text += head+inner_table[i][j][0]+"，"
                                #print(entity_text)
                            else:
                                text_line += head+inner_table[i][j][0]+"，"
                            text_set.add(str(head+inner_table[i][j][0]))
                    text += rank_text+entity_text+text_line
                    text = text[:-1]+"。"
            else:
                for j in range(width):
                
                    rank_text = ""
                    entity_text = ""
                    text_line = ""
                    for i in range(head_begin,head_end):
                        cell = inner_table[i][j]
                        #是属性值
                        if cell[1]==0:
                            find_flag = False
                            head = ""
                            temp_head = ""
                            for loop_j in range(1,j+1):
                                if inner_table[i][j-loop_j][1]==2:
                                    if find_flag:
                                        if inner_table[i][j-loop_j][0]!=temp_head:
                                            head = inner_table[i][j-loop_j][0]+":"+head
                                    else:
                                        head = inner_table[i][j-loop_j][0]+":"+head
                                    find_flag = True
                                    temp_head = inner_table[i][j-loop_j][0]
                                else:
                                    if find_flag:
                                        break
                            find_flag = False
                            temp_head = ""
                            for loop_i in range(1,i+1):
                                if inner_table[i-loop_i][j][1]==1:
                                    if find_flag:
                                        if inner_table[i-loop_i][j][0]!=temp_head:
                                            head = inner_table[i-loop_i][j][0]+":"+head
                                    else:
                                        head = inner_table[i-loop_i][j][0]+":"+head
                                    find_flag = True
                                    temp_head = inner_table[i-loop_i][j][0]
                                else:
                                    if find_flag:
                                        break
                            if str(head+inner_table[i][j][0]) in text_set:
                                continue
                            if re.search(rankPattern,head) is not None:
                                rank_text += head+inner_table[i][j][0]+"，"
                                #print(rank_text)
                            elif re.search(entityPattern,head) is not None:
                                entity_text += head+inner_table[i][j][0]+"，"
                                #print(entity_text)
                            else:
                                text_line += head+inner_table[i][j][0]+"，"
                            text_set.add(str(head+inner_table[i][j][0]))
                    text += rank_text+entity_text+text_line
                    text = text[:-1]+"。"
        return text
        
    pat_head = re.compile('(名称|序号|项目|工程|品目[一二三四1234]|第[一二三四1234](标段|名|候选人|中标)|包段|包号|货物|单位|数量|价格|报价|金额|总价|单价|[招投中]标|供应商|候选|编号|得分|评委|评分|名次|排名|排序|科室|方式|工期|时间|产品|开始|结束|联系|日期|面积|姓名|证号|备注|级别|地[点址]|类型|代理)')
    pat_value = re.compile("(\d{2,}.\d{1}|\d+年\d+月|\d{8,}|\d{3,}-\d{6,}|有限[责任]*公司|^\d+$)")
    tbodies = soup.find_all('tbody')
    if len(tbodies) == 0:
        tbodies = soup.find_all('table')
    # 遍历表格中的每个tbody
    #逆序处理嵌套表格
    for tbody_index in range(1,len(tbodies)+1):
        tbody = tbodies[len(tbodies)-tbody_index]
        fixSpan(tbody)
        inner_table = getTable(tbody)
        inner_table = fixTable(inner_table)
        if len(inner_table)>0:
            inner_table,head_list = setHead(inner_table,pat_head,pat_value,3)
            tbody.string = getTableText(inner_table,head_list)
            #print(tbody.string)
            tbody.name = "table"
    return soup

def segment(soup):
    #segList = ["tr","div","h1", "h2", "h3", "h4", "h5", "h6", "header"]
    segList = ["tr"]
    #commaList = ["p","div","br","td","span"]
    commaList = []
    spaceList = ["span"]
    subspaceList = ["td",'a',"span"]
    tbodies = soup.find_all('tbody')
    if len(tbodies) == 0:
        tbodies = soup.find_all('table')
    # 递归遍历所有节点,插入符号
    for child in soup.body.descendants:

        if child.name in segList:
            child.insert_after("。")
        if child.name in commaList:
            child.insert_after("，")
        if child.name in subspaceList:
            child.insert_before("#subs"+str(child.name)+"#")
            child.insert_after("#sube"+str(child.name)+"#")
        if child.name in spaceList:
            child.insert_after(" ")
    text = str(soup.get_text())
    
    #替换"""为"“",否则导入deepdive出错
    text = text.replace('"',"“").replace("\r","").replace("\n","")
    
    #替换英文冒号为中文冒号
    text = re.sub("(?<=[\u4e00-\u9fa5]):|:(?=[\u4e00-\u9fa5])","：",text)
    #替换为中文逗号
    text = re.sub("(?<=[\u4e00-\u9fa5]),|,(?=[\u4e00-\u9fa5])","，",text)
    #替换为中文分号
    text = re.sub("(?<=[\u4e00-\u9fa5]);|;(?=[\u4e00-\u9fa5])","；",text)
            
    #删除标签中的所有空格
    for subs in subspaceList:
        patten = "#subs"+str(subs)+"#(.*?)#sube"+str(subs)+"#"
        while(True):
            oneMatch = re.search(re.compile(patten),text)
            if oneMatch is not None:
                text = text.replace("#subs"+str(subs)+"#"+oneMatch.group(1)+"#sube"+str(subs)+"#",re.sub("\s","",oneMatch.group(1)))
            else:
                break
    #替换标点
    while(True):
        #替换连续的标点
        punc = re.search("，(?P<punc>：|。|，|；)\s*",text)
        if punc is not None:
            text = re.sub("，"+punc.group("punc")+"\s*",punc.group("punc"),text)
        
        punc = re.search("(?P<punc>：|。|，|；)\s*，",text)
        if punc is not None:
            text = re.sub(punc.group("punc")+"\s*，",punc.group("punc"),text)
        else:
            #替换标点之后的空格
            punc = re.search("(?P<punc>：|。|，|；)\s+",text)
            if punc is not None:
                text = re.sub(punc.group("punc")+"\s+",punc.group("punc"),text)
            else:
                break
    #将连续的中文句号替换为一个
    text_split = text.split("。")
    text_split = [x for x in text_split if len(x)>0]
    text = "。".join(text_split)
    #删除所有空格
    text = re.sub("\s*","",text)
    return text

if __name__=="__main__":
    conn = getConnection()
    cursor = conn.cursor()
    sql = sql = " select content,id from articles where id in(select doc_id from articles_validation where exists(select 1 from articles_processed where id=doc_id)) order by id limit 70"
    
    cursor.execute(sql)
    
    rows = cursor.fetchall()
    
    sentences = []
    a = time.time()
    for row in rows:
        content = row[0]
        sentences = sentences+re.split("。",segment(tableToText(BeautifulSoup(content,"lxml"))))
    tokens_all = fool.cut(sentences)
    #pos_all = fool.LEXICAL_ANALYSER.pos(tokens_all)
    ner_tag_all = fool.LEXICAL_ANALYSER.ner_labels(sentences,tokens_all)
    print("takes:",time.time()-a)