luojiehua
/
BIDI_ML_INFO_EXTRACTION


			
							123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214
							#coding:utf8

from bs4 import BeautifulSoup, Comment
import copy
import re
import os 
os.environ['NLS_LANG'] = 'SIMPLIFIED CHINESE_CHINA.UTF8'
import cx_Oracle as oracle
import pandas as pd


def table2text(soup):
    '''
    把公告中的表格转化为纯文本

    Args:
        soup: beautifulsoup实例
    Returns:
        处理过后的beautifulsoup实例
    '''
    tbodies = soup.find_all('tbody')
    if len(tbodies) == 0:
        tbodies = soup.find_all('table')
    # 遍历表格中的每个tbody
    for tbody in tbodies:
        # 处理colspan, rowspan信息补全问题
        trs = tbody.findChildren('tr', recursive=False)
        ths_len = 0
        ths = list()
        trs_set = set()
        # 遍历每一个tr
        for indtr, tr in enumerate(trs):
            ths_tmp = tr.findChildren('th', recursive=False)
            if len(ths_tmp) > 0:
                ths_len = ths_len + len(ths_tmp)
                for th in ths_tmp:
                    ths.append(th)
                trs_set.add(tr)
            # 遍历每行中的element
            tds = tr.findChildren(recursive=False)
            if len(tds) > 1:
                for indtd, td in enumerate(tds):
                    # 若有rowspan 则补全下一行同样位置
                    if 'rowspan' in td.attrs:
                        row = int(td['rowspan'])
                        td['rowspan'] = 1
                        for i in range(1, row, 1):
                            # 获取下一行的所有td， 在对应的位置插入
                            if (indtr+i)<len(trs):
                                tds1 = trs[indtr + i].findChildren('td', recursive=False)
                                if len(tds1) >= (indtd):
                                    if indtd > 0:
                                        tds1[indtd - 1].insert_after(copy.copy(td))
                                    else:
                                        tds1[0].insert_before(copy.copy(td))
                    # 若有colspan 则补全同一行下一个位置
                    if 'colspan' in td.attrs:
                        col = int(td['colspan'])
                        td['colspan'] = 1
                        for i in range(1, col, 1):
                            td.insert_after(copy.copy(td))
        # 表格转化成文字
        if ths_len > 1:  # 有表头的表格
            if len(trs_set) == 1:  # 横状表格
                ps = ''
                trs_set = tbody.findChildren('tr', recursive=False)
                for i in range(1, len(trs_set), 1):
                    tr = trs_set[i]
                    tds = tr.findChildren('td', recursive=False)
                    p = ''
                    for ind, th in enumerate(ths):
                        if ind < len(tds):
                            p = p + th.get_text() + "：" + tds[ind].get_text() + ";"
                    p = p + "；"
                    ps = ps + p
                tbody.string = ps
                tbody.name = 'div'

            else:  # 竖状表格
                ps = ''
                tds = list(trs_set)[0].findChildren('td', recursive=False)

                for ind, td in enumerate(tds):
                    p = ''
                    for i in range(0, len(trs_set), 1):
                        tds_temp = list(trs_set)[i].findChildren('td', recursive=False)
                        if ind < len(tds_temp):
                            if ind < len(tds_temp):
                                p = p + ths[i].get_text() + tds_temp[ind].get_text() + "；"
                    ps = ps + p
                tbody.string = ps
                tbody.name = 'p'
        else:  # 有表头但是非th标签的横状表格
            trs = tbody.findChildren('tr', recursive=False)
            if len(trs) > 0:
                ths = []
                for i in range(len(trs)):
                    if len(ths)>0:
                        ps = ''
                        for i in range(1, len(trs), 1):
                            tr = trs[i]
                            tds = tr.findChildren('td', recursive=False)
                            p = ''
                            if len(tds)==len(ths):
                                for ind, th in enumerate(ths):
                                    if (len(tds)-1) >= ind:
                                        p = p + th.get_text() + "：" + tds[ind].get_text() + "；"
                                p = p[:-1] + "。"
                                ps = ps + p
                                
                            else:
                                for ind, th in enumerate(ths):
                                    if (len(tds)-1) >= ind:
                                        p = p + tds[ind].get_text()
                                p = p + "。"
                                ps = ps + p
                                
                            tbody.string = ps
                            tbody.name = 'p'
                        break
                    tds0 = trs[i].findChildren('td', recursive=False) 
                    if len(tds0) > 2:
                        tds_str = [td.get_text() for td in tds0]
                        pat = re.compile('(序号|项目|产品|货物|单位|数量|价格|金额|总价|中标|供应商|候选|编号|得分|名次|排名|排序|科室){1}')
                        match_counts = re.subn(pat, '', ";".join(tds_str))[1]
                        if match_counts > 2:
                            for td in trs[i].findChildren('td', recursive=False):
                                td.name = 'th'
                                ths.append(td)
                                
                
    return soup

def segment(soup):
    #segList = ["tr","div","h1", "h2", "h3", "h4", "h5", "h6", "header"]
    segList = ["tr"]
    commaList = ["p","div","br"]
    subspaceList = ["td",'a']
    tbodies = soup.find_all('tbody')
    if len(tbodies) == 0:
        tbodies = soup.find_all('table')
    # 递归遍历所有节点,插入符号
    for child in soup.body.descendants:
        if child.name in segList:
            child.insert_after("。")
        if child.name in commaList:
            child.insert_after("，")
        if child.name in subspaceList:
            child.insert_before("#subs#")
            child.insert_after("#sube#")
    text = str(soup.get_text())
    
    #替换"""为"“",否则导入deepdive出错
    text = text.replace('"',"“").replace("\r","").replace("\n","")
    
    #替换英文冒号为中文冒号
    text = re.sub("(?<=[\u4e00-\u9fa5]):|:(?=[\u4e00-\u9fa5])","：",text)
    #替换为中文逗号
    text = re.sub("(?<=[\u4e00-\u9fa5]),|,(?=[\u4e00-\u9fa5])","，",text)
    #替换为中文分号
    text = re.sub("(?<=[\u4e00-\u9fa5]);|;(?=[\u4e00-\u9fa5])","；",text)
            
    #删除标签中的所有空格
    allMatch = re.findall(re.compile("#subs#(.*?)#sube#"),text)
    for item in allMatch:
        text = text.replace("#subs#"+item+"#sube#",re.sub("\s","",item))
    #替换标点
    while(True):
        #替换连续的标点
        punc = re.search("(?P<punc>：|。|，|；)\s*，",text)
        if punc is not None:
            text = re.sub(punc.group("punc")+"\s*，",punc.group("punc"),text)
        else:
            #替换标点之后的空格
            punc = re.search("(?P<punc>：|。|，|；)\s+",text)
            if punc is not None:
                #print(punc.group("punc"))
                text = re.sub(punc.group("punc")+"\s+",punc.group("punc"),text)
            else:
                break
    #将连续的中文句号替换为一个
    text_split = text.split("。")
    text_split = [x for x in text_split if len(x)>0]
    text = "。".join(text_split)
    return text


if __name__=="__main__":
    

    # connect oracle database
    db = oracle.connect('bxkc/bxkc@192.168.2.54:1521/orcl')
     
    # create cursor
    cursor = db.cursor()
     
    # execute sql
    cursor.execute("select dochtmlcon from sys_document where docchannel='101' and dochtmlcon is not NULL  and rownum<10000")
    
    rows = cursor.fetchall()
    htmls = []
    for row in rows:
        content = row[0]
        #print("===")
        #print(content)
        htmls.append(segment(table2text(BeautifulSoup(content,"lxml"))))

    
    dataframe = pd.DataFrame({'content':htmls})
    columns = ['content']
    dataframe.to_csv("articles.csv",index=True,header=False,sep=",",encoding="utf8",columns=columns)