#coding:utf8 from bs4 import BeautifulSoup, Comment import copy import re import os os.environ['NLS_LANG'] = 'SIMPLIFIED CHINESE_CHINA.UTF8' import cx_Oracle as oracle import pandas as pd def table2text(soup): ''' 把公告中的表格转化为纯文本 Args: soup: beautifulsoup实例 Returns: 处理过后的beautifulsoup实例 ''' tbodies = soup.find_all('tbody') if len(tbodies) == 0: tbodies = soup.find_all('table') # 遍历表格中的每个tbody for tbody in tbodies: # 处理colspan, rowspan信息补全问题 trs = tbody.findChildren('tr', recursive=False) ths_len = 0 ths = list() trs_set = set() # 遍历每一个tr for indtr, tr in enumerate(trs): ths_tmp = tr.findChildren('th', recursive=False) if len(ths_tmp) > 0: ths_len = ths_len + len(ths_tmp) for th in ths_tmp: ths.append(th) trs_set.add(tr) # 遍历每行中的element tds = tr.findChildren(recursive=False) if len(tds) > 1: for indtd, td in enumerate(tds): # 若有rowspan 则补全下一行同样位置 if 'rowspan' in td.attrs: row = int(td['rowspan']) td['rowspan'] = 1 for i in range(1, row, 1): # 获取下一行的所有td, 在对应的位置插入 if (indtr+i)= (indtd): if indtd > 0: tds1[indtd - 1].insert_after(copy.copy(td)) else: tds1[0].insert_before(copy.copy(td)) # 若有colspan 则补全同一行下一个位置 if 'colspan' in td.attrs: col = int(td['colspan']) td['colspan'] = 1 for i in range(1, col, 1): td.insert_after(copy.copy(td)) # 表格转化成文字 if ths_len > 1: # 有表头的表格 if len(trs_set) == 1: # 横状表格 ps = '' trs_set = tbody.findChildren('tr', recursive=False) for i in range(1, len(trs_set), 1): tr = trs_set[i] tds = tr.findChildren('td', recursive=False) p = '' for ind, th in enumerate(ths): if ind < len(tds): p = p + th.get_text() + ":" + tds[ind].get_text() + ";" p = p + ";" ps = ps + p tbody.string = ps tbody.name = 'div' else: # 竖状表格 ps = '' tds = list(trs_set)[0].findChildren('td', recursive=False) for ind, td in enumerate(tds): p = '' for i in range(0, len(trs_set), 1): tds_temp = list(trs_set)[i].findChildren('td', recursive=False) if ind < len(tds_temp): if ind < len(tds_temp): p = p + ths[i].get_text() + tds_temp[ind].get_text() + ";" ps = ps + p tbody.string = ps tbody.name = 'p' else: # 有表头但是非th标签的横状表格 trs = tbody.findChildren('tr', recursive=False) if len(trs) > 0: ths = [] for i in range(len(trs)): if len(ths)>0: ps = '' for i in range(1, len(trs), 1): tr = trs[i] tds = tr.findChildren('td', recursive=False) p = '' if len(tds)==len(ths): for ind, th in enumerate(ths): if (len(tds)-1) >= ind: p = p + th.get_text() + ":" + tds[ind].get_text() + ";" p = p[:-1] + "。" ps = ps + p else: for ind, th in enumerate(ths): if (len(tds)-1) >= ind: p = p + tds[ind].get_text() p = p + "。" ps = ps + p tbody.string = ps tbody.name = 'p' break tds0 = trs[i].findChildren('td', recursive=False) if len(tds0) > 2: tds_str = [td.get_text() for td in tds0] pat = re.compile('(序号|项目|产品|货物|单位|数量|价格|金额|总价|中标|供应商|候选|编号|得分|名次|排名|排序|科室){1}') match_counts = re.subn(pat, '', ";".join(tds_str))[1] if match_counts > 2: for td in trs[i].findChildren('td', recursive=False): td.name = 'th' ths.append(td) return soup def segment(soup): #segList = ["tr","div","h1", "h2", "h3", "h4", "h5", "h6", "header"] segList = ["tr"] commaList = ["p","div","br"] subspaceList = ["td",'a'] tbodies = soup.find_all('tbody') if len(tbodies) == 0: tbodies = soup.find_all('table') # 递归遍历所有节点,插入符号 for child in soup.body.descendants: if child.name in segList: child.insert_after("。") if child.name in commaList: child.insert_after(",") if child.name in subspaceList: child.insert_before("#subs#") child.insert_after("#sube#") text = str(soup.get_text()) #替换"""为"“",否则导入deepdive出错 text = text.replace('"',"“").replace("\r","").replace("\n","") #替换英文冒号为中文冒号 text = re.sub("(?<=[\u4e00-\u9fa5]):|:(?=[\u4e00-\u9fa5])",":",text) #替换为中文逗号 text = re.sub("(?<=[\u4e00-\u9fa5]),|,(?=[\u4e00-\u9fa5])",",",text) #替换为中文分号 text = re.sub("(?<=[\u4e00-\u9fa5]);|;(?=[\u4e00-\u9fa5])",";",text) #删除标签中的所有空格 allMatch = re.findall(re.compile("#subs#(.*?)#sube#"),text) for item in allMatch: text = text.replace("#subs#"+item+"#sube#",re.sub("\s","",item)) #替换标点 while(True): #替换连续的标点 punc = re.search("(?P:|。|,|;)\s*,",text) if punc is not None: text = re.sub(punc.group("punc")+"\s*,",punc.group("punc"),text) else: #替换标点之后的空格 punc = re.search("(?P:|。|,|;)\s+",text) if punc is not None: #print(punc.group("punc")) text = re.sub(punc.group("punc")+"\s+",punc.group("punc"),text) else: break #将连续的中文句号替换为一个 text_split = text.split("。") text_split = [x for x in text_split if len(x)>0] text = "。".join(text_split) return text if __name__=="__main__": # connect oracle database db = oracle.connect('bxkc/bxkc@192.168.2.54:1521/orcl') # create cursor cursor = db.cursor() # execute sql cursor.execute("select dochtmlcon from sys_document where docchannel='101' and dochtmlcon is not NULL and rownum<10000") rows = cursor.fetchall() htmls = [] for row in rows: content = row[0] #print("===") #print(content) htmls.append(segment(table2text(BeautifulSoup(content,"lxml")))) dataframe = pd.DataFrame({'content':htmls}) columns = ['content'] dataframe.to_csv("articles.csv",index=True,header=False,sep=",",encoding="utf8",columns=columns)