123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214 |
- #coding:utf8
- from bs4 import BeautifulSoup, Comment
- import copy
- import re
- import os
- os.environ['NLS_LANG'] = 'SIMPLIFIED CHINESE_CHINA.UTF8'
- import cx_Oracle as oracle
- import pandas as pd
- def table2text(soup):
- '''
- 把公告中的表格转化为纯文本
- Args:
- soup: beautifulsoup实例
- Returns:
- 处理过后的beautifulsoup实例
- '''
- tbodies = soup.find_all('tbody')
- if len(tbodies) == 0:
- tbodies = soup.find_all('table')
- # 遍历表格中的每个tbody
- for tbody in tbodies:
- # 处理colspan, rowspan信息补全问题
- trs = tbody.findChildren('tr', recursive=False)
- ths_len = 0
- ths = list()
- trs_set = set()
- # 遍历每一个tr
- for indtr, tr in enumerate(trs):
- ths_tmp = tr.findChildren('th', recursive=False)
- if len(ths_tmp) > 0:
- ths_len = ths_len + len(ths_tmp)
- for th in ths_tmp:
- ths.append(th)
- trs_set.add(tr)
- # 遍历每行中的element
- tds = tr.findChildren(recursive=False)
- if len(tds) > 1:
- for indtd, td in enumerate(tds):
- # 若有rowspan 则补全下一行同样位置
- if 'rowspan' in td.attrs:
- row = int(td['rowspan'])
- td['rowspan'] = 1
- for i in range(1, row, 1):
- # 获取下一行的所有td, 在对应的位置插入
- if (indtr+i)<len(trs):
- tds1 = trs[indtr + i].findChildren('td', recursive=False)
- if len(tds1) >= (indtd):
- if indtd > 0:
- tds1[indtd - 1].insert_after(copy.copy(td))
- else:
- tds1[0].insert_before(copy.copy(td))
- # 若有colspan 则补全同一行下一个位置
- if 'colspan' in td.attrs:
- col = int(td['colspan'])
- td['colspan'] = 1
- for i in range(1, col, 1):
- td.insert_after(copy.copy(td))
- # 表格转化成文字
- if ths_len > 1: # 有表头的表格
- if len(trs_set) == 1: # 横状表格
- ps = ''
- trs_set = tbody.findChildren('tr', recursive=False)
- for i in range(1, len(trs_set), 1):
- tr = trs_set[i]
- tds = tr.findChildren('td', recursive=False)
- p = ''
- for ind, th in enumerate(ths):
- if ind < len(tds):
- p = p + th.get_text() + ":" + tds[ind].get_text() + ";"
- p = p + ";"
- ps = ps + p
- tbody.string = ps
- tbody.name = 'div'
- else: # 竖状表格
- ps = ''
- tds = list(trs_set)[0].findChildren('td', recursive=False)
- for ind, td in enumerate(tds):
- p = ''
- for i in range(0, len(trs_set), 1):
- tds_temp = list(trs_set)[i].findChildren('td', recursive=False)
- if ind < len(tds_temp):
- if ind < len(tds_temp):
- p = p + ths[i].get_text() + tds_temp[ind].get_text() + ";"
- ps = ps + p
- tbody.string = ps
- tbody.name = 'p'
- else: # 有表头但是非th标签的横状表格
- trs = tbody.findChildren('tr', recursive=False)
- if len(trs) > 0:
- ths = []
- for i in range(len(trs)):
- if len(ths)>0:
- ps = ''
- for i in range(1, len(trs), 1):
- tr = trs[i]
- tds = tr.findChildren('td', recursive=False)
- p = ''
- if len(tds)==len(ths):
- for ind, th in enumerate(ths):
- if (len(tds)-1) >= ind:
- p = p + th.get_text() + ":" + tds[ind].get_text() + ";"
- p = p[:-1] + "。"
- ps = ps + p
-
- else:
- for ind, th in enumerate(ths):
- if (len(tds)-1) >= ind:
- p = p + tds[ind].get_text()
- p = p + "。"
- ps = ps + p
-
- tbody.string = ps
- tbody.name = 'p'
- break
- tds0 = trs[i].findChildren('td', recursive=False)
- if len(tds0) > 2:
- tds_str = [td.get_text() for td in tds0]
- pat = re.compile('(序号|项目|产品|货物|单位|数量|价格|金额|总价|中标|供应商|候选|编号|得分|名次|排名|排序|科室){1}')
- match_counts = re.subn(pat, '', ";".join(tds_str))[1]
- if match_counts > 2:
- for td in trs[i].findChildren('td', recursive=False):
- td.name = 'th'
- ths.append(td)
-
-
-
-
- return soup
- def segment(soup):
- #segList = ["tr","div","h1", "h2", "h3", "h4", "h5", "h6", "header"]
- segList = ["tr"]
- commaList = ["p","div","br"]
- subspaceList = ["td",'a']
- tbodies = soup.find_all('tbody')
- if len(tbodies) == 0:
- tbodies = soup.find_all('table')
- # 递归遍历所有节点,插入符号
- for child in soup.body.descendants:
- if child.name in segList:
- child.insert_after("。")
- if child.name in commaList:
- child.insert_after(",")
- if child.name in subspaceList:
- child.insert_before("#subs#")
- child.insert_after("#sube#")
- text = str(soup.get_text())
-
- #替换"""为"“",否则导入deepdive出错
- text = text.replace('"',"“").replace("\r","").replace("\n","")
-
- #替换英文冒号为中文冒号
- text = re.sub("(?<=[\u4e00-\u9fa5]):|:(?=[\u4e00-\u9fa5])",":",text)
- #替换为中文逗号
- text = re.sub("(?<=[\u4e00-\u9fa5]),|,(?=[\u4e00-\u9fa5])",",",text)
- #替换为中文分号
- text = re.sub("(?<=[\u4e00-\u9fa5]);|;(?=[\u4e00-\u9fa5])",";",text)
-
- #删除标签中的所有空格
- allMatch = re.findall(re.compile("#subs#(.*?)#sube#"),text)
- for item in allMatch:
- text = text.replace("#subs#"+item+"#sube#",re.sub("\s","",item))
- #替换标点
- while(True):
- #替换连续的标点
- punc = re.search("(?P<punc>:|。|,|;)\s*,",text)
- if punc is not None:
- text = re.sub(punc.group("punc")+"\s*,",punc.group("punc"),text)
- else:
- #替换标点之后的空格
- punc = re.search("(?P<punc>:|。|,|;)\s+",text)
- if punc is not None:
- #print(punc.group("punc"))
- text = re.sub(punc.group("punc")+"\s+",punc.group("punc"),text)
- else:
- break
- #将连续的中文句号替换为一个
- text_split = text.split("。")
- text_split = [x for x in text_split if len(x)>0]
- text = "。".join(text_split)
- return text
- if __name__=="__main__":
-
- # connect oracle database
- db = oracle.connect('bxkc/bxkc@192.168.2.54:1521/orcl')
-
- # create cursor
- cursor = db.cursor()
-
- # execute sql
- cursor.execute("select dochtmlcon from sys_document where docchannel='101' and dochtmlcon is not NULL and rownum<10000")
-
- rows = cursor.fetchall()
- htmls = []
- for row in rows:
- content = row[0]
- #print("===")
- #print(content)
- htmls.append(segment(table2text(BeautifulSoup(content,"lxml"))))
-
- dataframe = pd.DataFrame({'content':htmls})
- columns = ['content']
- dataframe.to_csv("articles.csv",index=True,header=False,sep=",",encoding="utf8",columns=columns)
|