generate_articles.py 8.4 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214
  1. #coding:utf8
  2. from bs4 import BeautifulSoup, Comment
  3. import copy
  4. import re
  5. import os
  6. os.environ['NLS_LANG'] = 'SIMPLIFIED CHINESE_CHINA.UTF8'
  7. import cx_Oracle as oracle
  8. import pandas as pd
  9. def table2text(soup):
  10. '''
  11. 把公告中的表格转化为纯文本
  12. Args:
  13. soup: beautifulsoup实例
  14. Returns:
  15. 处理过后的beautifulsoup实例
  16. '''
  17. tbodies = soup.find_all('tbody')
  18. if len(tbodies) == 0:
  19. tbodies = soup.find_all('table')
  20. # 遍历表格中的每个tbody
  21. for tbody in tbodies:
  22. # 处理colspan, rowspan信息补全问题
  23. trs = tbody.findChildren('tr', recursive=False)
  24. ths_len = 0
  25. ths = list()
  26. trs_set = set()
  27. # 遍历每一个tr
  28. for indtr, tr in enumerate(trs):
  29. ths_tmp = tr.findChildren('th', recursive=False)
  30. if len(ths_tmp) > 0:
  31. ths_len = ths_len + len(ths_tmp)
  32. for th in ths_tmp:
  33. ths.append(th)
  34. trs_set.add(tr)
  35. # 遍历每行中的element
  36. tds = tr.findChildren(recursive=False)
  37. if len(tds) > 1:
  38. for indtd, td in enumerate(tds):
  39. # 若有rowspan 则补全下一行同样位置
  40. if 'rowspan' in td.attrs:
  41. row = int(td['rowspan'])
  42. td['rowspan'] = 1
  43. for i in range(1, row, 1):
  44. # 获取下一行的所有td, 在对应的位置插入
  45. if (indtr+i)<len(trs):
  46. tds1 = trs[indtr + i].findChildren('td', recursive=False)
  47. if len(tds1) >= (indtd):
  48. if indtd > 0:
  49. tds1[indtd - 1].insert_after(copy.copy(td))
  50. else:
  51. tds1[0].insert_before(copy.copy(td))
  52. # 若有colspan 则补全同一行下一个位置
  53. if 'colspan' in td.attrs:
  54. col = int(td['colspan'])
  55. td['colspan'] = 1
  56. for i in range(1, col, 1):
  57. td.insert_after(copy.copy(td))
  58. # 表格转化成文字
  59. if ths_len > 1: # 有表头的表格
  60. if len(trs_set) == 1: # 横状表格
  61. ps = ''
  62. trs_set = tbody.findChildren('tr', recursive=False)
  63. for i in range(1, len(trs_set), 1):
  64. tr = trs_set[i]
  65. tds = tr.findChildren('td', recursive=False)
  66. p = ''
  67. for ind, th in enumerate(ths):
  68. if ind < len(tds):
  69. p = p + th.get_text() + ":" + tds[ind].get_text() + ";"
  70. p = p + ";"
  71. ps = ps + p
  72. tbody.string = ps
  73. tbody.name = 'div'
  74. else: # 竖状表格
  75. ps = ''
  76. tds = list(trs_set)[0].findChildren('td', recursive=False)
  77. for ind, td in enumerate(tds):
  78. p = ''
  79. for i in range(0, len(trs_set), 1):
  80. tds_temp = list(trs_set)[i].findChildren('td', recursive=False)
  81. if ind < len(tds_temp):
  82. if ind < len(tds_temp):
  83. p = p + ths[i].get_text() + tds_temp[ind].get_text() + ";"
  84. ps = ps + p
  85. tbody.string = ps
  86. tbody.name = 'p'
  87. else: # 有表头但是非th标签的横状表格
  88. trs = tbody.findChildren('tr', recursive=False)
  89. if len(trs) > 0:
  90. ths = []
  91. for i in range(len(trs)):
  92. if len(ths)>0:
  93. ps = ''
  94. for i in range(1, len(trs), 1):
  95. tr = trs[i]
  96. tds = tr.findChildren('td', recursive=False)
  97. p = ''
  98. if len(tds)==len(ths):
  99. for ind, th in enumerate(ths):
  100. if (len(tds)-1) >= ind:
  101. p = p + th.get_text() + ":" + tds[ind].get_text() + ";"
  102. p = p[:-1] + "。"
  103. ps = ps + p
  104. else:
  105. for ind, th in enumerate(ths):
  106. if (len(tds)-1) >= ind:
  107. p = p + tds[ind].get_text()
  108. p = p + "。"
  109. ps = ps + p
  110. tbody.string = ps
  111. tbody.name = 'p'
  112. break
  113. tds0 = trs[i].findChildren('td', recursive=False)
  114. if len(tds0) > 2:
  115. tds_str = [td.get_text() for td in tds0]
  116. pat = re.compile('(序号|项目|产品|货物|单位|数量|价格|金额|总价|中标|供应商|候选|编号|得分|名次|排名|排序|科室){1}')
  117. match_counts = re.subn(pat, '', ";".join(tds_str))[1]
  118. if match_counts > 2:
  119. for td in trs[i].findChildren('td', recursive=False):
  120. td.name = 'th'
  121. ths.append(td)
  122. return soup
  123. def segment(soup):
  124. #segList = ["tr","div","h1", "h2", "h3", "h4", "h5", "h6", "header"]
  125. segList = ["tr"]
  126. commaList = ["p","div","br"]
  127. subspaceList = ["td",'a']
  128. tbodies = soup.find_all('tbody')
  129. if len(tbodies) == 0:
  130. tbodies = soup.find_all('table')
  131. # 递归遍历所有节点,插入符号
  132. for child in soup.body.descendants:
  133. if child.name in segList:
  134. child.insert_after("。")
  135. if child.name in commaList:
  136. child.insert_after(",")
  137. if child.name in subspaceList:
  138. child.insert_before("#subs#")
  139. child.insert_after("#sube#")
  140. text = str(soup.get_text())
  141. #替换"""为"“",否则导入deepdive出错
  142. text = text.replace('"',"“").replace("\r","").replace("\n","")
  143. #替换英文冒号为中文冒号
  144. text = re.sub("(?<=[\u4e00-\u9fa5]):|:(?=[\u4e00-\u9fa5])",":",text)
  145. #替换为中文逗号
  146. text = re.sub("(?<=[\u4e00-\u9fa5]),|,(?=[\u4e00-\u9fa5])",",",text)
  147. #替换为中文分号
  148. text = re.sub("(?<=[\u4e00-\u9fa5]);|;(?=[\u4e00-\u9fa5])",";",text)
  149. #删除标签中的所有空格
  150. allMatch = re.findall(re.compile("#subs#(.*?)#sube#"),text)
  151. for item in allMatch:
  152. text = text.replace("#subs#"+item+"#sube#",re.sub("\s","",item))
  153. #替换标点
  154. while(True):
  155. #替换连续的标点
  156. punc = re.search("(?P<punc>:|。|,|;)\s*,",text)
  157. if punc is not None:
  158. text = re.sub(punc.group("punc")+"\s*,",punc.group("punc"),text)
  159. else:
  160. #替换标点之后的空格
  161. punc = re.search("(?P<punc>:|。|,|;)\s+",text)
  162. if punc is not None:
  163. #print(punc.group("punc"))
  164. text = re.sub(punc.group("punc")+"\s+",punc.group("punc"),text)
  165. else:
  166. break
  167. #将连续的中文句号替换为一个
  168. text_split = text.split("。")
  169. text_split = [x for x in text_split if len(x)>0]
  170. text = "。".join(text_split)
  171. return text
  172. if __name__=="__main__":
  173. # connect oracle database
  174. db = oracle.connect('bxkc/bxkc@192.168.2.54:1521/orcl')
  175. # create cursor
  176. cursor = db.cursor()
  177. # execute sql
  178. cursor.execute("select dochtmlcon from sys_document where docchannel='101' and dochtmlcon is not NULL and rownum<10000")
  179. rows = cursor.fetchall()
  180. htmls = []
  181. for row in rows:
  182. content = row[0]
  183. #print("===")
  184. #print(content)
  185. htmls.append(segment(table2text(BeautifulSoup(content,"lxml"))))
  186. dataframe = pd.DataFrame({'content':htmls})
  187. columns = ['content']
  188. dataframe.to_csv("articles.csv",index=True,header=False,sep=",",encoding="utf8",columns=columns)