testArticle_processed2.py 8.6 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209
  1. #coding:utf8
  2. from bs4 import BeautifulSoup, Comment
  3. import copy
  4. import re
  5. import psycopg2
  6. def table2text(soup):
  7. '''
  8. 把公告中的表格转化为纯文本
  9. Args:
  10. soup: beautifulsoup实例
  11. Returns:
  12. 处理过后的beautifulsoup实例
  13. '''
  14. tbodies = soup.find_all('tbody')
  15. if len(tbodies) == 0:
  16. tbodies = soup.find_all('table')
  17. # 遍历表格中的每个tbody
  18. for tbody in tbodies:
  19. # 处理colspan, rowspan信息补全问题
  20. trs = tbody.findChildren('tr', recursive=False)
  21. ths_len = 0
  22. ths = list()
  23. trs_set = set()
  24. # 遍历每一个tr
  25. for indtr, tr in enumerate(trs):
  26. ths_tmp = tr.findChildren('th', recursive=False)
  27. if len(ths_tmp) > 0:
  28. ths_len = ths_len + len(ths_tmp)
  29. for th in ths_tmp:
  30. ths.append(th)
  31. trs_set.add(tr)
  32. # 遍历每行中的element
  33. tds = tr.findChildren(recursive=False)
  34. if len(tds) > 1:
  35. for indtd, td in enumerate(tds):
  36. # 若有rowspan 则补全下一行同样位置
  37. if 'rowspan' in td.attrs:
  38. row = int(td['rowspan'])
  39. td['rowspan'] = 1
  40. for i in range(1, row, 1):
  41. # 获取下一行的所有td, 在对应的位置插入
  42. if indtr+i<len(trs):
  43. tds1 = trs[indtr + i].findChildren('td', recursive=False)
  44. if len(tds1)==0:
  45. tds1 = trs[indtr + i].findChildren('th', recursive=False)
  46. if len(tds1) >= (indtd) and len(tds1)>0:
  47. if indtd > 0:
  48. tds1[indtd - 1].insert_after(copy.copy(td))
  49. else:
  50. tds1[0].insert_before(copy.copy(td))
  51. # 若有colspan 则补全同一行下一个位置
  52. if 'colspan' in td.attrs:
  53. col = int(td['colspan'])
  54. td['colspan'] = 1
  55. for i in range(1, col, 1):
  56. td.insert_after(copy.copy(td))
  57. # 表格转化成文字
  58. if ths_len > 1: # 有表头的表格
  59. if len(trs_set) == 1: # 横状表格
  60. ps = ''
  61. trs_set = tbody.findChildren('tr', recursive=False)
  62. for i in range(1, len(trs_set), 1):
  63. tr = trs_set[i]
  64. tds = tr.findChildren('td', recursive=False)
  65. p = ''
  66. for ind, th in enumerate(ths):
  67. if ind < len(tds):
  68. p = p + th.get_text() + ":" + tds[ind].get_text() + ";"
  69. p = p + ";"
  70. ps = ps + p
  71. tbody.string = ps
  72. tbody.name = 'div'
  73. else: # 竖状表格
  74. ps = ''
  75. tds = list(trs_set)[0].findChildren('td', recursive=False)
  76. for ind, td in enumerate(tds):
  77. p = ''
  78. for i in range(0, len(trs_set), 1):
  79. tds_temp = list(trs_set)[i].findChildren('td', recursive=False)
  80. if ind < len(tds_temp):
  81. if ind < len(tds_temp):
  82. p = p + ths[i].get_text() + tds_temp[ind].get_text() + ";"
  83. ps = ps + p
  84. tbody.string = ps
  85. tbody.name = 'p'
  86. else: # 有表头但是非th标签的横状表格
  87. trs = tbody.findChildren('tr', recursive=False)
  88. if len(trs) > 0:
  89. ths = []
  90. for i in range(len(trs)):
  91. if len(ths)>0:
  92. #print(ths)
  93. ps = ''
  94. for i in range(1, len(trs), 1):
  95. tr = trs[i]
  96. tds = tr.findChildren('td', recursive=False)
  97. p = ''
  98. if len(tds)==len(ths):
  99. for ind, th in enumerate(ths):
  100. if (len(tds)-1) >= ind:
  101. p = p + th.get_text() + ":" + tds[ind].get_text() + ";"
  102. p = p[:-1] + "。"
  103. ps = ps + p
  104. else:
  105. for ind, th in enumerate(ths):
  106. if (len(tds)-1) >= ind:
  107. p = p + tds[ind].get_text()
  108. p = p + "。"
  109. ps = ps + p
  110. tbody.string = ps
  111. tbody.name = 'p'
  112. break
  113. tds0 = trs[i].findChildren('td', recursive=False)
  114. if len(tds0) > 2:
  115. tds_str = [td.get_text() for td in tds0]
  116. pat = re.compile('(名称|序号|项目|品目一|品目二|品目三|品目四|标段|产品|货物|单位|数量|价格|报价|金额|总价|中标|供应商|候选|编号|得分|名次|排名|排序|第一名|第二名|第三名|科室|方式|时间|日期|面积){1}')
  117. #match_counts = re.subn(pat, '', ";".join(tds_str))[1]
  118. match_counts = len(set(re.findall(pat, ";".join(tds_str))))
  119. if match_counts > 2:
  120. #print(set(re.findall(pat, ";".join(tds_str))))
  121. #print(";".join(tds_str))
  122. for td in trs[i].findChildren('td', recursive=False):
  123. td.name = 'th'
  124. ths.append(td)
  125. return soup
  126. def segment(soup):
  127. #segList = ["tr","div","h1", "h2", "h3", "h4", "h5", "h6", "header"]
  128. segList = ["tr"]
  129. commaList = ["p","div","br"]
  130. subspaceList = ["td",'a']
  131. tbodies = soup.find_all('tbody')
  132. if len(tbodies) == 0:
  133. tbodies = soup.find_all('table')
  134. # 递归遍历所有节点,插入符号
  135. for child in soup.body.descendants:
  136. if child.name in segList:
  137. child.insert_after("。")
  138. if child.name in commaList:
  139. child.insert_after(",")
  140. if child.name in subspaceList:
  141. child.insert_before("#subs#")
  142. child.insert_after("#sube#")
  143. text = str(soup.get_text())
  144. #替换"""为"“",否则导入deepdive出错
  145. text = text.replace('"',"“").replace("\r","").replace("\n","")
  146. #替换英文冒号为中文冒号
  147. text = re.sub("(?<=[\u4e00-\u9fa5]):|:(?=[\u4e00-\u9fa5])",":",text)
  148. #替换为中文逗号
  149. text = re.sub("(?<=[\u4e00-\u9fa5]),|,(?=[\u4e00-\u9fa5])",",",text)
  150. #替换为中文分号
  151. text = re.sub("(?<=[\u4e00-\u9fa5]);|;(?=[\u4e00-\u9fa5])",";",text)
  152. #删除标签中的所有空格
  153. allMatch = re.findall(re.compile("#subs#(.*?)#sube#"),text)
  154. for item in allMatch:
  155. text = text.replace("#subs#"+item+"#sube#",re.sub("\s","",item))
  156. #替换标点
  157. while(True):
  158. #替换连续的标点
  159. punc = re.search("(?P<punc>:|。|,|;)\s*,",text)
  160. if punc is not None:
  161. text = re.sub(punc.group("punc")+"\s*,",punc.group("punc"),text)
  162. else:
  163. #替换标点之后的空格
  164. punc = re.search("(?P<punc>:|。|,|;)\s+",text)
  165. if punc is not None:
  166. #print(punc.group("punc"))
  167. text = re.sub(punc.group("punc")+"\s+",punc.group("punc"),text)
  168. else:
  169. break
  170. #将连续的中文句号替换为一个
  171. text_split = text.split("。")
  172. text_split = [x for x in text_split if len(x)>0]
  173. text = "。".join(text_split)
  174. return text
  175. '''
  176. conn = psycopg2.connect(dbname="BiddingKM_test_10000",user="postgres",password="postgres",host="192.168.2.101")
  177. cursor = conn.cursor()
  178. cursor.execute(" select * from articles ")
  179. rows = cursor.fetchall()
  180. for row in rows:
  181. print(row[1])
  182. segment(table2text(BeautifulSoup(row[1],"lxml")))
  183. '''
  184. with open("C:\\Users\\User\\Desktop\\a.html","r",encoding="utf8") as f:
  185. html = f.read()
  186. soup = BeautifulSoup(html,"lxml")
  187. with open("C:\\Users\\User\\Desktop\\b.html","w",encoding="utf8") as f:
  188. f.write(segment(table2text(soup)))
  189. f.flush()