testArticle_processed.py 23 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536
  1. #coding:utf8
  2. from bs4 import BeautifulSoup, Comment,ResultSet
  3. import copy
  4. import re
  5. import psycopg2
  6. import codecs
  7. def table2text(soup):
  8. '''
  9. 把公告中的表格转化为纯文本
  10. Args:
  11. soup: beautifulsoup实例
  12. Returns:
  13. 处理过后的beautifulsoup实例
  14. '''
  15. tbodies = soup.find_all('tbody')
  16. if len(tbodies) == 0:
  17. tbodies = soup.find_all('table')
  18. # 遍历表格中的每个tbody
  19. for tbody in tbodies:
  20. # 处理colspan, rowspan信息补全问题
  21. trs = tbody.findChildren('tr', recursive=False)
  22. ths_len = 0
  23. ths = list()
  24. trs_set = set()
  25. # 遍历每一个tr
  26. for indtr, tr in enumerate(trs):
  27. ths_tmp = tr.findChildren('th', recursive=False)
  28. if len(ths_tmp) > 0:
  29. ths_len = ths_len + len(ths_tmp)
  30. for th in ths_tmp:
  31. ths.append(th)
  32. trs_set.add(tr)
  33. # 遍历每行中的element
  34. tds = tr.findChildren(recursive=False)
  35. if len(tds) > 1:
  36. for indtd, td in enumerate(tds):
  37. # 若有rowspan 则补全下一行同样位置
  38. if 'rowspan' in td.attrs:
  39. #print(td)
  40. row = int(re.sub("[^0-9]","",str(td['rowspan'])))
  41. td['rowspan'] = 1
  42. for i in range(1, row, 1):
  43. # 获取下一行的所有td, 在对应的位置插入
  44. if indtr+i<len(trs):
  45. tds1 = trs[indtr + i].findChildren('td', recursive=False)
  46. if len(tds1)==0:
  47. tds1 = trs[indtr + i].findChildren('th', recursive=False)
  48. if len(tds1) >= (indtd) and len(tds1)>0:
  49. if indtd > 0:
  50. tds1[indtd - 1].insert_after(copy.copy(td))
  51. else:
  52. tds1[0].insert_before(copy.copy(td))
  53. # 若有colspan 则补全同一行下一个位置
  54. if 'colspan' in td.attrs:
  55. if str(td['colspan'])!="":
  56. #print(re.sub("[^0-9]","",td['colspan']))
  57. col = int(re.sub("[^0-9]","",str(td['colspan'])))
  58. td['colspan'] = 1
  59. for i in range(1, col, 1):
  60. td.insert_after(copy.copy(td))
  61. # 表格转化成文字
  62. if ths_len > 1: # 有表头的表格
  63. if len(trs_set) == 1: # 横状表格
  64. ps = ''
  65. trs_set = tbody.findChildren('tr', recursive=False)
  66. for i in range(1, len(trs_set), 1):
  67. tr = trs_set[i]
  68. tds = tr.findChildren(['td','th'], recursive=False)
  69. p = ''
  70. for ind, th in enumerate(ths):
  71. if ind < len(tds):
  72. p = p + th.get_text() + ":" + tds[ind].get_text() + ";"
  73. p = p + ";"
  74. ps = ps + p
  75. tbody.string = ps
  76. tbody.name = 'div'
  77. else: # 竖状表格
  78. ps = ''
  79. tds = list(trs_set)[0].findChildren('td', recursive=False)
  80. for ind, td in enumerate(tds):
  81. p = ''
  82. for i in range(0, len(trs_set), 1):
  83. tds_temp = list(trs_set)[i].findChildren('td', recursive=False)
  84. if ind < len(tds_temp):
  85. if ind < len(tds_temp):
  86. p = p + ths[i].get_text() + tds_temp[ind].get_text() + ";"
  87. ps = ps + p
  88. tbody.string = ps
  89. tbody.name = 'p'
  90. else: # 有表头但是非th标签的横状表格
  91. trs = tbody.findChildren('tr', recursive=False)
  92. if len(trs) > 0:
  93. tds0 = trs[0].findChildren('td', recursive=False)
  94. if len(tds0) > 2:
  95. tds_str = [re.sub('\s','',td.get_text()) for td in tds0]
  96. pat = re.compile('(名称|序号|项目|品目[一二三四]|包号|标段|产品|货物|单位|数量|价格|报价|金额|总价|中标|供应商|候选|编号|得分|名次|排名|排序|第一名|第二名|第三名|科室|方式|时间|日期|面积){1}')
  97. #match_counts = re.subn(pat, '', ";".join(tds_str))[1]
  98. match_counts = len(set(re.findall(pat, ";".join(tds_str))))
  99. if match_counts > 2:
  100. ths = []
  101. for td in trs[0].findChildren('td', recursive=False):
  102. td.name = 'th'
  103. ths.append(td)
  104. ps = ''
  105. trs = tbody.findChildren('tr', recursive=False)
  106. for i in range(1, len(trs), 1):
  107. tr = trs[i]
  108. tds = tr.findChildren('td', recursive=False)
  109. p = ''
  110. for ind, th in enumerate(ths):
  111. if (len(tds)-1) >= ind:
  112. p = p + th.get_text() + ":" + tds[ind].get_text() + ";"
  113. p = p + ";"
  114. ps = ps + p
  115. tbody.string = ps
  116. tbody.name = 'p'
  117. return soup
  118. def tableToText(soup):
  119. def fixSpan(tbody):
  120. # 处理colspan, rowspan信息补全问题
  121. trs = tbody.findChildren('tr', recursive=False)
  122. ths_len = 0
  123. ths = list()
  124. trs_set = set()
  125. #修改为先进行列补全再进行行补全,否则可能会出现表格解析混乱
  126. # 遍历每一个tr
  127. for indtr, tr in enumerate(trs):
  128. ths_tmp = tr.findChildren('th', recursive=False)
  129. #不补全含有表格的tr
  130. if len(tr.findChildren('table'))>0:
  131. continue
  132. if len(ths_tmp) > 0:
  133. ths_len = ths_len + len(ths_tmp)
  134. for th in ths_tmp:
  135. ths.append(th)
  136. trs_set.add(tr)
  137. # 遍历每行中的element
  138. tds = tr.findChildren(recursive=False)
  139. for indtd, td in enumerate(tds):
  140. # 若有colspan 则补全同一行下一个位置
  141. if 'colspan' in td.attrs:
  142. if str(td['colspan'])!="":
  143. col = int(re.sub("[^0-9]","",str(td['colspan'])))
  144. td['colspan'] = 1
  145. for i in range(1, col, 1):
  146. td.insert_after(copy.copy(td))
  147. for indtr, tr in enumerate(trs):
  148. ths_tmp = tr.findChildren('th', recursive=False)
  149. #不补全含有表格的tr
  150. if len(tr.findChildren('table'))>0:
  151. continue
  152. if len(ths_tmp) > 0:
  153. ths_len = ths_len + len(ths_tmp)
  154. for th in ths_tmp:
  155. ths.append(th)
  156. trs_set.add(tr)
  157. # 遍历每行中的element
  158. tds = tr.findChildren(recursive=False)
  159. for indtd, td in enumerate(tds):
  160. # 若有rowspan 则补全下一行同样位置
  161. if 'rowspan' in td.attrs:
  162. if str(td['rowspan'])!="":
  163. row = int(re.sub("[^0-9]","",str(td['rowspan'])))
  164. td['rowspan'] = 1
  165. for i in range(1, row, 1):
  166. # 获取下一行的所有td, 在对应的位置插入
  167. if indtr+i<len(trs):
  168. tds1 = trs[indtr + i].findChildren(['td','th'], recursive=False)
  169. if len(tds1) >= (indtd) and len(tds1)>0:
  170. if indtd > 0:
  171. tds1[indtd - 1].insert_after(copy.copy(td))
  172. else:
  173. tds1[0].insert_before(copy.copy(td))
  174. def getTable(tbody):
  175. trs = tbody.findChildren('tr', recursive=False)
  176. inner_table = []
  177. for tr in trs:
  178. tr_line = []
  179. tds = tr.findChildren(['td','th'], recursive=False)
  180. for td in tds:
  181. tr_line.append([re.sub('\s*','',td.get_text()),0])
  182. inner_table.append(tr_line)
  183. return inner_table
  184. #处理表格不对齐的问题
  185. def fixTable(inner_table):
  186. maxWidth = 0
  187. for item in inner_table:
  188. if len(item)>maxWidth:
  189. maxWidth = len(item)
  190. for i in range(len(inner_table)):
  191. if len(inner_table[i])<maxWidth:
  192. for j in range(maxWidth-len(inner_table[i])):
  193. inner_table[i].append(["",0])
  194. return inner_table
  195. #设置表头
  196. def setHead(inner_table,pattern,pat_value,count):
  197. height = len(inner_table)
  198. width = len(inner_table[0])
  199. head_list = []
  200. head_list.append(0)
  201. #行表头
  202. for i in range(height):
  203. set_match = set()
  204. is_head = False
  205. for j in range(width):
  206. if re.search(pat_value,inner_table[i][j][0]) is not None:
  207. is_head = False
  208. break
  209. str_find = re.findall(pattern,inner_table[i][j][0])
  210. if len(str_find)>0:
  211. set_match.add(inner_table[i][j][0])
  212. if len(set_match)>=count:
  213. is_head = True
  214. if is_head:
  215. head_list.append(i)
  216. for j in range(width):
  217. inner_table[i][j][1] = 1
  218. head_list.append(height)
  219. #列表头
  220. for i in range(len(head_list)-1):
  221. head_begin = head_list[i]
  222. head_end = head_list[i+1]
  223. #最后一列不设置为列表头
  224. for i in range(width-1):
  225. set_match = set()
  226. is_head = False
  227. for j in range(head_begin,head_end):
  228. if re.search(pat_value,inner_table[j][i][0]) is not None:
  229. is_head = False
  230. break
  231. str_find = re.findall(pattern,inner_table[j][i][0])
  232. if len(str_find)>0:
  233. set_match.add(inner_table[j][i][0])
  234. if len(set_match)>=count:
  235. is_head = True
  236. if is_head:
  237. for j in range(head_begin,head_end):
  238. inner_table[j][i][1] = 2
  239. return inner_table,head_list
  240. def getDirect(inner_table,begin,end):
  241. column_head = set()
  242. row_head = set()
  243. widths = len(inner_table[0])
  244. for height in range(begin,end):
  245. for width in range(widths):
  246. if inner_table[height][width][1] ==1:
  247. row_head.add(height)
  248. if inner_table[height][width][1] ==2:
  249. column_head.add(width)
  250. company_pattern = re.compile("公司")
  251. if 0 in column_head and begin not in row_head:
  252. return "column"
  253. if 0 in column_head and begin in row_head:
  254. for height in range(begin,end):
  255. count = 0
  256. count_flag = True
  257. for width in range(width):
  258. if inner_table[height][width][1]==0:
  259. if re.search(company_pattern,inner_table[height][width][0]) is not None:
  260. count += 1
  261. else:
  262. count_flag = False
  263. if count_flag and count>=2:
  264. return "column"
  265. return "row"
  266. def getTableText(inner_table,head_list):
  267. rankPattern = "(排名|排序|名次|评标结果)"
  268. entityPattern = "(候选|([中投]标|报价)(人|单位|候选)|单位名称|供应商)"
  269. height = len(inner_table)
  270. width = len(inner_table[0])
  271. text = ""
  272. for head_i in range(len(head_list)-1):
  273. text_set = set()
  274. head_begin = head_list[head_i]
  275. head_end = head_list[head_i+1]
  276. direct = getDirect(inner_table, head_begin, head_end)
  277. if head_begin==head_end:
  278. continue
  279. print(direct)
  280. if direct=="row":
  281. for i in range(head_begin,head_end):
  282. rank_text = ""
  283. entity_text = ""
  284. text_line = ""
  285. for j in range(width):
  286. cell = inner_table[i][j]
  287. #是属性值
  288. if cell[1]==0:
  289. find_flag = False
  290. head = ""
  291. temp_head = ""
  292. for loop_j in range(1,j+1):
  293. if inner_table[i][j-loop_j][1]==2:
  294. if find_flag:
  295. if inner_table[i][j-loop_j][0]!=temp_head:
  296. head = inner_table[i][j-loop_j][0]+":"+head
  297. else:
  298. head = inner_table[i][j-loop_j][0]+":"+head
  299. find_flag = True
  300. temp_head = inner_table[i][j-loop_j][0]
  301. else:
  302. if find_flag:
  303. break
  304. find_flag = False
  305. temp_head = ""
  306. for loop_i in range(1,i+1):
  307. if inner_table[i-loop_i][j][1]==1:
  308. if find_flag:
  309. if inner_table[i-loop_i][j][0]!=temp_head:
  310. head = inner_table[i-loop_i][j][0]+":"+head
  311. else:
  312. head = inner_table[i-loop_i][j][0]+":"+head
  313. find_flag = True
  314. temp_head = inner_table[i-loop_i][j][0]
  315. else:
  316. if find_flag:
  317. break
  318. if str(head+inner_table[i][j][0]) in text_set:
  319. continue
  320. if re.search(rankPattern,head) is not None:
  321. rank_text += head+inner_table[i][j][0]+","
  322. #print(rank_text)
  323. elif re.search(entityPattern,head) is not None:
  324. entity_text += head+inner_table[i][j][0]+","
  325. #print(entity_text)
  326. else:
  327. text_line += head+inner_table[i][j][0]+","
  328. text_set.add(str(head+inner_table[i][j][0]))
  329. text += rank_text+entity_text+text_line
  330. else:
  331. for j in range(width):
  332. rank_text = ""
  333. entity_text = ""
  334. text_line = ""
  335. for i in range(head_begin,head_end):
  336. cell = inner_table[i][j]
  337. #是属性值
  338. if cell[1]==0:
  339. find_flag = False
  340. head = ""
  341. temp_head = ""
  342. for loop_j in range(1,j+1):
  343. if inner_table[i][j-loop_j][1]==2:
  344. if find_flag:
  345. if inner_table[i][j-loop_j][0]!=temp_head:
  346. head = inner_table[i][j-loop_j][0]+":"+head
  347. else:
  348. head = inner_table[i][j-loop_j][0]+":"+head
  349. find_flag = True
  350. temp_head = inner_table[i][j-loop_j][0]
  351. else:
  352. if find_flag:
  353. break
  354. find_flag = False
  355. temp_head = ""
  356. for loop_i in range(1,i+1):
  357. if inner_table[i-loop_i][j][1]==1:
  358. if find_flag:
  359. if inner_table[i-loop_i][j][0]!=temp_head:
  360. head = inner_table[i-loop_i][j][0]+":"+head
  361. else:
  362. head = inner_table[i-loop_i][j][0]+":"+head
  363. find_flag = True
  364. temp_head = inner_table[i-loop_i][j][0]
  365. else:
  366. if find_flag:
  367. break
  368. if str(head+inner_table[i][j][0]) in text_set:
  369. continue
  370. if re.search(rankPattern,head) is not None:
  371. rank_text += head+inner_table[i][j][0]+","
  372. #print(rank_text)
  373. elif re.search(entityPattern,head) is not None:
  374. entity_text += head+inner_table[i][j][0]+","
  375. #print(entity_text)
  376. else:
  377. text_line += head+inner_table[i][j][0]+","
  378. text_set.add(str(head+inner_table[i][j][0]))
  379. text += rank_text+entity_text+text_line
  380. text = text[:-1]+"。"
  381. return text
  382. pat_head = re.compile('(名称|序号|项目|工程|品目[一二三四1234]|第[一二三四1234](标段|名|候选人|中标)|包段|包号|货物|单位|数量|价格|报价|金额|总价|单价|[招投中]标|供应商|候选|编号|得分|评委|评分|名次|排名|排序|科室|方式|工期|时间|产品|开始|结束|联系|日期|面积|姓名|证号|备注|级别|地[点址]|类型|代理)')
  383. pat_value = re.compile("(\d{2,}.\d{1}|\d+年\d+月|\d{8,}|\d{3,}-\d{6,}|有限[责任]*公司|^\d+$)")
  384. tbodies = soup.find_all('tbody')
  385. if len(tbodies) == 0:
  386. tbodies = soup.find_all('table')
  387. # 遍历表格中的每个tbody
  388. #逆序处理嵌套表格
  389. for tbody_index in range(1,len(tbodies)+1):
  390. tbody = tbodies[len(tbodies)-tbody_index]
  391. fixSpan(tbody)
  392. inner_table = getTable(tbody)
  393. inner_table = fixTable(inner_table)
  394. if len(inner_table)>0:
  395. inner_table,head_list = setHead(inner_table,pat_head,pat_value,3)
  396. print(head_list)
  397. for item in inner_table:
  398. print(item)
  399. print()
  400. tbody.string = getTableText(inner_table,head_list)
  401. #print(tbody.string)
  402. tbody.name = "table"
  403. return soup
  404. def segment(soup):
  405. #segList = ["tr","div","h1", "h2", "h3", "h4", "h5", "h6", "header"]
  406. segList = ["tr"]
  407. commaList = ["p","div","br","td","span"]
  408. spaceList = ["span"]
  409. subspaceList = ["td",'a',"span"]
  410. tbodies = soup.find_all('tbody')
  411. if len(tbodies) == 0:
  412. tbodies = soup.find_all('table')
  413. # 递归遍历所有节点,插入符号
  414. for child in soup.body.descendants:
  415. if child.name in segList:
  416. child.insert_after("。")
  417. if child.name in commaList:
  418. child.insert_after(",")
  419. if child.name in subspaceList:
  420. child.insert_before("#subs"+str(child.name)+"#")
  421. child.insert_after("#sube"+str(child.name)+"#")
  422. if child.name in spaceList:
  423. child.insert_after(" ")
  424. text = str(soup.get_text())
  425. #替换"""为"“",否则导入deepdive出错
  426. text = text.replace('"',"“").replace("\r","").replace("\n","")
  427. #替换英文冒号为中文冒号
  428. text = re.sub("(?<=[\u4e00-\u9fa5]):|:(?=[\u4e00-\u9fa5])",":",text)
  429. #替换为中文逗号
  430. text = re.sub("(?<=[\u4e00-\u9fa5]),|,(?=[\u4e00-\u9fa5])",",",text)
  431. #替换为中文分号
  432. text = re.sub("(?<=[\u4e00-\u9fa5]);|;(?=[\u4e00-\u9fa5])",";",text)
  433. #删除标签中的所有空格
  434. for subs in subspaceList:
  435. patten = "#subs"+str(subs)+"#(.*?)#sube"+str(subs)+"#"
  436. while(True):
  437. oneMatch = re.search(re.compile(patten),text)
  438. if oneMatch is not None:
  439. text = text.replace("#subs"+str(subs)+"#"+oneMatch.group(1)+"#sube"+str(subs)+"#",re.sub("\s","",oneMatch.group(1)))
  440. else:
  441. break
  442. #替换标点
  443. while(True):
  444. #替换连续的标点
  445. punc = re.search(",(?P<punc>:|。|,|;)\s*",text)
  446. if punc is not None:
  447. text = re.sub(","+punc.group("punc")+"\s*",punc.group("punc"),text)
  448. punc = re.search("(?P<punc>:|。|,|;)\s*,",text)
  449. if punc is not None:
  450. text = re.sub(punc.group("punc")+"\s*,",punc.group("punc"),text)
  451. else:
  452. #替换标点之后的空格
  453. punc = re.search("(?P<punc>:|。|,|;)\s+",text)
  454. if punc is not None:
  455. text = re.sub(punc.group("punc")+"\s+",punc.group("punc"),text)
  456. else:
  457. break
  458. #将连续的中文句号替换为一个
  459. text_split = text.split("。")
  460. text_split = [x for x in text_split if len(x)>0]
  461. text = "。".join(text_split)
  462. return text
  463. doc_id = '33918780-7424-11e8-a3f8-44a84246dbba'
  464. conn = psycopg2.connect(dbname="BiddingKG",user="postgres",password="postgres",host="192.168.2.101")
  465. cursor = conn.cursor()
  466. cursor.execute(" select content from articles where id='"+doc_id+"' ")
  467. row = cursor.fetchall()[0]
  468. content = segment(table2text(BeautifulSoup(row[0],"lxml")))
  469. print(content)
  470. '''
  471. conn = psycopg2.connect(dbname="BiddingKM_test_10000",user="postgres",password="postgres",host="192.168.2.101")
  472. cursor = conn.cursor()
  473. cursor.execute(" select * from articles ")
  474. rows = cursor.fetchall()
  475. for row in rows:
  476. content = segment(table2text(BeautifulSoup(row[1],"lxml")))
  477. with codecs.open("export_article/"+str(row[0])+".txt","w",encoding="utf8") as f:
  478. f.write(content.replace("。","。\n"))
  479. f.flush()
  480. f.close()
  481. ''''''
  482. with open("C:\\Users\\User\\Desktop\\a.html","r",encoding="utf8") as f:
  483. html = f.read()
  484. soup = BeautifulSoup(html,"lxml")
  485. with open("C:\\Users\\User\\Desktop\\b.html","w",encoding="utf8") as f:
  486. #f.write(segment(table2text(soup)))
  487. f.write(segment(tableToText(soup)))
  488. f.flush()
  489. id="2b7f8b45-7b4c-11e8-abe2-109836a68148"
  490. with codecs.open("export_article/"+id+"_a.html","r",encoding="utf8") as f:
  491. html = f.read()
  492. soup = BeautifulSoup(html,"lxml")
  493. with codecs.open("export_article/"+id+"_b.html","w",encoding="utf8") as f:
  494. #f.write(segment(table2text(soup)))
  495. f.write(segment(tableToText(soup)))
  496. f.flush()
  497. '''