articles_processed.py 22 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506
  1. #!/usr/bin/env python
  2. #encoding:utf-8
  3. from deepdive import *
  4. from commonutil import *
  5. from bs4 import BeautifulSoup, Comment
  6. import copy
  7. import re
  8. import os
  9. os.environ['NLS_LANG'] = 'SIMPLIFIED CHINESE_CHINA.UTF8'
  10. def table2text(soup):
  11. '''
  12. 把公告中的表格转化为纯文本
  13. Args:
  14. soup: beautifulsoup实例
  15. Returns:
  16. 处理过后的beautifulsoup实例
  17. '''
  18. tbodies = soup.find_all('tbody')
  19. if len(tbodies) == 0:
  20. tbodies = soup.find_all('table')
  21. # 遍历表格中的每个tbody
  22. for tbody in tbodies:
  23. # 处理colspan, rowspan信息补全问题
  24. trs = tbody.findChildren('tr', recursive=False)
  25. ths_len = 0
  26. ths = list()
  27. trs_set = set()
  28. # 遍历每一个tr
  29. for indtr, tr in enumerate(trs):
  30. ths_tmp = tr.findChildren('th', recursive=False)
  31. if len(ths_tmp) > 0:
  32. ths_len = ths_len + len(ths_tmp)
  33. for th in ths_tmp:
  34. ths.append(th)
  35. trs_set.add(tr)
  36. # 遍历每行中的element
  37. tds = tr.findChildren(recursive=False)
  38. if len(tds) > 1:
  39. for indtd, td in enumerate(tds):
  40. # 若有rowspan 则补全下一行同样位置
  41. if 'rowspan' in td.attrs:
  42. if str(td['rowspan'])!="":
  43. #print(td)
  44. row = int(re.sub("[^0-9]","",str(td['rowspan'])))
  45. td['rowspan'] = 1
  46. for i in range(1, row, 1):
  47. # 获取下一行的所有td, 在对应的位置插入
  48. if indtr+i<len(trs):
  49. tds1 = trs[indtr + i].findChildren('td', recursive=False)
  50. if len(tds1)==0:
  51. tds1 = trs[indtr + i].findChildren('th', recursive=False)
  52. if len(tds1) >= (indtd) and len(tds1)>0:
  53. if indtd > 0:
  54. tds1[indtd - 1].insert_after(copy.copy(td))
  55. else:
  56. tds1[0].insert_before(copy.copy(td))
  57. # 若有colspan 则补全同一行下一个位置
  58. if 'colspan' in td.attrs:
  59. if str(td['colspan'])!="":
  60. #print(re.sub("[^0-9]","",td['colspan']))
  61. col = int(re.sub("[^0-9]","",str(td['colspan'])))
  62. td['colspan'] = 1
  63. for i in range(1, col, 1):
  64. td.insert_after(copy.copy(td))
  65. # 表格转化成文字
  66. if ths_len > 1: # 有表头的表格
  67. if len(trs_set) == 1: # 横状表格
  68. ps = ''
  69. trs_set = tbody.findChildren('tr', recursive=False)
  70. for i in range(1, len(trs_set), 1):
  71. tr = trs_set[i]
  72. tds = tr.findChildren('td', recursive=False)
  73. p = ''
  74. for ind, th in enumerate(ths):
  75. if ind < len(tds):
  76. p = p + th.get_text() + ":" + tds[ind].get_text() + ";"
  77. p = p + ";"
  78. ps = ps + p
  79. tbody.string = ps
  80. tbody.name = 'div'
  81. else: # 竖状表格
  82. ps = ''
  83. tds = list(trs_set)[0].findChildren('td', recursive=False)
  84. for ind, td in enumerate(tds):
  85. p = ''
  86. for i in range(0, len(trs_set), 1):
  87. tds_temp = list(trs_set)[i].findChildren('td', recursive=False)
  88. if ind < len(tds_temp):
  89. if ind < len(tds_temp):
  90. p = p + ths[i].get_text() + tds_temp[ind].get_text() + ";"
  91. ps = ps + p
  92. tbody.string = ps
  93. tbody.name = 'p'
  94. else: # 有表头但是非th标签的横状表格
  95. trs = tbody.findChildren('tr', recursive=False)
  96. if len(trs) > 0:
  97. tds0 = trs[0].findChildren('td', recursive=False)
  98. if len(tds0) > 2:
  99. tds_str = [td.get_text() for td in tds0]
  100. pat = re.compile('(序号|项目|产品|货物|单位|数量|价格|金额|总价|中标|供应商|候选|编号|得分|名次|排名|排序|科室){1}')
  101. match_counts = re.subn(pat, '', ";".join(tds_str))[1]
  102. if match_counts > 2:
  103. ths = []
  104. for td in trs[0].findChildren('td', recursive=False):
  105. td.name = 'th'
  106. ths.append(td)
  107. ps = ''
  108. trs = tbody.findChildren('tr', recursive=False)
  109. for i in range(1, len(trs), 1):
  110. tr = trs[i]
  111. tds = tr.findChildren('td', recursive=False)
  112. p = ''
  113. for ind, th in enumerate(ths):
  114. if (len(tds)-1) >= ind:
  115. p = p + th.get_text() + ":" + tds[ind].get_text() + ";"
  116. p = p + ";"
  117. ps = ps + p
  118. tbody.string = ps
  119. tbody.name = 'p'
  120. return soup
  121. def tableToText(soup):
  122. def fixSpan(tbody):
  123. # 处理colspan, rowspan信息补全问题
  124. trs = tbody.findChildren('tr', recursive=False)
  125. ths_len = 0
  126. ths = list()
  127. trs_set = set()
  128. #修改为先进行列补全再进行行补全,否则可能会出现表格解析混乱
  129. # 遍历每一个tr
  130. for indtr, tr in enumerate(trs):
  131. ths_tmp = tr.findChildren('th', recursive=False)
  132. #不补全含有表格的tr
  133. if len(tr.findChildren('table'))>0:
  134. continue
  135. if len(ths_tmp) > 0:
  136. ths_len = ths_len + len(ths_tmp)
  137. for th in ths_tmp:
  138. ths.append(th)
  139. trs_set.add(tr)
  140. # 遍历每行中的element
  141. tds = tr.findChildren(recursive=False)
  142. for indtd, td in enumerate(tds):
  143. # 若有colspan 则补全同一行下一个位置
  144. if 'colspan' in td.attrs:
  145. if str(td['colspan'])!="":
  146. col = int(re.sub("[^0-9]","",str(td['colspan'])))
  147. td['colspan'] = 1
  148. for i in range(1, col, 1):
  149. td.insert_after(copy.copy(td))
  150. for indtr, tr in enumerate(trs):
  151. ths_tmp = tr.findChildren('th', recursive=False)
  152. #不补全含有表格的tr
  153. if len(tr.findChildren('table'))>0:
  154. continue
  155. if len(ths_tmp) > 0:
  156. ths_len = ths_len + len(ths_tmp)
  157. for th in ths_tmp:
  158. ths.append(th)
  159. trs_set.add(tr)
  160. # 遍历每行中的element
  161. tds = tr.findChildren(recursive=False)
  162. for indtd, td in enumerate(tds):
  163. # 若有rowspan 则补全下一行同样位置
  164. if 'rowspan' in td.attrs:
  165. if str(td['rowspan'])!="":
  166. row = int(re.sub("[^0-9]","",str(td['rowspan'])))
  167. td['rowspan'] = 1
  168. for i in range(1, row, 1):
  169. # 获取下一行的所有td, 在对应的位置插入
  170. if indtr+i<len(trs):
  171. tds1 = trs[indtr + i].findChildren(['td','th'], recursive=False)
  172. if len(tds1) >= (indtd) and len(tds1)>0:
  173. if indtd > 0:
  174. tds1[indtd - 1].insert_after(copy.copy(td))
  175. else:
  176. tds1[0].insert_before(copy.copy(td))
  177. def getTable(tbody):
  178. trs = tbody.findChildren('tr', recursive=False)
  179. inner_table = []
  180. for tr in trs:
  181. tr_line = []
  182. tds = tr.findChildren(['td','th'], recursive=False)
  183. for td in tds:
  184. tr_line.append([re.sub('\s*','',td.get_text()),0])
  185. inner_table.append(tr_line)
  186. return inner_table
  187. #处理表格不对齐的问题
  188. def fixTable(inner_table):
  189. maxWidth = 0
  190. for item in inner_table:
  191. if len(item)>maxWidth:
  192. maxWidth = len(item)
  193. for i in range(len(inner_table)):
  194. if len(inner_table[i])<maxWidth:
  195. for j in range(maxWidth-len(inner_table[i])):
  196. inner_table[i].append(["",0])
  197. return inner_table
  198. #设置表头
  199. def setHead(inner_table,pattern,pat_value,count):
  200. height = len(inner_table)
  201. width = len(inner_table[0])
  202. head_list = []
  203. head_list.append(0)
  204. #行表头
  205. for i in range(height):
  206. set_match = set()
  207. is_head = False
  208. for j in range(width):
  209. if re.search(pat_value,inner_table[i][j][0]) is not None:
  210. is_head = False
  211. break
  212. str_find = re.findall(pattern,inner_table[i][j][0])
  213. if len(str_find)>0:
  214. set_match.add(inner_table[i][j][0])
  215. if len(set_match)>=count:
  216. is_head = True
  217. if is_head:
  218. head_list.append(i)
  219. for j in range(width):
  220. inner_table[i][j][1] = 1
  221. head_list.append(height)
  222. #列表头
  223. for i in range(len(head_list)-1):
  224. head_begin = head_list[i]
  225. head_end = head_list[i+1]
  226. #最后一列不设置为列表头
  227. for i in range(width-1):
  228. set_match = set()
  229. is_head = False
  230. for j in range(head_begin,head_end):
  231. if re.search(pat_value,inner_table[j][i][0]) is not None:
  232. is_head = False
  233. break
  234. str_find = re.findall(pattern,inner_table[j][i][0])
  235. if len(str_find)>0:
  236. set_match.add(inner_table[j][i][0])
  237. if len(set_match)>=count:
  238. is_head = True
  239. if is_head:
  240. for j in range(head_begin,head_end):
  241. inner_table[j][i][1] = 2
  242. return inner_table,head_list
  243. def getDirect(inner_table,begin,end):
  244. column_head = set()
  245. row_head = set()
  246. widths = len(inner_table[0])
  247. for height in range(begin,end):
  248. for width in range(widths):
  249. if inner_table[height][width][1] ==1:
  250. row_head.add(height)
  251. if inner_table[height][width][1] ==2:
  252. column_head.add(width)
  253. company_pattern = re.compile("公司")
  254. if 0 in column_head and begin not in row_head:
  255. return "column"
  256. if 0 in column_head and begin in row_head:
  257. for height in range(begin,end):
  258. count = 0
  259. count_flag = True
  260. for width in range(width):
  261. if inner_table[height][width][1]==0:
  262. if re.search(company_pattern,inner_table[height][width][0]) is not None:
  263. count += 1
  264. else:
  265. count_flag = False
  266. if count_flag and count>=2:
  267. return "column"
  268. return "row"
  269. def getTableText(inner_table,head_list):
  270. rankPattern = "(排名|排序|名次|评标结果)"
  271. entityPattern = "(候选|([中投]标|报价)(人|单位|候选)|单位名称|供应商)"
  272. height = len(inner_table)
  273. width = len(inner_table[0])
  274. text = ""
  275. for head_i in range(len(head_list)-1):
  276. text_set = set()
  277. head_begin = head_list[head_i]
  278. head_end = head_list[head_i+1]
  279. direct = getDirect(inner_table, head_begin, head_end)
  280. if direct=="row":
  281. for i in range(head_begin,head_end):
  282. rank_text = ""
  283. entity_text = ""
  284. text_line = ""
  285. for j in range(width):
  286. cell = inner_table[i][j]
  287. #是属性值
  288. if cell[1]==0:
  289. find_flag = False
  290. head = ""
  291. temp_head = ""
  292. for loop_j in range(1,j+1):
  293. if inner_table[i][j-loop_j][1]==2:
  294. if find_flag:
  295. if inner_table[i][j-loop_j][0]!=temp_head:
  296. head = inner_table[i][j-loop_j][0]+":"+head
  297. else:
  298. head = inner_table[i][j-loop_j][0]+":"+head
  299. find_flag = True
  300. temp_head = inner_table[i][j-loop_j][0]
  301. else:
  302. if find_flag:
  303. break
  304. find_flag = False
  305. temp_head = ""
  306. for loop_i in range(1,i+1):
  307. if inner_table[i-loop_i][j][1]==1:
  308. if find_flag:
  309. if inner_table[i-loop_i][j][0]!=temp_head:
  310. head = inner_table[i-loop_i][j][0]+":"+head
  311. else:
  312. head = inner_table[i-loop_i][j][0]+":"+head
  313. find_flag = True
  314. temp_head = inner_table[i-loop_i][j][0]
  315. else:
  316. if find_flag:
  317. break
  318. if str(head+inner_table[i][j][0]) in text_set:
  319. continue
  320. if re.search(rankPattern,head) is not None:
  321. rank_text += head+inner_table[i][j][0]+","
  322. #print(rank_text)
  323. elif re.search(entityPattern,head) is not None:
  324. entity_text += head+inner_table[i][j][0]+","
  325. #print(entity_text)
  326. else:
  327. text_line += head+inner_table[i][j][0]+","
  328. text_set.add(str(head+inner_table[i][j][0]))
  329. text += rank_text+entity_text+text_line
  330. text = text[:-1]+"。"
  331. else:
  332. for j in range(width):
  333. rank_text = ""
  334. entity_text = ""
  335. text_line = ""
  336. for i in range(head_begin,head_end):
  337. cell = inner_table[i][j]
  338. #是属性值
  339. if cell[1]==0:
  340. find_flag = False
  341. head = ""
  342. temp_head = ""
  343. for loop_j in range(1,j+1):
  344. if inner_table[i][j-loop_j][1]==2:
  345. if find_flag:
  346. if inner_table[i][j-loop_j][0]!=temp_head:
  347. head = inner_table[i][j-loop_j][0]+":"+head
  348. else:
  349. head = inner_table[i][j-loop_j][0]+":"+head
  350. find_flag = True
  351. temp_head = inner_table[i][j-loop_j][0]
  352. else:
  353. if find_flag:
  354. break
  355. find_flag = False
  356. temp_head = ""
  357. for loop_i in range(1,i+1):
  358. if inner_table[i-loop_i][j][1]==1:
  359. if find_flag:
  360. if inner_table[i-loop_i][j][0]!=temp_head:
  361. head = inner_table[i-loop_i][j][0]+":"+head
  362. else:
  363. head = inner_table[i-loop_i][j][0]+":"+head
  364. find_flag = True
  365. temp_head = inner_table[i-loop_i][j][0]
  366. else:
  367. if find_flag:
  368. break
  369. if str(head+inner_table[i][j][0]) in text_set:
  370. continue
  371. if re.search(rankPattern,head) is not None:
  372. rank_text += head+inner_table[i][j][0]+","
  373. #print(rank_text)
  374. elif re.search(entityPattern,head) is not None:
  375. entity_text += head+inner_table[i][j][0]+","
  376. #print(entity_text)
  377. else:
  378. text_line += head+inner_table[i][j][0]+","
  379. text_set.add(str(head+inner_table[i][j][0]))
  380. text += rank_text+entity_text+text_line
  381. text = text[:-1]+"。"
  382. return text
  383. pat_head = re.compile('(名称|序号|项目|工程|品目[一二三四1234]|第[一二三四1234](标段|名|候选人|中标)|包段|包号|货物|单位|数量|价格|报价|金额|总价|单价|[招投中]标|供应商|候选|编号|得分|评委|评分|名次|排名|排序|科室|方式|工期|时间|产品|开始|结束|联系|日期|面积|姓名|证号|备注|级别|地[点址]|类型|代理)')
  384. pat_value = re.compile("(\d{2,}.\d{1}|\d+年\d+月|\d{8,}|\d{3,}-\d{6,}|有限[责任]*公司|^\d+$)")
  385. tbodies = soup.find_all('tbody')
  386. if len(tbodies) == 0:
  387. tbodies = soup.find_all('table')
  388. # 遍历表格中的每个tbody
  389. #逆序处理嵌套表格
  390. for tbody_index in range(1,len(tbodies)+1):
  391. tbody = tbodies[len(tbodies)-tbody_index]
  392. fixSpan(tbody)
  393. inner_table = getTable(tbody)
  394. inner_table = fixTable(inner_table)
  395. if len(inner_table)>0:
  396. inner_table,head_list = setHead(inner_table,pat_head,pat_value,3)
  397. tbody.string = getTableText(inner_table,head_list)
  398. #print(tbody.string)
  399. tbody.name = "table"
  400. return soup
  401. def segment(soup):
  402. #segList = ["tr","div","h1", "h2", "h3", "h4", "h5", "h6", "header"]
  403. segList = ["tr"]
  404. #commaList = ["p","div","br","td","span"]
  405. commaList = []
  406. spaceList = ["span"]
  407. subspaceList = ["td",'a',"span"]
  408. tbodies = soup.find_all('tbody')
  409. if len(tbodies) == 0:
  410. tbodies = soup.find_all('table')
  411. # 递归遍历所有节点,插入符号
  412. for child in soup.body.descendants:
  413. if child.name in segList:
  414. child.insert_after("。")
  415. if child.name in commaList:
  416. child.insert_after(",")
  417. if child.name in subspaceList:
  418. child.insert_before("#subs"+str(child.name)+"#")
  419. child.insert_after("#sube"+str(child.name)+"#")
  420. if child.name in spaceList:
  421. child.insert_after(" ")
  422. text = str(soup.get_text())
  423. #替换"""为"“",否则导入deepdive出错
  424. text = text.replace('"',"“").replace("\r","").replace("\n","")
  425. #替换英文冒号为中文冒号
  426. text = re.sub("(?<=[\u4e00-\u9fa5]):|:(?=[\u4e00-\u9fa5])",":",text)
  427. #替换为中文逗号
  428. text = re.sub("(?<=[\u4e00-\u9fa5]),|,(?=[\u4e00-\u9fa5])",",",text)
  429. #替换为中文分号
  430. text = re.sub("(?<=[\u4e00-\u9fa5]);|;(?=[\u4e00-\u9fa5])",";",text)
  431. #删除标签中的所有空格
  432. for subs in subspaceList:
  433. patten = "#subs"+str(subs)+"#(.*?)#sube"+str(subs)+"#"
  434. while(True):
  435. oneMatch = re.search(re.compile(patten),text)
  436. if oneMatch is not None:
  437. text = text.replace("#subs"+str(subs)+"#"+oneMatch.group(1)+"#sube"+str(subs)+"#",re.sub("\s","",oneMatch.group(1)))
  438. else:
  439. break
  440. #替换标点
  441. while(True):
  442. #替换连续的标点
  443. punc = re.search(",(?P<punc>:|。|,|;)\s*",text)
  444. if punc is not None:
  445. text = re.sub(","+punc.group("punc")+"\s*",punc.group("punc"),text)
  446. punc = re.search("(?P<punc>:|。|,|;)\s*,",text)
  447. if punc is not None:
  448. text = re.sub(punc.group("punc")+"\s*,",punc.group("punc"),text)
  449. else:
  450. #替换标点之后的空格
  451. punc = re.search("(?P<punc>:|。|,|;)\s+",text)
  452. if punc is not None:
  453. text = re.sub(punc.group("punc")+"\s+",punc.group("punc"),text)
  454. else:
  455. break
  456. #将连续的中文句号替换为一个
  457. text_split = text.split("。")
  458. text_split = [x for x in text_split if len(x)>0]
  459. text = "。".join(text_split)
  460. #删除所有空格
  461. text = re.sub("\s*","",text)
  462. return text
  463. @tsv_extractor
  464. @returns(lambda
  465. doc_id = "text",
  466. content ="text",
  467. :[])
  468. def extract(
  469. doc_id = "text",
  470. content ="text",
  471. ):
  472. log("doc_id="+str(doc_id))
  473. #content_new = segment(table2text(BeautifulSoup(content,"lxml")))
  474. content_new = segment(tableToText(BeautifulSoup(content,"lxml")))
  475. if len(content_new)<=20000:
  476. yield[
  477. doc_id,
  478. content_new,]