2.py 17 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393
  1. '''
  2. Created on 2018年12月29日
  3. @author: User
  4. '''
  5. from bs4 import BeautifulSoup, Comment
  6. import copy
  7. import re
  8. import sys
  9. import os
  10. import time
  11. sys.path.append(os.path.abspath("../.."))
  12. import fool
  13. from BiddingKG.dl.common.Connection import getConnection
  14. def tableToText(soup):
  15. def fixSpan(tbody):
  16. # 处理colspan, rowspan信息补全问题
  17. trs = tbody.findChildren('tr', recursive=False)
  18. ths_len = 0
  19. ths = list()
  20. trs_set = set()
  21. #修改为先进行列补全再进行行补全,否则可能会出现表格解析混乱
  22. # 遍历每一个tr
  23. for indtr, tr in enumerate(trs):
  24. ths_tmp = tr.findChildren('th', recursive=False)
  25. #不补全含有表格的tr
  26. if len(tr.findChildren('table'))>0:
  27. continue
  28. if len(ths_tmp) > 0:
  29. ths_len = ths_len + len(ths_tmp)
  30. for th in ths_tmp:
  31. ths.append(th)
  32. trs_set.add(tr)
  33. # 遍历每行中的element
  34. tds = tr.findChildren(recursive=False)
  35. for indtd, td in enumerate(tds):
  36. # 若有colspan 则补全同一行下一个位置
  37. if 'colspan' in td.attrs:
  38. if str(td['colspan'])!="":
  39. col = int(re.sub("[^0-9]","",str(td['colspan'])))
  40. td['colspan'] = 1
  41. for i in range(1, col, 1):
  42. td.insert_after(copy.copy(td))
  43. for indtr, tr in enumerate(trs):
  44. ths_tmp = tr.findChildren('th', recursive=False)
  45. #不补全含有表格的tr
  46. if len(tr.findChildren('table'))>0:
  47. continue
  48. if len(ths_tmp) > 0:
  49. ths_len = ths_len + len(ths_tmp)
  50. for th in ths_tmp:
  51. ths.append(th)
  52. trs_set.add(tr)
  53. # 遍历每行中的element
  54. tds = tr.findChildren(recursive=False)
  55. for indtd, td in enumerate(tds):
  56. # 若有rowspan 则补全下一行同样位置
  57. if 'rowspan' in td.attrs:
  58. if str(td['rowspan'])!="":
  59. row = int(re.sub("[^0-9]","",str(td['rowspan'])))
  60. td['rowspan'] = 1
  61. for i in range(1, row, 1):
  62. # 获取下一行的所有td, 在对应的位置插入
  63. if indtr+i<len(trs):
  64. tds1 = trs[indtr + i].findChildren(['td','th'], recursive=False)
  65. if len(tds1) >= (indtd) and len(tds1)>0:
  66. if indtd > 0:
  67. tds1[indtd - 1].insert_after(copy.copy(td))
  68. else:
  69. tds1[0].insert_before(copy.copy(td))
  70. def getTable(tbody):
  71. trs = tbody.findChildren('tr', recursive=False)
  72. inner_table = []
  73. for tr in trs:
  74. tr_line = []
  75. tds = tr.findChildren(['td','th'], recursive=False)
  76. for td in tds:
  77. tr_line.append([re.sub('\s*','',td.get_text()),0])
  78. inner_table.append(tr_line)
  79. return inner_table
  80. #处理表格不对齐的问题
  81. def fixTable(inner_table):
  82. maxWidth = 0
  83. for item in inner_table:
  84. if len(item)>maxWidth:
  85. maxWidth = len(item)
  86. for i in range(len(inner_table)):
  87. if len(inner_table[i])<maxWidth:
  88. for j in range(maxWidth-len(inner_table[i])):
  89. inner_table[i].append(["",0])
  90. return inner_table
  91. #设置表头
  92. def setHead(inner_table,pattern,pat_value,count):
  93. height = len(inner_table)
  94. width = len(inner_table[0])
  95. head_list = []
  96. head_list.append(0)
  97. #行表头
  98. for i in range(height):
  99. set_match = set()
  100. is_head = False
  101. for j in range(width):
  102. if re.search(pat_value,inner_table[i][j][0]) is not None:
  103. is_head = False
  104. break
  105. str_find = re.findall(pattern,inner_table[i][j][0])
  106. if len(str_find)>0:
  107. set_match.add(inner_table[i][j][0])
  108. if len(set_match)>=count:
  109. is_head = True
  110. if is_head:
  111. head_list.append(i)
  112. for j in range(width):
  113. inner_table[i][j][1] = 1
  114. head_list.append(height)
  115. #列表头
  116. for i in range(len(head_list)-1):
  117. head_begin = head_list[i]
  118. head_end = head_list[i+1]
  119. #最后一列不设置为列表头
  120. for i in range(width-1):
  121. set_match = set()
  122. is_head = False
  123. for j in range(head_begin,head_end):
  124. if re.search(pat_value,inner_table[j][i][0]) is not None:
  125. is_head = False
  126. break
  127. str_find = re.findall(pattern,inner_table[j][i][0])
  128. if len(str_find)>0:
  129. set_match.add(inner_table[j][i][0])
  130. if len(set_match)>=count:
  131. is_head = True
  132. if is_head:
  133. for j in range(head_begin,head_end):
  134. inner_table[j][i][1] = 2
  135. return inner_table,head_list
  136. def getDirect(inner_table,begin,end):
  137. column_head = set()
  138. row_head = set()
  139. widths = len(inner_table[0])
  140. for height in range(begin,end):
  141. for width in range(widths):
  142. if inner_table[height][width][1] ==1:
  143. row_head.add(height)
  144. if inner_table[height][width][1] ==2:
  145. column_head.add(width)
  146. company_pattern = re.compile("公司")
  147. if 0 in column_head and begin not in row_head:
  148. return "column"
  149. if 0 in column_head and begin in row_head:
  150. for height in range(begin,end):
  151. count = 0
  152. count_flag = True
  153. for width in range(width):
  154. if inner_table[height][width][1]==0:
  155. if re.search(company_pattern,inner_table[height][width][0]) is not None:
  156. count += 1
  157. else:
  158. count_flag = False
  159. if count_flag and count>=2:
  160. return "column"
  161. return "row"
  162. def getTableText(inner_table,head_list):
  163. rankPattern = "(排名|排序|名次|评标结果)"
  164. entityPattern = "(候选|([中投]标|报价)(人|单位|候选)|单位名称|供应商)"
  165. height = len(inner_table)
  166. width = len(inner_table[0])
  167. text = ""
  168. for head_i in range(len(head_list)-1):
  169. text_set = set()
  170. head_begin = head_list[head_i]
  171. head_end = head_list[head_i+1]
  172. direct = getDirect(inner_table, head_begin, head_end)
  173. if direct=="row":
  174. for i in range(head_begin,head_end):
  175. rank_text = ""
  176. entity_text = ""
  177. text_line = ""
  178. for j in range(width):
  179. cell = inner_table[i][j]
  180. #是属性值
  181. if cell[1]==0:
  182. find_flag = False
  183. head = ""
  184. temp_head = ""
  185. for loop_j in range(1,j+1):
  186. if inner_table[i][j-loop_j][1]==2:
  187. if find_flag:
  188. if inner_table[i][j-loop_j][0]!=temp_head:
  189. head = inner_table[i][j-loop_j][0]+":"+head
  190. else:
  191. head = inner_table[i][j-loop_j][0]+":"+head
  192. find_flag = True
  193. temp_head = inner_table[i][j-loop_j][0]
  194. else:
  195. if find_flag:
  196. break
  197. find_flag = False
  198. temp_head = ""
  199. for loop_i in range(1,i+1):
  200. if inner_table[i-loop_i][j][1]==1:
  201. if find_flag:
  202. if inner_table[i-loop_i][j][0]!=temp_head:
  203. head = inner_table[i-loop_i][j][0]+":"+head
  204. else:
  205. head = inner_table[i-loop_i][j][0]+":"+head
  206. find_flag = True
  207. temp_head = inner_table[i-loop_i][j][0]
  208. else:
  209. if find_flag:
  210. break
  211. if str(head+inner_table[i][j][0]) in text_set:
  212. continue
  213. if re.search(rankPattern,head) is not None:
  214. rank_text += head+inner_table[i][j][0]+","
  215. #print(rank_text)
  216. elif re.search(entityPattern,head) is not None:
  217. entity_text += head+inner_table[i][j][0]+","
  218. #print(entity_text)
  219. else:
  220. text_line += head+inner_table[i][j][0]+","
  221. text_set.add(str(head+inner_table[i][j][0]))
  222. text += rank_text+entity_text+text_line
  223. text = text[:-1]+"。"
  224. else:
  225. for j in range(width):
  226. rank_text = ""
  227. entity_text = ""
  228. text_line = ""
  229. for i in range(head_begin,head_end):
  230. cell = inner_table[i][j]
  231. #是属性值
  232. if cell[1]==0:
  233. find_flag = False
  234. head = ""
  235. temp_head = ""
  236. for loop_j in range(1,j+1):
  237. if inner_table[i][j-loop_j][1]==2:
  238. if find_flag:
  239. if inner_table[i][j-loop_j][0]!=temp_head:
  240. head = inner_table[i][j-loop_j][0]+":"+head
  241. else:
  242. head = inner_table[i][j-loop_j][0]+":"+head
  243. find_flag = True
  244. temp_head = inner_table[i][j-loop_j][0]
  245. else:
  246. if find_flag:
  247. break
  248. find_flag = False
  249. temp_head = ""
  250. for loop_i in range(1,i+1):
  251. if inner_table[i-loop_i][j][1]==1:
  252. if find_flag:
  253. if inner_table[i-loop_i][j][0]!=temp_head:
  254. head = inner_table[i-loop_i][j][0]+":"+head
  255. else:
  256. head = inner_table[i-loop_i][j][0]+":"+head
  257. find_flag = True
  258. temp_head = inner_table[i-loop_i][j][0]
  259. else:
  260. if find_flag:
  261. break
  262. if str(head+inner_table[i][j][0]) in text_set:
  263. continue
  264. if re.search(rankPattern,head) is not None:
  265. rank_text += head+inner_table[i][j][0]+","
  266. #print(rank_text)
  267. elif re.search(entityPattern,head) is not None:
  268. entity_text += head+inner_table[i][j][0]+","
  269. #print(entity_text)
  270. else:
  271. text_line += head+inner_table[i][j][0]+","
  272. text_set.add(str(head+inner_table[i][j][0]))
  273. text += rank_text+entity_text+text_line
  274. text = text[:-1]+"。"
  275. return text
  276. pat_head = re.compile('(名称|序号|项目|工程|品目[一二三四1234]|第[一二三四1234](标段|名|候选人|中标)|包段|包号|货物|单位|数量|价格|报价|金额|总价|单价|[招投中]标|供应商|候选|编号|得分|评委|评分|名次|排名|排序|科室|方式|工期|时间|产品|开始|结束|联系|日期|面积|姓名|证号|备注|级别|地[点址]|类型|代理)')
  277. pat_value = re.compile("(\d{2,}.\d{1}|\d+年\d+月|\d{8,}|\d{3,}-\d{6,}|有限[责任]*公司|^\d+$)")
  278. tbodies = soup.find_all('tbody')
  279. if len(tbodies) == 0:
  280. tbodies = soup.find_all('table')
  281. # 遍历表格中的每个tbody
  282. #逆序处理嵌套表格
  283. for tbody_index in range(1,len(tbodies)+1):
  284. tbody = tbodies[len(tbodies)-tbody_index]
  285. fixSpan(tbody)
  286. inner_table = getTable(tbody)
  287. inner_table = fixTable(inner_table)
  288. if len(inner_table)>0:
  289. inner_table,head_list = setHead(inner_table,pat_head,pat_value,3)
  290. tbody.string = getTableText(inner_table,head_list)
  291. #print(tbody.string)
  292. tbody.name = "table"
  293. return soup
  294. def segment(soup):
  295. #segList = ["tr","div","h1", "h2", "h3", "h4", "h5", "h6", "header"]
  296. segList = ["tr"]
  297. #commaList = ["p","div","br","td","span"]
  298. commaList = []
  299. spaceList = ["span"]
  300. subspaceList = ["td",'a',"span"]
  301. tbodies = soup.find_all('tbody')
  302. if len(tbodies) == 0:
  303. tbodies = soup.find_all('table')
  304. # 递归遍历所有节点,插入符号
  305. for child in soup.body.descendants:
  306. if child.name in segList:
  307. child.insert_after("。")
  308. if child.name in commaList:
  309. child.insert_after(",")
  310. if child.name in subspaceList:
  311. child.insert_before("#subs"+str(child.name)+"#")
  312. child.insert_after("#sube"+str(child.name)+"#")
  313. if child.name in spaceList:
  314. child.insert_after(" ")
  315. text = str(soup.get_text())
  316. #替换"""为"“",否则导入deepdive出错
  317. text = text.replace('"',"“").replace("\r","").replace("\n","")
  318. #替换英文冒号为中文冒号
  319. text = re.sub("(?<=[\u4e00-\u9fa5]):|:(?=[\u4e00-\u9fa5])",":",text)
  320. #替换为中文逗号
  321. text = re.sub("(?<=[\u4e00-\u9fa5]),|,(?=[\u4e00-\u9fa5])",",",text)
  322. #替换为中文分号
  323. text = re.sub("(?<=[\u4e00-\u9fa5]);|;(?=[\u4e00-\u9fa5])",";",text)
  324. #删除标签中的所有空格
  325. for subs in subspaceList:
  326. patten = "#subs"+str(subs)+"#(.*?)#sube"+str(subs)+"#"
  327. while(True):
  328. oneMatch = re.search(re.compile(patten),text)
  329. if oneMatch is not None:
  330. text = text.replace("#subs"+str(subs)+"#"+oneMatch.group(1)+"#sube"+str(subs)+"#",re.sub("\s","",oneMatch.group(1)))
  331. else:
  332. break
  333. #替换标点
  334. while(True):
  335. #替换连续的标点
  336. punc = re.search(",(?P<punc>:|。|,|;)\s*",text)
  337. if punc is not None:
  338. text = re.sub(","+punc.group("punc")+"\s*",punc.group("punc"),text)
  339. punc = re.search("(?P<punc>:|。|,|;)\s*,",text)
  340. if punc is not None:
  341. text = re.sub(punc.group("punc")+"\s*,",punc.group("punc"),text)
  342. else:
  343. #替换标点之后的空格
  344. punc = re.search("(?P<punc>:|。|,|;)\s+",text)
  345. if punc is not None:
  346. text = re.sub(punc.group("punc")+"\s+",punc.group("punc"),text)
  347. else:
  348. break
  349. #将连续的中文句号替换为一个
  350. text_split = text.split("。")
  351. text_split = [x for x in text_split if len(x)>0]
  352. text = "。".join(text_split)
  353. #删除所有空格
  354. text = re.sub("\s*","",text)
  355. return text
  356. if __name__=="__main__":
  357. conn = getConnection()
  358. cursor = conn.cursor()
  359. sql = sql = " select content,id from articles where id in(select doc_id from articles_validation where exists(select 1 from articles_processed where id=doc_id)) order by id limit 70"
  360. cursor.execute(sql)
  361. rows = cursor.fetchall()
  362. sentences = []
  363. a = time.time()
  364. for row in rows:
  365. content = row[0]
  366. sentences = sentences+re.split("。",segment(tableToText(BeautifulSoup(content,"lxml"))))
  367. tokens_all = fool.cut(sentences)
  368. #pos_all = fool.LEXICAL_ANALYSER.pos(tokens_all)
  369. ner_tag_all = fool.LEXICAL_ANALYSER.ner_labels(sentences,tokens_all)
  370. print("takes:",time.time()-a)