test.py 16 KB


  1. import pickle
  2. import re
  3. import copy
  4. import codecs
  5. from bs4 import BeautifulSoup
  6. import glob
  7. def save(object_to_save, path):
  8. '''
  9. 保存对象
  10. @Arugs:
  11. object_to_save: 需要保存的对象
  12. @Return:
  13. 保存的路径
  14. '''
  15. with open(path, 'wb') as f:
  16. pickle.dump(object_to_save, f)
  17. def load(path):
  18. '''
  19. 读取对象
  20. @Arugs:
  21. path: 读取的路径
  22. @Return:
  23. 读取的对象
  24. '''
  25. with open(path, 'rb') as f:
  26. object1 = pickle.load(f)
  27. return object1
  28. def tableToText(soup):
  29. '''
  30. @param:
  31. soup:网页html的soup
  32. @return:处理完表格信息的网页text
  33. '''
  34. def fixSpan(tbody):
  35. # 处理colspan, rowspan信息补全问题
  36. trs = tbody.findChildren('tr', recursive=False)
  37. ths_len = 0
  38. ths = list()
  39. trs_set = set()
  40. #修改为先进行列补全再进行行补全,否则可能会出现表格解析混乱
  41. # 遍历每一个tr
  42. for indtr, tr in enumerate(trs):
  43. ths_tmp = tr.findChildren('th', recursive=False)
  44. #不补全含有表格的tr
  45. if len(tr.findChildren('table'))>0:
  46. continue
  47. if len(ths_tmp) > 0:
  48. ths_len = ths_len + len(ths_tmp)
  49. for th in ths_tmp:
  50. ths.append(th)
  51. trs_set.add(tr)
  52. # 遍历每行中的element
  53. tds = tr.findChildren(recursive=False)
  54. for indtd, td in enumerate(tds):
  55. # 若有colspan 则补全同一行下一个位置
  56. if 'colspan' in td.attrs:
  57. if str(re.sub("[^0-9]","",str(td['colspan'])))!="":
  58. col = int(re.sub("[^0-9]","",str(td['colspan'])))
  59. td['colspan'] = 1
  60. for i in range(1, col, 1):
  61. td.insert_after(copy.copy(td))
  62. for indtr, tr in enumerate(trs):
  63. ths_tmp = tr.findChildren('th', recursive=False)
  64. #不补全含有表格的tr
  65. if len(tr.findChildren('table'))>0:
  66. continue
  67. if len(ths_tmp) > 0:
  68. ths_len = ths_len + len(ths_tmp)
  69. for th in ths_tmp:
  70. ths.append(th)
  71. trs_set.add(tr)
  72. # 遍历每行中的element
  73. tds = tr.findChildren(recursive=False)
  74. for indtd, td in enumerate(tds):
  75. # 若有rowspan 则补全下一行同样位置
  76. if 'rowspan' in td.attrs:
  77. if str(re.sub("[^0-9]","",str(td['rowspan'])))!="":
  78. row = int(re.sub("[^0-9]","",str(td['rowspan'])))
  79. td['rowspan'] = 1
  80. for i in range(1, row, 1):
  81. # 获取下一行的所有td, 在对应的位置插入
  82. if indtr+i<len(trs):
  83. tds1 = trs[indtr + i].findChildren(['td','th'], recursive=False)
  84. if len(tds1) >= (indtd) and len(tds1)>0:
  85. if indtd > 0:
  86. tds1[indtd - 1].insert_after(copy.copy(td))
  87. else:
  88. tds1[0].insert_before(copy.copy(td))
  89. def getTable(tbody):
  90. trs = tbody.findChildren('tr', recursive=False)
  91. inner_table = []
  92. for tr in trs:
  93. tr_line = []
  94. tds = tr.findChildren(['td','th'], recursive=False)
  95. for td in tds:
  96. tr_line.append([re.sub('\s*','',td.get_text()),0])
  97. inner_table.append(tr_line)
  98. return inner_table
  99. #处理表格不对齐的问题
  100. def fixTable(inner_table):
  101. maxWidth = 0
  102. for item in inner_table:
  103. if len(item)>maxWidth:
  104. maxWidth = len(item)
  105. for i in range(len(inner_table)):
  106. if len(inner_table[i])<maxWidth:
  107. for j in range(maxWidth-len(inner_table[i])):
  108. inner_table[i].append(["",0])
  109. return inner_table
  110. #设置表头
  111. def setHead(inner_table,pattern,pat_value,count):
  112. height = len(inner_table)
  113. width = len(inner_table[0])
  114. head_list = []
  115. head_list.append(0)
  116. #行表头
  117. is_head_last = False
  118. for i in range(height):
  119. set_match = set()
  120. is_head = False
  121. is_long_value = False
  122. is_same_value = True
  123. same_value = inner_table[i][0][0]
  124. for j in range(width):
  125. if inner_table[i][j][0]!=same_value:
  126. is_same_value = False
  127. break
  128. for j in range(width):
  129. if re.search(pat_value,inner_table[i][j][0]) is not None:
  130. is_head = False
  131. break
  132. str_find = re.findall(pattern,inner_table[i][j][0])
  133. if len(str_find)>0:
  134. set_match.add(inner_table[i][j][0])
  135. if len(set_match)>=count:
  136. is_head = True
  137. if len(inner_table[i][0][0])>40:
  138. is_long_value = True
  139. if is_head or is_long_value or is_same_value:
  140. if not is_head_last:
  141. head_list.append(i)
  142. if is_head:
  143. for j in range(width):
  144. inner_table[i][j][1] = 1
  145. is_head_last = is_head
  146. head_list.append(height)
  147. #列表头
  148. for i in range(len(head_list)-1):
  149. head_begin = head_list[i]
  150. head_end = head_list[i+1]
  151. #最后一列不设置为列表头
  152. for i in range(width-1):
  153. set_match = set()
  154. is_head = False
  155. for j in range(head_begin,head_end):
  156. if re.search(pat_value,inner_table[j][i][0]) is not None:
  157. is_head = False
  158. break
  159. str_find = re.findall(pattern,inner_table[j][i][0])
  160. if len(str_find)>0:
  161. set_match.add(inner_table[j][i][0])
  162. if len(set_match)>=count:
  163. is_head = True
  164. if is_head:
  165. for j in range(head_begin,head_end):
  166. inner_table[j][i][1] = 2
  167. return inner_table,head_list
  168. #取得表格的处理方向
  169. def getDirect(inner_table,begin,end):
  170. column_head = set()
  171. row_head = set()
  172. widths = len(inner_table[0])
  173. for height in range(begin,end):
  174. for width in range(widths):
  175. if inner_table[height][width][1] ==1:
  176. row_head.add(height)
  177. if inner_table[height][width][1] ==2:
  178. column_head.add(width)
  179. company_pattern = re.compile("公司")
  180. if 0 in column_head and begin not in row_head:
  181. return "column"
  182. if 0 in column_head and begin in row_head:
  183. for height in range(begin,end):
  184. count = 0
  185. count_flag = True
  186. for width_index in range(width):
  187. if inner_table[height][width_index][1]==0:
  188. if re.search(company_pattern,inner_table[height][width_index][0]) is not None:
  189. count += 1
  190. else:
  191. count_flag = False
  192. if count_flag and count>=2:
  193. return "column"
  194. return "row"
  195. #根据表格处理方向生成句子,
  196. def getTableText(inner_table,head_list):
  197. rankPattern = "(排名|排序|名次|评标结果|评审结果)"
  198. entityPattern = "(候选|([中投]标|报价)(人|单位|候选)|单位名称|供应商)"
  199. height = len(inner_table)
  200. width = len(inner_table[0])
  201. text = ""
  202. for head_i in range(len(head_list)-1):
  203. head_begin = head_list[head_i]
  204. head_end = head_list[head_i+1]
  205. direct = getDirect(inner_table, head_begin, head_end)
  206. if direct=="row":
  207. for i in range(head_begin,head_end):
  208. rank_text = ""
  209. entity_text = ""
  210. text_line = ""
  211. for j in range(width):
  212. cell = inner_table[i][j]
  213. #是属性值
  214. if cell[1]==0:
  215. find_flag = False
  216. head = ""
  217. temp_head = ""
  218. text_set = set()
  219. for loop_j in range(1,j+1):
  220. if inner_table[i][j-loop_j][1]==2:
  221. if find_flag:
  222. if inner_table[i][j-loop_j][0]!=temp_head:
  223. head = inner_table[i][j-loop_j][0]+":"+head
  224. else:
  225. head = inner_table[i][j-loop_j][0]+":"+head
  226. find_flag = True
  227. temp_head = inner_table[i][j-loop_j][0]
  228. else:
  229. if find_flag:
  230. break
  231. find_flag = False
  232. temp_head = ""
  233. for loop_i in range(0,i+1-head_begin):
  234. if inner_table[i-loop_i][j][1]==1:
  235. if find_flag:
  236. if inner_table[i-loop_i][j][0]!=temp_head:
  237. head = inner_table[i-loop_i][j][0]+":"+head
  238. else:
  239. head = inner_table[i-loop_i][j][0]+":"+head
  240. find_flag = True
  241. temp_head = inner_table[i-loop_i][j][0]
  242. else:
  243. if find_flag:
  244. break
  245. if str(head+inner_table[i][j][0]) in text_set:
  246. continue
  247. if re.search(rankPattern,head) is not None:
  248. rank_text += head+inner_table[i][j][0]+","
  249. #print(rank_text)
  250. elif re.search(entityPattern,head) is not None:
  251. entity_text += head+inner_table[i][j][0]+","
  252. #print(entity_text)
  253. else:
  254. text_line += head+inner_table[i][j][0]+","
  255. text_set.add(str(head+inner_table[i][j][0]))
  256. text += rank_text+entity_text+text_line
  257. text = text[:-1]+"。"
  258. else:
  259. for j in range(width):
  260. rank_text = ""
  261. entity_text = ""
  262. text_line = ""
  263. for i in range(head_begin,head_end):
  264. cell = inner_table[i][j]
  265. #是属性值
  266. if cell[1]==0:
  267. find_flag = False
  268. head = ""
  269. temp_head = ""
  270. text_set = set()
  271. for loop_j in range(1,j+1):
  272. if inner_table[i][j-loop_j][1]==2:
  273. if find_flag:
  274. if inner_table[i][j-loop_j][0]!=temp_head:
  275. head = inner_table[i][j-loop_j][0]+":"+head
  276. else:
  277. head = inner_table[i][j-loop_j][0]+":"+head
  278. find_flag = True
  279. temp_head = inner_table[i][j-loop_j][0]
  280. else:
  281. if find_flag:
  282. break
  283. find_flag = False
  284. temp_head = ""
  285. for loop_i in range(0,i+1-head_begin):
  286. if inner_table[i-loop_i][j][1]==1:
  287. if find_flag:
  288. if inner_table[i-loop_i][j][0]!=temp_head:
  289. head = inner_table[i-loop_i][j][0]+":"+head
  290. else:
  291. head = inner_table[i-loop_i][j][0]+":"+head
  292. find_flag = True
  293. temp_head = inner_table[i-loop_i][j][0]
  294. else:
  295. if find_flag:
  296. break
  297. if str(head+inner_table[i][j][0]) in text_set:
  298. continue
  299. if re.search(rankPattern,head) is not None:
  300. rank_text += head+inner_table[i][j][0]+","
  301. #print(rank_text)
  302. elif re.search(entityPattern,head) is not None:
  303. entity_text += head+inner_table[i][j][0]+","
  304. #print(entity_text)
  305. else:
  306. text_line += head+inner_table[i][j][0]+","
  307. text_set.add(str(head+inner_table[i][j][0]))
  308. text += rank_text+entity_text+text_line
  309. text = text[:-1]+"。"
  310. return text
  311. pat_head = re.compile('(名称|序号|项目|工程|品目[一二三四1234]|第[一二三四1234](标段|名|候选人|中标)|包段|包号|货物|单位|数量|价格|报价|金额|总价|单价|[招投中]标|供应商|候选|编号|得分|评委|评分|名次|排名|排序|科室|方式|工期|时间|产品|开始|结束|联系|日期|面积|姓名|证号|备注|级别|地[点址]|类型|代理)')
  312. #pat_head = re.compile('(名称|序号|项目|工程|品目[一二三四1234]|第[一二三四1234](标段|候选人|中标)|包段|包号|货物|单位|数量|价格|报价|金额|总价|单价|[招投中]标|供应商|候选|编号|得分|评委|评分|名次|排名|排序|科室|方式|工期|时间|产品|开始|结束|联系|日期|面积|姓名|证号|备注|级别|地[点址]|类型|代理)')
  313. pat_value = re.compile("(\d{2,}.\d{1}|\d+年\d+月|\d{8,}|\d{3,}-\d{6,}|有限[责任]*公司|^\d+$)")
  314. tbodies = soup.find_all('tbody')
  315. if len(tbodies) == 0:
  316. tbodies = soup.find_all('table')
  317. # 遍历表格中的每个tbody
  318. #逆序处理嵌套表格
  319. for tbody_index in range(1,len(tbodies)+1):
  320. tbody = tbodies[len(tbodies)-tbody_index]
  321. fixSpan(tbody)
  322. inner_table = getTable(tbody)
  323. inner_table = fixTable(inner_table)
  324. if len(inner_table)>0 and len(inner_table[0])>0:
  325. inner_table,head_list = setHead(inner_table,pat_head,pat_value,3)
  326. tbody.string = getTableText(inner_table,head_list)
  327. #print(tbody.string)
  328. tbody.name = "table"
  329. return soup
  330. def getText(article):
  331. soup = BeautifulSoup(article,"lxml")
  332. soup = tableToText(soup)
  333. return soup.get_text()
  334. if __name__=="__main__":
  335. home = "C:\\Users\\User\\Desktop\\20190416要素\\*.html"
  336. data = []
  337. for file in glob.glob(home):
  338. article = codecs.open(file,"r",encoding="utf8").read()
  339. #text = getText(article)
  340. '''
  341. if len(text)<400:
  342. print(file.split("\\")[-1])
  343. continue
  344. '''
  345. data.append([file.split("\\")[-1],article])
  346. save(data,"data.pk")
  347. print("length:",len(data))
  348. '''
  349. data = load("data_zb.pk")
  350. print(len(data))
  351. a = set()
  352. index = 0
  353. for item in data:
  354. index += 1
  355. if item[0]=="比地_52_57160814.html":
  356. print("index",index)
  357. break
  358. '''