pdfparser.py 17 KB


  1. #coding:utf8
  2. from pdfminer.pdfparser import PDFParser
  3. from pdfminer.pdfdocument import PDFDocument
  4. from pdfminer.pdfpage import PDFPage
  5. from pdfminer.pdfpage import PDFTextExtractionNotAllowed
  6. from pdfminer.pdfinterp import PDFResourceManager
  7. from pdfminer.pdfinterp import PDFPageInterpreter
  8. from pdfminer.pdfdevice import PDFDevice
  9. from pdfminer.layout import *
  10. from pdfminer.converter import PDFPageAggregator
  11. import logging
  12. logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(name)s - %(levelname)s - %(message)s')
  13. class ParseDocument():
  14. def __init__(self,filepath):
  15. self.filename = filepath
  16. self.childs = []
  17. # Open a PDF file.
  18. fp = open(filepath, 'rb')
  19. # Create a PDF parser object associated with the file object.
  20. parser = PDFParser(fp)
  21. # Create a PDF document object that stores the document structure.
  22. # Supply the password for initialization.
  23. document = PDFDocument(parser)
  24. # Check if the document allows text extraction. If not, abort.
  25. if not document.is_extractable:
  26. raise PDFTextExtractionNotAllowed
  27. # Create a PDF resource manager object that stores shared resources.
  28. rsrcmgr = PDFResourceManager()
  29. # Create a PDF device object.
  30. laparams = LAParams(line_overlap=0.1,
  31. char_margin=0.1,
  32. line_margin=0.1,
  33. word_margin=0.1,
  34. boxes_flow=0.5,)
  35. device = PDFPageAggregator(rsrcmgr, laparams=laparams)
  36. # Create a PDF interpreter object.
  37. interpreter = PDFPageInterpreter(rsrcmgr, device)
  38. # Process each page contained in the document.
  39. page_no = 0
  40. for page in PDFPage.create_pages(document):
  41. interpreter.process_page(page)
  42. ltpage = device.get_result()
  43. page_no += 1
  44. logging.info("recognize page:%d"%page_no)
  45. self.childs.append(self.recognize(ltpage))
  46. # print(ltpage.__dict__)
  47. # ParsePage(ltpage).recognize_rect(ltpage)
  48. return
  49. def recognize(self,_page):
  50. _page = ParsePage(_page)
  51. return _page
  52. class ParsePage():
  53. def __init__(self,_page):
  54. self.childs = []
  55. self.list_tables = []
  56. self.list_sentences = []
  57. self.getFontinfo(_page)
  58. filter_objs = self.recognize_table(_page)
  59. self.recognize_sentences(_page,filter_objs)
  60. def recognize_table(self,_page,line_margin=0.2):
  61. list_rects = []
  62. list_textbox = []
  63. for _obj in _page._objs:
  64. if isinstance(_obj,(LTRect)):
  65. list_rects.append(_obj)
  66. elif isinstance(_obj,(LTTextBoxHorizontal,LTTextBoxVertical)):
  67. list_textbox.append(_obj)
  68. #
  69. #clusters_rects = []
  70. # #根据y0聚类
  71. # list_rects.sort(key=lambda x:x.bbox[1])
  72. # for _rect in list_rects:
  73. # _y0 = _rect.bbox[1]
  74. # _find = False
  75. # for l_cr in clusters_rects:
  76. # if abs(l_cr[0].bbox[1]-_y0)<2:
  77. # _find = True
  78. # l_cr.append(_rect)
  79. # break
  80. # if not _find:
  81. # clusters_rects.append([_rect])
  82. #
  83. # clusters_rects.sort(key=lambda x:x[0].bbox[1])
  84. # for l_cr in clusters_rects:
  85. # l_cr.sort(key=lambda x:x.bbox[0])
  86. #
  87. # table_index = [0]
  88. # for i in range(1,len(clusters_rects)):
  89. # if abs(clusters_rects[i][0].bbox[1]-clusters_rects[i-1][0].bbox[3])>line_margin:
  90. # table_index.append(i)
  91. # table_index.append(len(clusters_rects))
  92. #
  93. # print("11111111111111111111111")
  94. # print(clusters_rects)
  95. # print("22222222222222222222222")
  96. #
  97. # in_objs = set()
  98. # for i in range(1,len(table_index)):
  99. # _begin = table_index[i-1]
  100. # _end = table_index[i]
  101. # _ta = self.rect2table(list_textbox,clusters_rects[_begin:_end],in_objs)
  102. # if _ta:
  103. # self.list_tables.append(_ta)
  104. in_objs = set()
  105. list_l_rect = self.recognize_rect(_page)
  106. for l_rect in list_l_rect:
  107. _ta = self.rect2table(list_textbox,l_rect,in_objs)
  108. if _ta:
  109. self.list_tables.append(_ta)
  110. return in_objs
  111. def recognize_crosspoints(self,list_line):
  112. from matplotlib import pyplot as plt
  113. list_crosspoints = []
  114. print("lines num",len(list_line))
  115. plt.figure()
  116. for _line in list_line:
  117. x0,y0,x1,y1 = _line.bbox
  118. plt.plot([x0,x1],[y0,y1])
  119. for _i in range(len(list_line)):
  120. for _j in range(len(list_line)):
  121. line1 = list_line[_i].bbox
  122. line2 = list_line[_j].bbox
  123. exists,point = self.cross_point(line1,line2)
  124. if exists:
  125. list_crosspoints.append(point)
  126. # plt.figure()
  127. # for _line in list_line:
  128. # x0,y0,x1,y1 = _line.bbox
  129. # plt.plot([x0,x1],[y0,y1])
  130. # for point in list_crosspoints:
  131. # plt.scatter(point.get("point")[0],point.get("point")[1])
  132. # plt.show()
  133. # print(list_crosspoints)
  134. # print("points num",len(list_crosspoints))
  135. return list_crosspoints
  136. def recognize_rect(self,_page):
  137. list_line = []
  138. for _obj in _page._objs:
  139. if isinstance(_obj,(LTLine)):
  140. list_line.append(_obj)
  141. list_crosspoints = self.recognize_crosspoints(list_line)
  142. #聚类
  143. cluster_crosspoints = []
  144. for _point in list_crosspoints:
  145. cluster_crosspoints.append({"lines":_point.get("lines"),"points":[_point]})
  146. while 1:
  147. _find = False
  148. new_cluster_crosspoints = []
  149. for l_point in cluster_crosspoints:
  150. _flag = False
  151. for l_n_point in new_cluster_crosspoints:
  152. line1 = l_point.get("lines")
  153. line2 = l_n_point.get("lines")
  154. if len(line1&line2)>0:
  155. _find = True
  156. _flag = True
  157. l_n_point["lines"] = line1.union(line2)
  158. l_n_point["points"].extend(l_point["points"])
  159. if not _flag:
  160. new_cluster_crosspoints.append({"lines":l_point.get("lines"),"points":l_point.get("points")})
  161. cluster_crosspoints = new_cluster_crosspoints
  162. if not _find:
  163. break
  164. # print(len(cluster_crosspoints))
  165. list_l_rect = []
  166. for table_crosspoint in cluster_crosspoints:
  167. list_rect = self.crosspoint2rect(table_crosspoint.get("points"))
  168. list_l_rect.append(list_rect)
  169. return list_l_rect
  170. def crosspoint2rect(self,list_crosspoint,margin=4):
  171. dict_line_points = {}
  172. for _point in list_crosspoint:
  173. lines = list(_point.get("lines"))
  174. for _line in lines:
  175. if _line not in dict_line_points:
  176. dict_line_points[_line] = {"direct":None,"points":[]}
  177. dict_line_points[_line]["points"].append(_point)
  178. #排序
  179. for k,v in dict_line_points.items():
  180. list_x = []
  181. list_y = []
  182. for _p in v["points"]:
  183. list_x.append(_p.get("point")[0])
  184. list_y.append(_p.get("point")[1])
  185. if max(list_x)-min(list_x)>max(list_y)-min(list_y):
  186. v.get("points").sort(key=lambda x:x.get("point")[0])
  187. v["direct"] = "row"
  188. else:
  189. v.get("points").sort(key=lambda x:x.get("point")[1])
  190. v["direct"] = "column"
  191. list_rect = []
  192. for _point in list_crosspoint:
  193. if _point["buttom"]>=margin and _point["right"]>=margin:
  194. lines = list(_point.get("lines"))
  195. _line = lines[0]
  196. if dict_line_points[_line]["direct"]=="column":
  197. _line = lines[1]
  198. next_point = None
  199. for p1 in dict_line_points[_line]["points"]:
  200. if p1["buttom"]>=margin and p1["point"][0]>_point["point"][0]:
  201. next_point = p1
  202. break
  203. if not next_point:
  204. continue
  205. lines = list(next_point.get("lines"))
  206. _line = lines[0]
  207. if dict_line_points[_line]["direct"]=="row":
  208. _line = lines[1]
  209. final_point = None
  210. for p1 in dict_line_points[_line]["points"]:
  211. if p1["left"]>=margin and p1["point"][1]>next_point["point"][1]:
  212. final_point = p1
  213. break
  214. if not final_point:
  215. continue
  216. _r = LTRect(1,(_point["point"][0],_point["point"][1],final_point["point"][0],final_point["point"][1]))
  217. list_rect.append(_r)
  218. return list_rect
  219. def cross_point(self,line1, line2,segment=True,margin=2):
  220. point_is_exist = False
  221. x = y = 0
  222. x1,y1,x2,y2 = line1
  223. x3,y3,x4,y4 = line2
  224. if (x2 - x1) == 0:
  225. k1 = None
  226. b1 = 0
  227. else:
  228. k1 = (y2 - y1) * 1.0 / (x2 - x1) # 计算k1,由于点均为整数,需要进行浮点数转化
  229. b1 = y1 * 1.0 - x1 * k1 * 1.0 # 整型转浮点型是关键
  230. if (x4 - x3) == 0: # L2直线斜率不存在
  231. k2 = None
  232. b2 = 0
  233. else:
  234. k2 = (y4 - y3) * 1.0 / (x4 - x3) # 斜率存在
  235. b2 = y3 * 1.0 - x3 * k2 * 1.0
  236. if k1 is None:
  237. if not k2 is None:
  238. x = x1
  239. y = k2 * x1 + b2
  240. point_is_exist = True
  241. elif k2 is None:
  242. x = x3
  243. y = k1 * x3 + b1
  244. elif not k2 == k1:
  245. x = (b2 - b1) * 1.0 / (k1 - k2)
  246. y = k1 * x * 1.0 + b1 * 1.0
  247. point_is_exist = True
  248. left = 0
  249. right = 0
  250. top = 0
  251. buttom = 0
  252. if point_is_exist:
  253. if segment:
  254. if x>=(min(x1,x2)-margin) and x<=(max(x1,x2)+margin) and y>=(min(y1,y2)-margin) and y<=(max(y1,y2)+margin):
  255. if x>=(min(x3,x4)-margin) and x<=(max(x3,x4)+margin) and y>=(min(y3,y4)-margin) and y<=(max(y3,y4)+margin):
  256. point_is_exist = True
  257. left = abs(min(x1,x3)-x)
  258. right = abs(max(x2,x4)-x)
  259. top = abs(min(y1,y3)-y)
  260. buttom = abs(max(y2,y4)-y)
  261. else:
  262. point_is_exist = False
  263. else:
  264. point_is_exist = False
  265. line1_key = "%.2f-%.2f-%.2f-%.2f"%(x1,y1,x2,y2)
  266. line2_key = "%.2f-%.2f-%.2f-%.2f"%(x3,y3,x4,y4)
  267. return point_is_exist, {"point":[x, y],"left":left,"right":right,"top":top,"buttom":buttom,"lines":set([line1_key,line2_key])}
  268. def rect2table(self,list_textbox,list_rect,in_objs,margin=0.2,fixspan=True):
  269. _table = []
  270. set_x = set()
  271. set_y = set()
  272. clusters_rects = []
  273. #根据y1聚类
  274. list_rect.sort(key=lambda x:x.bbox[3])
  275. for _rect in list_rect:
  276. _y0 = _rect.bbox[3]
  277. _find = False
  278. for l_cr in clusters_rects:
  279. if abs(l_cr[0].bbox[3]-_y0)<2:
  280. _find = True
  281. l_cr.append(_rect)
  282. break
  283. if not _find:
  284. clusters_rects.append([_rect])
  285. clusters_rects.sort(key=lambda x:x[0].bbox[3],reverse=True)
  286. for l_cr in clusters_rects:
  287. l_cr.sort(key=lambda x:x.bbox[0])
  288. for _line in clusters_rects:
  289. for _rect in _line:
  290. (x0,y0,x1,y1) = _rect.bbox
  291. set_x.add(x0)
  292. set_x.add(x1)
  293. set_y.add(y0)
  294. set_y.add(y1)
  295. if len(set_x)==0 or len(set_y)==0:
  296. return
  297. list_x = list(set_x)
  298. list_y = list(set_y)
  299. list_x.sort(key=lambda x:x)
  300. list_y.sort(key=lambda x:x,reverse=True)
  301. for _line in clusters_rects:
  302. table_line = []
  303. for _rect in _line:
  304. (x0,y0,x1,y1) = _rect.bbox
  305. _cell = {"bbox":(x0,y0,x1,y1),"rect":_rect,"rowspan":self.getspan(list_y,y0,y1,margin),"columnspan":self.getspan(list_x,x0,x1,margin),"text":""}
  306. table_line.append(_cell)
  307. _table.append(table_line)
  308. for textbox in list_textbox:
  309. (x0,y0,x1,y1) = textbox.bbox
  310. _text = textbox.get_text()
  311. _find = False
  312. for table_line in _table:
  313. for _cell in table_line:
  314. if self.inbox(textbox.bbox,_cell["bbox"]):
  315. _cell["text"]+= _text
  316. in_objs.add(textbox)
  317. _find = True
  318. break
  319. if _find:
  320. break
  321. if fixspan:
  322. for _line in _table:
  323. for c_i in range(len(_line)):
  324. _cell = _line[c_i]
  325. if _cell.get("columnspan")>1:
  326. _cospan = _cell.get("columnspan")
  327. _cell["columnspan"] = 1
  328. for i in range(1,_cospan):
  329. _line.insert(c_i)
  330. for l_i in range(len(_table)):
  331. _line = _table[l_i]
  332. for c_i in range(len(_line)):
  333. _cell = _line[c_i]
  334. if _cell.get("rowspan")>1:
  335. _rospan = _cell.get("rowspan")
  336. _cell["rowspan"] = 1
  337. for i in range(1,_rospan):
  338. _table[l_i+i].insert(c_i,_cell)
  339. # print("=======")
  340. # for _line in _table:
  341. # for _cell in _line:
  342. # print("[%s]"%_cell.get("text")[:10].replace("\n",''),end="\t\t")
  343. # print("\n")
  344. # print("===========")
  345. table_bbox = (_table[0][0].get("bbox")[0],_table[0][0].get("bbox")[1],_table[-1][-1].get("bbox")[2],_table[-1][-1].get("bbox")[3])
  346. ta = ParseTable(table_bbox,_table)
  347. return ta
  348. def inbox(self,bbox0,bbox_g):
  349. # if bbox_g[0]<=bbox0[0] and bbox_g[1]<=bbox0[1] and bbox_g[2]>=bbox0[2] and bbox_g[3]>=bbox0[3]:
  350. # return 1
  351. if self.getIOU(bbox0,bbox_g)>0.5:
  352. return 1
  353. return 0
  354. def getIOU(self,bbox0,bbox1):
  355. width = max(bbox0[2],bbox1[2])-min(bbox0[0],bbox1[0])-(bbox0[2]-bbox0[0]+bbox1[2]-bbox1[0])
  356. height = max(bbox0[3],bbox1[3])-min(bbox0[1],bbox1[1])-(bbox0[3]-bbox0[1]+bbox1[3]-bbox1[1])
  357. if width<0 and height<0:
  358. return abs(width*height/min(abs((bbox0[2]-bbox0[0])*(bbox0[3]-bbox0[1])),abs((bbox1[2]-bbox1[0])*(bbox1[3]-bbox1[1]))))
  359. return 0
  360. def getspan(self,_list,x0,x1,margin):
  361. _count = 0
  362. (x0,x1) = (min(x0,x1),max(x0,x1))
  363. for _x in _list:
  364. if _x>=(x0-margin) and _x<=(x1+margin):
  365. _count += 1
  366. return _count-1
  367. def getFontinfo(self,_page):
  368. for _obj in _page._objs:
  369. if isinstance(_obj,(LTTextBoxHorizontal,LTTextBoxVertical)):
  370. for textline in _obj._objs:
  371. done = False
  372. for lchar in textline._objs:
  373. if isinstance(lchar,(LTChar)):
  374. _obj.fontname = lchar.fontname
  375. _obj.fontsize = lchar.size
  376. done = True
  377. break
  378. if done:
  379. break
  380. def recognize_sentences(self,_page,filter_objs):
  381. for _obj in _page._objs:
  382. if isinstance(_obj,(LTTextBoxHorizontal,LTTextBoxVertical)):
  383. if _obj in filter_objs:
  384. continue
  385. self.list_sentences.append(ParseSentence(_obj.bbox,_obj.__dict__.get("fontname"),_obj.__dict__.get("fontsize"),_obj.get_text()))
  386. class ParseRect():
  387. def __init__(self,bbox):
  388. self.bbox = bbox
  389. class ParseTable():
  390. def __init__(self,bbox,list_table):
  391. self.table = list_table
  392. self.bbox = bbox
  393. class ParseSentence():
  394. def __init__(self,bbox,fontname,fontsize,_text):
  395. (x0,y0,x1,y1) = bbox
  396. self.x0 = x0
  397. self.y0 = y0
  398. self.x1 = x1
  399. self.y1 = y1
  400. self.box = bbox
  401. self.fontname = fontname
  402. self.fontsize = fontsize
  403. self.text = _text
  404. def rec_serial(self):
  405. #todo :recog the serial of the sentence
  406. pass
  407. if __name__ == '__main__':
  408. document = ParseDocument('8a9494757a859f17017e8aa443360235.pdf')