convert_pdf.py 56 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012101310141015101610171018101910201021102210231024102510261027102810291030103110321033103410351036103710381039104010411042104310441045104610471048104910501051105210531054105510561057105810591060106110621063106410651066106710681069107010711072107310741075107610771078107910801081108210831084108510861087108810891090109110921093109410951096109710981099110011011102110311041105110611071108110911101111111211131114111511161117111811191120112111221123112411251126112711281129113011311132113311341135113611371138113911401141114211431144114511461147114811491150115111521153115411551156115711581159116011611162116311641165116611671168116911701171117211731174117511761177117811791180118111821183118411851186118711881189119011911192119311941195119611971198119912001201120212031204120512061207120812091210121112121213121412151216121712181219122012211222122312241225122612271228122912301231123212331234123512361237123812391240124112421243124412451246124712481249125012511252125312541255125612571258125912601261126212631264126512661267126812691270127112721273127412751276127712781279128012811282128312841285128612871288128912901291129212931294129512961297129812991300130113021303130413051306130713081309131013111312131313141315131613171318131913201321132213231324132513261327132813291330133113321333133413351336133713381339134013411342134313441345134613471348134913501351135213531354135513561357
  1. import copy
  2. import inspect
  3. import io
  4. import logging
  5. import os
  6. import re
  7. import sys
  8. sys.path.append(os.path.dirname(__file__) + "/../")
  9. from pdfplumber import PDF
  10. from pdfplumber.table import TableFinder
  11. from pdfplumber.page import Page as pdfPage
  12. from format_convert.convert_tree import _Document, _Page, _Image, _Sentence, _Table
  13. import time
  14. import pdfminer
  15. from format_convert import timeout_decorator
  16. from PIL import Image
  17. from format_convert.convert_image import image_process
  18. from format_convert.convert_need_interface import from_ocr_interface, from_office_interface
  19. import traceback
  20. import cv2
  21. import PyPDF2
  22. from PyPDF2 import PdfFileReader, PdfFileWriter
  23. from pdfminer.pdfparser import PDFParser
  24. from pdfminer.pdfdocument import PDFDocument
  25. from pdfminer.pdfpage import PDFPage
  26. from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
  27. from pdfminer.converter import PDFPageAggregator
  28. from pdfminer.layout import LTTextBoxHorizontal, LAParams, LTFigure, LTImage, LTCurve, LTText, LTChar, LTRect, \
  29. LTTextBoxVertical, LTLine
  30. from format_convert.utils import judge_error_code, add_div, get_platform, get_html_p, string_similarity, LineTable, \
  31. get_logger, log, memory_decorator,draw_lines_plt
  32. import fitz
  33. from format_convert.wrapt_timeout_decorator import timeout
  34. @memory_decorator
  35. def pdf2Image(path, save_dir):
  36. log("into pdf2Image")
  37. try:
  38. try:
  39. doc = fitz.open(path)
  40. except Exception as e:
  41. log("pdf format error!")
  42. # print("pdf format error!", e)
  43. return [-3]
  44. # output_image_list = []
  45. output_image_dict = {}
  46. page_count = doc.page_count
  47. for page_no in range(page_count):
  48. # 限制pdf页数,只取前10页后10页
  49. if page_count > 20:
  50. if 10 <= page_no < page_count - 10:
  51. # log("pdf2Image: pdf pages count " + str(doc.page_count)
  52. # + ", only get 70 pages")
  53. continue
  54. try:
  55. page = doc.loadPage(page_no)
  56. output = save_dir + "_page" + str(page_no) + ".png"
  57. rotate = int(0)
  58. # 每个尺寸的缩放系数为1.3,这将为我们生成分辨率提高2.6的图像。
  59. # 此处若是不做设置,默认图片大小为:792X612, dpi=96
  60. # (1.33333333 --> 1056x816) (2 --> 1584x1224)
  61. # (1.183, 2.28 --> 1920x1080)
  62. zoom_x = 3.
  63. zoom_y = 3.
  64. # mat = fitz.Matrix(zoom_x, zoom_y).preRotate(rotate)
  65. mat = fitz.Matrix(zoom_x, zoom_y).preRotate(rotate)
  66. pix = page.getPixmap(matrix=mat, alpha=False)
  67. pix.writePNG(output)
  68. pdf_image = cv2.imread(output)
  69. print("pdf_image", page_no, pdf_image.shape)
  70. # output_image_list.append([page_no, output])
  71. output_image_dict[int(page_no)] = output
  72. except ValueError as e:
  73. traceback.print_exc()
  74. if str(e) == "page not in document":
  75. log("pdf2Image page not in document! continue..." + str(page_no))
  76. continue
  77. elif "encrypted" in str(e):
  78. log("pdf2Image document need password " + str(page_no))
  79. return [-7]
  80. except RuntimeError as e:
  81. if "cannot find page" in str(e):
  82. log("pdf2Image page {} not in document! continue... ".format(str(page_no)) + str(e))
  83. continue
  84. else:
  85. traceback.print_exc()
  86. return [-3]
  87. return [output_image_dict]
  88. except Exception as e:
  89. log("pdf2Image error!")
  90. print("pdf2Image", traceback.print_exc())
  91. return [-1]
  92. @timeout(10, timeout_exception=TimeoutError)
  93. def pdf_analyze(interpreter, page, device, page_no):
  94. log("into pdf_analyze")
  95. pdf_time = time.time()
  96. print("pdf_analyze interpreter process...")
  97. interpreter.process_page(page)
  98. print("pdf_analyze device get_result...")
  99. layout = device.get_result()
  100. log("pdf2text page " + str(page_no) + " read time " + str(time.time() - pdf_time))
  101. return layout
  102. @memory_decorator
  103. def pdf2text(path, unique_type_dir):
  104. log("into pdf2text")
  105. try:
  106. # pymupdf pdf to image
  107. save_dir = path.split(".")[-2] + "_" + path.split(".")[-1]
  108. output_image_dict = pdf2Image(path, save_dir)
  109. if judge_error_code(output_image_dict):
  110. return output_image_dict
  111. output_image_dict = output_image_dict[0]
  112. output_image_no_list = list(output_image_dict.keys())
  113. output_image_no_list.sort(key=lambda x: x)
  114. # 获取每页pdf提取的文字、表格的列数、轮廓点、是否含表格、页码
  115. # page_info_list = []
  116. page_info_dict = {}
  117. has_table_dict = {}
  118. no_table_dict = {}
  119. for page_no in output_image_no_list:
  120. img_path = output_image_dict.get(page_no)
  121. print("pdf page", page_no, "in total", output_image_no_list[-1])
  122. # 读不出来的跳过
  123. try:
  124. img = cv2.imread(img_path)
  125. img_size = img.shape
  126. except:
  127. log("pdf2text read image in page fail! continue...")
  128. continue
  129. # 每张图片处理
  130. text, column_list, outline_points, is_table = image_process(img, img_path, use_ocr=False)
  131. if judge_error_code(text):
  132. return text
  133. # page_info_list.append([text, column_list, outline_points, is_table,
  134. # page_no, img_size])
  135. page_info = [text, column_list, outline_points, is_table, img_size]
  136. page_info_dict[int(page_no)] = page_info
  137. # 包含table的和不包含table的
  138. if is_table:
  139. has_table_dict[int(page_no)] = page_info
  140. else:
  141. no_table_dict[int(page_no)] = page_info
  142. has_table_no_list = list(has_table_dict.keys())
  143. has_table_no_list.sort(key=lambda x: x)
  144. page_no_list = list(page_info_dict.keys())
  145. page_no_list.sort(key=lambda x: x)
  146. # 页码表格连接
  147. table_connect_list, connect_text_list = page_table_connect(has_table_dict)
  148. if judge_error_code(table_connect_list):
  149. return table_connect_list
  150. # 连接的页码
  151. table_connect_page_no_list = []
  152. for area in connect_text_list:
  153. table_connect_page_no_list.append(area[1])
  154. print("pdf2text table_connect_list", table_connect_list)
  155. print("connect_text_list", connect_text_list)
  156. # pdfminer 方式
  157. try:
  158. fp = open(path, 'rb')
  159. # 用文件对象创建一个PDF文档分析器
  160. parser = PDFParser(fp)
  161. # 创建一个PDF文档
  162. doc = PDFDocument(parser)
  163. # 连接分析器,与文档对象
  164. rsrcmgr = PDFResourceManager()
  165. device = PDFPageAggregator(rsrcmgr, laparams=LAParams())
  166. interpreter = PDFPageInterpreter(rsrcmgr, device)
  167. # 判断是否能读pdf
  168. for page in PDFPage.create_pages(doc):
  169. break
  170. except pdfminer.psparser.PSEOF as e:
  171. # pdfminer 读不了空白页的对象,直接使用pymupdf转换出的图片进行ocr识别
  172. log("pdf2text " + str(e) + " use ocr read pdf!")
  173. text_list = []
  174. for page_no in page_no_list:
  175. log("pdf2text ocr page_no " + str(page_no))
  176. page_info = page_info_dict.get(page_no)
  177. # 表格
  178. if page_info[3]:
  179. # 判断表格是否跨页连接
  180. area_no = 0
  181. jump_page = 0
  182. for area in table_connect_list:
  183. if page_no in area:
  184. # 只记录一次text
  185. if page_no == area[0]:
  186. image_text = connect_text_list[area_no][0]
  187. text_list.append([image_text, page_no, 0])
  188. jump_page = 1
  189. area_no += 1
  190. # 是连接页的跳过后面步骤
  191. if jump_page:
  192. continue
  193. # 直接取text
  194. image_text = page_info_dict.get(page_no)[0]
  195. text_list.append([image_text, page_no, 0])
  196. # 非表格
  197. else:
  198. with open(output_image_dict.get(page_no), "rb") as ff:
  199. image_stream = ff.read()
  200. image_text = from_ocr_interface(image_stream)
  201. text_list.append([image_text, page_no, 0])
  202. text_list.sort(key=lambda z: z[1])
  203. text = ""
  204. for t in text_list:
  205. text += t[0]
  206. return [text]
  207. except Exception as e:
  208. log("pdf format error!")
  209. traceback.print_exc()
  210. return [-3]
  211. text_list = []
  212. page_no = 0
  213. pages = PDFPage.create_pages(doc)
  214. pages = list(pages)
  215. page_count = len(pages)
  216. for page in pages:
  217. log("pdf2text pymupdf page_no " + str(page_no))
  218. # 限制pdf页数,只取前100页
  219. # if page_no >= 70:
  220. # log("pdf2text: pdf pages only get 70 pages")
  221. # break
  222. if page_count > 20:
  223. if 10 <= page_no < page_count - 10:
  224. page_no += 1
  225. continue
  226. # 判断页码在含表格页码中,直接拿已生成的text
  227. if page_no in has_table_no_list:
  228. # 判断表格是否跨页连接
  229. area_no = 0
  230. jump_page = 0
  231. for area in table_connect_list:
  232. if page_no in area:
  233. # 只记录一次text
  234. if page_no == area[0]:
  235. image_text = connect_text_list[area_no][0]
  236. text_list.append([image_text, page_no, 0])
  237. jump_page = 1
  238. area_no += 1
  239. # 是连接页的跳过后面步骤
  240. if jump_page:
  241. page_no += 1
  242. continue
  243. # 直接取text
  244. image_text = has_table_dict.get(page_no)[0]
  245. text_list.append([image_text, page_no, 0])
  246. page_no += 1
  247. continue
  248. # 不含表格的解析pdf
  249. else:
  250. if get_platform() == "Windows":
  251. try:
  252. interpreter.process_page(page)
  253. layout = device.get_result()
  254. except Exception:
  255. log("pdf2text pdfminer read pdf page error! continue...")
  256. continue
  257. else:
  258. # 设置超时时间
  259. try:
  260. # 解析pdf中的不含表格的页
  261. if get_platform() == "Windows":
  262. origin_pdf_analyze = pdf_analyze.__wrapped__
  263. layout = origin_pdf_analyze(interpreter, page, device)
  264. else:
  265. layout = pdf_analyze(interpreter, page, device, page_no)
  266. except TimeoutError as e:
  267. log("pdf2text pdfminer read pdf page time out!")
  268. return [-4]
  269. except Exception:
  270. log("pdf2text pdfminer read pdf page error! continue...")
  271. continue
  272. # 判断该页有没有文字对象,没有则有可能是有水印
  273. only_image = 1
  274. image_count = 0
  275. for x in layout:
  276. if isinstance(x, LTTextBoxHorizontal):
  277. only_image = 0
  278. if isinstance(x, LTFigure):
  279. image_count += 1
  280. # 如果该页图片数量过多,直接ocr整页识别
  281. log("pdf2text image_count " + str(image_count))
  282. if image_count >= 3:
  283. image_text = page_info_dict.get(page_no)[0]
  284. if image_text is None:
  285. with open(output_image_dict.get(page_no), "rb") as ff:
  286. image_stream = ff.read()
  287. image_text = from_ocr_interface(image_stream)
  288. if judge_error_code(image_text):
  289. return image_text
  290. page_info_dict[page_no][0] = image_text
  291. text_list.append([image_text, page_no, 0])
  292. page_no += 1
  293. continue
  294. order_list = []
  295. for x in layout:
  296. # 该对象是否是ocr识别
  297. ocr_flag = 0
  298. if get_platform() == "Windows":
  299. # print("x", page_no, x)
  300. print()
  301. if isinstance(x, LTTextBoxHorizontal):
  302. image_text = x.get_text()
  303. # 无法识别编码,用ocr
  304. if re.search('[(]cid:[0-9]+[)]', image_text):
  305. print(re.search('[(]cid:[0-9]+[)]', image_text))
  306. image_text = page_info_dict.get(page_no)[0]
  307. if image_text is None:
  308. with open(output_image_dict.get(page_no), "rb") as ff:
  309. image_stream = ff.read()
  310. image_text = from_ocr_interface(image_stream)
  311. if judge_error_code(image_text):
  312. return image_text
  313. page_info_dict[page_no][0] = image_text
  314. image_text = add_div(image_text)
  315. # order_list.append([image_text, page_no, x.bbox[1]])
  316. order_list = [[image_text, page_no, x.bbox[1]]]
  317. break
  318. else:
  319. image_text = add_div(image_text)
  320. order_list.append([image_text, page_no, x.bbox[1]])
  321. continue
  322. if isinstance(x, LTFigure):
  323. for image in x:
  324. if isinstance(image, LTImage):
  325. try:
  326. print("pdf2text LTImage size", page_no, image.width, image.height)
  327. image_stream = image.stream.get_data()
  328. # 小的图忽略
  329. if image.width <= 300 and image.height <= 300:
  330. continue
  331. # 有些水印导致pdf分割、读取报错
  332. # if image.width <= 200 and image.height<=200:
  333. # continue
  334. # img_test = Image.open(io.BytesIO(image_stream))
  335. # img_test.save('temp/LTImage.jpg')
  336. # 查看提取的图片高宽,太大则抛错用pdf输出图进行ocr识别
  337. img_test = Image.open(io.BytesIO(image_stream))
  338. if img_test.size[1] > 2000 or img_test.size[0] > 1500:
  339. print("pdf2text LTImage stream output size", img_test.size)
  340. raise Exception
  341. # 比较小的图则直接保存用ocr识别
  342. else:
  343. img_test.save('temp/LTImage.jpg')
  344. with open('temp/LTImage.jpg', "rb") as ff:
  345. image_stream = ff.read()
  346. image_text = from_ocr_interface(image_stream)
  347. if judge_error_code(image_text):
  348. return image_text
  349. # except pdfminer.pdftypes.PDFNotImplementedError:
  350. # with open(output_image_list[page_no], "rb") as ff:
  351. # image_stream = ff.read()
  352. except Exception:
  353. log("pdf2text pdfminer read image in page " + str(page_no) +
  354. " fail! use pymupdf read image...")
  355. # print(traceback.print_exc())
  356. image_text = page_info_dict.get(page_no)[0]
  357. if image_text is None:
  358. with open(output_image_dict.get(page_no), "rb") as ff:
  359. image_stream = ff.read()
  360. image_text = from_ocr_interface(image_stream)
  361. if judge_error_code(image_text):
  362. return image_text
  363. page_info_dict[page_no][0] = image_text
  364. ocr_flag = 1
  365. # 判断只拿到了水印图: 无文字输出且只有图片对象
  366. if image_text == "" and only_image:
  367. # 拆出该页pdf
  368. try:
  369. log("pdf2text guess pdf has watermark")
  370. split_path = get_single_pdf(path, page_no)
  371. except:
  372. # 如果拆分抛异常,则大概率不是水印图,用ocr识别图片
  373. log("pdf2text guess pdf has no watermark")
  374. image_text = page_info_dict.get(page_no)[0]
  375. if image_text is None:
  376. with open(output_image_dict.get(page_no), "rb") as ff:
  377. image_stream = ff.read()
  378. image_text = from_ocr_interface(image_stream)
  379. order_list.append([image_text, page_no, -1])
  380. page_info_dict[page_no][0] = image_text
  381. ocr_flag = 1
  382. continue
  383. if judge_error_code(split_path):
  384. return split_path
  385. # 调用office格式转换
  386. file_path = from_office_interface(split_path, unique_type_dir, 'html', 3)
  387. # if file_path == [-3]:
  388. # return [-3]
  389. if judge_error_code(file_path):
  390. return file_path
  391. # 获取html文本
  392. image_text = get_html_p(file_path)
  393. if judge_error_code(image_text):
  394. return image_text
  395. if get_platform() == "Windows":
  396. print("image_text", page_no, x.bbox[1], image_text)
  397. with open("temp" + str(x.bbox[0]) + ".jpg", "wb") as ff:
  398. ff.write(image_stream)
  399. image_text = add_div(image_text)
  400. if ocr_flag:
  401. order_list.append([image_text, page_no, -1])
  402. else:
  403. order_list.append([image_text, page_no, x.bbox[1]])
  404. order_list.sort(key=lambda z: z[2], reverse=True)
  405. # 有ocr参与识别
  406. if order_list[-1][2] == -1:
  407. ocr_order_list = [order_list[-1]]
  408. not_ocr_order_list = []
  409. not_ocr_text = ""
  410. # 去重,因读取失败而重复获取
  411. for order in order_list:
  412. if order[2] != -1:
  413. not_ocr_order_list.append(order)
  414. not_ocr_text += order[0]
  415. if string_similarity(ocr_order_list[0][0], not_ocr_text) >= 0.85:
  416. order_list = not_ocr_order_list
  417. else:
  418. order_list = ocr_order_list
  419. for order in order_list:
  420. text_list.append(order)
  421. page_no += 1
  422. text = ""
  423. for t in text_list:
  424. # text += add_div(t[0])
  425. if t[0] is not None:
  426. text += t[0]
  427. return [text]
  428. except UnicodeDecodeError as e:
  429. log("pdf2text pdfminer create pages failed! " + str(e))
  430. return [-3]
  431. except Exception as e:
  432. log("pdf2text error!")
  433. print("pdf2text", traceback.print_exc())
  434. return [-1]
  435. def get_single_pdf(path, page_no):
  436. log("into get_single_pdf")
  437. try:
  438. # print("path, ", path)
  439. pdf_origin = PdfFileReader(path, strict=False)
  440. pdf_new = PdfFileWriter()
  441. pdf_new.addPage(pdf_origin.getPage(page_no))
  442. path_new = path.split(".")[0] + "_split.pdf"
  443. with open(path_new, "wb") as ff:
  444. pdf_new.write(ff)
  445. return path_new
  446. except PyPDF2.utils.PdfReadError as e:
  447. raise e
  448. except Exception as e:
  449. log("get_single_pdf error! page " + str(page_no))
  450. print("get_single_pdf", traceback.print_exc())
  451. raise e
  452. def page_table_connect(has_table_dict):
  453. log("into page_table_connect")
  454. if not has_table_dict:
  455. return [], []
  456. try:
  457. # 判断是否有页码的表格相连
  458. table_connect_list = []
  459. temp_list = []
  460. # 离图片顶部或底部距离,页面高度的1/7
  461. threshold = 7
  462. page_no_list = list(has_table_dict.keys())
  463. page_no_list.sort(key=lambda x: x)
  464. for i in range(1, len(page_no_list)):
  465. page_info = has_table_dict.get(page_no_list[i])
  466. last_page_info = has_table_dict.get(page_no_list[i - 1])
  467. # 页码需相连
  468. if page_no_list[i] - page_no_list[i - 1] == 1:
  469. # 上一页最后一个区域的列数和下一页第一个区域列数都为0,且相等
  470. if not last_page_info[1][-1] and not page_info[1][0] and \
  471. last_page_info[1][-1] == page_info[1][0]:
  472. # 上一页的轮廓点要离底部一定距离内,下一页的轮廓点要离顶部一定距离内
  473. if last_page_info[4][0] - last_page_info[2][-1][1][1] \
  474. <= int(last_page_info[4][0] / threshold) \
  475. and page_info[2][0][0][1] - 0 \
  476. <= int(page_info[4][0] / threshold):
  477. temp_list.append(page_no_list[i - 1])
  478. temp_list.append(page_no_list[i])
  479. continue
  480. # 条件不符合的,存储之前保存的连接页码
  481. if len(temp_list) > 1:
  482. temp_list = list(set(temp_list))
  483. temp_list.sort(key=lambda x: x)
  484. table_connect_list.append(temp_list)
  485. temp_list = []
  486. if len(temp_list) > 1:
  487. temp_list = list(set(temp_list))
  488. temp_list.sort(key=lambda x: x)
  489. table_connect_list.append(temp_list)
  490. temp_list = []
  491. # 连接两页内容
  492. connect_text_list = []
  493. for area in table_connect_list:
  494. first_page_no = area[0]
  495. area_page_text = str(has_table_dict.get(first_page_no)[0])
  496. for i in range(1, len(area)):
  497. current_page_no = area[i]
  498. current_page_text = str(has_table_dict.get(current_page_no)[0])
  499. # 连接两个table
  500. table_prefix = re.finditer('<table border="1">', current_page_text)
  501. index_list = []
  502. for t in table_prefix:
  503. index_list.append(t.span())
  504. delete_index = index_list[0]
  505. current_page_text = current_page_text[:delete_index[0]] \
  506. + current_page_text[delete_index[1]:]
  507. table_suffix = re.finditer('</table>', area_page_text)
  508. index_list = []
  509. for t in table_suffix:
  510. index_list.append(t.span())
  511. delete_index = index_list[-1]
  512. area_page_text = area_page_text[:delete_index[0]] \
  513. + area_page_text[delete_index[1]:]
  514. area_page_text = area_page_text + current_page_text
  515. connect_text_list.append([area_page_text, area])
  516. return table_connect_list, connect_text_list
  517. except Exception as e:
  518. # print("page_table_connect", e)
  519. log("page_table_connect error!")
  520. print("page_table_connect", traceback.print_exc())
  521. return [-1], [-1]
  522. @timeout(30, timeout_exception=TimeoutError)
  523. def read_pdf(path, package_name, packages):
  524. log(package_name)
  525. laparams = LAParams(line_overlap=0.01,
  526. char_margin=0.3,
  527. line_margin=0.01,
  528. word_margin=0.01,
  529. boxes_flow=0.1,)
  530. if package_name == packages[0]:
  531. fp = open(path, 'rb')
  532. parser = PDFParser(fp)
  533. doc_pdfminer = PDFDocument(parser)
  534. rsrcmgr = PDFResourceManager()
  535. device = PDFPageAggregator(rsrcmgr, laparams=laparams)
  536. interpreter = PDFPageInterpreter(rsrcmgr, device)
  537. return doc_pdfminer, device, interpreter
  538. elif package_name == packages[1]:
  539. doc_pymupdf = fitz.open(path)
  540. return doc_pymupdf
  541. elif package_name == packages[2]:
  542. doc_pypdf2 = PdfFileReader(path, strict=False)
  543. doc_pypdf2_new = PdfFileWriter()
  544. return doc_pypdf2, doc_pypdf2_new
  545. elif package_name == packages[3]:
  546. fp = open(path, 'rb')
  547. lt = LineTable()
  548. doc_top = 0
  549. doc_pdfplumber = read_pdfplumber(fp, laparams)
  550. return lt, doc_top, doc_pdfplumber
  551. @timeout(25, timeout_exception=TimeoutError)
  552. def read_pdfminer(path, laparams):
  553. fp = open(path, 'rb')
  554. parser = PDFParser(fp)
  555. doc_pdfminer = PDFDocument(parser)
  556. rsrcmgr = PDFResourceManager()
  557. device = PDFPageAggregator(rsrcmgr, laparams=laparams)
  558. interpreter = PDFPageInterpreter(rsrcmgr, device)
  559. return doc_pdfminer, device, interpreter
  560. @timeout(15, timeout_exception=TimeoutError)
  561. def read_pymupdf(path):
  562. return fitz.open(path)
  563. @timeout(15, timeout_exception=TimeoutError)
  564. def read_pypdf2(path):
  565. doc_pypdf2 = PdfFileReader(path, strict=False)
  566. doc_pypdf2_new = PdfFileWriter()
  567. return doc_pypdf2, doc_pypdf2_new
  568. @timeout(25, timeout_exception=TimeoutError, use_signals=False)
  569. def read_pdfplumber(path, laparams):
  570. fp = open(path, 'rb')
  571. lt = LineTable()
  572. doc_top = 0
  573. doc_pdfplumber = PDF(fp, laparams=laparams.__dict__)
  574. return lt, doc_top, doc_pdfplumber
  575. class PDFConvert:
  576. def __init__(self, path, unique_type_dir):
  577. self._doc = _Document(path)
  578. self.path = path
  579. self.unique_type_dir = unique_type_dir
  580. if not os.path.exists(self.unique_type_dir):
  581. os.mkdir(self.unique_type_dir)
  582. self.packages = ["pdfminer", "PyMuPDF", "PyPDF2", "pdfplumber"]
  583. self.has_init_pdf = [0] * len(self.packages)
  584. @memory_decorator
  585. def init_package(self, package_name):
  586. # 各个包初始化
  587. try:
  588. laparams = LAParams(line_overlap=0.01,
  589. char_margin=0.3,
  590. line_margin=0.01,
  591. word_margin=0.01,
  592. boxes_flow=0.1,)
  593. if package_name == self.packages[0]:
  594. # fp = open(self.path, 'rb')
  595. # parser = PDFParser(fp)
  596. # self.doc_pdfminer = PDFDocument(parser)
  597. # rsrcmgr = PDFResourceManager()
  598. # self.laparams = LAParams(line_overlap=0.01,
  599. # char_margin=0.3,
  600. # line_margin=0.01,
  601. # word_margin=0.01,
  602. # boxes_flow=0.1,)
  603. # self.device = PDFPageAggregator(rsrcmgr, laparams=self.laparams)
  604. # self.interpreter = PDFPageInterpreter(rsrcmgr, self.device)
  605. self.doc_pdfminer, self.device, self.interpreter = read_pdfminer(self.path, laparams)
  606. self.has_init_pdf[0] = 1
  607. elif package_name == self.packages[1]:
  608. self.doc_pymupdf = read_pymupdf(self.path)
  609. self.has_init_pdf[1] = 1
  610. elif package_name == self.packages[2]:
  611. # self.doc_pypdf2 = PdfFileReader(self.path, strict=False)
  612. # self.doc_pypdf2_new = PdfFileWriter()
  613. self.doc_pypdf2, self.doc_pypdf2_new = read_pypdf2(self.path)
  614. self.has_init_pdf[2] = 1
  615. elif package_name == self.packages[3]:
  616. # self.fp = open(self.path, 'rb')
  617. # self.lt = LineTable()
  618. # self.doc_top = 0
  619. # self.doc_pdfplumber = PDF(self.fp, laparams=self.laparams.__dict__)
  620. self.lt, self.doc_top, self.doc_pdfplumber = read_pdfplumber(self.path, laparams)
  621. self.has_init_pdf[3] = 0
  622. else:
  623. print("Only Support Packages", str(self.packages))
  624. raise Exception
  625. except Exception as e:
  626. log(package_name + " cannot open pdf!")
  627. traceback.print_exc()
  628. self._doc.error_code = [-3]
  629. def convert(self):
  630. if self.has_init_pdf[0] == 0:
  631. self.init_package("pdfminer")
  632. if self._doc.error_code is not None:
  633. self._doc.error_code = None
  634. # pdfminer读不了直接转成图片识别
  635. self.get_all_page_image()
  636. return
  637. # 判断是否能读pdf
  638. try:
  639. pages = PDFPage.create_pages(self.doc_pdfminer)
  640. for page in pages:
  641. break
  642. pages = list(pages)
  643. # except pdfminer.psparser.PSEOF as e:
  644. except:
  645. # pdfminer 读不了空白页的对象,直接使用pymupdf转换出的图片进行ocr识别
  646. log("pdf2text pdfminer read failed! read by pymupdf!")
  647. traceback.print_exc()
  648. try:
  649. self.get_all_page_image()
  650. return
  651. except:
  652. traceback.print_exc()
  653. log("pdf2text use pymupdf read failed!")
  654. self._doc.error_code = [-3]
  655. return
  656. # 每一页进行处理
  657. pages = PDFPage.create_pages(self.doc_pdfminer)
  658. pages = list(pages)
  659. page_count = len(pages)
  660. page_no = 0
  661. for page in pages:
  662. # 限制pdf页数,只取前后各10页
  663. if page_count > 20:
  664. if 10 <= page_no < page_count - 10:
  665. page_no += 1
  666. continue
  667. self._page = _Page(page, page_no)
  668. # 解析单页
  669. self.convert_page(page, page_no)
  670. # print("+"*30, page.resources)
  671. if self._doc.error_code is None and self._page.error_code is not None:
  672. if self._page.error_code[0] in [-4, -3, 0]:
  673. page_no += 1
  674. continue
  675. else:
  676. self._doc.error_code = self._page.error_code
  677. break
  678. self._doc.add_child(self._page)
  679. page_no += 1
  680. def clean_text(self,_text):
  681. return re.sub("\s","",_text)
  682. def convert_page(self, page, page_no):
  683. # pdf page.annots为None,不经过get_layout,直接ocr
  684. # if page.annots is None:
  685. # lt_image_list = []
  686. # lt_text_list = []
  687. # # 设置只有图片,可跳到ocr
  688. # only_image = 1
  689. # image_count = 1
  690. # else:
  691. layout = self.get_layout(page, page_no)
  692. if self._doc.error_code is not None:
  693. return
  694. if judge_error_code(layout):
  695. self._page.error_code = layout
  696. return
  697. # 判断该页的对象类型,并存储
  698. # only_image = 1
  699. # image_count = 0
  700. lt_text_list = []
  701. lt_image_list = []
  702. for x in layout:
  703. if isinstance(x, (LTTextBoxHorizontal, LTTextBoxVertical)):
  704. # only_image = 0
  705. lt_text_list.append(x)
  706. if isinstance(x, LTFigure):
  707. for y in x:
  708. if isinstance(y, LTImage):
  709. lt_image_list.append(y)
  710. # image_count += 1
  711. lt_text_list = self.delete_water_mark(lt_text_list, layout.bbox, 15)
  712. print("convert_pdf page", page_no)
  713. log("len(lt_image_list), len(lt_text_list) " + str(len(lt_image_list)) + " " + str(len(lt_text_list)))
  714. # 若只有文本且图片数为0,直接提取文字及表格
  715. # if only_image == 0 and image_count == 0:
  716. if len(lt_image_list) == 0 and len(lt_text_list) > 0:
  717. # PDFPlumber
  718. if self.has_init_pdf[3] == 0:
  719. self.init_package("pdfplumber")
  720. if self._doc.error_code is not None:
  721. self._doc.error_code = None
  722. log("init pdfplumber failed! try pymupdf...")
  723. # 调用pdfplumber获取pdf图片报错,则使用pypdf2将pdf转html
  724. page_image = self.get_page_image(page_no)
  725. if judge_error_code(page_image):
  726. self._page.error_code = page_image
  727. else:
  728. _image = _Image(page_image[1], page_image[0])
  729. self._page.add_child(_image)
  730. return
  731. # 无法识别pdf字符编码,整页用ocr
  732. text_temp = ""
  733. for _t in lt_text_list:
  734. text_temp += _t.get_text()
  735. if re.search('[(]cid:[0-9]+[)]', text_temp):
  736. log("text has cid! try pymupdf...")
  737. page_image = self.get_page_image(page_no)
  738. if judge_error_code(page_image):
  739. self._page.error_code = page_image
  740. else:
  741. _image = _Image(page_image[1], page_image[0])
  742. self._page.add_child(_image)
  743. return
  744. try:
  745. lt_line_list = []
  746. page_plumber = pdfPage(self.doc_pdfplumber, page, page_number=page_no, initial_doctop=self.doc_top)
  747. self.doc_top += page_plumber.height
  748. table_finder = TableFinder(page_plumber)
  749. for _edge in table_finder.get_edges():
  750. lt_line_list.append(LTLine(1, (float(_edge["x0"]), float(_edge["y0"])),
  751. (float(_edge["x1"]), float(_edge["y1"]))))
  752. #draw lines to check
  753. # draw_lines_plt([l.bbox for l in lt_line_list])
  754. list_tables, filter_objs, _ = self.lt.recognize_table(lt_text_list, lt_line_list)
  755. self._page.in_table_objs = filter_objs
  756. for table in list_tables:
  757. _table = _Table(table["table"], table["bbox"])
  758. # self._page.children.append(_table)
  759. self._page.add_child(_table)
  760. list_sentences = ParseUtils.recognize_sentences(lt_text_list, filter_objs,
  761. layout.bbox, page_no)
  762. for sentence in list_sentences:
  763. _sen = _Sentence(sentence.text, sentence.bbox)
  764. self._page.add_child(_sen)
  765. # pdf对象需反向排序
  766. self._page.is_reverse = True
  767. except:
  768. traceback.print_exc()
  769. self._page.error_code = [-8]
  770. # 若该页图片数量过多,或无文本,则直接ocr整页识别
  771. # elif image_count > 3 or only_image == 1:
  772. elif len(lt_image_list) > 3 or len(lt_text_list) == 0:
  773. page_image = self.get_page_image(page_no)
  774. if judge_error_code(page_image):
  775. self._page.error_code = page_image
  776. else:
  777. _image = _Image(page_image[1], page_image[0])
  778. _image.is_from_pdf = True
  779. self._page.add_child(_image)
  780. # 正常读取该页对象
  781. else:
  782. # 文本对象
  783. for x in lt_text_list:
  784. # 获取对象文本
  785. object_text = x.get_text()
  786. # 无法识别pdf字符编码,整页用ocr
  787. if re.search('[(]cid:[0-9]+[)]', object_text):
  788. page_image = self.get_page_image(page_no)
  789. if judge_error_code(page_image):
  790. self._page.error_code = page_image
  791. else:
  792. _image = _Image(page_image[1], page_image[0])
  793. self._page.add_child(_image)
  794. return
  795. else:
  796. _sen = _Sentence(object_text, x.bbox)
  797. # _sen.x = x.bbox[0]
  798. # _sen.y = x.bbox[1]
  799. self._page.add_child(_sen)
  800. # 图表对象
  801. for image in lt_image_list:
  802. try:
  803. print("pdf2text LTImage size", page_no, image.width, image.height)
  804. image_stream = image.stream.get_data()
  805. # 小的图忽略
  806. if image.width <= 300 and image.height <= 300:
  807. continue
  808. # 查看提取的图片高宽,太大则用pdf输出图进行ocr识别
  809. img_test = Image.open(io.BytesIO(image_stream))
  810. if img_test.size[1] > 2000 or img_test.size[0] > 1500:
  811. print("pdf2text LTImage stream output size", img_test.size)
  812. page_image = self.get_page_image(page_no)
  813. if judge_error_code(page_image):
  814. self._page.error_code = page_image
  815. else:
  816. _image = _Image(page_image[1], page_image[0])
  817. _image.is_from_pdf = True
  818. self._page.add_child(_image)
  819. return
  820. # 比较小的图则直接保存用ocr识别
  821. else:
  822. temp_path = self.unique_type_dir + 'page' + str(page_no) \
  823. + '_lt' + str(lt_image_list.index(image)) + '.jpg'
  824. img_test.save(temp_path)
  825. with open(temp_path, "rb") as ff:
  826. image_stream = ff.read()
  827. _image = _Image(image_stream, temp_path, image.bbox)
  828. self._page.add_child(_image)
  829. except Exception:
  830. log("pdf2text pdfminer read image in page " + str(page_no) +
  831. " fail! use pymupdf read image...")
  832. print(traceback.print_exc())
  833. # pdf对象需反向排序
  834. self._page.is_reverse = True
  835. def get_layout(self, page, page_no):
  836. log("")
  837. if self.has_init_pdf[0] == 0:
  838. self.init_package("pdfminer")
  839. if self._doc.error_code is not None:
  840. return
  841. # 获取该页layout
  842. start_time = time.time()
  843. try:
  844. if get_platform() == "Windows":
  845. # origin_pdf_analyze = pdf_analyze.__wrapped__
  846. # layout = origin_pdf_analyze(self.interpreter, page, self.device)
  847. layout = pdf_analyze(self.interpreter, page, self.device, page_no)
  848. else:
  849. layout = pdf_analyze(self.interpreter, page, self.device, page_no)
  850. except TimeoutError as e:
  851. log("pdf2text pdfminer read pdf page " + str(page_no) + " time out! " + str(time.time() - start_time))
  852. layout = [-4]
  853. except Exception:
  854. traceback.print_exc()
  855. log("pdf2text pdfminer read pdf page " + str(page_no) + " error! continue...")
  856. layout = [-3]
  857. return layout
  858. def get_page_image(self, page_no):
  859. log("")
  860. try:
  861. if self.has_init_pdf[1] == 0:
  862. self.init_package("PyMuPDF")
  863. if self._doc.error_code is not None:
  864. return
  865. # save_dir = self.path.split(".")[-2] + "_" + self.path.split(".")[-1]
  866. output = self.unique_type_dir + "page" + str(page_no) + ".png"
  867. page = self.doc_pymupdf.loadPage(page_no)
  868. rotate = int(0)
  869. zoom_x = 2.
  870. zoom_y = 2.
  871. mat = fitz.Matrix(zoom_x, zoom_y).preRotate(rotate)
  872. pix = page.getPixmap(matrix=mat, alpha=False)
  873. pix.writePNG(output)
  874. # 输出图片resize
  875. self.resize_image(output)
  876. with open(output, "rb") as f:
  877. pdf_image = f.read()
  878. return [output, pdf_image]
  879. except ValueError as e:
  880. traceback.print_exc()
  881. if str(e) == "page not in document":
  882. log("pdf2Image page not in document! continue... page " + str(page_no))
  883. return [0]
  884. elif "encrypted" in str(e):
  885. log("pdf2Image document need password " + str(page_no))
  886. return [-7]
  887. except RuntimeError as e:
  888. if "cannot find page" in str(e):
  889. log("pdf2Image page {} not in document! continue... ".format(str(page_no)) + str(e))
  890. return [0]
  891. else:
  892. traceback.print_exc()
  893. return [-3]
  894. def get_all_page_image(self):
  895. log("")
  896. if self.has_init_pdf[1] == 0:
  897. self.init_package("PyMuPDF")
  898. if self._doc.error_code is not None:
  899. return
  900. page_count = self.doc_pymupdf.page_count
  901. for page_no in range(page_count):
  902. # 限制pdf页数,只取前10页后10页
  903. if page_count > 20:
  904. if 10 <= page_no < page_count - 10:
  905. continue
  906. self._page = _Page(None, page_no)
  907. page_image = self.get_page_image(page_no)
  908. if judge_error_code(page_image):
  909. self._page.error_code = page_image
  910. else:
  911. _image = _Image(page_image[1], page_image[0])
  912. self._page.add_child(_image)
  913. # 报错继续读后面页面
  914. if self._doc.error_code is None and self._page.error_code is not None:
  915. continue
  916. self._doc.add_child(self._page)
  917. def get_html(self):
  918. self.convert()
  919. if self._doc.error_code is not None:
  920. return self._doc.error_code
  921. return self._doc.get_html()
  922. def delete_water_mark(self, lt_text_list, page_bbox, times=5):
  923. # 删除过多重复字句,为水印
  924. duplicate_dict = {}
  925. for _obj in lt_text_list:
  926. t = _obj.get_text()
  927. if t in duplicate_dict.keys():
  928. duplicate_dict[t][0] += 1
  929. duplicate_dict[t][1].append(_obj)
  930. else:
  931. duplicate_dict[t] = [1, [_obj]]
  932. delete_text = []
  933. for t in duplicate_dict.keys():
  934. if duplicate_dict[t][0] >= times:
  935. obj_list = duplicate_dict[t][1]
  936. obj_list.sort(key=lambda x: x.bbox[3])
  937. obj_distance_h = abs(obj_list[-1].bbox[3] - obj_list[0].bbox[1])
  938. obj_list.sort(key=lambda x: x.bbox[2])
  939. obj_distance_w = abs(obj_list[-1].bbox[2] - obj_list[0].bbox[0])
  940. if obj_distance_h >= abs(page_bbox[1] - page_bbox[3]) * 0.7 \
  941. and obj_distance_w >= abs(page_bbox[0] - page_bbox[2]) * 0.7:
  942. delete_text.append(t)
  943. temp_text_list = []
  944. for _obj in lt_text_list:
  945. t = _obj.get_text()
  946. if t not in delete_text:
  947. temp_text_list.append(_obj)
  948. return temp_text_list
  949. def resize_image(self, img_path, max_size=2000):
  950. _img = cv2.imread(img_path)
  951. if _img.shape[0] <= max_size or _img.shape[1] <= max_size:
  952. return
  953. else:
  954. resize_axis = 0 if _img.shape[0] >= _img.shape[1] else 1
  955. ratio = max_size / _img.shape[resize_axis]
  956. new_shape = [0, 0]
  957. new_shape[resize_axis] = max_size
  958. new_shape[1-resize_axis] = int(_img.shape[1-resize_axis] * ratio)
  959. _img = cv2.resize(_img, (new_shape[1], new_shape[0]))
  960. cv2.imwrite(img_path, _img)
  961. def get_single_pdf(self, path, page_no):
  962. log("into get_single_pdf")
  963. try:
  964. pdf_origin = copy.deepcopy(self.doc_pypdf2)
  965. pdf_new = copy.deepcopy(self.doc_pypdf2_new)
  966. pdf_new.addPage(pdf_origin.getPage(page_no))
  967. path_new = path.split(".")[0] + "_split.pdf"
  968. with open(path_new, "wb") as ff:
  969. pdf_new.write(ff)
  970. return path_new
  971. except PyPDF2.utils.PdfReadError as e:
  972. return [-3]
  973. except Exception as e:
  974. log("get_single_pdf error! page " + str(page_no))
  975. return [-3]
  976. # 以下为现成pdf单页解析接口
  977. class ParseSentence:
  978. def __init__(self,bbox,fontname,fontsize,_text,_title,title_text,_pattern,title_degree,is_outline,outline_location,page_no):
  979. (x0,y0,x1,y1) = bbox
  980. self.x0 = x0
  981. self.y0 = y0
  982. self.x1 = x1
  983. self.y1 = y1
  984. self.bbox = bbox
  985. self.fontname = fontname
  986. self.fontsize = fontsize
  987. self.text = _text
  988. self.title = _title
  989. self.title_text = title_text
  990. self.groups = _pattern
  991. self.title_degree = title_degree
  992. self.is_outline = is_outline
  993. self.outline_location = outline_location
  994. self.page_no = page_no
  995. def __repr__(self):
  996. return "%s,%s,%s,%d,%s"%(self.text,self.title,self.is_outline,self.outline_location,str(self.bbox))
  997. class ParseUtils:
  998. @staticmethod
  999. def getFontinfo(_page):
  1000. for _obj in _page._objs:
  1001. if isinstance(_obj,(LTTextBoxHorizontal,LTTextBoxVertical)):
  1002. for textline in _obj._objs:
  1003. done = False
  1004. for lchar in textline._objs:
  1005. if isinstance(lchar,(LTChar)):
  1006. _obj.fontname = lchar.fontname
  1007. _obj.fontsize = lchar.size
  1008. done = True
  1009. break
  1010. if done:
  1011. break
  1012. @staticmethod
  1013. def recognize_sentences(list_textbox, filter_objs, page_bbox, page_no,
  1014. remove_space=True, sourceP_LB=True):
  1015. list_textbox.sort(key=lambda x: x.bbox[0])
  1016. list_textbox.sort(key=lambda x: x.bbox[3], reverse=sourceP_LB)
  1017. cluster_textbox = []
  1018. for _textbox in list_textbox:
  1019. if _textbox in filter_objs:
  1020. continue
  1021. _find = False
  1022. for _ct in cluster_textbox:
  1023. if abs(_ct["y"]-_textbox.bbox[1]) < 5:
  1024. _find = True
  1025. _ct["textbox"].append(_textbox)
  1026. if not _find:
  1027. cluster_textbox.append({"y": _textbox.bbox[1], "textbox": [_textbox]})
  1028. cluster_textbox.sort(key=lambda x: x["y"], reverse=sourceP_LB)
  1029. list_sentences = []
  1030. for _line in cluster_textbox:
  1031. _textboxs = _line["textbox"]
  1032. _textboxs.sort(key=lambda x: x.bbox[0])
  1033. _linetext = _textboxs[0].get_text()
  1034. for _i in range(1, len(_textboxs)):
  1035. if abs(_textboxs[_i].bbox[0]-_textboxs[_i-1].bbox[2])>60:
  1036. if _linetext[-1] not in (",", ",", "。", ".", "、", ";"):
  1037. _linetext += "=,="
  1038. _linetext += _textboxs[_i].get_text()
  1039. _linetext = re.sub("[\s\r\n]", "", _linetext)
  1040. _bbox = (_textboxs[0].bbox[0], _textboxs[0].bbox[1],
  1041. _textboxs[-1].bbox[2],_textboxs[-1].bbox[3])
  1042. _title = None
  1043. _pattern_groups = None
  1044. title_text = ""
  1045. if not _title:
  1046. _groups = ParseUtils.find_title_by_pattern(_textboxs[0].get_text())
  1047. if _groups:
  1048. _title = _groups[0][0]
  1049. title_text = _groups[0][1]
  1050. _pattern_groups = _groups
  1051. if not _title:
  1052. _groups = ParseUtils.find_title_by_pattern(_linetext)
  1053. if _groups:
  1054. _title = _groups[0][0]
  1055. title_text = _groups[0][1]
  1056. _pattern_groups = _groups
  1057. if not _title:
  1058. _title = ParseUtils.rec_incenter(_bbox,page_bbox)
  1059. title_degree = 2
  1060. if not _title:
  1061. _linetext = _linetext.replace("=,=", ",")
  1062. else:
  1063. _linetext = _linetext.replace("=,=", "")
  1064. title_degree = int(_title.split("_")[1])
  1065. # 页码
  1066. if ParseUtils.rec_incenter(_bbox,page_bbox) and re.search("^\d+$", _linetext) is not None:
  1067. continue
  1068. if _linetext == "" or re.search("^,+$", _linetext) is not None:
  1069. continue
  1070. is_outline = False
  1071. outline_location = -1
  1072. _search = re.search("(?P<text>.+?)\.{5,}(?P<nums>\d+)$", _linetext)
  1073. if _search is not None:
  1074. is_outline = True
  1075. _linetext = _search.group("text")
  1076. outline_location = int(_search.group("nums"))
  1077. list_sentences.append(ParseSentence(_bbox,_textboxs[-1].__dict__.get("fontname"),_textboxs[-1].__dict__.get("fontsize"),_linetext,_title,title_text,_pattern_groups,title_degree,is_outline,outline_location,page_no))
  1078. # for _sen in list_sentences:
  1079. # print(_sen.__dict__)
  1080. return list_sentences
  1081. @staticmethod
  1082. def find_title_by_pattern(_text,_pattern="(?P<title_1>(?P<title_1_index_0_0>^第?)(?P<title_1_index_1_1>[一二三四五六七八九十ⅠⅡⅢⅣⅤⅥⅦⅧⅨⅩⅪⅫ]+)(?P<title_1_index_2_0>[、章]))|" \
  1083. "(?P<title_3>^(?P<title_3_index_0_1>[ⅠⅡⅢⅣⅤⅥⅦⅧⅨⅩⅪⅫ]+))|" \
  1084. "(?P<title_4>^(?P<title_4_index_0_0>第?)(?P<title_4_index_1_1>[一二三四五六七八九十]+)(?P<title_4_index_2_0>[节]))|" \
  1085. "(?P<title_11>^(?P<title_11_index_0_0>\d{1,2}[\..、\s\-]\d{1,2}[\..、\s\-]\d{1,2}[\..、\s\-])(?P<title_11_index_1_1>\d{1,2})(?P<title_11_index_2_0>[\..、\s\-]))|" \
  1086. "(?P<title_10>^(?P<title_10_index_0_0>\d{1,2}[\..、\s\-]\d{1,2}[\..、\s\-])(?P<title_10_index_1_1>\d{1,2})(?P<title_10_index_2_0>[\..、\s\-]))|" \
  1087. "(?P<title_7>^(?P<title_7_index_0_0>\d{1,2}[\..、\s\-])(?P<title_7_index_1_1>\d{1,2})(?P<title_7_index_2_0>[\..、\s\-]))|" \
  1088. "(?P<title_6>^(?P<title_6_index_0_1>\d{1,2})(?P<title_6_index_1_0>[\..、\s\-]))|" \
  1089. "(?P<title_15>^(?P<title_15_index_0_0>(?)(?P<title_15_index_1_1>\d{1,2})(?P<title_15_index_2_0>)))|" \
  1090. "(?P<title_17>^(?P<title_17_index_0_0>(?)(?P<title_17_index_1_1>[a-zA-Z]+)(?P<title_17_index_2_0>)))|"
  1091. "(?P<title_19>^(?P<title_19_index_0_0>(?)(?P<title_19_index_1_1>[一二三四五六七八九十ⅠⅡⅢⅣⅤⅥⅦⅧⅨⅩⅪⅫ]+)(?P<title_19_index_2_0>)))|" \
  1092. ):
  1093. _se = re.search(_pattern,_text)
  1094. groups = []
  1095. if _se is not None:
  1096. _gd = _se.groupdict()
  1097. for k,v in _gd.items():
  1098. if v is not None:
  1099. groups.append((k,v))
  1100. if len(groups):
  1101. groups.sort(key=lambda x:x[0])
  1102. return groups
  1103. return None
  1104. @staticmethod
  1105. def rec_incenter(o_bbox,p_bbox):
  1106. p_width = p_bbox[2]-p_bbox[0]
  1107. l_space = (o_bbox[0]-p_bbox[0])/p_width
  1108. r_space = (p_bbox[2]-o_bbox[2])/p_width
  1109. if abs((l_space-r_space))<0.1 and l_space>0.2:
  1110. return "title_2"
  1111. @staticmethod
  1112. def is_first_title(_title):
  1113. if _title is None:
  1114. return False
  1115. if re.search("^\d+$",_title) is not None:
  1116. if int(_title)==1:
  1117. return True
  1118. return False
  1119. if re.search("^[一二三四五六七八九十百]+$",_title) is not None:
  1120. if _title=="一":
  1121. return True
  1122. return False
  1123. if re.search("^[a-z]+$",_title) is not None:
  1124. if _title=="a":
  1125. return True
  1126. return False
  1127. if re.search("^[A-Z]+$",_title) is not None:
  1128. if _title=="A":
  1129. return True
  1130. return False
  1131. if re.search("^[ⅠⅡⅢⅣⅤⅥⅦⅧⅨⅩⅪⅫ]$",_title) is not None:
  1132. if _title=="Ⅰ":
  1133. return True
  1134. return False
  1135. return False
  1136. @staticmethod
  1137. def get_next_title(_title):
  1138. if re.search("^\d+$",_title) is not None:
  1139. return str(int(_title)+1)
  1140. if re.search("^[一二三四五六七八九十百]+$",_title) is not None:
  1141. _next_title = ParseUtils.make_increase(['一','二','三','四','五','六','七','八','九','十'],re.sub("[十百]",'',_title))
  1142. _next_title = list(_next_title)
  1143. _next_title.reverse()
  1144. if _next_title[-1]!="十":
  1145. if len(_next_title)>=2:
  1146. _next_title.insert(-1,'十')
  1147. if len(_next_title)>=4:
  1148. _next_title.insert(-3,'百')
  1149. if _title[0]=="十":
  1150. if _next_title=="十":
  1151. _next_title = ["二","十"]
  1152. _next_title.insert(0,"十")
  1153. _next_title = "".join(_next_title)
  1154. return _next_title
  1155. if re.search("^[a-z]+$",_title) is not None:
  1156. _next_title = ParseUtils.make_increase([chr(i+ord('a')) for i in range(26)],_title)
  1157. _next_title = list(_next_title)
  1158. _next_title.reverse()
  1159. return "".join(_next_title)
  1160. if re.search("^[A-Z]+$",_title) is not None:
  1161. _next_title = ParseUtils.make_increase([chr(i+ord('A')) for i in range(26)],_title)
  1162. _next_title = list(_next_title)
  1163. _next_title.reverse()
  1164. return "".join(_next_title)
  1165. if re.search("^[ⅠⅡⅢⅣⅤⅥⅦⅧⅨⅩⅪⅫ]$",_title) is not None:
  1166. _sort = ["Ⅰ","Ⅱ","Ⅲ","Ⅳ","Ⅴ","Ⅵ","Ⅶ","Ⅷ","Ⅸ","Ⅹ","Ⅺ","Ⅻ"]
  1167. _index = _sort.index(_title)
  1168. if _index<len(_sort)-1:
  1169. return _sort[_index+1]
  1170. return None
  1171. @staticmethod
  1172. def make_increase(_sort,_title,_add=1):
  1173. if len(_title)==0 and _add==0:
  1174. return ""
  1175. if len(_title)==0 and _add==1:
  1176. return _sort[0]
  1177. _index = _sort.index(_title[-1])
  1178. next_index = (_index+_add)%len(_sort)
  1179. next_chr = _sort[next_index]
  1180. if _index==len(_sort)-1:
  1181. _add = 1
  1182. else:
  1183. _add = 0
  1184. return next_chr+ParseUtils.make_increase(_sort,_title[:-1],_add)
  1185. @staticmethod
  1186. def rec_serial(_text,o_bbox,p_bbox,fontname,_pattern="(?P<title_1>^[一二三四五六七八九十]+[、])|" \
  1187. "(?P<title_2>^\d+[\.、\s])|" \
  1188. "(?P<title_3>^\d+\.\d+[\.、\s])|" \
  1189. "(?P<title_4>^\d+\.\d+\.\d+[\.、\s])|" \
  1190. "(?P<title_5>^\d+\.\d+\.\d+\.\d+[\.、\s])"):
  1191. #todo :recog the serial of the sentence
  1192. _se = re.search(_pattern,_text)
  1193. if _se is not None:
  1194. _gd = _se.groupdict()
  1195. for k,v in _gd.items():
  1196. if v is not None:
  1197. return k
  1198. return None