convert_pdf.py 87 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050105110521053105410551056105710581059106010611062106310641065106610671068106910701071107210731074107510761077107810791080108110821083108410851086108710881089109010911092109310941095109610971098109911001101110211031104110511061107110811091110111111121113111411151116111711181119112011211122112311241125112611271128112911301131113211331134113511361137113811391140114111421143114411451146114711481149115011511152115311541155115611571158115911601161116211631164116511661167116811691170117111721173117411751176117711781179118011811182118311841185118611871188118911901191119211931194119511961197119811991200120112021203120412051206120712081209121012111212121312141215121612171218121912201221122212231224122512261227122812291230123112321233123412351236123712381239124012411242124312441245124612471248124912501251125212531254125512561257125812591260126112621263126412651266126712681269127012711272127312741275127612771278127912801281128212831284128512861287128812891290129112921293129412951296129712981299130013011302130313041305130613071308130913101311131213131314131513161317131813191320132113221323132413251326132713281329133013311332133313341335133613371338133913401341134213431344134513461347134813491350135113521353135413551356135713581359136013611362136313641365136613671368136913701371137213731374137513761377137813791380138113821383138413851386138713881389139013911392139313941395139613971398139914001401140214031404140514061407140814091410141114121413141414151416141714181419142014211422142314241425142614271428142914301431143214331434143514361437143814391440144114421443144414451446144714481449145014511452145314541455145614571458145914601461146214631464146514661467146814691470147114721473147414751476147714781479148014811482148314841485148614871488148914901491149214931494149514961497149814991500150115021503150415051506150715081509151015111512151315141515151615171518151915201521152215231524152515261527152815291530153115321533153415351536153715381539154015411542154315441545154615471548154915501551155215531554155515561557155815591560156115621563156415651566156715681569157015711572157315741575157615771578157915801581158215831584158515861587158815891590159115921593159415951596159715981599160016011602160316041605160616071608160916101611161216131614161516161617161816191620162116221623162416251626162716281629163016311632163316341635163616371638163916401641164216431644164516461647164816491650165116521653165416551656165716581659166016611662166316641665166616671668166916701671167216731674167516761677167816791680168116821683168416851686168716881689169016911692169316941695169616971698169917001701170217031704170517061707170817091710171117121713171417151716171717181719172017211722172317241725172617271728172917301731173217331734173517361737173817391740174117421743174417451746174717481749175017511752175317541755175617571758175917601761176217631764176517661767176817691770177117721773177417751776177717781779178017811782178317841785178617871788178917901791179217931794179517961797179817991800180118021803180418051806180718081809181018111812181318141815181618171818181918201821182218231824182518261827182818291830183118321833183418351836183718381839184018411842184318441845184618471848184918501851185218531854185518561857185818591860186118621863186418651866186718681869187018711872187318741875187618771878187918801881188218831884188518861887188818891890189118921893189418951896189718981899190019011902190319041905190619071908190919101911191219131914191519161917191819191920192119221923192419251926192719281929193019311932193319341935193619371938193919401941194219431944194519461947194819491950195119521953195419551956195719581959196019611962196319641965196619671968196919701971197219731974197519761977197819791980198119821983198419851986198719881989199019911992199319941995199619971998199920002001200220032004200520062007200820092010201120122013201420152016201720182019202020212022202320242025202620272028202920302031203220332034203520362037203820392040204120422043204420452046204720482049205020512052205320542055205620572058205920602061206220632064206520662067206820692070207120722073207420752076207720782079208020812082208320842085208620872088208920902091209220932094209520962097209820992100210121022103
  1. import copy
  2. import inspect
  3. import io
  4. import logging
  5. import os
  6. import re
  7. import sys
  8. from bs4 import BeautifulSoup
  9. sys.path.append(os.path.dirname(__file__) + "/../")
  10. from pdfplumber import PDF
  11. from pdfplumber.table import TableFinder
  12. from pdfplumber.page import Page as pdfPage
  13. from format_convert.convert_tree import _Document, _Page, _Image, _Sentence, _Table
  14. import time
  15. import pdfminer
  16. import math
  17. from scipy.stats import linregress
  18. from matplotlib import pyplot as plt
  19. from shapely.geometry import LineString, Point
  20. from format_convert import timeout_decorator
  21. from PIL import Image
  22. from format_convert.convert_image import image_process
  23. from format_convert.convert_need_interface import from_ocr_interface, from_office_interface
  24. import traceback
  25. import cv2
  26. import PyPDF2
  27. from PyPDF2 import PdfFileReader, PdfFileWriter
  28. from pdfminer.pdfparser import PDFParser
  29. from pdfminer.pdfdocument import PDFDocument
  30. from pdfminer.pdfpage import PDFPage
  31. from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
  32. from pdfminer.converter import PDFPageAggregator
  33. from pdfminer.layout import LTTextBoxHorizontal, LAParams, LTFigure, LTImage, LTCurve, LTText, LTChar, LTRect, \
  34. LTTextBoxVertical, LTLine, LTTextContainer
  35. from format_convert.utils import judge_error_code, add_div, get_platform, get_html_p, string_similarity, LineTable, \
  36. get_logger, log, memory_decorator, draw_lines_plt, get_garble_code, line_is_cross
  37. import fitz
  38. from format_convert.wrapt_timeout_decorator import timeout
  39. @memory_decorator
  40. def pdf2Image(path, save_dir):
  41. log("into pdf2Image")
  42. try:
  43. try:
  44. doc = fitz.open(path)
  45. except Exception as e:
  46. log("pdf format error!")
  47. # print("pdf format error!", e)
  48. return [-3]
  49. # output_image_list = []
  50. output_image_dict = {}
  51. page_count = doc.page_count
  52. for page_no in range(page_count):
  53. # 限制pdf页数,只取前10页后10页
  54. if page_count > 20:
  55. if 10 <= page_no < page_count - 10:
  56. # log("pdf2Image: pdf pages count " + str(doc.page_count)
  57. # + ", only get 70 pages")
  58. continue
  59. try:
  60. page = doc.loadPage(page_no)
  61. output = save_dir + "_page" + str(page_no) + ".png"
  62. rotate = int(0)
  63. # 每个尺寸的缩放系数为1.3,这将为我们生成分辨率提高2.6的图像。
  64. # 此处若是不做设置,默认图片大小为:792X612, dpi=96
  65. # (1.33333333 --> 1056x816) (2 --> 1584x1224)
  66. # (1.183, 2.28 --> 1920x1080)
  67. zoom_x = 3.
  68. zoom_y = 3.
  69. # mat = fitz.Matrix(zoom_x, zoom_y).preRotate(rotate)
  70. mat = fitz.Matrix(zoom_x, zoom_y).preRotate(rotate)
  71. pix = page.getPixmap(matrix=mat, alpha=False)
  72. pix.writePNG(output)
  73. pdf_image = cv2.imread(output)
  74. print("pdf_image", page_no, pdf_image.shape)
  75. # output_image_list.append([page_no, output])
  76. output_image_dict[int(page_no)] = output
  77. except ValueError as e:
  78. traceback.print_exc()
  79. if str(e) == "page not in document":
  80. log("pdf2Image page not in document! continue..." + str(page_no))
  81. continue
  82. elif "encrypted" in str(e):
  83. log("pdf2Image document need password " + str(page_no))
  84. return [-7]
  85. except RuntimeError as e:
  86. if "cannot find page" in str(e):
  87. log("pdf2Image page {} not in document! continue... ".format(str(page_no)) + str(e))
  88. continue
  89. else:
  90. traceback.print_exc()
  91. return [-3]
  92. return [output_image_dict]
  93. except Exception as e:
  94. log("pdf2Image error!")
  95. print("pdf2Image", traceback.print_exc())
  96. return [-1]
  97. @timeout(10, timeout_exception=TimeoutError)
  98. def pdf_analyze(interpreter, page, device, page_no):
  99. log("into pdf_analyze")
  100. pdf_time = time.time()
  101. # print("pdf_analyze interpreter process...")
  102. interpreter.process_page(page)
  103. # print("pdf_analyze device get_result...")
  104. layout = device.get_result()
  105. log("pdf2text page " + str(page_no) + " read time " + str(time.time() - pdf_time))
  106. return layout
  107. @memory_decorator
  108. def pdf2text(path, unique_type_dir):
  109. log("into pdf2text")
  110. try:
  111. # pymupdf pdf to image
  112. save_dir = path.split(".")[-2] + "_" + path.split(".")[-1]
  113. output_image_dict = pdf2Image(path, save_dir)
  114. if judge_error_code(output_image_dict):
  115. return output_image_dict
  116. output_image_dict = output_image_dict[0]
  117. output_image_no_list = list(output_image_dict.keys())
  118. output_image_no_list.sort(key=lambda x: x)
  119. # 获取每页pdf提取的文字、表格的列数、轮廓点、是否含表格、页码
  120. # page_info_list = []
  121. page_info_dict = {}
  122. has_table_dict = {}
  123. no_table_dict = {}
  124. for page_no in output_image_no_list:
  125. img_path = output_image_dict.get(page_no)
  126. print("pdf page", page_no, "in total", output_image_no_list[-1])
  127. # 读不出来的跳过
  128. try:
  129. img = cv2.imread(img_path)
  130. img_size = img.shape
  131. except:
  132. log("pdf2text read image in page fail! continue...")
  133. continue
  134. # 每张图片处理
  135. text, column_list, outline_points, is_table = image_process(img, img_path, use_ocr=False)
  136. if judge_error_code(text):
  137. return text
  138. # page_info_list.append([text, column_list, outline_points, is_table,
  139. # page_no, img_size])
  140. page_info = [text, column_list, outline_points, is_table, img_size]
  141. page_info_dict[int(page_no)] = page_info
  142. # 包含table的和不包含table的
  143. if is_table:
  144. has_table_dict[int(page_no)] = page_info
  145. else:
  146. no_table_dict[int(page_no)] = page_info
  147. has_table_no_list = list(has_table_dict.keys())
  148. has_table_no_list.sort(key=lambda x: x)
  149. page_no_list = list(page_info_dict.keys())
  150. page_no_list.sort(key=lambda x: x)
  151. # 页码表格连接
  152. table_connect_list, connect_text_list = page_table_connect(has_table_dict)
  153. if judge_error_code(table_connect_list):
  154. return table_connect_list
  155. # 连接的页码
  156. table_connect_page_no_list = []
  157. for area in connect_text_list:
  158. table_connect_page_no_list.append(area[1])
  159. print("pdf2text table_connect_list", table_connect_list)
  160. print("connect_text_list", connect_text_list)
  161. # pdfminer 方式
  162. try:
  163. fp = open(path, 'rb')
  164. # 用文件对象创建一个PDF文档分析器
  165. parser = PDFParser(fp)
  166. # 创建一个PDF文档
  167. doc = PDFDocument(parser)
  168. # 连接分析器,与文档对象
  169. rsrcmgr = PDFResourceManager()
  170. device = PDFPageAggregator(rsrcmgr, laparams=LAParams())
  171. interpreter = PDFPageInterpreter(rsrcmgr, device)
  172. # 判断是否能读pdf
  173. for page in PDFPage.create_pages(doc):
  174. break
  175. except pdfminer.psparser.PSEOF as e:
  176. # pdfminer 读不了空白页的对象,直接使用pymupdf转换出的图片进行ocr识别
  177. log("pdf2text " + str(e) + " use ocr read pdf!")
  178. text_list = []
  179. for page_no in page_no_list:
  180. log("pdf2text ocr page_no " + str(page_no))
  181. page_info = page_info_dict.get(page_no)
  182. # 表格
  183. if page_info[3]:
  184. # 判断表格是否跨页连接
  185. area_no = 0
  186. jump_page = 0
  187. for area in table_connect_list:
  188. if page_no in area:
  189. # 只记录一次text
  190. if page_no == area[0]:
  191. image_text = connect_text_list[area_no][0]
  192. text_list.append([image_text, page_no, 0])
  193. jump_page = 1
  194. area_no += 1
  195. # 是连接页的跳过后面步骤
  196. if jump_page:
  197. continue
  198. # 直接取text
  199. image_text = page_info_dict.get(page_no)[0]
  200. text_list.append([image_text, page_no, 0])
  201. # 非表格
  202. else:
  203. with open(output_image_dict.get(page_no), "rb") as ff:
  204. image_stream = ff.read()
  205. image_text = from_ocr_interface(image_stream)
  206. text_list.append([image_text, page_no, 0])
  207. text_list.sort(key=lambda z: z[1])
  208. text = ""
  209. for t in text_list:
  210. text += t[0]
  211. return [text]
  212. except Exception as e:
  213. log("pdf format error!")
  214. traceback.print_exc()
  215. return [-3]
  216. text_list = []
  217. page_no = 0
  218. pages = PDFPage.create_pages(doc)
  219. pages = list(pages)
  220. page_count = len(pages)
  221. for page in pages:
  222. log("pdf2text pymupdf page_no " + str(page_no))
  223. # 限制pdf页数,只取前100页
  224. # if page_no >= 70:
  225. # log("pdf2text: pdf pages only get 70 pages")
  226. # break
  227. if page_count > 20:
  228. if 10 <= page_no < page_count - 10:
  229. page_no += 1
  230. continue
  231. # 判断页码在含表格页码中,直接拿已生成的text
  232. if page_no in has_table_no_list:
  233. # 判断表格是否跨页连接
  234. area_no = 0
  235. jump_page = 0
  236. for area in table_connect_list:
  237. if page_no in area:
  238. # 只记录一次text
  239. if page_no == area[0]:
  240. image_text = connect_text_list[area_no][0]
  241. text_list.append([image_text, page_no, 0])
  242. jump_page = 1
  243. area_no += 1
  244. # 是连接页的跳过后面步骤
  245. if jump_page:
  246. page_no += 1
  247. continue
  248. # 直接取text
  249. image_text = has_table_dict.get(page_no)[0]
  250. text_list.append([image_text, page_no, 0])
  251. page_no += 1
  252. continue
  253. # 不含表格的解析pdf
  254. else:
  255. if get_platform() == "Windows":
  256. try:
  257. interpreter.process_page(page)
  258. layout = device.get_result()
  259. except Exception:
  260. log("pdf2text pdfminer read pdf page error! continue...")
  261. continue
  262. else:
  263. # 设置超时时间
  264. try:
  265. # 解析pdf中的不含表格的页
  266. if get_platform() == "Windows":
  267. origin_pdf_analyze = pdf_analyze.__wrapped__
  268. layout = origin_pdf_analyze(interpreter, page, device)
  269. else:
  270. layout = pdf_analyze(interpreter, page, device, page_no)
  271. except TimeoutError as e:
  272. log("pdf2text pdfminer read pdf page time out!")
  273. return [-4]
  274. except Exception:
  275. log("pdf2text pdfminer read pdf page error! continue...")
  276. continue
  277. # 判断该页有没有文字对象,没有则有可能是有水印
  278. only_image = 1
  279. image_count = 0
  280. for x in layout:
  281. if isinstance(x, LTTextBoxHorizontal):
  282. only_image = 0
  283. if isinstance(x, LTFigure):
  284. image_count += 1
  285. # 如果该页图片数量过多,直接ocr整页识别
  286. log("pdf2text image_count " + str(image_count))
  287. if image_count >= 3:
  288. image_text = page_info_dict.get(page_no)[0]
  289. if image_text is None:
  290. with open(output_image_dict.get(page_no), "rb") as ff:
  291. image_stream = ff.read()
  292. image_text = from_ocr_interface(image_stream)
  293. if judge_error_code(image_text):
  294. return image_text
  295. page_info_dict[page_no][0] = image_text
  296. text_list.append([image_text, page_no, 0])
  297. page_no += 1
  298. continue
  299. order_list = []
  300. for x in layout:
  301. # 该对象是否是ocr识别
  302. ocr_flag = 0
  303. if get_platform() == "Windows":
  304. # print("x", page_no, x)
  305. print()
  306. if isinstance(x, LTTextBoxHorizontal):
  307. image_text = x.get_text()
  308. # 无法识别编码,用ocr
  309. if re.search('[(]cid:[0-9]+[)]', image_text):
  310. print(re.search('[(]cid:[0-9]+[)]', image_text))
  311. image_text = page_info_dict.get(page_no)[0]
  312. if image_text is None:
  313. with open(output_image_dict.get(page_no), "rb") as ff:
  314. image_stream = ff.read()
  315. image_text = from_ocr_interface(image_stream)
  316. if judge_error_code(image_text):
  317. return image_text
  318. page_info_dict[page_no][0] = image_text
  319. image_text = add_div(image_text)
  320. # order_list.append([image_text, page_no, x.bbox[1]])
  321. order_list = [[image_text, page_no, x.bbox[1]]]
  322. break
  323. else:
  324. image_text = add_div(image_text)
  325. order_list.append([image_text, page_no, x.bbox[1]])
  326. continue
  327. if isinstance(x, LTFigure):
  328. for image in x:
  329. if isinstance(image, LTImage):
  330. try:
  331. print("pdf2text LTImage size", page_no, image.width, image.height)
  332. image_stream = image.stream.get_data()
  333. # 小的图忽略
  334. if image.width <= 300 and image.height <= 300:
  335. continue
  336. # 有些水印导致pdf分割、读取报错
  337. # if image.width <= 200 and image.height<=200:
  338. # continue
  339. # img_test = Image.open(io.BytesIO(image_stream))
  340. # img_test.save('temp/LTImage.jpg')
  341. # 查看提取的图片高宽,太大则抛错用pdf输出图进行ocr识别
  342. img_test = Image.open(io.BytesIO(image_stream))
  343. if img_test.size[1] > 2000 or img_test.size[0] > 1500:
  344. print("pdf2text LTImage stream output size", img_test.size)
  345. raise Exception
  346. # 比较小的图则直接保存用ocr识别
  347. else:
  348. img_test.save('temp/LTImage.jpg')
  349. with open('temp/LTImage.jpg', "rb") as ff:
  350. image_stream = ff.read()
  351. image_text = from_ocr_interface(image_stream)
  352. if judge_error_code(image_text):
  353. return image_text
  354. # except pdfminer.pdftypes.PDFNotImplementedError:
  355. # with open(output_image_list[page_no], "rb") as ff:
  356. # image_stream = ff.read()
  357. except Exception:
  358. log("pdf2text pdfminer read image in page " + str(page_no) +
  359. " fail! use pymupdf read image...")
  360. # print(traceback.print_exc())
  361. image_text = page_info_dict.get(page_no)[0]
  362. if image_text is None:
  363. with open(output_image_dict.get(page_no), "rb") as ff:
  364. image_stream = ff.read()
  365. image_text = from_ocr_interface(image_stream)
  366. if judge_error_code(image_text):
  367. return image_text
  368. page_info_dict[page_no][0] = image_text
  369. ocr_flag = 1
  370. # 判断只拿到了水印图: 无文字输出且只有图片对象
  371. if image_text == "" and only_image:
  372. # 拆出该页pdf
  373. try:
  374. log("pdf2text guess pdf has watermark")
  375. split_path = get_single_pdf(path, page_no)
  376. except:
  377. # 如果拆分抛异常,则大概率不是水印图,用ocr识别图片
  378. log("pdf2text guess pdf has no watermark")
  379. image_text = page_info_dict.get(page_no)[0]
  380. if image_text is None:
  381. with open(output_image_dict.get(page_no), "rb") as ff:
  382. image_stream = ff.read()
  383. image_text = from_ocr_interface(image_stream)
  384. order_list.append([image_text, page_no, -1])
  385. page_info_dict[page_no][0] = image_text
  386. ocr_flag = 1
  387. continue
  388. if judge_error_code(split_path):
  389. return split_path
  390. # 调用office格式转换
  391. file_path = from_office_interface(split_path, unique_type_dir, 'html', 3)
  392. # if file_path == [-3]:
  393. # return [-3]
  394. if judge_error_code(file_path):
  395. return file_path
  396. # 获取html文本
  397. image_text = get_html_p(file_path)
  398. if judge_error_code(image_text):
  399. return image_text
  400. if get_platform() == "Windows":
  401. print("image_text", page_no, x.bbox[1], image_text)
  402. with open("temp" + str(x.bbox[0]) + ".jpg", "wb") as ff:
  403. ff.write(image_stream)
  404. image_text = add_div(image_text)
  405. if ocr_flag:
  406. order_list.append([image_text, page_no, -1])
  407. else:
  408. order_list.append([image_text, page_no, x.bbox[1]])
  409. order_list.sort(key=lambda z: z[2], reverse=True)
  410. # 有ocr参与识别
  411. if order_list[-1][2] == -1:
  412. ocr_order_list = [order_list[-1]]
  413. not_ocr_order_list = []
  414. not_ocr_text = ""
  415. # 去重,因读取失败而重复获取
  416. for order in order_list:
  417. if order[2] != -1:
  418. not_ocr_order_list.append(order)
  419. not_ocr_text += order[0]
  420. if string_similarity(ocr_order_list[0][0], not_ocr_text) >= 0.85:
  421. order_list = not_ocr_order_list
  422. else:
  423. order_list = ocr_order_list
  424. for order in order_list:
  425. text_list.append(order)
  426. page_no += 1
  427. text = ""
  428. for t in text_list:
  429. # text += add_div(t[0])
  430. if t[0] is not None:
  431. text += t[0]
  432. return [text]
  433. except UnicodeDecodeError as e:
  434. log("pdf2text pdfminer create pages failed! " + str(e))
  435. return [-3]
  436. except Exception as e:
  437. log("pdf2text error!")
  438. traceback.print_exc()
  439. return [-1]
  440. def get_single_pdf(path, page_no):
  441. log("into get_single_pdf")
  442. try:
  443. # print("path, ", path)
  444. pdf_origin = PdfFileReader(path, strict=False)
  445. pdf_new = PdfFileWriter()
  446. pdf_new.addPage(pdf_origin.getPage(page_no))
  447. path_new = path.split(".")[0] + "_split.pdf"
  448. with open(path_new, "wb") as ff:
  449. pdf_new.write(ff)
  450. return path_new
  451. except PyPDF2.utils.PdfReadError as e:
  452. raise e
  453. except Exception as e:
  454. log("get_single_pdf error! page " + str(page_no))
  455. traceback.print_exc()
  456. raise e
  457. def page_table_connect(has_table_dict):
  458. log("into page_table_connect")
  459. if not has_table_dict:
  460. return [], []
  461. try:
  462. # 判断是否有页码的表格相连
  463. table_connect_list = []
  464. temp_list = []
  465. # 离图片顶部或底部距离,页面高度的1/7
  466. threshold = 7
  467. page_no_list = list(has_table_dict.keys())
  468. page_no_list.sort(key=lambda x: x)
  469. for i in range(1, len(page_no_list)):
  470. page_info = has_table_dict.get(page_no_list[i])
  471. last_page_info = has_table_dict.get(page_no_list[i - 1])
  472. # 页码需相连
  473. if page_no_list[i] - page_no_list[i - 1] == 1:
  474. # 上一页最后一个区域的列数和下一页第一个区域列数都为0,且相等
  475. if not last_page_info[1][-1] and not page_info[1][0] and \
  476. last_page_info[1][-1] == page_info[1][0]:
  477. # 上一页的轮廓点要离底部一定距离内,下一页的轮廓点要离顶部一定距离内
  478. if last_page_info[4][0] - last_page_info[2][-1][1][1] \
  479. <= int(last_page_info[4][0] / threshold) \
  480. and page_info[2][0][0][1] - 0 \
  481. <= int(page_info[4][0] / threshold):
  482. temp_list.append(page_no_list[i - 1])
  483. temp_list.append(page_no_list[i])
  484. continue
  485. # 条件不符合的,存储之前保存的连接页码
  486. if len(temp_list) > 1:
  487. temp_list = list(set(temp_list))
  488. temp_list.sort(key=lambda x: x)
  489. table_connect_list.append(temp_list)
  490. temp_list = []
  491. if len(temp_list) > 1:
  492. temp_list = list(set(temp_list))
  493. temp_list.sort(key=lambda x: x)
  494. table_connect_list.append(temp_list)
  495. temp_list = []
  496. # 连接两页内容
  497. connect_text_list = []
  498. for area in table_connect_list:
  499. first_page_no = area[0]
  500. area_page_text = str(has_table_dict.get(first_page_no)[0])
  501. for i in range(1, len(area)):
  502. current_page_no = area[i]
  503. current_page_text = str(has_table_dict.get(current_page_no)[0])
  504. # 连接两个table
  505. table_prefix = re.finditer('<table border="1">', current_page_text)
  506. index_list = []
  507. for t in table_prefix:
  508. index_list.append(t.span())
  509. delete_index = index_list[0]
  510. current_page_text = current_page_text[:delete_index[0]] \
  511. + current_page_text[delete_index[1]:]
  512. table_suffix = re.finditer('</table>', area_page_text)
  513. index_list = []
  514. for t in table_suffix:
  515. index_list.append(t.span())
  516. delete_index = index_list[-1]
  517. area_page_text = area_page_text[:delete_index[0]] \
  518. + area_page_text[delete_index[1]:]
  519. area_page_text = area_page_text + current_page_text
  520. connect_text_list.append([area_page_text, area])
  521. return table_connect_list, connect_text_list
  522. except Exception as e:
  523. # print("page_table_connect", e)
  524. log("page_table_connect error!")
  525. traceback.print_exc()
  526. return [-1], [-1]
  527. @timeout(30, timeout_exception=TimeoutError)
  528. def read_pdf(path, package_name, packages):
  529. log(package_name)
  530. laparams = LAParams(line_overlap=0.01,
  531. char_margin=0.3,
  532. line_margin=0.01,
  533. word_margin=0.01,
  534. boxes_flow=0.1, )
  535. if package_name == packages[0]:
  536. fp = open(path, 'rb')
  537. parser = PDFParser(fp)
  538. doc_pdfminer = PDFDocument(parser)
  539. rsrcmgr = PDFResourceManager()
  540. device = PDFPageAggregator(rsrcmgr, laparams=laparams)
  541. interpreter = PDFPageInterpreter(rsrcmgr, device)
  542. return doc_pdfminer, device, interpreter
  543. elif package_name == packages[1]:
  544. doc_pymupdf = fitz.open(path)
  545. return doc_pymupdf
  546. elif package_name == packages[2]:
  547. doc_pypdf2 = PdfFileReader(path, strict=False)
  548. doc_pypdf2_new = PdfFileWriter()
  549. return doc_pypdf2, doc_pypdf2_new
  550. elif package_name == packages[3]:
  551. fp = open(path, 'rb')
  552. lt = LineTable()
  553. doc_top = 0
  554. doc_pdfplumber = read_pdfplumber(fp, laparams)
  555. return lt, doc_top, doc_pdfplumber
  556. @timeout(25, timeout_exception=TimeoutError)
  557. def read_pdfminer(path, laparams):
  558. fp = open(path, 'rb')
  559. parser = PDFParser(fp)
  560. doc_pdfminer = PDFDocument(parser)
  561. rsrcmgr = PDFResourceManager()
  562. device = PDFPageAggregator(rsrcmgr, laparams=laparams)
  563. interpreter = PDFPageInterpreter(rsrcmgr, device)
  564. return doc_pdfminer, device, interpreter
  565. @timeout(15, timeout_exception=TimeoutError)
  566. def read_pymupdf(path):
  567. return fitz.open(path)
  568. @timeout(15, timeout_exception=TimeoutError)
  569. def read_pypdf2(path):
  570. doc_pypdf2 = PdfFileReader(path, strict=False)
  571. doc_pypdf2_new = PdfFileWriter()
  572. return doc_pypdf2, doc_pypdf2_new
  573. @timeout(25, timeout_exception=TimeoutError, use_signals=False)
  574. def read_pdfplumber(path, laparams):
  575. fp = open(path, 'rb')
  576. lt = LineTable()
  577. doc_top = 0
  578. doc_pdfplumber = PDF(fp, laparams=laparams.__dict__)
  579. return lt, doc_top, doc_pdfplumber
  580. class PDFConvert:
  581. def __init__(self, path, unique_type_dir, need_page_no):
  582. self._doc = _Document(path)
  583. self.path = path
  584. self.unique_type_dir = unique_type_dir
  585. if not os.path.exists(self.unique_type_dir):
  586. os.mkdir(self.unique_type_dir)
  587. # 指定提取的页码范围
  588. self.need_page_no = need_page_no
  589. self.start_page_no = None
  590. self.end_page_no = None
  591. # 默认使用limit_page_cnt控制,前10页后10页
  592. if self.need_page_no is None:
  593. self.limit_page_cnt = 20
  594. else:
  595. # 使用start_page_no,end_page_no范围控制,例如2,5
  596. ss = self.need_page_no.split(',')
  597. if len(ss) != 2:
  598. self._doc.error_code = [-14]
  599. else:
  600. self.start_page_no = int(ss[0])
  601. self.end_page_no = int(ss[-1])
  602. if self.end_page_no == -1:
  603. self.end_page_no = 1000000
  604. self.start_page_no -= 1
  605. self.end_page_no -= 1
  606. if self.end_page_no <= self.start_page_no or self.start_page_no < 0 or self.end_page_no < -1:
  607. self._doc.error_code = [-14]
  608. self.packages = ["pdfminer", "PyMuPDF", "PyPDF2", "pdfplumber"]
  609. self.has_init_pdf = [0] * len(self.packages)
  610. @memory_decorator
  611. def init_package(self, package_name):
  612. # 各个包初始化
  613. try:
  614. laparams = LAParams(line_overlap=0.01,
  615. char_margin=0.3,
  616. line_margin=0.01,
  617. word_margin=0.01,
  618. boxes_flow=0.1, )
  619. if package_name == self.packages[0]:
  620. # fp = open(self.path, 'rb')
  621. # parser = PDFParser(fp)
  622. # self.doc_pdfminer = PDFDocument(parser)
  623. # rsrcmgr = PDFResourceManager()
  624. # self.laparams = LAParams(line_overlap=0.01,
  625. # char_margin=0.3,
  626. # line_margin=0.01,
  627. # word_margin=0.01,
  628. # boxes_flow=0.1,)
  629. # self.device = PDFPageAggregator(rsrcmgr, laparams=self.laparams)
  630. # self.interpreter = PDFPageInterpreter(rsrcmgr, self.device)
  631. self.doc_pdfminer, self.device, self.interpreter = read_pdfminer(self.path, laparams)
  632. self.has_init_pdf[0] = 1
  633. elif package_name == self.packages[1]:
  634. self.doc_pymupdf = read_pymupdf(self.path)
  635. self.has_init_pdf[1] = 1
  636. elif package_name == self.packages[2]:
  637. # self.doc_pypdf2 = PdfFileReader(self.path, strict=False)
  638. # self.doc_pypdf2_new = PdfFileWriter()
  639. self.doc_pypdf2, self.doc_pypdf2_new = read_pypdf2(self.path)
  640. self.has_init_pdf[2] = 1
  641. elif package_name == self.packages[3]:
  642. # self.fp = open(self.path, 'rb')
  643. # self.lt = LineTable()
  644. # self.doc_top = 0
  645. # self.doc_pdfplumber = PDF(self.fp, laparams=self.laparams.__dict__)
  646. self.lt, self.doc_top, self.doc_pdfplumber = read_pdfplumber(self.path, laparams)
  647. self.has_init_pdf[3] = 0
  648. else:
  649. log("Only Support Packages " + str(self.packages))
  650. raise Exception
  651. except Exception as e:
  652. log(package_name + " cannot open pdf!")
  653. traceback.print_exc()
  654. self._doc.error_code = [-3]
  655. def convert(self, limit_page_cnt=20):
  656. if self.has_init_pdf[0] == 0:
  657. self.init_package("pdfminer")
  658. if self._doc.error_code is not None:
  659. self._doc.error_code = None
  660. # pdfminer读不了直接转成图片识别
  661. self.get_all_page_image()
  662. return
  663. # 判断是否能读pdf
  664. try:
  665. pages = PDFPage.create_pages(self.doc_pdfminer)
  666. for page in pages:
  667. break
  668. pages = list(pages)
  669. # except pdfminer.psparser.PSEOF as e:
  670. except:
  671. # pdfminer 读不了空白页的对象,直接使用pymupdf转换出的图片进行ocr识别
  672. log("pdf2text pdfminer read failed! read by pymupdf!")
  673. traceback.print_exc()
  674. try:
  675. self.get_all_page_image()
  676. return
  677. except:
  678. traceback.print_exc()
  679. log("pdf2text use pymupdf read failed!")
  680. self._doc.error_code = [-3]
  681. return
  682. # 每一页进行处理
  683. pages = PDFPage.create_pages(self.doc_pdfminer)
  684. pages = list(pages)
  685. page_count = len(pages)
  686. page_no = 0
  687. for page in pages:
  688. # 指定pdf页码
  689. if self.start_page_no is not None and self.end_page_no is not None:
  690. if page_count < self.end_page_no:
  691. self.end_page_no = page_count
  692. if page_no < self.start_page_no or page_no >= self.end_page_no:
  693. page_no += 1
  694. continue
  695. # 限制pdf页数,只取前后各10页
  696. else:
  697. if page_count > limit_page_cnt and int(limit_page_cnt/2) <= page_no < page_count - int(limit_page_cnt/2):
  698. page_no += 1
  699. continue
  700. # 解析单页
  701. self._page = _Page(page, page_no)
  702. self.convert_page(page, page_no)
  703. if self._doc.error_code is None and self._page.error_code is not None:
  704. if self._page.error_code[0] in [-4, -3, 0]:
  705. page_no += 1
  706. continue
  707. else:
  708. self._doc.error_code = self._page.error_code
  709. break
  710. self._doc.add_child(self._page)
  711. page_no += 1
  712. def clean_text(self, _text):
  713. return re.sub("\s", "", _text)
  714. def get_text_lines(self, page, page_no):
  715. lt_line_list = []
  716. page_plumber = pdfPage(self.doc_pdfplumber, page, page_number=page_no, initial_doctop=self.doc_top)
  717. self.doc_top += page_plumber.height
  718. table_finder = TableFinder(page_plumber)
  719. all_width_zero = True
  720. for _edge in table_finder.get_edges():
  721. if _edge.get('linewidth') and _edge.get('linewidth') > 0:
  722. all_width_zero = False
  723. break
  724. for _edge in table_finder.get_edges():
  725. # print(_edge)
  726. if _edge.get('linewidth', 0.1) > 0 or all_width_zero:
  727. lt_line_list.append(LTLine(1, (float(_edge["x0"]), float(_edge["y0"])),
  728. (float(_edge["x1"]), float(_edge["y1"]))))
  729. log("pdf page %s has %s lines" % (str(page_no), str(len(lt_line_list))))
  730. return lt_line_list
  731. def get_page_lines(self, layout, page_no):
  732. def _plot(_line_list, mode=1):
  733. for _line in _line_list:
  734. if mode == 1:
  735. x0, y0, x1, y1 = _line.__dict__.get("bbox")
  736. elif mode == 2:
  737. x0, y0, x1, y1 = _line
  738. plt.plot([x0, x1], [y0, y1])
  739. plt.show()
  740. return
  741. def is_cross(A, B, C, D):
  742. if A[0] == B[0] == C[0] == D[0]:
  743. if A[1] <= C[1] <= B[1] or A[1] <= D[1] <= B[1] \
  744. or C[1] <= A[1] <= D[1] or C[1] <= B[1] <= D[1]:
  745. return True
  746. if A[1] == B[1] == C[1] == D[1]:
  747. if A[0] <= C[0] <= B[0] or A[0] <= D[0] <= B[0] \
  748. or C[0] <= A[0] <= D[0] or C[0] <= B[0] <= D[0]:
  749. return True
  750. line1 = LineString([A, B])
  751. line2 = LineString([C, D])
  752. int_pt = line1.intersection(line2)
  753. try:
  754. point_of_intersection = int_pt.x, int_pt.y
  755. return True
  756. except:
  757. return False
  758. def calculate_k(bbox):
  759. x = [bbox[0], bbox[2]]
  760. y = [bbox[1], bbox[3]]
  761. slope, intercept, r_value, p_value, std_err = linregress(x, y)
  762. # print('k', slope)
  763. if math.isnan(slope):
  764. slope = 0
  765. return slope
  766. def line_iou(line1, line2, axis=0):
  767. inter = min(line1[1][axis], line2[1][axis]) - max(line1[0][axis], line2[0][axis])
  768. # union = max(line1[1][axis], line2[1][axis]) - min(line1[0][axis], line2[0][axis])
  769. union = min(abs(line1[0][axis] - line1[1][axis]), abs(line2[0][axis] - line2[1][axis]))
  770. if union in [0, 0.]:
  771. iou = 0.
  772. else:
  773. iou = inter / union
  774. return iou
  775. def get_cross_line(_line_list, threshold=1, cross_times=0):
  776. # 根据是否有交点判断表格线
  777. _cross_line_list = []
  778. for line1 in _line_list:
  779. if line1 in _cross_line_list:
  780. continue
  781. if abs(line1[2] - line1[0]) > abs(line1[3] - line1[1]):
  782. p1 = [max(0, line1[0] - threshold), line1[1]]
  783. p2 = [min(line1[2] + threshold, page_w), line1[3]]
  784. else:
  785. p1 = [line1[0], max(0, line1[1] - threshold)]
  786. p2 = [line1[2], min(line1[3] + threshold, page_h)]
  787. line1 = [p1[0], p1[1], p2[0], p2[1]]
  788. _times = 0
  789. for line2 in _line_list:
  790. if abs(line2[2] - line2[0]) > abs(line2[3] - line2[1]):
  791. p3 = [max(0, line2[0] - threshold), line2[1]]
  792. p4 = [min(line2[2] + threshold, page_w), line2[3]]
  793. else:
  794. p3 = [line2[0], max(0, line2[1] - threshold)]
  795. p4 = [line2[2], min(line2[3] + threshold, page_h)]
  796. line2 = [p3[0], p3[1], p4[0], p4[1]]
  797. if line1 == line2:
  798. continue
  799. if is_cross(p1, p2, p3, p4):
  800. _times += 1
  801. if _times >= cross_times:
  802. _cross_line_list += [line1]
  803. break
  804. return _cross_line_list
  805. def repair_bias_line(_line_list):
  806. temp_list = []
  807. for line in _line_list:
  808. x0, y0, x1, y1 = line
  809. _y = min(y0, y1)
  810. _x = min(x0, x1)
  811. if abs(x0 - x1) > abs(y0 - y1):
  812. temp_list.append([x0, _y, x1, _y])
  813. else:
  814. temp_list.append([_x, y0, _x, y1])
  815. _line_list = temp_list
  816. return _line_list
  817. def repair_col_line(_straight_list, _bias_list, threshold=2, min_width=7):
  818. if not _straight_list or not _bias_list:
  819. print('add_col_bias_line empty', len(_straight_list), len(_bias_list))
  820. return []
  821. # 分列
  822. _straight_list.sort(key=lambda x: (x[0], x[1]))
  823. cols = []
  824. col = []
  825. current_w = _straight_list[0][0]
  826. for line in _straight_list:
  827. if abs(line[0] - line[2]) > abs(line[1] - line[3]):
  828. continue
  829. if min(line[0], line[2]) - threshold <= current_w <= max(line[0], line[2]) + threshold:
  830. col.append(line)
  831. else:
  832. if col:
  833. cols.append(col)
  834. col = [line]
  835. current_w = line[0]
  836. if col:
  837. cols.append(col)
  838. # 补充col
  839. new_list = []
  840. for line in bias_line_list:
  841. if abs(line[0] - line[2]) > abs(line[1] - line[3]):
  842. continue
  843. for col in cols:
  844. w = col[0][0]
  845. if w - threshold <= line[0] <= w + threshold or w - threshold <= line[2] <= w + threshold:
  846. new_list.append([w, line[1] - 3, w, line[3] + 3])
  847. new_list += _straight_list
  848. # 去重
  849. new_list = [str(x) for x in new_list]
  850. new_list = list(set(new_list))
  851. new_list = [eval(x) for x in new_list]
  852. # 分列
  853. new_list.sort(key=lambda x: (x[0], x[1]))
  854. cols = []
  855. col = []
  856. current_w = new_list[0][0]
  857. for line in new_list:
  858. if abs(line[0] - line[2]) > abs(line[1] - line[3]):
  859. continue
  860. if min(line[0], line[2]) - threshold <= current_w <= max(line[0], line[2]) + threshold:
  861. col.append(line)
  862. else:
  863. if col:
  864. cols.append(col)
  865. col = [line]
  866. current_w = line[0]
  867. if col:
  868. cols.append(col)
  869. # 删除col
  870. for col1 in cols:
  871. for col2 in cols:
  872. if col1 == col2 or abs(col1[0][0] - col2[0][0]) > min_width:
  873. continue
  874. col1_len, col2_len = 0, 0
  875. for c in col1:
  876. col1_len += abs(c[1] - c[3])
  877. for c in col2:
  878. col2_len += abs(c[1] - c[3])
  879. if col1_len > col2_len * 3:
  880. for c in col2:
  881. if c in new_list:
  882. new_list.remove(c)
  883. if col2_len > col1_len * 3:
  884. for c in col1:
  885. if c in new_list:
  886. new_list.remove(c)
  887. return new_list
  888. def merge_line(_line_list, threshold=2):
  889. new_line_list = []
  890. # 分列
  891. _line_list.sort(key=lambda x: (x[0], x[1]))
  892. cols = []
  893. col = [_line_list[0]]
  894. current_w = _line_list[0][0]
  895. for line in _line_list:
  896. if abs(line[0] - line[2]) > abs(line[1] - line[3]):
  897. continue
  898. if min(line[0], line[2]) - threshold <= current_w <= max(line[0], line[2]) + threshold \
  899. and is_cross(line[0:2], line[2:4], col[-1][0:2], col[-1][2:4]):
  900. col.append(line)
  901. else:
  902. if col:
  903. cols.append(col)
  904. col = [line]
  905. current_w = line[0]
  906. if col:
  907. cols.append(col)
  908. for col in cols:
  909. temp_c = col[0]
  910. col_w = col[0][0]
  911. for i in range(len(col) - 1):
  912. c = col[i]
  913. next_c = col[i + 1]
  914. if is_cross(c[0:2], c[2:4], next_c[0:2], next_c[2:4]):
  915. temp_c = [col_w, min(temp_c[1], c[1], c[3], next_c[1], next_c[3]), col_w,
  916. max(temp_c[3], c[1], c[3], next_c[1], next_c[3])]
  917. else:
  918. new_line_list.append(temp_c)
  919. temp_c = next_c
  920. if not new_line_list or (new_line_list and new_line_list[-1] != temp_c):
  921. new_line_list.append(temp_c)
  922. # 分行
  923. _line_list.sort(key=lambda x: (x[1], x[0]))
  924. rows = []
  925. row = []
  926. current_h = _line_list[0][1]
  927. for line in _line_list:
  928. if abs(line[0] - line[2]) < abs(line[1] - line[3]):
  929. continue
  930. if min(line[1], line[3]) - threshold <= current_h <= max(line[1], line[3]) + threshold:
  931. row.append(line)
  932. else:
  933. if row:
  934. rows.append(row)
  935. row = [line]
  936. current_h = line[1]
  937. if row:
  938. rows.append(row)
  939. for row in rows:
  940. temp_r = row[0]
  941. row_h = row[0][1]
  942. for i in range(len(row) - 1):
  943. r = row[i]
  944. next_r = row[i + 1]
  945. # if is_cross(r[0:2], r[2:4], next_r[0:2], next_r[2:4]):
  946. if line_iou([r[0:2], r[2:4]], [next_r[0:2], next_r[2:4]], axis=0):
  947. temp_r = [min(temp_r[0], r[0], r[2], next_r[0], next_r[2]), row_h,
  948. max(temp_r[2], r[0], r[2], next_r[0], next_r[2]), row_h]
  949. else:
  950. new_line_list.append(temp_r)
  951. temp_r = next_r
  952. if not new_line_list or (new_line_list and new_line_list[-1] != temp_r):
  953. new_line_list.append(temp_r)
  954. return new_line_list
  955. def remove_outline_no_cross(_line_list):
  956. row_list = []
  957. col_list = []
  958. for line in _line_list:
  959. # 存所有行
  960. if abs(line[0] - line[2]) > abs(line[1] - line[3]):
  961. row_list.append(line)
  962. # 存所有列
  963. if abs(line[0] - line[2]) < abs(line[1] - line[3]):
  964. col_list.append(line)
  965. if not col_list:
  966. return _line_list
  967. # 左右两条边框
  968. col_list.sort(key=lambda x: (x[0], x[1]))
  969. left_col = col_list[0]
  970. right_col = col_list[-1]
  971. # 判断有交点但中间区域无交点
  972. compare_list = []
  973. for col in [left_col, right_col]:
  974. add_h = abs(col[1]-col[3]) / 8
  975. center_area = [col[1]+add_h, col[3]-add_h]
  976. cross_cnt = 0
  977. center_cross_cnt = 0
  978. center_row_cnt = 0
  979. for row in row_list:
  980. if is_cross(row[0:2], row[2:4], col[0:2], col[2:4]):
  981. if center_area[0] <= row[1] <= center_area[1]:
  982. center_cross_cnt += 1
  983. else:
  984. cross_cnt += 1
  985. else:
  986. if center_area[0] <= row[1] <= center_area[1]:
  987. center_row_cnt += 1
  988. compare_list.append([cross_cnt, center_cross_cnt, center_row_cnt])
  989. _flag = True
  990. for c in compare_list:
  991. if c[0] >= 2 and c[1] == 0 and c[2] >= 2:
  992. continue
  993. _flag = False
  994. print('compare_list', compare_list)
  995. if _flag and compare_list[0][1] == compare_list[1][1] \
  996. and compare_list[0][2] == compare_list[1][2]:
  997. for col in [left_col, right_col]:
  998. if col in _line_list:
  999. _line_list.remove(col)
  1000. return _line_list
  1001. log('into get_page_lines')
  1002. page_h = layout.height
  1003. page_w = layout.width
  1004. element_list = []
  1005. line_list = []
  1006. bias_line_list = []
  1007. text_bbox_list = []
  1008. for element in layout:
  1009. if isinstance(element, LTTextContainer):
  1010. text_bbox_list.append(element.bbox)
  1011. # 只取这三种类型的bbox
  1012. if isinstance(element, (LTRect, LTCurve, LTLine)):
  1013. element_list.append(element)
  1014. if element.height > 0.5 and element.width > 0.5:
  1015. # print('element.height, element.width', element.height, element.width)
  1016. k = calculate_k(element.bbox)
  1017. if 1.73 / 3 < abs(k) < 1.73:
  1018. continue
  1019. else:
  1020. bias_line_list.append(element.bbox)
  1021. continue
  1022. line_list.append(element.bbox)
  1023. if not line_list and not bias_line_list:
  1024. return []
  1025. # 是否使用斜线来生成表格
  1026. if len(line_list) < 6 and len(bias_line_list) > len(line_list) * 2:
  1027. # print('use bias line')
  1028. # bias_line_list += add_col_bias_line(line_list, bias_line_list)
  1029. line_list = bias_line_list
  1030. # 去重
  1031. line_list = [str(x) for x in line_list]
  1032. line_list = list(set(line_list))
  1033. line_list = [eval(x) for x in line_list]
  1034. # 根据是否有交点判断表格线
  1035. cross_line_list = get_cross_line(line_list, threshold=2, cross_times=1)
  1036. if not cross_line_list:
  1037. return []
  1038. # 斜线校正
  1039. if cross_line_list:
  1040. cross_line_list = repair_bias_line(cross_line_list)
  1041. # 修复竖线
  1042. if bias_line_list:
  1043. cross_line_list = repair_col_line(cross_line_list, bias_line_list)
  1044. # 根据是否有交点判断表格线
  1045. cross_line_list = get_cross_line(cross_line_list, threshold=1, cross_times=1)
  1046. # 合并线条
  1047. if not cross_line_list:
  1048. return []
  1049. cross_line_list = merge_line(cross_line_list)
  1050. # 删除最外层嵌套边框
  1051. cross_line_list = remove_outline_no_cross(cross_line_list)
  1052. # show
  1053. # print('len(cross_line_list)', len(cross_line_list))
  1054. # _plot(line_list, mode=2)
  1055. # _plot(cross_line_list, mode=2)
  1056. lt_line_list = []
  1057. for line in cross_line_list:
  1058. lt_line_list.append(LTLine(1, (float(line[0]), float(line[1])),
  1059. (float(line[2]), float(line[3]))))
  1060. log("pdf page %s has %s lines" % (str(page_no), str(len(lt_line_list))))
  1061. return lt_line_list
  1062. def recognize_text(self, layout, page_no, lt_text_list, lt_line_list):
  1063. list_tables, filter_objs, _, connect_textbox_list = self.lt.recognize_table(lt_text_list, lt_line_list)
  1064. self._page.in_table_objs = filter_objs
  1065. # print("=======text_len:%d:filter_len:%d"%(len(lt_text_list),len(filter_objs)))
  1066. for table in list_tables:
  1067. _table = _Table(table["table"], table["bbox"])
  1068. # self._page.children.append(_table)
  1069. self._page.add_child(_table)
  1070. list_sentences = ParseUtils.recognize_sentences(lt_text_list, filter_objs,
  1071. layout.bbox, page_no)
  1072. for sentence in list_sentences:
  1073. _sen = _Sentence(sentence.text, sentence.bbox)
  1074. self._page.add_child(_sen)
  1075. # pdf对象需反向排序
  1076. self._page.is_reverse = True
  1077. def is_text_legal(self, lt_text_list, page_no):
  1078. # 无法识别pdf字符编码,整页用ocr
  1079. text_temp = ""
  1080. for _t in lt_text_list:
  1081. text_temp += _t.get_text()
  1082. if re.search('[(]cid:[0-9]+[)]', text_temp):
  1083. log("text has cid! try pymupdf...")
  1084. page_image = self.get_page_image(page_no)
  1085. if judge_error_code(page_image):
  1086. self._page.error_code = page_image
  1087. else:
  1088. _image = _Image(page_image[1], page_image[0])
  1089. self._page.add_child(_image)
  1090. return False
  1091. match1 = re.findall(get_garble_code(), text_temp)
  1092. # match2 = re.search('[\u4e00-\u9fa5]', text_temp)
  1093. if len(match1) > 3 and len(text_temp) > 10:
  1094. log("pdf garbled code! try pymupdf... " + text_temp[:20])
  1095. page_image = self.get_page_image(page_no)
  1096. if judge_error_code(page_image):
  1097. self._page.error_code = page_image
  1098. else:
  1099. _image = _Image(page_image[1], page_image[0])
  1100. self._page.add_child(_image)
  1101. return False
  1102. return True
  1103. def judge_b_table(self, lt_text_list):
  1104. # 先分行
  1105. lt_text_list.sort(key=lambda x: (x.bbox[1], x.bbox[0]))
  1106. lt_text_row_list = []
  1107. current_h = lt_text_list[0].bbox[1]
  1108. row = []
  1109. threshold = 2
  1110. for lt_text in lt_text_list:
  1111. bbox = lt_text.bbox
  1112. if current_h - threshold <= bbox[1] <= current_h + threshold:
  1113. row.append(lt_text)
  1114. else:
  1115. if row:
  1116. lt_text_row_list.append(row)
  1117. row = [lt_text]
  1118. current_h = lt_text.bbox[1]
  1119. if row:
  1120. lt_text_row_list.append(row)
  1121. # print('lt_text_row_list')
  1122. # for r in lt_text_row_list:
  1123. # print('r', [x.get_text() for x in r])
  1124. # 判断文本中间是否是空格,或一行文本中间有多个
  1125. is_b_table_flag = False
  1126. is_b_table_cnt = 3
  1127. tolerate_cnt = 2
  1128. t_cnt = 0
  1129. row_cnt = 0
  1130. for row in lt_text_row_list:
  1131. # 水印行跳过
  1132. if len(row) == 1 and len(row[0].get_text()[:-1]) == 1:
  1133. continue
  1134. if len(row) == 1:
  1135. text = row[0].get_text()
  1136. bbox = row[0].bbox
  1137. match = re.search('[ ]{3,}', text)
  1138. if match and re.search('[\u4e00-\u9fff]{2,}', text[:match.span()[0]]) \
  1139. and re.search('[\u4e00-\u9fff]{2,}', text[match.span()[1]:]):
  1140. row_cnt += 1
  1141. t_cnt = 0
  1142. else:
  1143. # 容忍
  1144. if t_cnt < tolerate_cnt:
  1145. t_cnt += 1
  1146. continue
  1147. row_cnt = 0
  1148. else:
  1149. row_cnt += 1
  1150. t_cnt = 0
  1151. if row_cnt >= is_b_table_cnt:
  1152. is_b_table_flag = True
  1153. break
  1154. log('pdf is_b_table_flag ' + str(is_b_table_flag))
  1155. return is_b_table_flag
  1156. def convert_page(self, page, page_no):
  1157. layout = self.get_layout(page, page_no)
  1158. if self._doc.error_code is not None:
  1159. return
  1160. if judge_error_code(layout):
  1161. self._page.error_code = layout
  1162. return
  1163. # 判断该页的对象类型,并存储
  1164. lt_text_list = []
  1165. lt_image_list = []
  1166. for x in layout:
  1167. if isinstance(x, (LTTextBoxHorizontal, LTTextBoxVertical)):
  1168. lt_text_list.append(x)
  1169. if isinstance(x, LTFigure):
  1170. for y in x:
  1171. if isinstance(y, LTImage):
  1172. # 小的图忽略
  1173. if y.width <= 300 and y.height <= 300:
  1174. continue
  1175. # 图的width超过layout width,很大可能是水印
  1176. if y.width > layout.width + 20:
  1177. continue
  1178. lt_image_list.append(y)
  1179. lt_text_list = self.delete_water_mark(lt_text_list, layout.bbox, 15)
  1180. log("convert_pdf page " + str(page_no))
  1181. log("len(lt_image_list), len(lt_text_list) " + str(len(lt_image_list)) + " " + str(len(lt_text_list)))
  1182. log('layout.width, layout.height' + str(layout.width) + str(layout.height))
  1183. # 若只有文本且图片数为0,直接提取文字及表格
  1184. # if only_image == 0 and image_count == 0:
  1185. # if len(lt_image_list) == 0 and len(lt_text_list) > 0:
  1186. # # PDFPlumber
  1187. # if self.has_init_pdf[3] == 0:
  1188. # self.init_package("pdfplumber")
  1189. # if self._doc.error_code is not None:
  1190. # self._doc.error_code = None
  1191. # log("init pdfplumber failed! try pymupdf...")
  1192. # # 调用pdfplumber获取pdf图片报错,则使用pypdf2将pdf转html
  1193. # page_image = self.get_page_image(page_no)
  1194. # if judge_error_code(page_image):
  1195. # self._page.error_code = page_image
  1196. # else:
  1197. # _image = _Image(page_image[1], page_image[0])
  1198. # self._page.add_child(_image)
  1199. # return
  1200. #
  1201. # if not self.is_text_legal(lt_text_list, page_no):
  1202. # return
  1203. #
  1204. # # 根据text规律,判断该页是否可能有无边框表格
  1205. # start_time = time.time()
  1206. # if self.judge_b_table(lt_text_list):
  1207. # page_image = self.get_page_image(page_no)
  1208. # if judge_error_code(page_image):
  1209. # self._page.error_code = page_image
  1210. # else:
  1211. # _image = _Image(page_image[1], page_image[0])
  1212. # _image.is_from_pdf = True
  1213. # _image.b_table_from_text = True
  1214. # _image.b_table_text_obj_list = lt_text_list
  1215. # _image.b_table_layout_size = (layout.width, layout.height)
  1216. # self._page.add_child(_image)
  1217. # log('convert_pdf judge_b_table set image cost: ' + str(time.time()-start_time))
  1218. #
  1219. # try:
  1220. # lt_line_list = self.get_page_lines(layout, page_no)
  1221. # except:
  1222. # traceback.print_exc()
  1223. # lt_line_list = []
  1224. # self._page.error_code = [-13]
  1225. # try:
  1226. # # lt_line_list = self.get_text_lines(page,page_no)
  1227. # self.recognize_text(layout, page_no, lt_text_list, lt_line_list)
  1228. # except:
  1229. # traceback.print_exc()
  1230. # self._page.error_code = [-8]
  1231. # 若该页图片数量过多,或无文本,则直接ocr整页识别
  1232. # elif image_count > 3 or only_image == 1:
  1233. if len(lt_image_list) > 3 or len(lt_text_list) == 0:
  1234. page_image = self.get_page_image(page_no)
  1235. if judge_error_code(page_image):
  1236. self._page.error_code = page_image
  1237. else:
  1238. _image = _Image(page_image[1], page_image[0])
  1239. _image.is_from_pdf = True
  1240. self._page.add_child(_image)
  1241. # 正常读取该页对象
  1242. else:
  1243. # 图表对象
  1244. for image in lt_image_list:
  1245. try:
  1246. print("pdf2text LTImage size", page_no, image.width, image.height)
  1247. image_stream = image.stream.get_data()
  1248. # 小的图忽略
  1249. if image.width <= 300 and image.height <= 300:
  1250. continue
  1251. # 查看提取的图片高宽,太大则用pdf输出图进行ocr识别
  1252. img_test = Image.open(io.BytesIO(image_stream))
  1253. if image.height >= 1000 and image.width >= 1000:
  1254. page_image = self.get_page_image(page_no)
  1255. if judge_error_code(page_image):
  1256. self._page.error_code = page_image
  1257. else:
  1258. _image = _Image(page_image[1], page_image[0])
  1259. _image.is_from_pdf = True
  1260. self._page.add_child(_image)
  1261. return
  1262. # 比较小的图则直接保存用ocr识别
  1263. else:
  1264. temp_path = self.unique_type_dir + 'page' + str(page_no) \
  1265. + '_lt' + str(lt_image_list.index(image)) + '.jpg'
  1266. img_test.save(temp_path)
  1267. with open(temp_path, "rb") as ff:
  1268. image_stream = ff.read()
  1269. _image = _Image(image_stream, temp_path, image.bbox)
  1270. self._page.add_child(_image)
  1271. except Exception:
  1272. log("pdf2text pdfminer read image in page " + str(page_no) +
  1273. " fail! use pymupdf read image...")
  1274. traceback.print_exc()
  1275. # pdf对象需反向排序
  1276. self._page.is_reverse = True
  1277. self.init_package("pdfplumber")
  1278. if not self.is_text_legal(lt_text_list, page_no):
  1279. return
  1280. # 根据text规律,判断该页是否可能有无边框表格
  1281. if self.judge_b_table(lt_text_list):
  1282. page_image = self.get_page_image(page_no)
  1283. if judge_error_code(page_image):
  1284. self._page.error_code = page_image
  1285. else:
  1286. _image = _Image(page_image[1], page_image[0])
  1287. _image.is_from_pdf = True
  1288. _image.b_table_from_text = True
  1289. _image.b_table_text_obj_list = lt_text_list
  1290. _image.b_table_layout_size = (layout.width, layout.height)
  1291. self._page.add_child(_image)
  1292. try:
  1293. lt_line_list = self.get_page_lines(layout, page_no)
  1294. except:
  1295. traceback.print_exc()
  1296. lt_line_list = []
  1297. self._page.error_code = [-13]
  1298. self.recognize_text(layout, page_no, lt_text_list, lt_line_list)
  1299. def get_layout(self, page, page_no):
  1300. log("get_layout")
  1301. if self.has_init_pdf[0] == 0:
  1302. self.init_package("pdfminer")
  1303. if self._doc.error_code is not None:
  1304. return
  1305. # 获取该页layout
  1306. start_time = time.time()
  1307. try:
  1308. if get_platform() == "Windows":
  1309. # origin_pdf_analyze = pdf_analyze.__wrapped__
  1310. # layout = origin_pdf_analyze(self.interpreter, page, self.device)
  1311. layout = pdf_analyze(self.interpreter, page, self.device, page_no)
  1312. else:
  1313. layout = pdf_analyze(self.interpreter, page, self.device, page_no)
  1314. except TimeoutError as e:
  1315. log("pdf2text pdfminer read pdf page " + str(page_no) + " time out! " + str(time.time() - start_time))
  1316. layout = [-4]
  1317. except Exception:
  1318. traceback.print_exc()
  1319. log("pdf2text pdfminer read pdf page " + str(page_no) + " error! continue...")
  1320. layout = [-3]
  1321. return layout
  1322. def get_page_image(self, page_no):
  1323. log("")
  1324. try:
  1325. if self.has_init_pdf[1] == 0:
  1326. self.init_package("PyMuPDF")
  1327. if self._doc.error_code is not None:
  1328. return
  1329. # save_dir = self.path.split(".")[-2] + "_" + self.path.split(".")[-1]
  1330. output = self.unique_type_dir + "page" + str(page_no) + ".png"
  1331. page = self.doc_pymupdf.loadPage(page_no)
  1332. rotate = int(0)
  1333. zoom_x = 2.
  1334. zoom_y = 2.
  1335. mat = fitz.Matrix(zoom_x, zoom_y).preRotate(rotate)
  1336. pix = page.getPixmap(matrix=mat, alpha=False)
  1337. pix.writePNG(output)
  1338. # 输出图片resize
  1339. self.resize_image(output)
  1340. with open(output, "rb") as f:
  1341. pdf_image = f.read()
  1342. return [output, pdf_image]
  1343. except ValueError as e:
  1344. traceback.print_exc()
  1345. if str(e) == "page not in document":
  1346. log("pdf2Image page not in document! continue... page " + str(page_no))
  1347. return [0]
  1348. elif "encrypted" in str(e):
  1349. log("pdf2Image document need password " + str(page_no))
  1350. return [-7]
  1351. except RuntimeError as e:
  1352. if "cannot find page" in str(e):
  1353. log("pdf2Image page {} not in document! continue... ".format(str(page_no)) + str(e))
  1354. return [0]
  1355. else:
  1356. traceback.print_exc()
  1357. return [-3]
  1358. def get_all_page_image(self):
  1359. log("")
  1360. if self.has_init_pdf[1] == 0:
  1361. self.init_package("PyMuPDF")
  1362. if self._doc.error_code is not None:
  1363. return
  1364. page_count = self.doc_pymupdf.page_count
  1365. for page_no in range(page_count):
  1366. # 限制pdf页数,只取前10页后10页
  1367. if page_count > 20:
  1368. if 10 <= page_no < page_count - 10:
  1369. continue
  1370. self._page = _Page(None, page_no)
  1371. page_image = self.get_page_image(page_no)
  1372. if judge_error_code(page_image):
  1373. self._page.error_code = page_image
  1374. else:
  1375. _image = _Image(page_image[1], page_image[0])
  1376. self._page.add_child(_image)
  1377. # 报错继续读后面页面
  1378. if self._doc.error_code is None and self._page.error_code is not None:
  1379. continue
  1380. self._doc.add_child(self._page)
  1381. def connect_table(self, html_list):
  1382. if not html_list:
  1383. return html_list
  1384. # 判断条件1:最后一个表格后有无非页码文本/第一个表格前有无文本
  1385. connect_flag_list = []
  1386. soup_list = []
  1387. for i, h in enumerate(html_list):
  1388. soup_list.append(BeautifulSoup(h, 'lxml'))
  1389. # 找最后一个表格
  1390. table_start1, table_end1 = None, None
  1391. # print('h', h)
  1392. match = re.finditer('<table', h)
  1393. for m in match:
  1394. table_start1 = m.span()[0]
  1395. if table_start1 is not None:
  1396. match = re.finditer('</table>', h[table_start1:])
  1397. for m in match:
  1398. table_end1 = m.span()[1] + table_start1
  1399. # 最后一个表格后有无除了页码外的内容
  1400. connect_flag1 = False
  1401. if table_end1 is not None:
  1402. match = re.search('[^-/第页0-9]*', re.sub('<div>|</div>', '', h[table_end1:]))
  1403. # print('match1', match.group())
  1404. if not match or match.group() == '':
  1405. connect_flag1 = True
  1406. # 找第一个表格
  1407. table_start2, table_end2 = None, None
  1408. match = re.finditer('<table', h)
  1409. for m in match:
  1410. table_start2 = m.span()[0]
  1411. break
  1412. # 第一个表格后有无内容
  1413. connect_flag2 = False
  1414. if table_start2 is not None and table_start2 == 0:
  1415. connect_flag2 = True
  1416. connect_flag_list.append([i, connect_flag2, connect_flag1])
  1417. # print('connect_flag_list', connect_flag_list)
  1418. # 根据条件1合并需连接页码,形成组
  1419. connect_pages_list = []
  1420. temp_list = []
  1421. for i, c in enumerate(connect_flag_list):
  1422. if temp_list and c[1]:
  1423. temp_list.append(c)
  1424. elif not temp_list and c[2]:
  1425. temp_list.append(c)
  1426. else:
  1427. if temp_list:
  1428. connect_pages_list.append(temp_list)
  1429. temp_list = []
  1430. connect_pages_list.append([c])
  1431. if temp_list:
  1432. connect_pages_list.append(temp_list)
  1433. # print('connect_pages_list', connect_pages_list)
  1434. # 判断条件2:判断组内列数是否相同
  1435. connect_pages_list2 = []
  1436. for c_list in connect_pages_list:
  1437. if len(c_list) == 1:
  1438. connect_pages_list2.append(c_list)
  1439. else:
  1440. col_cnt_list = []
  1441. for c in c_list:
  1442. soup = soup_list[c[0]]
  1443. table1 = soup.findAll('table')[-1]
  1444. table2 = soup.findAll('table')[0]
  1445. tr1 = table1.findAll('tr')
  1446. tr2 = table2.findAll('tr')
  1447. td1 = tr1[-1].findAll('td')
  1448. td2 = tr2[0].findAll('td')
  1449. col_cnt_list.append([len(td2), len(td1)])
  1450. new_c_list = [c_list[0]]
  1451. # print('col_cnt_list', col_cnt_list)
  1452. for i in range(len(col_cnt_list) - 1):
  1453. if col_cnt_list[i][1] != col_cnt_list[i + 1][0]:
  1454. connect_pages_list2.append(new_c_list)
  1455. new_c_list = [c_list[i + 1]]
  1456. else:
  1457. new_c_list.append(c_list[i + 1])
  1458. if new_c_list:
  1459. connect_pages_list2.append(new_c_list)
  1460. # print('connect_pages_list2', connect_pages_list2)
  1461. # 符合连接条件的拼接表格
  1462. new_html_list = []
  1463. for c_list in connect_pages_list2:
  1464. if len(c_list) == 1:
  1465. new_html_list.append(html_list[c_list[0][0]])
  1466. continue
  1467. new_html = ''
  1468. for c in c_list:
  1469. new_html += html_list[c[0]]
  1470. new_html = re.sub('</table>([-/第页0-9]|<div>|</div>)*<table border="1">', '<tr><td>#@#@#</td></tr>',
  1471. new_html)
  1472. soup = BeautifulSoup(new_html, 'lxml')
  1473. trs = soup.findAll('tr')
  1474. for i in range(len(trs)):
  1475. if trs[i].get_text() == '#@#@#':
  1476. td1 = trs[i - 1].findAll('td')
  1477. td2 = trs[i + 1].findAll('td')
  1478. if td2[0].get_text() == '':
  1479. for j in range(len(td1)):
  1480. td1[j].string = td1[j].get_text() + td2[j].get_text()
  1481. trs[i + 1].decompose()
  1482. trs[i].decompose()
  1483. new_html = str(soup)
  1484. new_html_list.append(new_html)
  1485. html_str = ''
  1486. for h in new_html_list:
  1487. html_str += h
  1488. return [html_str]
  1489. def get_html(self):
  1490. if self._doc.error_code is not None:
  1491. return self._doc.error_code
  1492. self.convert()
  1493. if self._doc.error_code is not None:
  1494. return self._doc.error_code
  1495. html = self._doc.get_html(return_list=True)
  1496. # 表格连接
  1497. try:
  1498. html = self.connect_table(html)
  1499. except:
  1500. traceback.print_exc()
  1501. return [-12]
  1502. return html
  1503. def delete_water_mark(self, lt_text_list, page_bbox, times=5):
  1504. # 删除过多重复字句,为水印
  1505. duplicate_dict = {}
  1506. for _obj in lt_text_list:
  1507. t = _obj.get_text()
  1508. if t in duplicate_dict.keys():
  1509. duplicate_dict[t][0] += 1
  1510. duplicate_dict[t][1].append(_obj)
  1511. else:
  1512. duplicate_dict[t] = [1, [_obj]]
  1513. delete_text = []
  1514. for t in duplicate_dict.keys():
  1515. if duplicate_dict[t][0] >= times:
  1516. obj_list = duplicate_dict[t][1]
  1517. obj_list.sort(key=lambda x: x.bbox[3])
  1518. obj_distance_h = abs(obj_list[-1].bbox[3] - obj_list[0].bbox[1])
  1519. obj_list.sort(key=lambda x: x.bbox[2])
  1520. obj_distance_w = abs(obj_list[-1].bbox[2] - obj_list[0].bbox[0])
  1521. if obj_distance_h >= abs(page_bbox[1] - page_bbox[3]) * 0.7 \
  1522. and obj_distance_w >= abs(page_bbox[0] - page_bbox[2]) * 0.7:
  1523. delete_text.append(t)
  1524. temp_text_list = []
  1525. for _obj in lt_text_list:
  1526. t = _obj.get_text()
  1527. if t not in delete_text:
  1528. temp_text_list.append(_obj)
  1529. return temp_text_list
  1530. def resize_image(self, img_path, max_size=2000):
  1531. _img = cv2.imread(img_path)
  1532. if _img.shape[0] <= max_size or _img.shape[1] <= max_size:
  1533. return
  1534. else:
  1535. resize_axis = 0 if _img.shape[0] >= _img.shape[1] else 1
  1536. ratio = max_size / _img.shape[resize_axis]
  1537. new_shape = [0, 0]
  1538. new_shape[resize_axis] = max_size
  1539. new_shape[1 - resize_axis] = int(_img.shape[1 - resize_axis] * ratio)
  1540. _img = cv2.resize(_img, (new_shape[1], new_shape[0]))
  1541. cv2.imwrite(img_path, _img)
  1542. def get_single_pdf(self, path, page_no):
  1543. log("into get_single_pdf")
  1544. try:
  1545. pdf_origin = copy.deepcopy(self.doc_pypdf2)
  1546. pdf_new = copy.deepcopy(self.doc_pypdf2_new)
  1547. pdf_new.addPage(pdf_origin.getPage(page_no))
  1548. path_new = path.split(".")[0] + "_split.pdf"
  1549. with open(path_new, "wb") as ff:
  1550. pdf_new.write(ff)
  1551. return path_new
  1552. except PyPDF2.utils.PdfReadError as e:
  1553. return [-3]
  1554. except Exception as e:
  1555. log("get_single_pdf error! page " + str(page_no))
  1556. return [-3]
  1557. def get_text_font():
  1558. def flags_decomposer(flags):
  1559. """Make font flags human readable."""
  1560. l = []
  1561. if flags & 2 ** 0:
  1562. l.append("superscript")
  1563. if flags & 2 ** 1:
  1564. l.append("italic")
  1565. if flags & 2 ** 2:
  1566. l.append("serifed")
  1567. else:
  1568. l.append("sans")
  1569. if flags & 2 ** 3:
  1570. l.append("monospaced")
  1571. else:
  1572. l.append("proportional")
  1573. if flags & 2 ** 4:
  1574. l.append("bold")
  1575. return ", ".join(l)
  1576. def get_underlined_textLines(page):
  1577. """
  1578. 获取某页pdf上的所有下划线文本信息
  1579. :param page: fitz中的一页
  1580. :return: list of tuples,每个tuple都是一个完整的下划线覆盖的整体:[(下划线句, 所在blk_no, 所在line_no), ...]
  1581. """
  1582. paths = page.get_drawings() # get drawings on the current page
  1583. # 获取该页内所有的height很小的bbox。因为下划线其实大多是这种矩形
  1584. # subselect things we may regard as lines
  1585. lines = []
  1586. for p in paths:
  1587. for item in p["items"]:
  1588. if item[0] == "l": # an actual line
  1589. p1, p2 = item[1:]
  1590. if p1.y == p2.y:
  1591. lines.append((p1, p2))
  1592. elif item[0] == "re": # a rectangle: check if height is small
  1593. r = item[1]
  1594. if r.width > r.height and r.height <= 2:
  1595. lines.append((r.tl, r.tr)) # take top left / right points
  1596. # 获取该页的`max_lineheight`,用于下面比较距离使用
  1597. blocks = page.get_text("dict", flags=11)["blocks"]
  1598. max_lineheight = 0
  1599. for b in blocks:
  1600. for l in b["lines"]:
  1601. bbox = fitz.Rect(l["bbox"])
  1602. if bbox.height > max_lineheight:
  1603. max_lineheight = bbox.height
  1604. underlined_res = []
  1605. # 开始对下划线内容进行查询
  1606. # make a list of words
  1607. words = page.get_text("words")
  1608. # if underlined, the bottom left / right of a word
  1609. # should not be too far away from left / right end of some line:
  1610. for wdx, w in enumerate(words): # w[4] is the actual word string
  1611. r = fitz.Rect(w[:4]) # first 4 items are the word bbox
  1612. for p1, p2 in lines: # check distances for start / end points
  1613. if abs(r.bl - p1) <= max_lineheight: # 当前word的左下满足下划线左下
  1614. if abs(r.br - p2) <= max_lineheight: # 当前word的右下满足下划线右下(单个词,无空格)
  1615. print(f"Word '{w[4]}' is underlined! Its block-line number is {w[-3], w[-2]}")
  1616. underlined_res.append((w[4], w[-3], w[-2])) # 分别是(下划线词,所在blk_no,所在line_no)
  1617. break # don't check more lines
  1618. else: # 继续寻找同line右侧的有缘人,因为有些下划线覆盖的词包含多个词,多个词之间有空格
  1619. curr_line_num = w[-2] # line nunmber
  1620. for right_wdx in range(wdx + 1, len(words), 1):
  1621. _next_w = words[right_wdx]
  1622. if _next_w[-2] != curr_line_num: # 当前遍历到的右侧word已经不是当前行的了(跨行是不行的)
  1623. break
  1624. _r_right = fitz.Rect(_next_w[:4]) # 获取当前同行右侧某word的方框4点
  1625. if abs(_r_right.br - p2) <= max_lineheight: # 用此word右下点和p2(目标下划线右上点)算距离,距离要小于max_lineheight
  1626. print(
  1627. f"Word '{' '.join([_one_word[4] for _one_word in words[wdx:right_wdx + 1]])}' is underlined! " +
  1628. f"Its block-line number is {w[-3], w[-2]}")
  1629. underlined_res.append(
  1630. (' '.join([_one_word[4] for _one_word in words[wdx:right_wdx + 1]]),
  1631. w[-3], w[-2])
  1632. ) # 分别是(下划线词,所在blk_no,所在line_no)
  1633. break # don't check more lines
  1634. return underlined_res
  1635. _p = r'C:\Users\Administrator\Desktop\test_pdf\error2-2.pdf'
  1636. doc_pymupdf = read_pymupdf(_p)
  1637. page = doc_pymupdf[0]
  1638. blocks = page.get_text("dict", flags=11)["blocks"]
  1639. for b in blocks: # iterate through the text blocks
  1640. for l in b["lines"]: # iterate through the text lines
  1641. for s in l["spans"]: # iterate through the text spans
  1642. print("")
  1643. font_properties = "Font: '%s' (%s), size %g, color #%06x" % (
  1644. s["font"], # font name
  1645. flags_decomposer(s["flags"]), # readable font flags
  1646. s["size"], # font size
  1647. s["color"], # font color
  1648. )
  1649. print(s)
  1650. print("Text: '%s'" % s["text"]) # simple print of text
  1651. print(font_properties)
  1652. get_underlined_textLines(page)
  1653. # 以下为现成pdf单页解析接口
  1654. class ParseSentence:
  1655. def __init__(self, bbox, fontname, fontsize, _text, _title, title_text, _pattern, title_degree, is_outline,
  1656. outline_location, page_no):
  1657. (x0, y0, x1, y1) = bbox
  1658. self.x0 = x0
  1659. self.y0 = y0
  1660. self.x1 = x1
  1661. self.y1 = y1
  1662. self.bbox = bbox
  1663. self.fontname = fontname
  1664. self.fontsize = fontsize
  1665. self.text = _text
  1666. self.title = _title
  1667. self.title_text = title_text
  1668. self.groups = _pattern
  1669. self.title_degree = title_degree
  1670. self.is_outline = is_outline
  1671. self.outline_location = outline_location
  1672. self.page_no = page_no
  1673. def __repr__(self):
  1674. return "%s,%s,%s,%d,%s" % (self.text, self.title, self.is_outline, self.outline_location, str(self.bbox))
  1675. class ParseUtils:
  1676. @staticmethod
  1677. def getFontinfo(_page):
  1678. for _obj in _page._objs:
  1679. if isinstance(_obj, (LTTextBoxHorizontal, LTTextBoxVertical)):
  1680. for textline in _obj._objs:
  1681. done = False
  1682. for lchar in textline._objs:
  1683. if isinstance(lchar, (LTChar)):
  1684. _obj.fontname = lchar.fontname
  1685. _obj.fontsize = lchar.size
  1686. done = True
  1687. break
  1688. if done:
  1689. break
  1690. @staticmethod
  1691. def recognize_sentences(list_textbox, filter_objs, page_bbox, page_no,
  1692. remove_space=True, sourceP_LB=True):
  1693. list_textbox.sort(key=lambda x: x.bbox[0])
  1694. list_textbox.sort(key=lambda x: x.bbox[3], reverse=sourceP_LB)
  1695. cluster_textbox = []
  1696. for _textbox in list_textbox:
  1697. if _textbox in filter_objs:
  1698. continue
  1699. _find = False
  1700. for _ct in cluster_textbox:
  1701. if abs(_ct["y"] - _textbox.bbox[1]) < 5:
  1702. _find = True
  1703. _ct["textbox"].append(_textbox)
  1704. if not _find:
  1705. cluster_textbox.append({"y": _textbox.bbox[1], "textbox": [_textbox]})
  1706. cluster_textbox.sort(key=lambda x: x["y"], reverse=sourceP_LB)
  1707. list_sentences = []
  1708. for _line in cluster_textbox:
  1709. _textboxs = _line["textbox"]
  1710. _textboxs.sort(key=lambda x: x.bbox[0])
  1711. _linetext = _textboxs[0].get_text()
  1712. for _i in range(1, len(_textboxs)):
  1713. if abs(_textboxs[_i].bbox[0] - _textboxs[_i - 1].bbox[2]) > 60:
  1714. if _linetext[-1] not in (",", ",", "。", ".", "、", ";"):
  1715. _linetext += "=,="
  1716. _linetext += _textboxs[_i].get_text()
  1717. _linetext = re.sub("[\s\r\n]", "", _linetext)
  1718. _bbox = (_textboxs[0].bbox[0], _textboxs[0].bbox[1],
  1719. _textboxs[-1].bbox[2], _textboxs[-1].bbox[3])
  1720. _title = None
  1721. _pattern_groups = None
  1722. title_text = ""
  1723. if not _title:
  1724. _groups = ParseUtils.find_title_by_pattern(_textboxs[0].get_text())
  1725. if _groups:
  1726. _title = _groups[0][0]
  1727. title_text = _groups[0][1]
  1728. _pattern_groups = _groups
  1729. if not _title:
  1730. _groups = ParseUtils.find_title_by_pattern(_linetext)
  1731. if _groups:
  1732. _title = _groups[0][0]
  1733. title_text = _groups[0][1]
  1734. _pattern_groups = _groups
  1735. if not _title:
  1736. _title = ParseUtils.rec_incenter(_bbox, page_bbox)
  1737. title_degree = 2
  1738. if not _title:
  1739. _linetext = _linetext.replace("=,=", ",")
  1740. else:
  1741. _linetext = _linetext.replace("=,=", "")
  1742. title_degree = int(_title.split("_")[1])
  1743. # 页码
  1744. if ParseUtils.rec_incenter(_bbox, page_bbox) and re.search("^\d+$", _linetext) is not None:
  1745. continue
  1746. if _linetext == "" or re.search("^,+$", _linetext) is not None:
  1747. continue
  1748. is_outline = False
  1749. outline_location = -1
  1750. _search = re.search("(?P<text>.+?)\.{5,}(?P<nums>\d+)$", _linetext)
  1751. if _search is not None:
  1752. is_outline = True
  1753. _linetext = _search.group("text")
  1754. outline_location = int(_search.group("nums"))
  1755. list_sentences.append(
  1756. ParseSentence(_bbox, _textboxs[-1].__dict__.get("fontname"), _textboxs[-1].__dict__.get("fontsize"),
  1757. _linetext, _title, title_text, _pattern_groups, title_degree, is_outline,
  1758. outline_location, page_no))
  1759. # for _sen in list_sentences:
  1760. # print(_sen.__dict__)
  1761. return list_sentences
  1762. @staticmethod
  1763. def find_title_by_pattern(_text,
  1764. _pattern="(?P<title_1>(?P<title_1_index_0_0>^第?)(?P<title_1_index_1_1>[一二三四五六七八九十ⅠⅡⅢⅣⅤⅥⅦⅧⅨⅩⅪⅫ]+)(?P<title_1_index_2_0>[、章]))|" \
  1765. "(?P<title_3>^(?P<title_3_index_0_1>[ⅠⅡⅢⅣⅤⅥⅦⅧⅨⅩⅪⅫ]+))|" \
  1766. "(?P<title_4>^(?P<title_4_index_0_0>第?)(?P<title_4_index_1_1>[一二三四五六七八九十]+)(?P<title_4_index_2_0>[节]))|" \
  1767. "(?P<title_11>^(?P<title_11_index_0_0>\d{1,2}[\..、\s\-]\d{1,2}[\..、\s\-]\d{1,2}[\..、\s\-])(?P<title_11_index_1_1>\d{1,2})(?P<title_11_index_2_0>[\..、\s\-]))|" \
  1768. "(?P<title_10>^(?P<title_10_index_0_0>\d{1,2}[\..、\s\-]\d{1,2}[\..、\s\-])(?P<title_10_index_1_1>\d{1,2})(?P<title_10_index_2_0>[\..、\s\-]))|" \
  1769. "(?P<title_7>^(?P<title_7_index_0_0>\d{1,2}[\..、\s\-])(?P<title_7_index_1_1>\d{1,2})(?P<title_7_index_2_0>[\..、\s\-]))|" \
  1770. "(?P<title_6>^(?P<title_6_index_0_1>\d{1,2})(?P<title_6_index_1_0>[\..、\s\-]))|" \
  1771. "(?P<title_15>^(?P<title_15_index_0_0>(?)(?P<title_15_index_1_1>\d{1,2})(?P<title_15_index_2_0>)))|" \
  1772. "(?P<title_17>^(?P<title_17_index_0_0>(?)(?P<title_17_index_1_1>[a-zA-Z]+)(?P<title_17_index_2_0>)))|"
  1773. "(?P<title_19>^(?P<title_19_index_0_0>(?)(?P<title_19_index_1_1>[一二三四五六七八九十ⅠⅡⅢⅣⅤⅥⅦⅧⅨⅩⅪⅫ]+)(?P<title_19_index_2_0>)))|" \
  1774. ):
  1775. _se = re.search(_pattern, _text)
  1776. groups = []
  1777. if _se is not None:
  1778. _gd = _se.groupdict()
  1779. for k, v in _gd.items():
  1780. if v is not None:
  1781. groups.append((k, v))
  1782. if len(groups):
  1783. groups.sort(key=lambda x: x[0])
  1784. return groups
  1785. return None
  1786. @staticmethod
  1787. def rec_incenter(o_bbox, p_bbox):
  1788. p_width = p_bbox[2] - p_bbox[0]
  1789. l_space = (o_bbox[0] - p_bbox[0]) / p_width
  1790. r_space = (p_bbox[2] - o_bbox[2]) / p_width
  1791. if abs((l_space - r_space)) < 0.1 and l_space > 0.2:
  1792. return "title_2"
  1793. @staticmethod
  1794. def is_first_title(_title):
  1795. if _title is None:
  1796. return False
  1797. if re.search("^\d+$", _title) is not None:
  1798. if int(_title) == 1:
  1799. return True
  1800. return False
  1801. if re.search("^[一二三四五六七八九十百]+$", _title) is not None:
  1802. if _title == "一":
  1803. return True
  1804. return False
  1805. if re.search("^[a-z]+$", _title) is not None:
  1806. if _title == "a":
  1807. return True
  1808. return False
  1809. if re.search("^[A-Z]+$", _title) is not None:
  1810. if _title == "A":
  1811. return True
  1812. return False
  1813. if re.search("^[ⅠⅡⅢⅣⅤⅥⅦⅧⅨⅩⅪⅫ]$", _title) is not None:
  1814. if _title == "Ⅰ":
  1815. return True
  1816. return False
  1817. return False
  1818. @staticmethod
  1819. def get_next_title(_title):
  1820. if re.search("^\d+$", _title) is not None:
  1821. return str(int(_title) + 1)
  1822. if re.search("^[一二三四五六七八九十百]+$", _title) is not None:
  1823. _next_title = ParseUtils.make_increase(['一', '二', '三', '四', '五', '六', '七', '八', '九', '十'],
  1824. re.sub("[十百]", '', _title))
  1825. _next_title = list(_next_title)
  1826. _next_title.reverse()
  1827. if _next_title[-1] != "十":
  1828. if len(_next_title) >= 2:
  1829. _next_title.insert(-1, '十')
  1830. if len(_next_title) >= 4:
  1831. _next_title.insert(-3, '百')
  1832. if _title[0] == "十":
  1833. if _next_title == "十":
  1834. _next_title = ["二", "十"]
  1835. _next_title.insert(0, "十")
  1836. _next_title = "".join(_next_title)
  1837. return _next_title
  1838. if re.search("^[a-z]+$", _title) is not None:
  1839. _next_title = ParseUtils.make_increase([chr(i + ord('a')) for i in range(26)], _title)
  1840. _next_title = list(_next_title)
  1841. _next_title.reverse()
  1842. return "".join(_next_title)
  1843. if re.search("^[A-Z]+$", _title) is not None:
  1844. _next_title = ParseUtils.make_increase([chr(i + ord('A')) for i in range(26)], _title)
  1845. _next_title = list(_next_title)
  1846. _next_title.reverse()
  1847. return "".join(_next_title)
  1848. if re.search("^[ⅠⅡⅢⅣⅤⅥⅦⅧⅨⅩⅪⅫ]$", _title) is not None:
  1849. _sort = ["Ⅰ", "Ⅱ", "Ⅲ", "Ⅳ", "Ⅴ", "Ⅵ", "Ⅶ", "Ⅷ", "Ⅸ", "Ⅹ", "Ⅺ", "Ⅻ"]
  1850. _index = _sort.index(_title)
  1851. if _index < len(_sort) - 1:
  1852. return _sort[_index + 1]
  1853. return None
  1854. @staticmethod
  1855. def make_increase(_sort, _title, _add=1):
  1856. if len(_title) == 0 and _add == 0:
  1857. return ""
  1858. if len(_title) == 0 and _add == 1:
  1859. return _sort[0]
  1860. _index = _sort.index(_title[-1])
  1861. next_index = (_index + _add) % len(_sort)
  1862. next_chr = _sort[next_index]
  1863. if _index == len(_sort) - 1:
  1864. _add = 1
  1865. else:
  1866. _add = 0
  1867. return next_chr + ParseUtils.make_increase(_sort, _title[:-1], _add)
  1868. @staticmethod
  1869. def rec_serial(_text, o_bbox, p_bbox, fontname, _pattern="(?P<title_1>^[一二三四五六七八九十]+[、])|" \
  1870. "(?P<title_2>^\d+[\.、\s])|" \
  1871. "(?P<title_3>^\d+\.\d+[\.、\s])|" \
  1872. "(?P<title_4>^\d+\.\d+\.\d+[\.、\s])|" \
  1873. "(?P<title_5>^\d+\.\d+\.\d+\.\d+[\.、\s])"):
  1874. # todo :recog the serial of the sentence
  1875. _se = re.search(_pattern, _text)
  1876. if _se is not None:
  1877. _gd = _se.groupdict()
  1878. for k, v in _gd.items():
  1879. if v is not None:
  1880. return k
  1881. return None
  1882. if __name__ == '__main__':
  1883. # get_text_font()
  1884. PDFConvert(r"C:/Users/Administrator/Downloads/1651896704621.pdf", "C:/Users/Administrator/Downloads/1").get_html()
  1885. # print(b'\x10')