convert_pdf.py 92 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989910010110210310410510610710810911011111211311411511611711811912012112212312412512612712812913013113213313413513613713813914014114214314414514614714814915015115215315415515615715815916016116216316416516616716816917017117217317417517617717817918018118218318418518618718818919019119219319419519619719819920020120220320420520620720820921021121221321421521621721821922022122222322422522622722822923023123223323423523623723823924024124224324424524624724824925025125225325425525625725825926026126226326426526626726826927027127227327427527627727827928028128228328428528628728828929029129229329429529629729829930030130230330430530630730830931031131231331431531631731831932032132232332432532632732832933033133233333433533633733833934034134234334434534634734834935035135235335435535635735835936036136236336436536636736836937037137237337437537637737837938038138238338438538638738838939039139239339439539639739839940040140240340440540640740840941041141241341441541641741841942042142242342442542642742842943043143243343443543643743843944044144244344444544644744844945045145245345445545645745845946046146246346446546646746846947047147247347447547647747847948048148248348448548648748848949049149249349449549649749849950050150250350450550650750850951051151251351451551651751851952052152252352452552652752852953053153253353453553653753853954054154254354454554654754854955055155255355455555655755855956056156256356456556656756856957057157257357457557657757857958058158258358458558658758858959059159259359459559659759859960060160260360460560660760860961061161261361461561661761861962062162262362462562662762862963063163263363463563663763863964064164264364464564664764864965065165265365465565665765865966066166266366466566666766866967067167267367467567667767867968068168268368468568668768868969069169269369469569669769869970070170270370470570670770870971071171271371471571671771871972072172272372472572672772872973073173273373473573673773873974074174274374474574674774874975075175275375475575675775875976076176276376476576676776876977077177277377477577677777877978078178278378478578678778878979079179279379479579679779879980080180280380480580680780880981081181281381481581681781881982082182282382482582682782882983083183283383483583683783883984084184284384484584684784884985085185285385485585685785885986086186286386486586686786886987087187287387487587687787887988088188288388488588688788888989089189289389489589689789889990090190290390490590690790890991091191291391491591691791891992092192292392492592692792892993093193293393493593693793893994094194294394494594694794894995095195295395495595695795895996096196296396496596696796896997097197297397497597697797897998098198298398498598698798898999099199299399499599699799899910001001100210031004100510061007100810091010101110121013101410151016101710181019102010211022102310241025102610271028102910301031103210331034103510361037103810391040104110421043104410451046104710481049105010511052105310541055105610571058105910601061106210631064106510661067106810691070107110721073107410751076107710781079108010811082108310841085108610871088108910901091109210931094109510961097109810991100110111021103110411051106110711081109111011111112111311141115111611171118111911201121112211231124112511261127112811291130113111321133113411351136113711381139114011411142114311441145114611471148114911501151115211531154115511561157115811591160116111621163116411651166116711681169117011711172117311741175117611771178117911801181118211831184118511861187118811891190119111921193119411951196119711981199120012011202120312041205120612071208120912101211121212131214121512161217121812191220122112221223122412251226122712281229123012311232123312341235123612371238123912401241124212431244124512461247124812491250125112521253125412551256125712581259126012611262126312641265126612671268126912701271127212731274127512761277127812791280128112821283128412851286128712881289129012911292129312941295129612971298129913001301130213031304130513061307130813091310131113121313131413151316131713181319132013211322132313241325132613271328132913301331133213331334133513361337133813391340134113421343134413451346134713481349135013511352135313541355135613571358135913601361136213631364136513661367136813691370137113721373137413751376137713781379138013811382138313841385138613871388138913901391139213931394139513961397139813991400140114021403140414051406140714081409141014111412141314141415141614171418141914201421142214231424142514261427142814291430143114321433143414351436143714381439144014411442144314441445144614471448144914501451145214531454145514561457145814591460146114621463146414651466146714681469147014711472147314741475147614771478147914801481148214831484148514861487148814891490149114921493149414951496149714981499150015011502150315041505150615071508150915101511151215131514151515161517151815191520152115221523152415251526152715281529153015311532153315341535153615371538153915401541154215431544154515461547154815491550155115521553155415551556155715581559156015611562156315641565156615671568156915701571157215731574157515761577157815791580158115821583158415851586158715881589159015911592159315941595159615971598159916001601160216031604160516061607160816091610161116121613161416151616161716181619162016211622162316241625162616271628162916301631163216331634163516361637163816391640164116421643164416451646164716481649165016511652165316541655165616571658165916601661166216631664166516661667166816691670167116721673167416751676167716781679168016811682168316841685168616871688168916901691169216931694169516961697169816991700170117021703170417051706170717081709171017111712171317141715171617171718171917201721172217231724172517261727172817291730173117321733173417351736173717381739174017411742174317441745174617471748174917501751175217531754175517561757175817591760176117621763176417651766176717681769177017711772177317741775177617771778177917801781178217831784178517861787178817891790179117921793179417951796179717981799180018011802180318041805180618071808180918101811181218131814181518161817181818191820182118221823182418251826182718281829183018311832183318341835183618371838183918401841184218431844184518461847184818491850185118521853185418551856185718581859186018611862186318641865186618671868186918701871187218731874187518761877187818791880188118821883188418851886188718881889189018911892189318941895189618971898189919001901190219031904190519061907190819091910191119121913191419151916191719181919192019211922192319241925192619271928192919301931193219331934193519361937193819391940194119421943194419451946194719481949195019511952195319541955195619571958195919601961196219631964196519661967196819691970197119721973197419751976197719781979198019811982198319841985198619871988198919901991199219931994199519961997199819992000200120022003200420052006200720082009201020112012201320142015201620172018201920202021202220232024202520262027202820292030203120322033203420352036203720382039204020412042204320442045204620472048204920502051205220532054205520562057205820592060206120622063206420652066206720682069207020712072207320742075207620772078207920802081208220832084208520862087208820892090209120922093209420952096209720982099210021012102210321042105210621072108210921102111211221132114211521162117211821192120212121222123212421252126212721282129213021312132213321342135213621372138213921402141214221432144214521462147214821492150215121522153215421552156215721582159216021612162216321642165216621672168216921702171217221732174217521762177217821792180218121822183218421852186218721882189219021912192219321942195219621972198219922002201220222032204220522062207220822092210221122122213221422152216221722182219222022212222222322242225222622272228222922302231223222332234223522362237
  1. import copy
  2. import io
  3. import os
  4. import re
  5. import sys
  6. from bs4 import BeautifulSoup
  7. sys.path.append(os.path.dirname(__file__) + "/../")
  8. from pdfplumber import PDF
  9. from pdfplumber.table import TableFinder
  10. from pdfplumber.page import Page as pdfPage
  11. from format_convert.convert_tree import _Document, _Page, _Image, _Sentence, _Table
  12. import time
  13. import math
  14. from scipy.stats import linregress
  15. from matplotlib import pyplot as plt
  16. from shapely.geometry import LineString, Point
  17. from PIL import Image
  18. import traceback
  19. import cv2
  20. import PyPDF2
  21. from PyPDF2 import PdfFileReader, PdfFileWriter
  22. from pdfminer.pdfparser import PDFParser
  23. from pdfminer.pdfdocument import PDFDocument
  24. from pdfminer.pdfpage import PDFPage
  25. from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
  26. from pdfminer.converter import PDFPageAggregator
  27. from pdfminer.layout import LTTextBoxHorizontal, LAParams, LTFigure, LTImage, LTCurve, LTText, LTChar, LTRect, \
  28. LTTextBoxVertical, LTLine, LTTextContainer, LTTextLine
  29. from format_convert.utils import judge_error_code, get_platform, LineTable, log, \
  30. memory_decorator, get_garble_code, get_md5_from_bytes, bytes2np, bbox_iou
  31. import fitz
  32. from format_convert.wrapt_timeout_decorator import timeout
  33. from otr.table_line_new import table_line_pdf
  34. @memory_decorator
  35. def pdf2Image(path, save_dir):
  36. log("into pdf2Image")
  37. try:
  38. try:
  39. doc = fitz.open(path)
  40. except Exception as e:
  41. log("pdf format error!")
  42. # print("pdf format error!", e)
  43. return [-3]
  44. # output_image_list = []
  45. output_image_dict = {}
  46. page_count = doc.page_count
  47. for page_no in range(page_count):
  48. # 限制pdf页数,只取前10页后10页
  49. if page_count > 20:
  50. if 10 <= page_no < page_count - 10:
  51. # log("pdf2Image: pdf pages count " + str(doc.page_count)
  52. # + ", only get 70 pages")
  53. continue
  54. try:
  55. page = doc.loadPage(page_no)
  56. output = save_dir + "_page" + str(page_no) + ".png"
  57. rotate = int(0)
  58. # 每个尺寸的缩放系数为1.3,这将为我们生成分辨率提高2.6的图像。
  59. # 此处若是不做设置,默认图片大小为:792X612, dpi=96
  60. # (1.33333333 --> 1056x816) (2 --> 1584x1224)
  61. # (1.183, 2.28 --> 1920x1080)
  62. zoom_x = 3.
  63. zoom_y = 3.
  64. # mat = fitz.Matrix(zoom_x, zoom_y).preRotate(rotate)
  65. mat = fitz.Matrix(zoom_x, zoom_y).preRotate(rotate)
  66. pix = page.getPixmap(matrix=mat, alpha=False)
  67. pix.writePNG(output)
  68. pdf_image = cv2.imread(output)
  69. print("pdf_image", page_no, pdf_image.shape)
  70. # output_image_list.append([page_no, output])
  71. output_image_dict[int(page_no)] = output
  72. except ValueError as e:
  73. traceback.print_exc()
  74. if str(e) == "page not in document":
  75. log("pdf2Image page not in document! continue..." + str(page_no))
  76. continue
  77. elif "encrypted" in str(e):
  78. log("pdf2Image document need password " + str(page_no))
  79. return [-7]
  80. except RuntimeError as e:
  81. if "cannot find page" in str(e):
  82. log("pdf2Image page {} not in document! continue... ".format(str(page_no)) + str(e))
  83. continue
  84. else:
  85. traceback.print_exc()
  86. return [-3]
  87. return [output_image_dict]
  88. except Exception as e:
  89. log("pdf2Image error!")
  90. print("pdf2Image", traceback.print_exc())
  91. return [-1]
  92. @timeout(10, timeout_exception=TimeoutError)
  93. def pdf_analyze(interpreter, page, device, page_no):
  94. log("into pdf_analyze")
  95. pdf_time = time.time()
  96. # print("pdf_analyze interpreter process...")
  97. interpreter.process_page(page)
  98. # print("pdf_analyze device get_result...")
  99. layout = device.get_result()
  100. log("pdf2text page " + str(page_no) + " read time " + str(time.time() - pdf_time))
  101. return layout
  102. @memory_decorator
  103. def pdf2text(path, unique_type_dir):
  104. return
  105. def get_single_pdf(path, page_no):
  106. log("into get_single_pdf")
  107. try:
  108. # print("path, ", path)
  109. pdf_origin = PdfFileReader(path, strict=False)
  110. pdf_new = PdfFileWriter()
  111. pdf_new.addPage(pdf_origin.getPage(page_no))
  112. path_new = path.split(".")[0] + "_split.pdf"
  113. with open(path_new, "wb") as ff:
  114. pdf_new.write(ff)
  115. return path_new
  116. except PyPDF2.utils.PdfReadError as e:
  117. raise e
  118. except Exception as e:
  119. log("get_single_pdf error! page " + str(page_no))
  120. traceback.print_exc()
  121. raise e
  122. def page_table_connect(has_table_dict):
  123. log("into page_table_connect")
  124. if not has_table_dict:
  125. return [], []
  126. try:
  127. # 判断是否有页码的表格相连
  128. table_connect_list = []
  129. temp_list = []
  130. # 离图片顶部或底部距离,页面高度的1/7
  131. threshold = 7
  132. page_no_list = list(has_table_dict.keys())
  133. page_no_list.sort(key=lambda x: x)
  134. for i in range(1, len(page_no_list)):
  135. page_info = has_table_dict.get(page_no_list[i])
  136. last_page_info = has_table_dict.get(page_no_list[i - 1])
  137. # 页码需相连
  138. if page_no_list[i] - page_no_list[i - 1] == 1:
  139. # 上一页最后一个区域的列数和下一页第一个区域列数都为0,且相等
  140. if not last_page_info[1][-1] and not page_info[1][0] and \
  141. last_page_info[1][-1] == page_info[1][0]:
  142. # 上一页的轮廓点要离底部一定距离内,下一页的轮廓点要离顶部一定距离内
  143. if last_page_info[4][0] - last_page_info[2][-1][1][1] \
  144. <= int(last_page_info[4][0] / threshold) \
  145. and page_info[2][0][0][1] - 0 \
  146. <= int(page_info[4][0] / threshold):
  147. temp_list.append(page_no_list[i - 1])
  148. temp_list.append(page_no_list[i])
  149. continue
  150. # 条件不符合的,存储之前保存的连接页码
  151. if len(temp_list) > 1:
  152. temp_list = list(set(temp_list))
  153. temp_list.sort(key=lambda x: x)
  154. table_connect_list.append(temp_list)
  155. temp_list = []
  156. if len(temp_list) > 1:
  157. temp_list = list(set(temp_list))
  158. temp_list.sort(key=lambda x: x)
  159. table_connect_list.append(temp_list)
  160. temp_list = []
  161. # 连接两页内容
  162. connect_text_list = []
  163. for area in table_connect_list:
  164. first_page_no = area[0]
  165. area_page_text = str(has_table_dict.get(first_page_no)[0])
  166. for i in range(1, len(area)):
  167. current_page_no = area[i]
  168. current_page_text = str(has_table_dict.get(current_page_no)[0])
  169. # 连接两个table
  170. table_prefix = re.finditer('<table border="1">', current_page_text)
  171. index_list = []
  172. for t in table_prefix:
  173. index_list.append(t.span())
  174. delete_index = index_list[0]
  175. current_page_text = current_page_text[:delete_index[0]] \
  176. + current_page_text[delete_index[1]:]
  177. table_suffix = re.finditer('</table>', area_page_text)
  178. index_list = []
  179. for t in table_suffix:
  180. index_list.append(t.span())
  181. delete_index = index_list[-1]
  182. area_page_text = area_page_text[:delete_index[0]] \
  183. + area_page_text[delete_index[1]:]
  184. area_page_text = area_page_text + current_page_text
  185. connect_text_list.append([area_page_text, area])
  186. return table_connect_list, connect_text_list
  187. except Exception as e:
  188. # print("page_table_connect", e)
  189. log("page_table_connect error!")
  190. traceback.print_exc()
  191. return [-1], [-1]
  192. @timeout(30, timeout_exception=TimeoutError)
  193. def read_pdf(path, package_name, packages):
  194. log(package_name)
  195. laparams = LAParams(line_overlap=0.01,
  196. char_margin=0.3,
  197. line_margin=0.01,
  198. word_margin=0.01,
  199. boxes_flow=0.1, )
  200. if package_name == packages[0]:
  201. fp = open(path, 'rb')
  202. parser = PDFParser(fp)
  203. doc_pdfminer = PDFDocument(parser)
  204. rsrcmgr = PDFResourceManager()
  205. device = PDFPageAggregator(rsrcmgr, laparams=laparams)
  206. interpreter = PDFPageInterpreter(rsrcmgr, device)
  207. return doc_pdfminer, device, interpreter
  208. elif package_name == packages[1]:
  209. doc_pymupdf = fitz.open(path)
  210. return doc_pymupdf
  211. elif package_name == packages[2]:
  212. doc_pypdf2 = PdfFileReader(path, strict=False)
  213. doc_pypdf2_new = PdfFileWriter()
  214. return doc_pypdf2, doc_pypdf2_new
  215. elif package_name == packages[3]:
  216. fp = open(path, 'rb')
  217. lt = LineTable()
  218. doc_top = 0
  219. doc_pdfplumber = read_pdfplumber(fp, laparams)
  220. return lt, doc_top, doc_pdfplumber
  221. @timeout(25, timeout_exception=TimeoutError)
  222. def read_pdfminer(path, laparams):
  223. fp = open(path, 'rb')
  224. parser = PDFParser(fp)
  225. doc_pdfminer = PDFDocument(parser)
  226. rsrcmgr = PDFResourceManager()
  227. device = PDFPageAggregator(rsrcmgr, laparams=laparams)
  228. interpreter = PDFPageInterpreter(rsrcmgr, device)
  229. return doc_pdfminer, device, interpreter
  230. @timeout(15, timeout_exception=TimeoutError)
  231. def read_pymupdf(path):
  232. return fitz.open(path)
  233. @timeout(15, timeout_exception=TimeoutError)
  234. def read_pypdf2(path):
  235. doc_pypdf2 = PdfFileReader(path, strict=False)
  236. doc_pypdf2_new = PdfFileWriter()
  237. return doc_pypdf2, doc_pypdf2_new
  238. @timeout(25, timeout_exception=TimeoutError, use_signals=False)
  239. def read_pdfplumber(path, laparams):
  240. fp = open(path, 'rb')
  241. lt = LineTable()
  242. doc_top = 0
  243. doc_pdfplumber = PDF(fp, laparams=laparams.__dict__)
  244. return lt, doc_top, doc_pdfplumber
  245. class PDFConvert:
  246. def __init__(self, path, unique_type_dir, need_page_no):
  247. self._doc = _Document(path)
  248. self.path = path
  249. self.unique_type_dir = unique_type_dir
  250. if not os.path.exists(self.unique_type_dir):
  251. os.mkdir(self.unique_type_dir)
  252. # 指定提取的页码范围
  253. self.need_page_no = need_page_no
  254. self.start_page_no = None
  255. self.end_page_no = None
  256. # 默认使用limit_page_cnt控制,前10页后10页
  257. if self.need_page_no is None:
  258. self.limit_page_cnt = 20
  259. else:
  260. # 使用start_page_no,end_page_no范围控制,例如2,5
  261. ss = self.need_page_no.split(',')
  262. if len(ss) != 2:
  263. self._doc.error_code = [-14]
  264. else:
  265. self.start_page_no = int(ss[0])
  266. self.end_page_no = int(ss[-1])
  267. if self.end_page_no == -1:
  268. self.end_page_no = 1000000
  269. self.start_page_no -= 1
  270. self.end_page_no -= 1
  271. if self.end_page_no <= self.start_page_no or self.start_page_no < 0 or self.end_page_no < -1:
  272. self._doc.error_code = [-14]
  273. self.packages = ["pdfminer", "PyMuPDF", "PyPDF2", "pdfplumber"]
  274. self.has_init_pdf = [0] * len(self.packages)
  275. # 记录图片对象的md5,用于去除大量重复图片
  276. self.md5_image_obj_list = []
  277. @memory_decorator
  278. def init_package(self, package_name):
  279. # 各个包初始化
  280. try:
  281. laparams = LAParams(line_overlap=0.01,
  282. char_margin=0.3,
  283. line_margin=0.01,
  284. word_margin=0.01,
  285. boxes_flow=0.1, )
  286. if package_name == self.packages[0]:
  287. # fp = open(self.path, 'rb')
  288. # parser = PDFParser(fp)
  289. # self.doc_pdfminer = PDFDocument(parser)
  290. # rsrcmgr = PDFResourceManager()
  291. # self.laparams = LAParams(line_overlap=0.01,
  292. # char_margin=0.3,
  293. # line_margin=0.01,
  294. # word_margin=0.01,
  295. # boxes_flow=0.1,)
  296. # self.device = PDFPageAggregator(rsrcmgr, laparams=self.laparams)
  297. # self.interpreter = PDFPageInterpreter(rsrcmgr, self.device)
  298. self.doc_pdfminer, self.device, self.interpreter = read_pdfminer(self.path, laparams)
  299. self.has_init_pdf[0] = 1
  300. elif package_name == self.packages[1]:
  301. self.doc_pymupdf = read_pymupdf(self.path)
  302. self.has_init_pdf[1] = 1
  303. elif package_name == self.packages[2]:
  304. # self.doc_pypdf2 = PdfFileReader(self.path, strict=False)
  305. # self.doc_pypdf2_new = PdfFileWriter()
  306. self.doc_pypdf2, self.doc_pypdf2_new = read_pypdf2(self.path)
  307. self.has_init_pdf[2] = 1
  308. elif package_name == self.packages[3]:
  309. # self.fp = open(self.path, 'rb')
  310. # self.lt = LineTable()
  311. # self.doc_top = 0
  312. # self.doc_pdfplumber = PDF(self.fp, laparams=self.laparams.__dict__)
  313. self.lt, self.doc_top, self.doc_pdfplumber = read_pdfplumber(self.path, laparams)
  314. self.has_init_pdf[3] = 0
  315. else:
  316. log("Only Support Packages " + str(self.packages))
  317. raise Exception
  318. except Exception as e:
  319. log(package_name + " cannot open pdf!")
  320. traceback.print_exc()
  321. self._doc.error_code = [-3]
  322. def convert(self, limit_page_cnt=20):
  323. if self.has_init_pdf[0] == 0:
  324. self.init_package("pdfminer")
  325. if self._doc.error_code is not None:
  326. self._doc.error_code = None
  327. # pdfminer读不了直接转成图片识别
  328. self.get_all_page_image()
  329. return
  330. # 判断是否能读pdf
  331. try:
  332. pages = PDFPage.create_pages(self.doc_pdfminer)
  333. for page in pages:
  334. break
  335. pages = list(pages)
  336. # except pdfminer.psparser.PSEOF as e:
  337. except:
  338. # pdfminer 读不了空白页的对象,直接使用pymupdf转换出的图片进行ocr识别
  339. log("pdf2text pdfminer read failed! read by pymupdf!")
  340. traceback.print_exc()
  341. try:
  342. self.get_all_page_image()
  343. return
  344. except:
  345. traceback.print_exc()
  346. log("pdf2text use pymupdf read failed!")
  347. self._doc.error_code = [-3]
  348. return
  349. # 每一页进行处理
  350. pages = PDFPage.create_pages(self.doc_pdfminer)
  351. pages = list(pages)
  352. page_count = len(pages)
  353. page_no = 0
  354. for page in pages:
  355. # 指定pdf页码
  356. if self.start_page_no is not None and self.end_page_no is not None:
  357. if page_count < self.end_page_no:
  358. self.end_page_no = page_count
  359. if page_no < self.start_page_no or page_no >= self.end_page_no:
  360. page_no += 1
  361. continue
  362. # 限制pdf页数,只取前后各10页
  363. else:
  364. if page_count > limit_page_cnt and int(limit_page_cnt/2) <= page_no < page_count - int(limit_page_cnt/2):
  365. page_no += 1
  366. continue
  367. # 解析单页
  368. self._page = _Page(page, page_no)
  369. self.convert_page(page, page_no)
  370. if self._doc.error_code is None and self._page.error_code is not None:
  371. if self._page.error_code[0] in [-4, -3, 0]:
  372. page_no += 1
  373. continue
  374. else:
  375. self._doc.error_code = self._page.error_code
  376. break
  377. self._doc.add_child(self._page)
  378. page_no += 1
  379. self.delete_same_image()
  380. self.delete_header_footer()
  381. # self.delete_bold_text_duplicate()
  382. def delete_same_image(self, show=0):
  383. # 剔除大量重复图片
  384. md5_dict = {}
  385. for _md5, image_obj in self.md5_image_obj_list:
  386. if _md5 in md5_dict.keys():
  387. md5_dict[_md5] += [image_obj]
  388. else:
  389. md5_dict[_md5] = [image_obj]
  390. cnt_threshold = 10
  391. delete_obj_list = []
  392. for _md5 in md5_dict.keys():
  393. img_list = md5_dict.get(_md5)
  394. print('len(md5_dict.get(_md5))', _md5, len(img_list))
  395. if len(img_list) >= cnt_threshold:
  396. if show:
  397. img_np = bytes2np(img_list[0].content)
  398. cv2.namedWindow('delete same img_np', cv2.WINDOW_NORMAL)
  399. cv2.imshow('delete same img_np', img_np)
  400. cv2.waitKey(0)
  401. delete_obj_list += img_list
  402. for page in self._doc.children:
  403. for obj in delete_obj_list:
  404. if obj in page.children:
  405. page.children.remove(obj)
  406. if show:
  407. for page in self._doc.children:
  408. for obj in page.children:
  409. if isinstance(obj, _Image):
  410. img_np = bytes2np(obj.content)
  411. cv2.imshow('page img_np', img_np)
  412. cv2.waitKey(0)
  413. def delete_header_footer(self):
  414. sen_dict = {}
  415. for page in self._doc.children:
  416. for obj in page.children:
  417. if isinstance(obj, _Sentence):
  418. key = str(obj.content) + ' ' + str(int(obj.y))
  419. # print('key', key)
  420. if key in sen_dict.keys():
  421. sen_dict[key] += [obj]
  422. else:
  423. sen_dict[key] = [obj]
  424. for key in sen_dict.keys():
  425. l = sen_dict.get(key)
  426. if len(l) >= 2/3 * max(10, len(self._doc.children)):
  427. for page in self._doc.children:
  428. new_children = []
  429. for obj in page.children:
  430. if isinstance(obj, _Sentence):
  431. if obj not in l:
  432. new_children.append(obj)
  433. else:
  434. new_children.append(obj)
  435. page.children = new_children
  436. print('len(l)', len(l), len(self._doc.children))
  437. print('delete_header_footer l[0]', l[0].content, l[0].y)
  438. return
  439. def delete_bold_text_duplicate(self, lt_text_box_list):
  440. # 拿出所有LTChar
  441. lt_char_list = []
  442. for lt_text_box in lt_text_box_list:
  443. for lt_text_line in lt_text_box:
  444. for lt_char in lt_text_line:
  445. if isinstance(lt_char, LTChar):
  446. lt_char_list.append(lt_char)
  447. # 找出需剔除的
  448. lt_char_list.sort(key=lambda x: (int(x.bbox[1]), x.bbox[0]))
  449. delete_list = []
  450. for i in range(len(lt_char_list)-1):
  451. lt_char1 = lt_char_list[i]
  452. bbox1 = lt_char1.bbox
  453. lt_char2 = lt_char_list[i+1]
  454. bbox2 = lt_char2.bbox
  455. if lt_char1 in delete_list:
  456. continue
  457. if lt_char2 in delete_list:
  458. continue
  459. if lt_char1.get_text() == lt_char2.get_text() and bbox1[0] <= bbox2[0] <= bbox1[2] <= bbox2[2] \
  460. and int(bbox1[1]) == int(bbox2[1]) and int(bbox1[3]) == int(bbox2[3]) \
  461. and re.search('[\u4e00-\u9fff():、,。]', lt_char1.get_text()):
  462. delete_list.append(lt_char2)
  463. # 重新组装
  464. new_lt_text_box_list = []
  465. for lt_text_box in lt_text_box_list:
  466. new_lt_text_box = LTTextBoxHorizontal()
  467. for lt_text_line in lt_text_box:
  468. new_lt_text_line = LTTextLine(0.01)
  469. for lt_char in lt_text_line:
  470. if lt_char in delete_list:
  471. continue
  472. if isinstance(lt_char, LTChar):
  473. new_lt_text_line.add(lt_char)
  474. new_lt_text_box.add(new_lt_text_line)
  475. new_lt_text_box_list.append(new_lt_text_box)
  476. return new_lt_text_box_list
  477. def clean_text(self, _text):
  478. return re.sub("\s", "", _text)
  479. def get_text_lines(self, page, page_no):
  480. lt_line_list = []
  481. page_plumber = pdfPage(self.doc_pdfplumber, page, page_number=page_no, initial_doctop=self.doc_top)
  482. self.doc_top += page_plumber.height
  483. table_finder = TableFinder(page_plumber)
  484. all_width_zero = True
  485. for _edge in table_finder.get_edges():
  486. if _edge.get('linewidth') and _edge.get('linewidth') > 0:
  487. all_width_zero = False
  488. break
  489. for _edge in table_finder.get_edges():
  490. # print(_edge)
  491. if _edge.get('linewidth', 0.1) > 0 or all_width_zero:
  492. lt_line_list.append(LTLine(1, (float(_edge["x0"]), float(_edge["y0"])),
  493. (float(_edge["x1"]), float(_edge["y1"]))))
  494. log("pdf page %s has %s lines" % (str(page_no), str(len(lt_line_list))))
  495. return lt_line_list
  496. def get_page_lines(self, layout, page_no, show=0):
  497. def _plot(_line_list, title, mode=1):
  498. if not show:
  499. return
  500. for _line in _line_list:
  501. if mode == 1:
  502. x0, y0, x1, y1 = _line.__dict__.get("bbox")
  503. elif mode == 2:
  504. x0, y0, x1, y1 = _line
  505. plt.plot([x0, x1], [y0, y1])
  506. plt.title(title)
  507. plt.show()
  508. return
  509. def is_cross(A, B, C, D):
  510. if A[0] == B[0] == C[0] == D[0]:
  511. if A[1] <= C[1] <= B[1] or A[1] <= D[1] <= B[1] \
  512. or C[1] <= A[1] <= D[1] or C[1] <= B[1] <= D[1]:
  513. return True
  514. if A[1] == B[1] == C[1] == D[1]:
  515. if A[0] <= C[0] <= B[0] or A[0] <= D[0] <= B[0] \
  516. or C[0] <= A[0] <= D[0] or C[0] <= B[0] <= D[0]:
  517. return True
  518. line1 = LineString([A, B])
  519. line2 = LineString([C, D])
  520. int_pt = line1.intersection(line2)
  521. try:
  522. point_of_intersection = int_pt.x, int_pt.y
  523. return True
  524. except:
  525. return False
  526. def calculate_k(bbox):
  527. x = [bbox[0], bbox[2]]
  528. y = [bbox[1], bbox[3]]
  529. slope, intercept, r_value, p_value, std_err = linregress(x, y)
  530. # print('k', slope)
  531. if math.isnan(slope):
  532. slope = 0
  533. return slope
  534. def line_iou(line1, line2, axis=0):
  535. if line1[0][axis] <= line2[0][axis] <= line2[1][axis] <= line1[1][axis]:
  536. return 1.0
  537. if line2[0][axis] <= line1[0][axis] <= line1[1][axis] <= line2[1][axis]:
  538. return 1.0
  539. inter = min(line1[1][axis], line2[1][axis]) - max(line1[0][axis], line2[0][axis])
  540. # union = max(line1[1][axis], line2[1][axis]) - min(line1[0][axis], line2[0][axis])
  541. union = min(abs(line1[0][axis] - line1[1][axis]), abs(line2[0][axis] - line2[1][axis]))
  542. if union in [0, 0.]:
  543. iou = 0.
  544. else:
  545. iou = inter / union
  546. return iou
  547. def get_cross_line(_line_list, threshold=1, cross_times=0):
  548. # 根据是否有交点判断表格线
  549. _cross_line_list = []
  550. for line1 in _line_list:
  551. if line1 in _cross_line_list:
  552. continue
  553. if abs(line1[2] - line1[0]) > abs(line1[3] - line1[1]):
  554. p1 = [max(0, line1[0] - threshold), line1[1]]
  555. p2 = [min(line1[2] + threshold, page_w), line1[3]]
  556. else:
  557. p1 = [line1[0], max(0, line1[1] - threshold)]
  558. p2 = [line1[2], min(line1[3] + threshold, page_h)]
  559. line1 = [p1[0], p1[1], p2[0], p2[1]]
  560. _times = 0
  561. for line2 in _line_list:
  562. if abs(line2[2] - line2[0]) > abs(line2[3] - line2[1]):
  563. p3 = [max(0, line2[0] - threshold), line2[1]]
  564. p4 = [min(line2[2] + threshold, page_w), line2[3]]
  565. else:
  566. p3 = [line2[0], max(0, line2[1] - threshold)]
  567. p4 = [line2[2], min(line2[3] + threshold, page_h)]
  568. line2 = [p3[0], p3[1], p4[0], p4[1]]
  569. if line1 == line2:
  570. continue
  571. if is_cross(p1, p2, p3, p4):
  572. _times += 1
  573. if _times >= cross_times:
  574. _cross_line_list += [line1]
  575. break
  576. return _cross_line_list
  577. def repair_bias_line(_line_list):
  578. temp_list = []
  579. for line in _line_list:
  580. x0, y0, x1, y1 = line
  581. _y = min(y0, y1)
  582. _x = min(x0, x1)
  583. if abs(x0 - x1) > abs(y0 - y1):
  584. temp_list.append([x0, _y, x1, _y])
  585. else:
  586. temp_list.append([_x, y0, _x, y1])
  587. _line_list = temp_list
  588. return _line_list
  589. def repair_col_line(_straight_list, _bias_list, threshold=2, min_width=7):
  590. if not _straight_list or not _bias_list:
  591. print('add_col_bias_line empty', len(_straight_list), len(_bias_list))
  592. return []
  593. # 分列
  594. _straight_list.sort(key=lambda x: (x[0], x[1]))
  595. cols = []
  596. col = []
  597. current_w = _straight_list[0][0]
  598. for line in _straight_list:
  599. if abs(line[0] - line[2]) > abs(line[1] - line[3]):
  600. continue
  601. if min(line[0], line[2]) - threshold <= current_w <= max(line[0], line[2]) + threshold:
  602. col.append(line)
  603. else:
  604. if col:
  605. cols.append(col)
  606. col = [line]
  607. current_w = line[0]
  608. if col:
  609. cols.append(col)
  610. # 补充col
  611. new_list = []
  612. for line in bias_line_list:
  613. if abs(line[0] - line[2]) > abs(line[1] - line[3]):
  614. continue
  615. for col in cols:
  616. w = col[0][0]
  617. if w - threshold <= line[0] <= w + threshold or w - threshold <= line[2] <= w + threshold:
  618. new_list.append([w, line[1] - 3, w, line[3] + 3])
  619. new_list += _straight_list
  620. # 去重
  621. new_list = [str(x) for x in new_list]
  622. new_list = list(set(new_list))
  623. new_list = [eval(x) for x in new_list]
  624. # 分列
  625. new_list.sort(key=lambda x: (x[0], x[1]))
  626. cols = []
  627. col = []
  628. current_w = new_list[0][0]
  629. for line in new_list:
  630. if abs(line[0] - line[2]) > abs(line[1] - line[3]):
  631. continue
  632. if min(line[0], line[2]) - threshold <= current_w <= max(line[0], line[2]) + threshold:
  633. col.append(line)
  634. else:
  635. if col:
  636. cols.append(col)
  637. col = [line]
  638. current_w = line[0]
  639. if col:
  640. cols.append(col)
  641. # 删除col
  642. for col1 in cols:
  643. for col2 in cols:
  644. if col1 == col2 or abs(col1[0][0] - col2[0][0]) > min_width:
  645. continue
  646. col1_len, col2_len = 0, 0
  647. for c in col1:
  648. col1_len += abs(c[1] - c[3])
  649. for c in col2:
  650. col2_len += abs(c[1] - c[3])
  651. if col1_len > col2_len * 3:
  652. for c in col2:
  653. if c in new_list:
  654. new_list.remove(c)
  655. if col2_len > col1_len * 3:
  656. for c in col1:
  657. if c in new_list:
  658. new_list.remove(c)
  659. return new_list
  660. def merge_line(_line_list, threshold=2):
  661. new_line_list = []
  662. # 分列
  663. _line_list.sort(key=lambda x: (x[0], x[1]))
  664. cols = []
  665. col = []
  666. current_w = None
  667. for line in _line_list:
  668. if abs(line[0] - line[2]) > abs(line[1] - line[3]):
  669. continue
  670. if not col:
  671. col.append(line)
  672. current_w = line[0]
  673. _iou = line_iou([[0, line[1]], [0, line[3]]], [[0, col[0][1]], [0, col[0][3]]], axis=1)
  674. if min(line[0], line[2]) - threshold <= current_w <= max(line[0], line[2]) + threshold \
  675. and is_cross(line[0:2], line[2:4], col[-1][0:2], col[-1][2:4]):
  676. col.append(line)
  677. elif min(line[0], line[2]) - 2*threshold <= current_w <= max(line[0], line[2]) + 2*threshold \
  678. and _iou >= 0.1:
  679. col.append(line)
  680. else:
  681. if col:
  682. cols.append(col)
  683. col = [line]
  684. current_w = line[0]
  685. if col:
  686. cols.append(col)
  687. for col in cols:
  688. temp_c = col[0]
  689. col_w = col[0][0]
  690. for i in range(len(col) - 1):
  691. c = col[i]
  692. next_c = col[i + 1]
  693. if is_cross(c[0:2], c[2:4], next_c[0:2], next_c[2:4]) \
  694. or line_iou([[0, c[1]], [0, c[3]]], [[0, next_c[1]], [0, next_c[3]]], axis=1) >= 0.1:
  695. temp_c = [col_w, min(temp_c[1], c[1], c[3], next_c[1], next_c[3]), col_w,
  696. max(temp_c[3], c[1], c[3], next_c[1], next_c[3])]
  697. else:
  698. new_line_list.append(temp_c)
  699. temp_c = next_c
  700. if not new_line_list or (new_line_list and new_line_list[-1] != temp_c):
  701. new_line_list.append(temp_c)
  702. # 分行
  703. _line_list.sort(key=lambda x: (x[1], x[0]))
  704. rows = []
  705. row = []
  706. current_h = None
  707. for line in _line_list:
  708. if abs(line[0] - line[2]) < abs(line[1] - line[3]):
  709. continue
  710. if not row:
  711. row = [line]
  712. current_h = line[1]
  713. if min(line[1], line[3]) - threshold <= current_h <= max(line[1], line[3]) + threshold:
  714. row.append(line)
  715. else:
  716. if row:
  717. rows.append(row)
  718. row = [line]
  719. current_h = line[1]
  720. if row:
  721. rows.append(row)
  722. for row in rows:
  723. temp_r = row[0]
  724. row_h = row[0][1]
  725. for i in range(len(row) - 1):
  726. r = row[i]
  727. next_r = row[i + 1]
  728. # if is_cross(r[0:2], r[2:4], next_r[0:2], next_r[2:4]):
  729. if line_iou([r[0:2], r[2:4]], [next_r[0:2], next_r[2:4]], axis=0) >= 0.1:
  730. temp_r = [min(temp_r[0], r[0], r[2], next_r[0], next_r[2]), row_h,
  731. max(temp_r[2], r[0], r[2], next_r[0], next_r[2]), row_h]
  732. else:
  733. new_line_list.append(temp_r)
  734. temp_r = next_r
  735. if not new_line_list or (new_line_list and new_line_list[-1] != temp_r):
  736. new_line_list.append(temp_r)
  737. return new_line_list
  738. def remove_outline_no_cross(_line_list):
  739. row_list = []
  740. col_list = []
  741. for line in _line_list:
  742. # 存所有行
  743. if abs(line[0] - line[2]) > abs(line[1] - line[3]):
  744. row_list.append(line)
  745. # 存所有列
  746. if abs(line[0] - line[2]) < abs(line[1] - line[3]):
  747. col_list.append(line)
  748. if not col_list:
  749. return _line_list
  750. # 左右两条边框
  751. col_list.sort(key=lambda x: (x[0], x[1]))
  752. left_col = col_list[0]
  753. right_col = col_list[-1]
  754. # 判断有交点但中间区域无交点
  755. compare_list = []
  756. for col in [left_col, right_col]:
  757. add_h = abs(col[1]-col[3]) / 8
  758. center_area = [col[1]+add_h, col[3]-add_h]
  759. cross_cnt = 0
  760. center_cross_cnt = 0
  761. center_row_cnt = 0
  762. for row in row_list:
  763. if is_cross(row[0:2], row[2:4], col[0:2], col[2:4]):
  764. if center_area[0] <= row[1] <= center_area[1]:
  765. center_cross_cnt += 1
  766. else:
  767. cross_cnt += 1
  768. else:
  769. if center_area[0] <= row[1] <= center_area[1]:
  770. center_row_cnt += 1
  771. compare_list.append([cross_cnt, center_cross_cnt, center_row_cnt])
  772. _flag = True
  773. for c in compare_list:
  774. if c[0] >= 2 and c[1] == 0 and c[2] >= 2:
  775. continue
  776. _flag = False
  777. print('compare_list', compare_list)
  778. if _flag and compare_list[0][1] == compare_list[1][1] \
  779. and compare_list[0][2] == compare_list[1][2]:
  780. for col in [left_col, right_col]:
  781. if col in _line_list:
  782. _line_list.remove(col)
  783. return _line_list
  784. def cross_line_process(_cross_line_list, _bias_line_list):
  785. if show:
  786. print('cross_line_process len(_cross_line_list) -1', len(_cross_line_list))
  787. # 斜线校正
  788. if _cross_line_list:
  789. _cross_line_list = repair_bias_line(_cross_line_list)
  790. if show:
  791. print('cross_line_process len(_cross_line_list) 0', len(_cross_line_list))
  792. # 修复竖线
  793. if _bias_line_list:
  794. _cross_line_list = repair_col_line(_cross_line_list, _bias_line_list)
  795. if show:
  796. print('cross_line_process len(_cross_line_list) 1', len(_cross_line_list))
  797. # 根据是否有交点判断表格线
  798. _cross_line_list = get_cross_line(_cross_line_list, threshold=1, cross_times=1)
  799. if show:
  800. print('cross_line_process len(_cross_line_list) 2', len(_cross_line_list))
  801. # 合并线条
  802. if not _cross_line_list:
  803. return []
  804. _cross_line_list = merge_line(_cross_line_list)
  805. if show:
  806. print('cross_line_process len(_cross_line_list) 3', len(_cross_line_list))
  807. # 删除最外层嵌套边框
  808. _cross_line_list = remove_outline_no_cross(_cross_line_list)
  809. if show:
  810. print('cross_line_process len(_cross_line_list) 4', len(_cross_line_list))
  811. # 复用otr的部分后处理,补线
  812. _cross_line_list = table_line_pdf(_cross_line_list, page_w, page_h)
  813. if show:
  814. print('cross_line_process len(_cross_line_list) 5', len(_cross_line_list))
  815. return _cross_line_list
  816. log('into get_page_lines')
  817. page_h = layout.height
  818. page_w = layout.width
  819. element_list = []
  820. line_list = []
  821. bias_line_list = []
  822. text_container_list = []
  823. lt_rect_list = []
  824. # 先提取文本框和矩形框
  825. min_y = 10000
  826. max_x, max_y = 0, 0
  827. for element in layout:
  828. if isinstance(element, LTTextContainer):
  829. text_container_list.append(element)
  830. if isinstance(element, LTRect):
  831. # print(element.bbox, element.height, element.width)
  832. lt_rect_list.append(element)
  833. if element.bbox[1] <= min_y:
  834. min_y = element.bbox[1]
  835. if element.bbox[3] <= min_y:
  836. min_y = element.bbox[3]
  837. if element.bbox[1] > max_y:
  838. max_y = element.bbox[1]
  839. if element.bbox[3] > max_y:
  840. max_y = element.bbox[3]
  841. if element.bbox[0] > max_x:
  842. max_x = element.bbox[0]
  843. if element.bbox[2] > max_x:
  844. max_x = element.bbox[2]
  845. if max_y > page_h:
  846. page_h = max_y + 20
  847. if max_x > page_w:
  848. page_w = max_x + 20
  849. # 矩形框y有负数
  850. if min_y < 0:
  851. for lt_rect in lt_rect_list:
  852. if lt_rect.y0 < 0 or lt_rect.y1 < 0:
  853. new_y0 = 10 if lt_rect.y0 < 0 else lt_rect.y0
  854. new_y1 = 10 if lt_rect.y1 < 0 else lt_rect.y1
  855. lt_rect.set_bbox((lt_rect.x0, new_y0, lt_rect.x1, new_y1))
  856. _plot([x.bbox for x in lt_rect_list], 'get_page_lines start', mode=2)
  857. # 合并矩形框
  858. delete_lt_rect_list = []
  859. for i in range(len(lt_rect_list)):
  860. lt_rect1 = lt_rect_list[i]
  861. b1 = lt_rect1.bbox
  862. if lt_rect1 in delete_lt_rect_list:
  863. continue
  864. for j in range(i+1, len(lt_rect_list)):
  865. lt_rect2 = lt_rect_list[j]
  866. b2 = lt_rect2.bbox
  867. if lt_rect2 in delete_lt_rect_list:
  868. continue
  869. if bbox_iou(b1, b2) >= 0.9:
  870. delete_lt_rect_list.append(lt_rect2)
  871. # 筛选出线形矩形和非线形矩形
  872. line_rect_list = []
  873. non_line_rect_list = []
  874. for lt_rect in lt_rect_list:
  875. if (lt_rect.height <= 2) ^ (lt_rect.width <= 2):
  876. line_rect_list.append(lt_rect)
  877. if lt_rect.height > 2 and lt_rect.width > 2:
  878. non_line_rect_list.append(lt_rect)
  879. # 非线形矩形若与线形矩形距离较近,则删除
  880. threshold = 5
  881. for n_rect in non_line_rect_list:
  882. if n_rect in delete_lt_rect_list:
  883. continue
  884. middle_x = (n_rect.x0 + n_rect.x1) / 2
  885. middle_y = (n_rect.y0 + n_rect.y1) / 2
  886. for rect in line_rect_list:
  887. if rect in delete_lt_rect_list:
  888. continue
  889. if rect.height >= rect.width:
  890. if n_rect.width / 2 - threshold <= abs(rect.x0 - middle_x) <= n_rect.width / 2 + threshold:
  891. delete_lt_rect_list.append(n_rect)
  892. else:
  893. if n_rect.height / 2 - threshold <= abs(rect.y0 - middle_y) <= n_rect.height / 2 + threshold:
  894. delete_lt_rect_list.append(n_rect)
  895. # print('len(delete_lt_rect_list)', len(delete_lt_rect_list))
  896. # 寻找每个文本框对应的最小矩形框
  897. text_lt_rect_list = []
  898. for text_lt_rect in text_container_list:
  899. text_box = text_lt_rect.bbox
  900. min_area = 1000000
  901. min_lt_rect = None
  902. for lt_rect in lt_rect_list:
  903. if lt_rect in delete_lt_rect_list:
  904. continue
  905. if lt_rect in text_lt_rect_list:
  906. continue
  907. if lt_rect.height <= 5 or lt_rect.width <= 5:
  908. continue
  909. _bbox = lt_rect.bbox
  910. _area = abs(_bbox[2] - _bbox[0]) * abs(_bbox[3] - _bbox[1])
  911. _iou = bbox_iou(_bbox, text_box)
  912. if _iou >= 0.5 and _area < min_area:
  913. min_area = _area
  914. min_lt_rect = lt_rect
  915. # 匹配不到再去小框里找
  916. if min_lt_rect is None:
  917. for lt_rect in lt_rect_list:
  918. if lt_rect in delete_lt_rect_list:
  919. continue
  920. if lt_rect in text_lt_rect_list:
  921. continue
  922. if lt_rect.height <= 5 or lt_rect.width <= 5:
  923. _bbox = lt_rect.bbox
  924. _area = abs(_bbox[2] - _bbox[0]) * abs(_bbox[3] - _bbox[1])
  925. _iou = bbox_iou(_bbox, text_box)
  926. if _iou >= 0.5 and _area < min_area:
  927. min_area = _area
  928. min_lt_rect = lt_rect
  929. # print(text_lt_rect.get_text(), min_lt_rect)
  930. if min_lt_rect is not None:
  931. text_lt_rect_list.append(min_lt_rect)
  932. # 删除包含关系的矩形框
  933. delete_lt_rect_list += text_lt_rect_list
  934. # for i in range(len(lt_rect_list)):
  935. # lt_rect1 = lt_rect_list[i]
  936. # b1 = lt_rect1.bbox
  937. # if lt_rect1 in delete_lt_rect_list:
  938. # continue
  939. # for j in range(i+1, len(lt_rect_list)):
  940. # lt_rect2 = lt_rect_list[j]
  941. # b2 = lt_rect2.bbox
  942. # if lt_rect2 in delete_lt_rect_list:
  943. # continue
  944. # if b1[0] <= b2[0] <= b2[1] <= b1[2] and b1[1] <= b2[1] <= b2[3] <= b1[3]:
  945. # delete_lt_rect_list.append(b2)
  946. # if b2[0] <= b1[0] <= b1[1] <= b2[2] and b2[1] <= b1[1] <= b1[3] <= b2[3]:
  947. # delete_lt_rect_list.append(b1)
  948. # 在提取表格框,并排除文本框
  949. for element in layout:
  950. _b = element.bbox
  951. # 只取这三种类型的bbox
  952. if isinstance(element, LTRect):
  953. if element in delete_lt_rect_list:
  954. continue
  955. line_list += [[_b[0], _b[1], _b[0], _b[3]], [_b[0], _b[1], _b[2], _b[1]],
  956. [_b[2], _b[1], _b[2], _b[3]], [_b[0], _b[3], _b[2], _b[3]]]
  957. elif isinstance(element, (LTCurve, LTLine)):
  958. # element_list.append(element)
  959. if element.height > 10 or element.width > 10:
  960. # print('element.height, element.width', element.bbox, type(element))
  961. # k = calculate_k(element.bbox)
  962. # if 1.73 / 3 < abs(k) < 1.73:
  963. # continue
  964. # else:
  965. # bias_line_list.append(element.bbox)
  966. # continue
  967. if element.height >= element.width:
  968. line_list += [[_b[0], _b[1], _b[0], _b[3]], [_b[2], _b[1], _b[2], _b[3]]]
  969. else:
  970. line_list += [[_b[0], _b[1], _b[2], _b[1]], [_b[0], _b[3], _b[2], _b[3]]]
  971. if show:
  972. print('get_page_lines line_list', line_list)
  973. print('get_page_lines bias_line_list', bias_line_list)
  974. _plot(line_list+bias_line_list, 'line_list+bias_line_list', mode=2)
  975. if not line_list and not bias_line_list:
  976. return []
  977. # # 是否使用斜线来生成表格
  978. line_list_copy = copy.deepcopy(line_list)
  979. # if len(line_list) < 6 and len(bias_line_list) > len(line_list) * 2:
  980. # if show:
  981. # print('use bias line')
  982. # # bias_line_list += add_col_bias_line(line_list, bias_line_list)
  983. # line_list = bias_line_list
  984. # 去重
  985. line_list = [str(x) for x in line_list]
  986. line_list = list(set(line_list))
  987. line_list = [eval(x) for x in line_list]
  988. # 根据是否有交点判断表格线
  989. cross_line_list = get_cross_line(line_list_copy, threshold=2, cross_times=1)
  990. if show:
  991. print('get_page_lines cross_line_list', cross_line_list)
  992. if not cross_line_list:
  993. # 将线全部合并再获取一次
  994. cross_line_list = get_cross_line(line_list_copy, threshold=2, cross_times=1)
  995. if not cross_line_list:
  996. return []
  997. _plot(line_list, 'get_cross_line', mode=2)
  998. cross_line_list = cross_line_process(cross_line_list, bias_line_list)
  999. if not cross_line_list:
  1000. cross_line_list = get_cross_line(line_list_copy, threshold=2, cross_times=1)
  1001. cross_line_list = cross_line_process(cross_line_list, bias_line_list)
  1002. if show:
  1003. print('get_page_lines cross_line_list2', cross_line_list)
  1004. # show
  1005. if show:
  1006. print('len(cross_line_list)', len(cross_line_list))
  1007. _plot(cross_line_list, 'cross_line_process', mode=2)
  1008. lt_line_list = []
  1009. for line in cross_line_list:
  1010. lt_line_list.append(LTLine(1, (float(line[0]), float(line[1])),
  1011. (float(line[2]), float(line[3]))))
  1012. log("pdf page %s has %s lines" % (str(page_no), str(len(lt_line_list))))
  1013. return lt_line_list
  1014. def recognize_text(self, layout, page_no, lt_text_list, lt_line_list):
  1015. list_tables, filter_objs, _, connect_textbox_list = self.lt.recognize_table(lt_text_list, lt_line_list, from_pdf=True)
  1016. self._page.in_table_objs = filter_objs
  1017. # print("=======text_len:%d:filter_len:%d"%(len(lt_text_list),len(filter_objs)))
  1018. for table in list_tables:
  1019. _table = _Table(table["table"], table["bbox"])
  1020. # self._page.children.append(_table)
  1021. self._page.add_child(_table)
  1022. list_sentences = ParseUtils.recognize_sentences(lt_text_list, filter_objs,
  1023. layout.bbox, page_no)
  1024. for sentence in list_sentences:
  1025. _sen = _Sentence(sentence.text, sentence.bbox)
  1026. self._page.add_child(_sen)
  1027. # pdf对象需反向排序
  1028. self._page.is_reverse = True
  1029. return list_tables
  1030. def is_text_legal(self, lt_text_list, page_no):
  1031. # 无法识别pdf字符编码,整页用ocr
  1032. text_temp = ""
  1033. for _t in lt_text_list:
  1034. text_temp += _t.get_text()
  1035. if re.search('[(]cid:[0-9]+[)]', text_temp):
  1036. log("text has cid! try pymupdf...")
  1037. page_image = self.get_page_image(page_no)
  1038. if judge_error_code(page_image):
  1039. self._page.error_code = page_image
  1040. else:
  1041. _image = _Image(page_image[1], page_image[0])
  1042. self._page.add_child(_image)
  1043. return False
  1044. match1 = re.findall(get_garble_code(), text_temp)
  1045. # match2 = re.search('[\u4e00-\u9fa5]', text_temp)
  1046. if len(match1) > 8 and len(text_temp) > 10:
  1047. log("pdf garbled code! try pymupdf... " + text_temp[:20])
  1048. page_image = self.get_page_image(page_no)
  1049. if judge_error_code(page_image):
  1050. self._page.error_code = page_image
  1051. else:
  1052. _image = _Image(page_image[1], page_image[0])
  1053. self._page.add_child(_image)
  1054. return False
  1055. return True
  1056. def judge_b_table(self, lt_text_list, table_list):
  1057. table_h_list = []
  1058. for table in table_list:
  1059. table_h_list.append([table.get('bbox')[1], table.get('bbox')[3]])
  1060. # 先分行
  1061. lt_text_list.sort(key=lambda x: (x.bbox[1], x.bbox[0]))
  1062. lt_text_row_list = []
  1063. current_h = lt_text_list[0].bbox[1]
  1064. row = []
  1065. threshold = 2
  1066. for lt_text in lt_text_list:
  1067. bbox = lt_text.bbox
  1068. if current_h - threshold <= bbox[1] <= current_h + threshold:
  1069. row.append(lt_text)
  1070. else:
  1071. if row:
  1072. lt_text_row_list.append(row)
  1073. row = [lt_text]
  1074. current_h = lt_text.bbox[1]
  1075. if row:
  1076. lt_text_row_list.append(row)
  1077. # print('lt_text_row_list')
  1078. # for r in lt_text_row_list:
  1079. # print('r', [x.get_text() for x in r])
  1080. # 判断文本中间是否是空格,或一行文本中间有多个
  1081. is_b_table_flag = False
  1082. is_b_table_cnt = 3
  1083. tolerate_cnt = 2
  1084. t_cnt = 0
  1085. row_cnt = 0
  1086. b_table_row_list = []
  1087. for row in lt_text_row_list:
  1088. # 水印行跳过
  1089. if len(row) == 1 and len(row[0].get_text()[:-1]) == 1:
  1090. continue
  1091. # 目录行跳过
  1092. continue_flag = False
  1093. for r in row:
  1094. if re.search('[.·]{7,}', r.get_text()):
  1095. continue_flag = True
  1096. break
  1097. if continue_flag:
  1098. continue
  1099. if len(row) == 1:
  1100. text = row[0].get_text()
  1101. bbox = row[0].bbox
  1102. match = re.search('[ ]{3,}', text)
  1103. if match and re.search('[\u4e00-\u9fff]{2,}', text[:match.span()[0]]) \
  1104. and re.search('[\u4e00-\u9fff]{2,}', text[match.span()[1]:]):
  1105. row_cnt += 1
  1106. t_cnt = 0
  1107. b_table_row_list += row
  1108. else:
  1109. # 容忍
  1110. if t_cnt < tolerate_cnt:
  1111. t_cnt += 1
  1112. continue
  1113. row_cnt = 0
  1114. b_table_row_list = []
  1115. else:
  1116. row_cnt += 1
  1117. t_cnt = 0
  1118. b_table_row_list += row
  1119. if row_cnt >= is_b_table_cnt:
  1120. # 判断在不在有边框表格的范围
  1121. in_flag = False
  1122. for table_h in table_h_list:
  1123. for b in b_table_row_list:
  1124. # print('b.bbox', b.bbox)
  1125. # print(table_h)
  1126. if table_h[1] <= b.bbox[1] <= table_h[0] or table_h[1] <= b.bbox[3] <= table_h[0]:
  1127. in_flag = True
  1128. break
  1129. if in_flag:
  1130. break
  1131. if in_flag:
  1132. is_b_table_flag = False
  1133. t_cnt = 0
  1134. row_cnt = 0
  1135. else:
  1136. # print('True b_table_row_list', b_table_row_list)
  1137. # print('table_h_list', table_h_list)
  1138. is_b_table_flag = True
  1139. break
  1140. log('pdf is_b_table_flag ' + str(is_b_table_flag))
  1141. return is_b_table_flag
  1142. def convert_page(self, page, page_no):
  1143. layout = self.get_layout(page, page_no)
  1144. if self._doc.error_code is not None:
  1145. return
  1146. if judge_error_code(layout):
  1147. self._page.error_code = layout
  1148. return
  1149. # 判断该页的对象类型,并存储
  1150. lt_text_list = []
  1151. lt_image_list = []
  1152. for x in layout:
  1153. if isinstance(x, (LTTextBoxHorizontal, LTTextBoxVertical)):
  1154. lt_text_list.append(x)
  1155. if isinstance(x, LTFigure):
  1156. for y in x:
  1157. if isinstance(y, LTImage):
  1158. # 小的图忽略
  1159. if y.width <= 300 and y.height <= 300:
  1160. continue
  1161. # 图的width超过layout width,很大可能是水印
  1162. if y.width > layout.width + 20:
  1163. continue
  1164. lt_image_list.append(y)
  1165. # 解决重复粗体字问题
  1166. lt_text_list = self.delete_bold_text_duplicate(lt_text_list)
  1167. # 删除水印字
  1168. lt_text_list = self.delete_water_mark(lt_text_list, layout.bbox, 15)
  1169. log("convert_pdf page " + str(page_no))
  1170. log("len(lt_image_list), len(lt_text_list) " + str(len(lt_image_list)) + " " + str(len(lt_text_list)))
  1171. log('layout.width, layout.height ' + str(layout.width) + str(layout.height))
  1172. # 若只有文本且图片数为0,直接提取文字及表格
  1173. # if only_image == 0 and image_count == 0:
  1174. # if len(lt_image_list) == 0 and len(lt_text_list) > 0:
  1175. # # PDFPlumber
  1176. # if self.has_init_pdf[3] == 0:
  1177. # self.init_package("pdfplumber")
  1178. # if self._doc.error_code is not None:
  1179. # self._doc.error_code = None
  1180. # log("init pdfplumber failed! try pymupdf...")
  1181. # # 调用pdfplumber获取pdf图片报错,则使用pypdf2将pdf转html
  1182. # page_image = self.get_page_image(page_no)
  1183. # if judge_error_code(page_image):
  1184. # self._page.error_code = page_image
  1185. # else:
  1186. # _image = _Image(page_image[1], page_image[0])
  1187. # self._page.add_child(_image)
  1188. # return
  1189. #
  1190. # if not self.is_text_legal(lt_text_list, page_no):
  1191. # return
  1192. #
  1193. # # 根据text规律,判断该页是否可能有无边框表格
  1194. # start_time = time.time()
  1195. # if self.judge_b_table(lt_text_list):
  1196. # page_image = self.get_page_image(page_no)
  1197. # if judge_error_code(page_image):
  1198. # self._page.error_code = page_image
  1199. # else:
  1200. # _image = _Image(page_image[1], page_image[0])
  1201. # _image.is_from_pdf = True
  1202. # _image.b_table_from_text = True
  1203. # _image.b_table_text_obj_list = lt_text_list
  1204. # _image.b_table_layout_size = (layout.width, layout.height)
  1205. # self._page.add_child(_image)
  1206. # log('convert_pdf judge_b_table set image cost: ' + str(time.time()-start_time))
  1207. #
  1208. # try:
  1209. # lt_line_list = self.get_page_lines(layout, page_no)
  1210. # except:
  1211. # traceback.print_exc()
  1212. # lt_line_list = []
  1213. # self._page.error_code = [-13]
  1214. # try:
  1215. # # lt_line_list = self.get_text_lines(page,page_no)
  1216. # self.recognize_text(layout, page_no, lt_text_list, lt_line_list)
  1217. # except:
  1218. # traceback.print_exc()
  1219. # self._page.error_code = [-8]
  1220. # 若该页图片数量过多,或无文本,则直接ocr整页识别
  1221. # elif image_count > 3 or only_image == 1:
  1222. if len(lt_image_list) > 4 or len(lt_text_list) == 0:
  1223. page_image = self.get_page_image(page_no)
  1224. if judge_error_code(page_image):
  1225. self._page.error_code = page_image
  1226. else:
  1227. _image = _Image(page_image[1], page_image[0])
  1228. _image.is_from_pdf = True
  1229. self._page.add_child(_image)
  1230. # 正常读取该页对象
  1231. else:
  1232. # 图表对象
  1233. for image in lt_image_list:
  1234. try:
  1235. print("pdf2text LTImage size", page_no, image.width, image.height)
  1236. image_stream = image.stream.get_data()
  1237. # 小的图忽略
  1238. if image.width <= 300 and image.height <= 300:
  1239. continue
  1240. # 查看提取的图片高宽,太大则用pdf输出图进行ocr识别
  1241. img_test = Image.open(io.BytesIO(image_stream))
  1242. if image.height >= 1000 and image.width >= 1000:
  1243. page_image = self.get_page_image(page_no)
  1244. if judge_error_code(page_image):
  1245. self._page.error_code = page_image
  1246. else:
  1247. _image = _Image(page_image[1], page_image[0])
  1248. _image.is_from_pdf = True
  1249. self._page.add_child(_image)
  1250. image_md5 = get_md5_from_bytes(page_image[1])
  1251. self.md5_image_obj_list.append([image_md5, _image])
  1252. return
  1253. # 比较小的图则直接保存用ocr识别
  1254. else:
  1255. temp_path = self.unique_type_dir + 'page' + str(page_no) \
  1256. + '_lt' + str(lt_image_list.index(image)) + '.jpg'
  1257. img_test.save(temp_path)
  1258. with open(temp_path, "rb") as ff:
  1259. image_stream = ff.read()
  1260. _image = _Image(image_stream, temp_path, image.bbox)
  1261. self._page.add_child(_image)
  1262. image_md5 = get_md5_from_bytes(image_stream)
  1263. self.md5_image_obj_list.append([image_md5, _image])
  1264. except Exception:
  1265. log("pdf2text pdfminer read image in page " + str(page_no) +
  1266. " fail! use pymupdf read image...")
  1267. traceback.print_exc()
  1268. # pdf对象需反向排序
  1269. self._page.is_reverse = True
  1270. self.init_package("pdfplumber")
  1271. if not self.is_text_legal(lt_text_list, page_no):
  1272. return
  1273. try:
  1274. lt_line_list = self.get_page_lines(layout, page_no)
  1275. except:
  1276. traceback.print_exc()
  1277. lt_line_list = []
  1278. self._page.error_code = [-13]
  1279. table_list = self.recognize_text(layout, page_no, lt_text_list, lt_line_list)
  1280. # 根据text规律,判断该页是否可能有无边框表格
  1281. if self.judge_b_table(lt_text_list, table_list):
  1282. page_image = self.get_page_image(page_no)
  1283. if judge_error_code(page_image):
  1284. self._page.error_code = page_image
  1285. else:
  1286. _image = _Image(page_image[1], page_image[0])
  1287. _image.is_from_pdf = True
  1288. _image.b_table_from_text = True
  1289. _image.b_table_text_obj_list = lt_text_list
  1290. _image.b_table_layout_size = (layout.width, layout.height)
  1291. self._page.add_child(_image)
  1292. def get_layout(self, page, page_no):
  1293. log("get_layout")
  1294. if self.has_init_pdf[0] == 0:
  1295. self.init_package("pdfminer")
  1296. if self._doc.error_code is not None:
  1297. return
  1298. # 获取该页layout
  1299. start_time = time.time()
  1300. try:
  1301. if get_platform() == "Windows":
  1302. # origin_pdf_analyze = pdf_analyze.__wrapped__
  1303. # layout = origin_pdf_analyze(self.interpreter, page, self.device)
  1304. layout = pdf_analyze(self.interpreter, page, self.device, page_no)
  1305. else:
  1306. layout = pdf_analyze(self.interpreter, page, self.device, page_no)
  1307. except TimeoutError as e:
  1308. log("pdf2text pdfminer read pdf page " + str(page_no) + " time out! " + str(time.time() - start_time))
  1309. layout = [-4]
  1310. except Exception:
  1311. traceback.print_exc()
  1312. log("pdf2text pdfminer read pdf page " + str(page_no) + " error! continue...")
  1313. layout = [-3]
  1314. return layout
  1315. def get_page_image(self, page_no):
  1316. log("")
  1317. try:
  1318. if self.has_init_pdf[1] == 0:
  1319. self.init_package("PyMuPDF")
  1320. if self._doc.error_code is not None:
  1321. return
  1322. # save_dir = self.path.split(".")[-2] + "_" + self.path.split(".")[-1]
  1323. output = self.unique_type_dir + "page" + str(page_no) + ".png"
  1324. page = self.doc_pymupdf.loadPage(page_no)
  1325. rotate = int(0)
  1326. zoom_x = 2.
  1327. zoom_y = 2.
  1328. mat = fitz.Matrix(zoom_x, zoom_y).preRotate(rotate)
  1329. pix = page.getPixmap(matrix=mat, alpha=False)
  1330. pix.writePNG(output)
  1331. # 输出图片resize
  1332. self.resize_image(output)
  1333. with open(output, "rb") as f:
  1334. pdf_image = f.read()
  1335. return [output, pdf_image]
  1336. except ValueError as e:
  1337. traceback.print_exc()
  1338. if str(e) == "page not in document":
  1339. log("pdf2Image page not in document! continue... page " + str(page_no))
  1340. return [0]
  1341. elif "encrypted" in str(e):
  1342. log("pdf2Image document need password " + str(page_no))
  1343. return [-7]
  1344. except RuntimeError as e:
  1345. if "cannot find page" in str(e):
  1346. log("pdf2Image page {} not in document! continue... ".format(str(page_no)) + str(e))
  1347. return [0]
  1348. else:
  1349. traceback.print_exc()
  1350. return [-3]
  1351. def get_all_page_image(self):
  1352. log("")
  1353. if self.has_init_pdf[1] == 0:
  1354. self.init_package("PyMuPDF")
  1355. if self._doc.error_code is not None:
  1356. return
  1357. page_count = self.doc_pymupdf.page_count
  1358. for page_no in range(page_count):
  1359. # 限制pdf页数,只取前10页后10页
  1360. if page_count > 20:
  1361. if 10 <= page_no < page_count - 10:
  1362. continue
  1363. self._page = _Page(None, page_no)
  1364. page_image = self.get_page_image(page_no)
  1365. if judge_error_code(page_image):
  1366. self._page.error_code = page_image
  1367. else:
  1368. _image = _Image(page_image[1], page_image[0])
  1369. self._page.add_child(_image)
  1370. # 报错继续读后面页面
  1371. if self._doc.error_code is None and self._page.error_code is not None:
  1372. continue
  1373. self._doc.add_child(self._page)
  1374. def connect_table(self, html_list):
  1375. if not html_list:
  1376. return html_list
  1377. # 判断初始条件1
  1378. # 0: 前一页最后一个表格为A,后一页第一个表格为B
  1379. # 1.1: A后无文本(除了页码),且B前无文本(除了页码)
  1380. # 1.2: B前有文字(可能是页眉,小于60字),且B的第一行前几个单元格为空,且第一行不为空的单元格有文字较多的格子
  1381. # 1.3: B前有文字(可能是页眉,小于60字),且B的第一行第一个单元格为空,且有文字的格子数量占所有格子的一半
  1382. # 1.4: B前有文字(可能是页眉,小于60字),且B的第一行第一个单元格为纯数字序号
  1383. # 1.5: A后有文字(除了页码还有页眉),且A的后面只有一行且中文不超过15个字
  1384. connect_flag_list = []
  1385. soup_list = []
  1386. for i, h in enumerate(html_list):
  1387. soup = BeautifulSoup(h, 'lxml')
  1388. soup_list.append(soup)
  1389. # 找最后一个表格
  1390. last_table_start, last_table_end = None, None
  1391. match = re.finditer('<table', h)
  1392. for m in match:
  1393. last_table_start = m.span()[0]
  1394. if last_table_start is not None:
  1395. match = re.finditer('</table>', h[last_table_start:])
  1396. for m in match:
  1397. last_table_end = m.span()[1] + last_table_start
  1398. # 最后一个表格后有无除了页码外的内容
  1399. connect_flag1 = False
  1400. if last_table_end is not None:
  1401. match = re.findall('[^-/第页0-9,,]', re.sub('<div>|</div>', '', h[last_table_end:]))
  1402. # print('match', match.group())
  1403. # if not match or match.group() == '':
  1404. if len(match) == 0:
  1405. connect_flag1 = True
  1406. # 有页脚
  1407. if not connect_flag1:
  1408. if len(re.findall('<div>', h[last_table_end:])) <= 1 \
  1409. and len(re.findall('[\u4e00-\u9fff]', h[last_table_end:])) <= 60:
  1410. connect_flag1 = True
  1411. # 找第一个表格
  1412. first_table_start, first_table_end = None, None
  1413. match = re.finditer('<table', h)
  1414. for m in match:
  1415. first_table_start = m.span()[0]
  1416. break
  1417. # 第一个表格前有无内容
  1418. connect_flag2 = False
  1419. if first_table_start is not None and first_table_start == 0:
  1420. connect_flag2 = True
  1421. # 有内容但是是页眉
  1422. if not connect_flag2:
  1423. tables = soup.findAll('table')
  1424. if tables:
  1425. first_table = tables[0]
  1426. rows = first_table.findAll('tr')
  1427. if rows:
  1428. first_row = rows[0]
  1429. col_text_len_list = [len(x.text) for x in first_row]
  1430. col_text_list = [x.text for x in first_row]
  1431. # 文字大于60且第一个为空
  1432. if not connect_flag2 and len(h[:first_table_start]) <= 60 and col_text_len_list[0] == 0 and max(col_text_len_list) >= 30:
  1433. connect_flag2 = True
  1434. # 有文字格子数占一半一下且第一个格子为空
  1435. if not connect_flag2 and col_text_len_list.count(0) >= len(col_text_len_list) / 2 and col_text_len_list[0] == 0:
  1436. connect_flag2 = True
  1437. # 表格前最多只有一行且第一个格子为纯数字
  1438. if not connect_flag2 and len(col_text_list) > 0 and \
  1439. len(re.findall('<div>', h[:first_table_start])) <= 0 and \
  1440. len(re.findall('\d', col_text_list[0])) == len(col_text_list[0]):
  1441. connect_flag2 = True
  1442. # if not connect_flag2 and len(re.findall('<div>', h[:first_table_start])) <= 0 and len(re.findall('[\u4e00-\u9fff]', h[:first_table_start])) <= 25:
  1443. # connect_flag2 = True
  1444. connect_flag_list.append([i, connect_flag2, connect_flag1])
  1445. print('connect_flag_list', connect_flag_list)
  1446. # 根据条件1合并需连接页码,形成组
  1447. connect_pages_list = []
  1448. if connect_flag_list:
  1449. temp_list = [connect_flag_list[0]]
  1450. for i in range(1, len(connect_flag_list)):
  1451. c = connect_flag_list[i]
  1452. if c[1] and temp_list[-1][2]:
  1453. temp_list.append(c)
  1454. else:
  1455. if temp_list:
  1456. connect_pages_list.append(temp_list)
  1457. temp_list = [c]
  1458. # connect_pages_list.append([c])
  1459. if temp_list:
  1460. connect_pages_list.append(temp_list)
  1461. print('connect_pages_list', connect_pages_list)
  1462. # 判断后续条件:判断组内列数是否相同
  1463. connect_pages_list2 = []
  1464. for c_list in connect_pages_list:
  1465. if len(c_list) == 1:
  1466. connect_pages_list2.append(c_list)
  1467. else:
  1468. col_cnt_list = []
  1469. # 单元格可能被复制了,相同的合并当做一列
  1470. merge_col_cnt_list = []
  1471. for c in c_list:
  1472. soup = soup_list[c[0]]
  1473. table1 = soup.findAll('table')[-1]
  1474. table2 = soup.findAll('table')[0]
  1475. tr1 = table1.findAll('tr')
  1476. tr2 = table2.findAll('tr')
  1477. td1 = tr1[-1].findAll('td')
  1478. td2 = tr2[0].findAll('td')
  1479. col_cnt_list.append([len(td2), len(td1)])
  1480. # # 计算合并重复文本格子后的列数
  1481. # last_text = td1[0].text
  1482. # merge_td1 = [last_text]
  1483. # for td in td1:
  1484. # if td.text == last_text:
  1485. # continue
  1486. # else:
  1487. # merge_td1.append(td.text)
  1488. # last_text = td.text
  1489. # last_text = td2[0].text
  1490. # merge_td2 = [last_text]
  1491. # for td in td2:
  1492. # if td.text == last_text:
  1493. # continue
  1494. # else:
  1495. # merge_td2.append(td.text)
  1496. # last_text = td.text
  1497. # merge_col_cnt_list.append([len(merge_td2), len(merge_td1)])
  1498. # 判断
  1499. new_c_list = [c_list[0]]
  1500. # print('col_cnt_list', col_cnt_list)
  1501. for i in range(len(col_cnt_list) - 1):
  1502. if col_cnt_list[i][1] != col_cnt_list[i + 1][0]:
  1503. # and merge_col_cnt_list[i][1] != merge_col_cnt_list[i + 1][0]:
  1504. connect_pages_list2.append(new_c_list)
  1505. new_c_list = [c_list[i + 1]]
  1506. else:
  1507. new_c_list.append(c_list[i + 1])
  1508. if new_c_list:
  1509. connect_pages_list2.append(new_c_list)
  1510. print('connect_pages_list2', connect_pages_list2)
  1511. # 符合连接条件的拼接表格
  1512. new_html_list = []
  1513. for c_list in connect_pages_list2:
  1514. if len(c_list) == 1:
  1515. new_html_list.append(html_list[c_list[0][0]])
  1516. continue
  1517. new_html = ''
  1518. for c in c_list:
  1519. match = re.finditer('</table>', new_html)
  1520. last_table_index = None
  1521. for m in match:
  1522. last_table_index = m.span()[0]
  1523. new_html += html_list[c[0]]
  1524. # print('html_list[c[0]]', html_list[c[0]])
  1525. if last_table_index is None:
  1526. continue
  1527. match = re.finditer('<table border="1">', new_html[last_table_index:])
  1528. first_table_index = None
  1529. for m in match:
  1530. first_table_index = last_table_index + m.span()[1]
  1531. break
  1532. if first_table_index is None:
  1533. continue
  1534. # print('re', re.findall('</table>.*?<table border="1">', new_html[last_table_index:first_table_index]))
  1535. # 非贪婪匹配
  1536. new_html_sub = re.sub('</table>.*?<table border="1">',
  1537. '<tr><td>#@#@#</td></tr>',
  1538. new_html[last_table_index:first_table_index])
  1539. new_html = new_html[:last_table_index] + new_html_sub + new_html[first_table_index:]
  1540. # print('new_html', new_html)
  1541. # new_html = new_html[:-5]
  1542. # ([-/第页0-9]|<div>|</div>)*
  1543. # 非贪婪匹配
  1544. # match = re.finditer('</table>.*?<table border="1">', new_html)
  1545. # for m in match:
  1546. # if '#@#@#' in m.group():
  1547. #
  1548. # new_html = re.sub('</table>.*#@#@#.*?<table border="1">',
  1549. # '<tr><td>#@#@#</td></tr>',
  1550. # new_html)
  1551. # print('new_html', new_html)
  1552. soup = BeautifulSoup(new_html, 'lxml')
  1553. trs = soup.findAll('tr')
  1554. decompose_trs = []
  1555. for i in range(len(trs)):
  1556. if trs[i].get_text() == '#@#@#':
  1557. td1 = trs[i - 1].findAll('td')
  1558. td2 = trs[i + 1].findAll('td')
  1559. if td2[0].get_text() == '':
  1560. # 解决连续多页是一行表格,该行会被去掉问题
  1561. find_father = False
  1562. for father, son in decompose_trs:
  1563. # print('son', son)
  1564. # print('td1', trs[i - 1])
  1565. if father != '' and son == trs[i - 1]:
  1566. td_father = father.findAll('td')
  1567. for j in range(len(td_father)):
  1568. # print('td_father[j].string3', td_father[j].string)
  1569. td_father[j].string = td_father[j].get_text() + td2[j].get_text()
  1570. # print('td_father[j].string4', td_father[j].string)
  1571. find_father = True
  1572. decompose_trs.append([father, trs[i + 1]])
  1573. break
  1574. if not find_father:
  1575. for j in range(len(td1)):
  1576. # print('td1[j].string1', td1[j].string)
  1577. td1[j].string = td1[j].get_text() + td2[j].get_text()
  1578. # print('td1[j].string2', td1[j].string)
  1579. decompose_trs.append([trs[i - 1], trs[i + 1]])
  1580. # print('trs[i + 1]', trs[i + 1])
  1581. # trs[i + 1].decompose()
  1582. # print('trs[i-1]', trs[i-1])
  1583. # trs[i].decompose()
  1584. decompose_trs.append(['', trs[i]])
  1585. # print('decompose_trs', decompose_trs)
  1586. # for father, son in decompose_trs:
  1587. # print('father', father)
  1588. # print('son', son)
  1589. # print('len(decompose_trs)', len(decompose_trs))
  1590. for father, son in decompose_trs:
  1591. for tr in trs:
  1592. if tr == son:
  1593. tr.decompose()
  1594. break
  1595. new_html = str(soup)
  1596. new_html_list.append(new_html)
  1597. html_str = ''
  1598. for h in new_html_list:
  1599. html_str += h
  1600. return [html_str]
  1601. def get_html(self):
  1602. if self._doc.error_code is not None:
  1603. return self._doc.error_code
  1604. self.convert()
  1605. if self._doc.error_code is not None:
  1606. return self._doc.error_code
  1607. html = self._doc.get_html(return_list=True)
  1608. # 表格连接
  1609. try:
  1610. html = self.connect_table(html)
  1611. except:
  1612. traceback.print_exc()
  1613. return [-12]
  1614. return html
  1615. def delete_water_mark(self, lt_text_list, page_bbox, times=5):
  1616. # 删除过多重复字句,为水印
  1617. duplicate_dict = {}
  1618. for _obj in lt_text_list:
  1619. t = _obj.get_text()
  1620. if t in duplicate_dict.keys():
  1621. duplicate_dict[t][0] += 1
  1622. duplicate_dict[t][1].append(_obj)
  1623. else:
  1624. duplicate_dict[t] = [1, [_obj]]
  1625. delete_text = []
  1626. for t in duplicate_dict.keys():
  1627. if duplicate_dict[t][0] >= times:
  1628. obj_list = duplicate_dict[t][1]
  1629. obj_list.sort(key=lambda x: x.bbox[3])
  1630. obj_distance_h = abs(obj_list[-1].bbox[3] - obj_list[0].bbox[1])
  1631. obj_list.sort(key=lambda x: x.bbox[2])
  1632. obj_distance_w = abs(obj_list[-1].bbox[2] - obj_list[0].bbox[0])
  1633. if obj_distance_h >= abs(page_bbox[1] - page_bbox[3]) * 0.7 \
  1634. and obj_distance_w >= abs(page_bbox[0] - page_bbox[2]) * 0.7:
  1635. delete_text.append(t)
  1636. temp_text_list = []
  1637. for _obj in lt_text_list:
  1638. t = _obj.get_text()
  1639. if t not in delete_text:
  1640. temp_text_list.append(_obj)
  1641. return temp_text_list
  1642. def resize_image(self, img_path, max_size=2000):
  1643. _img = cv2.imread(img_path)
  1644. if _img.shape[0] <= max_size or _img.shape[1] <= max_size:
  1645. return
  1646. else:
  1647. resize_axis = 0 if _img.shape[0] >= _img.shape[1] else 1
  1648. ratio = max_size / _img.shape[resize_axis]
  1649. new_shape = [0, 0]
  1650. new_shape[resize_axis] = max_size
  1651. new_shape[1 - resize_axis] = int(_img.shape[1 - resize_axis] * ratio)
  1652. _img = cv2.resize(_img, (new_shape[1], new_shape[0]))
  1653. cv2.imwrite(img_path, _img)
  1654. def get_single_pdf(self, path, page_no):
  1655. log("into get_single_pdf")
  1656. try:
  1657. pdf_origin = copy.deepcopy(self.doc_pypdf2)
  1658. pdf_new = copy.deepcopy(self.doc_pypdf2_new)
  1659. pdf_new.addPage(pdf_origin.getPage(page_no))
  1660. path_new = path.split(".")[0] + "_split.pdf"
  1661. with open(path_new, "wb") as ff:
  1662. pdf_new.write(ff)
  1663. return path_new
  1664. except PyPDF2.utils.PdfReadError as e:
  1665. return [-3]
  1666. except Exception as e:
  1667. log("get_single_pdf error! page " + str(page_no))
  1668. return [-3]
  1669. def get_text_font():
  1670. def flags_decomposer(flags):
  1671. """Make font flags human readable."""
  1672. l = []
  1673. if flags & 2 ** 0:
  1674. l.append("superscript")
  1675. if flags & 2 ** 1:
  1676. l.append("italic")
  1677. if flags & 2 ** 2:
  1678. l.append("serifed")
  1679. else:
  1680. l.append("sans")
  1681. if flags & 2 ** 3:
  1682. l.append("monospaced")
  1683. else:
  1684. l.append("proportional")
  1685. if flags & 2 ** 4:
  1686. l.append("bold")
  1687. return ", ".join(l)
  1688. def get_underlined_textLines(page):
  1689. """
  1690. 获取某页pdf上的所有下划线文本信息
  1691. :param page: fitz中的一页
  1692. :return: list of tuples,每个tuple都是一个完整的下划线覆盖的整体:[(下划线句, 所在blk_no, 所在line_no), ...]
  1693. """
  1694. paths = page.get_drawings() # get drawings on the current page
  1695. # 获取该页内所有的height很小的bbox。因为下划线其实大多是这种矩形
  1696. # subselect things we may regard as lines
  1697. lines = []
  1698. for p in paths:
  1699. for item in p["items"]:
  1700. if item[0] == "l": # an actual line
  1701. p1, p2 = item[1:]
  1702. if p1.y == p2.y:
  1703. lines.append((p1, p2))
  1704. elif item[0] == "re": # a rectangle: check if height is small
  1705. r = item[1]
  1706. if r.width > r.height and r.height <= 2:
  1707. lines.append((r.tl, r.tr)) # take top left / right points
  1708. # 获取该页的`max_lineheight`,用于下面比较距离使用
  1709. blocks = page.get_text("dict", flags=11)["blocks"]
  1710. max_lineheight = 0
  1711. for b in blocks:
  1712. for l in b["lines"]:
  1713. bbox = fitz.Rect(l["bbox"])
  1714. if bbox.height > max_lineheight:
  1715. max_lineheight = bbox.height
  1716. underlined_res = []
  1717. # 开始对下划线内容进行查询
  1718. # make a list of words
  1719. words = page.get_text("words")
  1720. # if underlined, the bottom left / right of a word
  1721. # should not be too far away from left / right end of some line:
  1722. for wdx, w in enumerate(words): # w[4] is the actual word string
  1723. r = fitz.Rect(w[:4]) # first 4 items are the word bbox
  1724. for p1, p2 in lines: # check distances for start / end points
  1725. if abs(r.bl - p1) <= max_lineheight: # 当前word的左下满足下划线左下
  1726. if abs(r.br - p2) <= max_lineheight: # 当前word的右下满足下划线右下(单个词,无空格)
  1727. print(f"Word '{w[4]}' is underlined! Its block-line number is {w[-3], w[-2]}")
  1728. underlined_res.append((w[4], w[-3], w[-2])) # 分别是(下划线词,所在blk_no,所在line_no)
  1729. break # don't check more lines
  1730. else: # 继续寻找同line右侧的有缘人,因为有些下划线覆盖的词包含多个词,多个词之间有空格
  1731. curr_line_num = w[-2] # line nunmber
  1732. for right_wdx in range(wdx + 1, len(words), 1):
  1733. _next_w = words[right_wdx]
  1734. if _next_w[-2] != curr_line_num: # 当前遍历到的右侧word已经不是当前行的了(跨行是不行的)
  1735. break
  1736. _r_right = fitz.Rect(_next_w[:4]) # 获取当前同行右侧某word的方框4点
  1737. if abs(_r_right.br - p2) <= max_lineheight: # 用此word右下点和p2(目标下划线右上点)算距离,距离要小于max_lineheight
  1738. print(
  1739. f"Word '{' '.join([_one_word[4] for _one_word in words[wdx:right_wdx + 1]])}' is underlined! " +
  1740. f"Its block-line number is {w[-3], w[-2]}")
  1741. underlined_res.append(
  1742. (' '.join([_one_word[4] for _one_word in words[wdx:right_wdx + 1]]),
  1743. w[-3], w[-2])
  1744. ) # 分别是(下划线词,所在blk_no,所在line_no)
  1745. break # don't check more lines
  1746. return underlined_res
  1747. _p = r'C:\Users\Administrator\Desktop\test_pdf\error2-2.pdf'
  1748. doc_pymupdf = read_pymupdf(_p)
  1749. page = doc_pymupdf[0]
  1750. blocks = page.get_text("dict", flags=11)["blocks"]
  1751. for b in blocks: # iterate through the text blocks
  1752. for l in b["lines"]: # iterate through the text lines
  1753. for s in l["spans"]: # iterate through the text spans
  1754. print("")
  1755. font_properties = "Font: '%s' (%s), size %g, color #%06x" % (
  1756. s["font"], # font name
  1757. flags_decomposer(s["flags"]), # readable font flags
  1758. s["size"], # font size
  1759. s["color"], # font color
  1760. )
  1761. print(s)
  1762. print("Text: '%s'" % s["text"]) # simple print of text
  1763. print(font_properties)
  1764. get_underlined_textLines(page)
  1765. # 以下为现成pdf单页解析接口
  1766. class ParseSentence:
  1767. def __init__(self, bbox, fontname, fontsize, _text, _title, title_text, _pattern, title_degree, is_outline,
  1768. outline_location, page_no):
  1769. (x0, y0, x1, y1) = bbox
  1770. self.x0 = x0
  1771. self.y0 = y0
  1772. self.x1 = x1
  1773. self.y1 = y1
  1774. self.bbox = bbox
  1775. self.fontname = fontname
  1776. self.fontsize = fontsize
  1777. self.text = _text
  1778. self.title = _title
  1779. self.title_text = title_text
  1780. self.groups = _pattern
  1781. self.title_degree = title_degree
  1782. self.is_outline = is_outline
  1783. self.outline_location = outline_location
  1784. self.page_no = page_no
  1785. def __repr__(self):
  1786. return "%s,%s,%s,%d,%s" % (self.text, self.title, self.is_outline, self.outline_location, str(self.bbox))
  1787. class ParseUtils:
  1788. @staticmethod
  1789. def getFontinfo(_page):
  1790. for _obj in _page._objs:
  1791. if isinstance(_obj, (LTTextBoxHorizontal, LTTextBoxVertical)):
  1792. for textline in _obj._objs:
  1793. done = False
  1794. for lchar in textline._objs:
  1795. if isinstance(lchar, (LTChar)):
  1796. _obj.fontname = lchar.fontname
  1797. _obj.fontsize = lchar.size
  1798. done = True
  1799. break
  1800. if done:
  1801. break
  1802. @staticmethod
  1803. def recognize_sentences(list_textbox, filter_objs, page_bbox, page_no,
  1804. remove_space=True, sourceP_LB=True):
  1805. list_textbox.sort(key=lambda x: x.bbox[0])
  1806. list_textbox.sort(key=lambda x: x.bbox[3], reverse=sourceP_LB)
  1807. cluster_textbox = []
  1808. for _textbox in list_textbox:
  1809. if _textbox in filter_objs:
  1810. continue
  1811. _find = False
  1812. for _ct in cluster_textbox:
  1813. if abs(_ct["y"] - _textbox.bbox[1]) < 5:
  1814. _find = True
  1815. _ct["textbox"].append(_textbox)
  1816. if not _find:
  1817. cluster_textbox.append({"y": _textbox.bbox[1], "textbox": [_textbox]})
  1818. cluster_textbox.sort(key=lambda x: x["y"], reverse=sourceP_LB)
  1819. list_sentences = []
  1820. for _line in cluster_textbox:
  1821. _textboxs = _line["textbox"]
  1822. _textboxs.sort(key=lambda x: x.bbox[0])
  1823. _linetext = _textboxs[0].get_text()
  1824. for _i in range(1, len(_textboxs)):
  1825. if abs(_textboxs[_i].bbox[0] - _textboxs[_i - 1].bbox[2]) > 60:
  1826. if _linetext and _linetext[-1] not in (",", ",", "。", ".", "、", ";"):
  1827. _linetext += "=,="
  1828. _linetext += _textboxs[_i].get_text()
  1829. _linetext = re.sub("[\s\r\n]", "", _linetext)
  1830. _bbox = (_textboxs[0].bbox[0], _textboxs[0].bbox[1],
  1831. _textboxs[-1].bbox[2], _textboxs[-1].bbox[3])
  1832. _title = None
  1833. _pattern_groups = None
  1834. title_text = ""
  1835. if not _title:
  1836. _groups = ParseUtils.find_title_by_pattern(_textboxs[0].get_text())
  1837. if _groups:
  1838. _title = _groups[0][0]
  1839. title_text = _groups[0][1]
  1840. _pattern_groups = _groups
  1841. if not _title:
  1842. _groups = ParseUtils.find_title_by_pattern(_linetext)
  1843. if _groups:
  1844. _title = _groups[0][0]
  1845. title_text = _groups[0][1]
  1846. _pattern_groups = _groups
  1847. if not _title:
  1848. _title = ParseUtils.rec_incenter(_bbox, page_bbox)
  1849. title_degree = 2
  1850. if not _title:
  1851. _linetext = _linetext.replace("=,=", ",")
  1852. else:
  1853. _linetext = _linetext.replace("=,=", "")
  1854. title_degree = int(_title.split("_")[1])
  1855. # 页码
  1856. if ParseUtils.rec_incenter(_bbox, page_bbox) and re.search("^\d+$", _linetext) is not None:
  1857. continue
  1858. if _linetext == "" or re.search("^,+$", _linetext) is not None:
  1859. continue
  1860. is_outline = False
  1861. outline_location = -1
  1862. _search = re.search("(?P<text>.+?)\.{5,}(?P<nums>\d+)$", _linetext)
  1863. if _search is not None:
  1864. is_outline = True
  1865. _linetext = _search.group("text")
  1866. outline_location = int(_search.group("nums"))
  1867. list_sentences.append(
  1868. ParseSentence(_bbox, _textboxs[-1].__dict__.get("fontname"), _textboxs[-1].__dict__.get("fontsize"),
  1869. _linetext, _title, title_text, _pattern_groups, title_degree, is_outline,
  1870. outline_location, page_no))
  1871. # for _sen in list_sentences:
  1872. # print(_sen.__dict__)
  1873. return list_sentences
  1874. @staticmethod
  1875. def find_title_by_pattern(_text,
  1876. _pattern="(?P<title_1>(?P<title_1_index_0_0>^第?)(?P<title_1_index_1_1>[一二三四五六七八九十ⅠⅡⅢⅣⅤⅥⅦⅧⅨⅩⅪⅫ]+)(?P<title_1_index_2_0>[、章]))|" \
  1877. "(?P<title_3>^(?P<title_3_index_0_1>[ⅠⅡⅢⅣⅤⅥⅦⅧⅨⅩⅪⅫ]+))|" \
  1878. "(?P<title_4>^(?P<title_4_index_0_0>第?)(?P<title_4_index_1_1>[一二三四五六七八九十]+)(?P<title_4_index_2_0>[节]))|" \
  1879. "(?P<title_11>^(?P<title_11_index_0_0>\d{1,2}[\..、\s\-]\d{1,2}[\..、\s\-]\d{1,2}[\..、\s\-])(?P<title_11_index_1_1>\d{1,2})(?P<title_11_index_2_0>[\..、\s\-]))|" \
  1880. "(?P<title_10>^(?P<title_10_index_0_0>\d{1,2}[\..、\s\-]\d{1,2}[\..、\s\-])(?P<title_10_index_1_1>\d{1,2})(?P<title_10_index_2_0>[\..、\s\-]))|" \
  1881. "(?P<title_7>^(?P<title_7_index_0_0>\d{1,2}[\..、\s\-])(?P<title_7_index_1_1>\d{1,2})(?P<title_7_index_2_0>[\..、\s\-]))|" \
  1882. "(?P<title_6>^(?P<title_6_index_0_1>\d{1,2})(?P<title_6_index_1_0>[\..、\s\-]))|" \
  1883. "(?P<title_15>^(?P<title_15_index_0_0>(?)(?P<title_15_index_1_1>\d{1,2})(?P<title_15_index_2_0>)))|" \
  1884. "(?P<title_17>^(?P<title_17_index_0_0>(?)(?P<title_17_index_1_1>[a-zA-Z]+)(?P<title_17_index_2_0>)))|"
  1885. "(?P<title_19>^(?P<title_19_index_0_0>(?)(?P<title_19_index_1_1>[一二三四五六七八九十ⅠⅡⅢⅣⅤⅥⅦⅧⅨⅩⅪⅫ]+)(?P<title_19_index_2_0>)))|" \
  1886. ):
  1887. _se = re.search(_pattern, _text)
  1888. groups = []
  1889. if _se is not None:
  1890. _gd = _se.groupdict()
  1891. for k, v in _gd.items():
  1892. if v is not None:
  1893. groups.append((k, v))
  1894. if len(groups):
  1895. groups.sort(key=lambda x: x[0])
  1896. return groups
  1897. return None
  1898. @staticmethod
  1899. def rec_incenter(o_bbox, p_bbox):
  1900. p_width = p_bbox[2] - p_bbox[0]
  1901. l_space = (o_bbox[0] - p_bbox[0]) / p_width
  1902. r_space = (p_bbox[2] - o_bbox[2]) / p_width
  1903. if abs((l_space - r_space)) < 0.1 and l_space > 0.2:
  1904. return "title_2"
  1905. @staticmethod
  1906. def is_first_title(_title):
  1907. if _title is None:
  1908. return False
  1909. if re.search("^\d+$", _title) is not None:
  1910. if int(_title) == 1:
  1911. return True
  1912. return False
  1913. if re.search("^[一二三四五六七八九十百]+$", _title) is not None:
  1914. if _title == "一":
  1915. return True
  1916. return False
  1917. if re.search("^[a-z]+$", _title) is not None:
  1918. if _title == "a":
  1919. return True
  1920. return False
  1921. if re.search("^[A-Z]+$", _title) is not None:
  1922. if _title == "A":
  1923. return True
  1924. return False
  1925. if re.search("^[ⅠⅡⅢⅣⅤⅥⅦⅧⅨⅩⅪⅫ]$", _title) is not None:
  1926. if _title == "Ⅰ":
  1927. return True
  1928. return False
  1929. return False
  1930. @staticmethod
  1931. def get_next_title(_title):
  1932. if re.search("^\d+$", _title) is not None:
  1933. return str(int(_title) + 1)
  1934. if re.search("^[一二三四五六七八九十百]+$", _title) is not None:
  1935. _next_title = ParseUtils.make_increase(['一', '二', '三', '四', '五', '六', '七', '八', '九', '十'],
  1936. re.sub("[十百]", '', _title))
  1937. _next_title = list(_next_title)
  1938. _next_title.reverse()
  1939. if _next_title[-1] != "十":
  1940. if len(_next_title) >= 2:
  1941. _next_title.insert(-1, '十')
  1942. if len(_next_title) >= 4:
  1943. _next_title.insert(-3, '百')
  1944. if _title[0] == "十":
  1945. if _next_title == "十":
  1946. _next_title = ["二", "十"]
  1947. _next_title.insert(0, "十")
  1948. _next_title = "".join(_next_title)
  1949. return _next_title
  1950. if re.search("^[a-z]+$", _title) is not None:
  1951. _next_title = ParseUtils.make_increase([chr(i + ord('a')) for i in range(26)], _title)
  1952. _next_title = list(_next_title)
  1953. _next_title.reverse()
  1954. return "".join(_next_title)
  1955. if re.search("^[A-Z]+$", _title) is not None:
  1956. _next_title = ParseUtils.make_increase([chr(i + ord('A')) for i in range(26)], _title)
  1957. _next_title = list(_next_title)
  1958. _next_title.reverse()
  1959. return "".join(_next_title)
  1960. if re.search("^[ⅠⅡⅢⅣⅤⅥⅦⅧⅨⅩⅪⅫ]$", _title) is not None:
  1961. _sort = ["Ⅰ", "Ⅱ", "Ⅲ", "Ⅳ", "Ⅴ", "Ⅵ", "Ⅶ", "Ⅷ", "Ⅸ", "Ⅹ", "Ⅺ", "Ⅻ"]
  1962. _index = _sort.index(_title)
  1963. if _index < len(_sort) - 1:
  1964. return _sort[_index + 1]
  1965. return None
  1966. @staticmethod
  1967. def make_increase(_sort, _title, _add=1):
  1968. if len(_title) == 0 and _add == 0:
  1969. return ""
  1970. if len(_title) == 0 and _add == 1:
  1971. return _sort[0]
  1972. _index = _sort.index(_title[-1])
  1973. next_index = (_index + _add) % len(_sort)
  1974. next_chr = _sort[next_index]
  1975. if _index == len(_sort) - 1:
  1976. _add = 1
  1977. else:
  1978. _add = 0
  1979. return next_chr + ParseUtils.make_increase(_sort, _title[:-1], _add)
  1980. @staticmethod
  1981. def rec_serial(_text, o_bbox, p_bbox, fontname, _pattern="(?P<title_1>^[一二三四五六七八九十]+[、])|" \
  1982. "(?P<title_2>^\d+[\.、\s])|" \
  1983. "(?P<title_3>^\d+\.\d+[\.、\s])|" \
  1984. "(?P<title_4>^\d+\.\d+\.\d+[\.、\s])|" \
  1985. "(?P<title_5>^\d+\.\d+\.\d+\.\d+[\.、\s])"):
  1986. # todo :recog the serial of the sentence
  1987. _se = re.search(_pattern, _text)
  1988. if _se is not None:
  1989. _gd = _se.groupdict()
  1990. for k, v in _gd.items():
  1991. if v is not None:
  1992. return k
  1993. return None
  1994. if __name__ == '__main__':
  1995. # get_text_font()
  1996. PDFConvert(r"C:/Users/Administrator/Downloads/1651896704621.pdf", "C:/Users/Administrator/Downloads/1").get_html()
  1997. # print(b'\x10')