convert_pdf.py 96 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989910010110210310410510610710810911011111211311411511611711811912012112212312412512612712812913013113213313413513613713813914014114214314414514614714814915015115215315415515615715815916016116216316416516616716816917017117217317417517617717817918018118218318418518618718818919019119219319419519619719819920020120220320420520620720820921021121221321421521621721821922022122222322422522622722822923023123223323423523623723823924024124224324424524624724824925025125225325425525625725825926026126226326426526626726826927027127227327427527627727827928028128228328428528628728828929029129229329429529629729829930030130230330430530630730830931031131231331431531631731831932032132232332432532632732832933033133233333433533633733833934034134234334434534634734834935035135235335435535635735835936036136236336436536636736836937037137237337437537637737837938038138238338438538638738838939039139239339439539639739839940040140240340440540640740840941041141241341441541641741841942042142242342442542642742842943043143243343443543643743843944044144244344444544644744844945045145245345445545645745845946046146246346446546646746846947047147247347447547647747847948048148248348448548648748848949049149249349449549649749849950050150250350450550650750850951051151251351451551651751851952052152252352452552652752852953053153253353453553653753853954054154254354454554654754854955055155255355455555655755855956056156256356456556656756856957057157257357457557657757857958058158258358458558658758858959059159259359459559659759859960060160260360460560660760860961061161261361461561661761861962062162262362462562662762862963063163263363463563663763863964064164264364464564664764864965065165265365465565665765865966066166266366466566666766866967067167267367467567667767867968068168268368468568668768868969069169269369469569669769869970070170270370470570670770870971071171271371471571671771871972072172272372472572672772872973073173273373473573673773873974074174274374474574674774874975075175275375475575675775875976076176276376476576676776876977077177277377477577677777877978078178278378478578678778878979079179279379479579679779879980080180280380480580680780880981081181281381481581681781881982082182282382482582682782882983083183283383483583683783883984084184284384484584684784884985085185285385485585685785885986086186286386486586686786886987087187287387487587687787887988088188288388488588688788888989089189289389489589689789889990090190290390490590690790890991091191291391491591691791891992092192292392492592692792892993093193293393493593693793893994094194294394494594694794894995095195295395495595695795895996096196296396496596696796896997097197297397497597697797897998098198298398498598698798898999099199299399499599699799899910001001100210031004100510061007100810091010101110121013101410151016101710181019102010211022102310241025102610271028102910301031103210331034103510361037103810391040104110421043104410451046104710481049105010511052105310541055105610571058105910601061106210631064106510661067106810691070107110721073107410751076107710781079108010811082108310841085108610871088108910901091109210931094109510961097109810991100110111021103110411051106110711081109111011111112111311141115111611171118111911201121112211231124112511261127112811291130113111321133113411351136113711381139114011411142114311441145114611471148114911501151115211531154115511561157115811591160116111621163116411651166116711681169117011711172117311741175117611771178117911801181118211831184118511861187118811891190119111921193119411951196119711981199120012011202120312041205120612071208120912101211121212131214121512161217121812191220122112221223122412251226122712281229123012311232123312341235123612371238123912401241124212431244124512461247124812491250125112521253125412551256125712581259126012611262126312641265126612671268126912701271127212731274127512761277127812791280128112821283128412851286128712881289129012911292129312941295129612971298129913001301130213031304130513061307130813091310131113121313131413151316131713181319132013211322132313241325132613271328132913301331133213331334133513361337133813391340134113421343134413451346134713481349135013511352135313541355135613571358135913601361136213631364136513661367136813691370137113721373137413751376137713781379138013811382138313841385138613871388138913901391139213931394139513961397139813991400140114021403140414051406140714081409141014111412141314141415141614171418141914201421142214231424142514261427142814291430143114321433143414351436143714381439144014411442144314441445144614471448144914501451145214531454145514561457145814591460146114621463146414651466146714681469147014711472147314741475147614771478147914801481148214831484148514861487148814891490149114921493149414951496149714981499150015011502150315041505150615071508150915101511151215131514151515161517151815191520152115221523152415251526152715281529153015311532153315341535153615371538153915401541154215431544154515461547154815491550155115521553155415551556155715581559156015611562156315641565156615671568156915701571157215731574157515761577157815791580158115821583158415851586158715881589159015911592159315941595159615971598159916001601160216031604160516061607160816091610161116121613161416151616161716181619162016211622162316241625162616271628162916301631163216331634163516361637163816391640164116421643164416451646164716481649165016511652165316541655165616571658165916601661166216631664166516661667166816691670167116721673167416751676167716781679168016811682168316841685168616871688168916901691169216931694169516961697169816991700170117021703170417051706170717081709171017111712171317141715171617171718171917201721172217231724172517261727172817291730173117321733173417351736173717381739174017411742174317441745174617471748174917501751175217531754175517561757175817591760176117621763176417651766176717681769177017711772177317741775177617771778177917801781178217831784178517861787178817891790179117921793179417951796179717981799180018011802180318041805180618071808180918101811181218131814181518161817181818191820182118221823182418251826182718281829183018311832183318341835183618371838183918401841184218431844184518461847184818491850185118521853185418551856185718581859186018611862186318641865186618671868186918701871187218731874187518761877187818791880188118821883188418851886188718881889189018911892189318941895189618971898189919001901190219031904190519061907190819091910191119121913191419151916191719181919192019211922192319241925192619271928192919301931193219331934193519361937193819391940194119421943194419451946194719481949195019511952195319541955195619571958195919601961196219631964196519661967196819691970197119721973197419751976197719781979198019811982198319841985198619871988198919901991199219931994199519961997199819992000200120022003200420052006200720082009201020112012201320142015201620172018201920202021202220232024202520262027202820292030203120322033203420352036203720382039204020412042204320442045204620472048204920502051205220532054205520562057205820592060206120622063206420652066206720682069207020712072207320742075207620772078207920802081208220832084208520862087208820892090209120922093209420952096209720982099210021012102210321042105210621072108210921102111211221132114211521162117211821192120212121222123212421252126212721282129213021312132213321342135213621372138213921402141214221432144214521462147214821492150215121522153215421552156215721582159216021612162216321642165216621672168216921702171217221732174217521762177217821792180218121822183218421852186218721882189219021912192219321942195219621972198219922002201220222032204220522062207220822092210221122122213221422152216221722182219222022212222222322242225222622272228222922302231223222332234223522362237223822392240224122422243224422452246224722482249225022512252225322542255225622572258225922602261226222632264226522662267226822692270227122722273227422752276227722782279228022812282228322842285228622872288228922902291229222932294229522962297229822992300230123022303
  1. import shutil
  2. import zlib
  3. from glob import glob
  4. import copy
  5. import io
  6. import os
  7. import re
  8. import sys
  9. from bs4 import BeautifulSoup
  10. sys.path.append(os.path.dirname(__file__) + "/../")
  11. from pdfplumber import PDF
  12. from pdfplumber.table import TableFinder
  13. from pdfplumber.page import Page as pdfPage
  14. from format_convert.convert_tree import _Document, _Page, _Image, _Sentence, _Table, TextBox
  15. import time
  16. from PIL import Image
  17. import traceback
  18. import cv2
  19. import PyPDF2
  20. from PyPDF2 import PdfFileReader, PdfFileWriter
  21. from pdfminer.pdfparser import PDFParser
  22. from pdfminer.pdfdocument import PDFDocument
  23. from pdfminer.pdfpage import PDFPage
  24. from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
  25. from pdfminer.converter import PDFPageAggregator
  26. from pdfminer.layout import LTTextBoxHorizontal, LAParams, LTFigure, LTImage, LTCurve, LTText, LTChar, LTRect, \
  27. LTTextBoxVertical, LTLine, LTTextContainer, LTTextLine
  28. from format_convert.utils import judge_error_code, get_platform, LineTable, log, \
  29. memory_decorator, get_garble_code, get_md5_from_bytes, bytes2np, bbox_iou, get_garble_code2, \
  30. get_traditional_chinese, ascii85_decode
  31. import fitz
  32. from format_convert.wrapt_timeout_decorator import timeout
  33. from otr.table_line_pdf import table_line_pdf
  34. from botr.extract_table import get_b_table_by_blank_colon
  35. @memory_decorator
  36. def pdf2text(path, unique_type_dir):
  37. return
  38. @timeout(10, timeout_exception=TimeoutError)
  39. def pdf_analyze(interpreter, page, device, page_no):
  40. pdf_time = time.time()
  41. interpreter.process_page(page)
  42. # print('interpreter.process_page time', time.time()-pdf_time)
  43. layout = device.get_result()
  44. log("page_no: " + str(page_no) + " pdf_analyze cost: " + str(time.time() - pdf_time))
  45. return layout
  46. @timeout(25, timeout_exception=TimeoutError)
  47. def read_pdfminer(path, laparams):
  48. fp = open(path, 'rb')
  49. parser = PDFParser(fp)
  50. doc_pdfminer = PDFDocument(parser)
  51. rsrcmgr = PDFResourceManager()
  52. device = PDFPageAggregator(rsrcmgr, laparams=laparams)
  53. interpreter = PDFPageInterpreter(rsrcmgr, device)
  54. return doc_pdfminer, device, interpreter
  55. @timeout(15, timeout_exception=TimeoutError)
  56. def read_pymupdf(path):
  57. return fitz.open(path)
  58. @timeout(15, timeout_exception=TimeoutError)
  59. def read_pypdf2(path):
  60. doc_pypdf2 = PdfFileReader(path, strict=False)
  61. doc_pypdf2_new = PdfFileWriter()
  62. return doc_pypdf2, doc_pypdf2_new
  63. @timeout(25, timeout_exception=TimeoutError, use_signals=False)
  64. def read_pdfplumber(path, laparams):
  65. fp = open(path, 'rb')
  66. lt = LineTable()
  67. doc_top = 0
  68. doc_pdfplumber = PDF(fp, laparams=laparams.__dict__)
  69. return lt, doc_top, doc_pdfplumber
  70. class PDFConvert:
  71. def __init__(self, path, unique_type_dir, need_page_no, page_need_to_image_dict=None):
  72. self._doc = _Document(path)
  73. self.path = path
  74. self.unique_type_dir = unique_type_dir
  75. if not os.path.exists(self.unique_type_dir):
  76. os.mkdir(self.unique_type_dir)
  77. # 指定提取的页码范围
  78. self.need_page_no = need_page_no
  79. self.start_page_no = None
  80. self.end_page_no = None
  81. # 默认使用limit_page_cnt控制,前10页后10页
  82. if self.need_page_no is None:
  83. self.limit_page_cnt = 50
  84. else:
  85. # 使用start_page_no,end_page_no范围控制,例如2,5
  86. ss = self.need_page_no.split(',')
  87. if len(ss) != 2:
  88. self._doc.error_code = [-14]
  89. else:
  90. self.start_page_no = int(ss[0])
  91. self.end_page_no = int(ss[-1])
  92. if self.end_page_no == -1:
  93. self.end_page_no = 1000000
  94. self.start_page_no -= 1
  95. self.end_page_no -= 1
  96. if self.end_page_no <= self.start_page_no or self.start_page_no < 0 or self.end_page_no < -1:
  97. self._doc.error_code = [-14]
  98. self.packages = ["pdfminer", "PyMuPDF", "PyPDF2", "pdfplumber"]
  99. self.has_init_pdf = [0] * len(self.packages)
  100. # 记录图片对象的md5,用于去除大量重复图片
  101. self.md5_image_obj_list = []
  102. # 记录该页是不是纯文本
  103. self.only_text_list = []
  104. # 是否提取特殊页
  105. self.convert_specific_page = 1
  106. # 初始化_page
  107. self._page = _Page(None, 0)
  108. # 需要直接转成image来识别的页面
  109. if type(page_need_to_image_dict) is not dict:
  110. self.page_need_to_image_dict = {}
  111. else:
  112. self.page_need_to_image_dict = page_need_to_image_dict
  113. @memory_decorator
  114. def init_package(self, package_name):
  115. # 各个包初始化
  116. try:
  117. laparams = LAParams(line_overlap=0.01,
  118. char_margin=0.3,
  119. line_margin=0.01,
  120. word_margin=0.01,
  121. # boxes_flow=0.1,
  122. boxes_flow=None,
  123. )
  124. if package_name == self.packages[0]:
  125. self.doc_pdfminer, self.device, self.interpreter = read_pdfminer(self.path, laparams)
  126. self.has_init_pdf[0] = 1
  127. elif package_name == self.packages[1]:
  128. self.doc_pymupdf = read_pymupdf(self.path)
  129. self.has_init_pdf[1] = 1
  130. elif package_name == self.packages[2]:
  131. self.doc_pypdf2, self.doc_pypdf2_new = read_pypdf2(self.path)
  132. self.has_init_pdf[2] = 1
  133. elif package_name == self.packages[3]:
  134. self.lt, self.doc_top, self.doc_pdfplumber = read_pdfplumber(self.path, laparams)
  135. self.has_init_pdf[3] = 1
  136. else:
  137. log("Only Support Packages " + str(self.packages))
  138. raise Exception
  139. except Exception as e:
  140. log(package_name + " cannot open pdf!")
  141. traceback.print_exc()
  142. self._doc.error_code = [-3]
  143. @memory_decorator
  144. def convert(self, limit_page_cnt=50):
  145. if self.has_init_pdf[0] == 0:
  146. self.init_package("pdfminer")
  147. if self._doc.error_code is not None:
  148. self._doc.error_code = None
  149. # pdfminer读不了直接转成图片识别
  150. self.get_all_page_image()
  151. return
  152. # 判断是否能读pdf
  153. try:
  154. pages = PDFPage.create_pages(self.doc_pdfminer)
  155. for page in pages:
  156. break
  157. pages = list(pages)
  158. # except pdfminer.psparser.PSEOF as e:
  159. except:
  160. # pdfminer 读不了空白页的对象,直接使用pymupdf转换出的图片进行ocr识别
  161. log("pdfminer read failed! read by pymupdf!")
  162. traceback.print_exc()
  163. try:
  164. self.get_all_page_image()
  165. return
  166. except:
  167. traceback.print_exc()
  168. log("use pymupdf read failed!")
  169. self._doc.error_code = [-3]
  170. return
  171. # 每一页进行处理
  172. pages = PDFPage.create_pages(self.doc_pdfminer)
  173. pages = list(pages)
  174. page_count = len(pages)
  175. self.only_text_list = [-1] * len(pages)
  176. page_no = 0
  177. layout_list = []
  178. all_text_box_list = []
  179. for page in pages:
  180. # 指定pdf页码
  181. if self.start_page_no is not None and self.end_page_no is not None:
  182. if page_count < self.end_page_no:
  183. self.end_page_no = page_count
  184. if page_no < self.start_page_no or page_no >= self.end_page_no:
  185. page_no += 1
  186. continue
  187. # 限制pdf页数,只取前后各10页
  188. else:
  189. # if page_count > limit_page_cnt and int(limit_page_cnt / 2) <= page_no < page_count - int(
  190. # limit_page_cnt / 2):
  191. # page_no += 1
  192. # continue
  193. if page_count > limit_page_cnt and page_no >= limit_page_cnt:
  194. page_no += 1
  195. continue
  196. # 读取layout
  197. start_time = time.time()
  198. layout, layout_obj_list, max_y = self.read_layout(page, page_no)
  199. # 处理所需obj
  200. layout_obj_list = self.get_need_objs(layout_obj_list, max_y)
  201. all_text_box_list += layout_obj_list[1]
  202. layout_list.append([layout, layout_obj_list, max_y, page_no])
  203. log('read_layout page_no: ' + str(page_no) + ' cost: ' + str(time.time() - start_time))
  204. page_no += 1
  205. # 去跨页水印
  206. _, delete_water_mark_list = self.delete_water_mark(all_text_box_list, layout.bbox, times=int(len(layout_list)/2))
  207. # print('delete_water_mark_list', delete_water_mark_list)
  208. delete_water_mark_list = []
  209. for layout, layout_obj_list, max_y, page_no in layout_list:
  210. # for obj in layout_obj_list:
  211. # print('obj', obj)
  212. # 解析单页
  213. start_time = time.time()
  214. self._page = _Page(None, page_no)
  215. self._page.is_pdf = 1
  216. self.convert_page(layout, layout_obj_list, max_y, page_no, delete_water_mark_list)
  217. self._page.children.sort(key=lambda x: x.y)
  218. log('convert_page page_no: ' + str(page_no) + ' cost: ' + str(time.time() - start_time))
  219. if self._doc.error_code is None and self._page.error_code is not None:
  220. if self._page.error_code[0] in [-4, -3, 0]:
  221. continue
  222. else:
  223. self._doc.error_code = self._page.error_code
  224. break
  225. self._doc.add_child(self._page)
  226. self._doc.children, detete_header_footer_list = self.delete_header_footer(self._doc.children)
  227. if self.convert_specific_page and self.need_page_no is None:
  228. # 补充提取特定页
  229. if self.only_text_list.count(0) == 0:
  230. ratio = 0
  231. else:
  232. ratio = self.only_text_list.count(0) / (page_count - self.only_text_list.count(-1))
  233. if page_count > limit_page_cnt and ratio <= 0.2:
  234. page_no = 0
  235. find_flag = 0
  236. add_page_list = []
  237. for page in pages:
  238. # if not int(limit_page_cnt / 2) <= page_no < page_count - int(limit_page_cnt / 2):
  239. # page_no += 1
  240. # continue
  241. if not (page_no >= limit_page_cnt):
  242. page_no += 1
  243. continue
  244. # 解析单页
  245. start_time = time.time()
  246. self._page = _Page(page, page_no)
  247. layout, layout_obj_list, max_y = self.read_layout(page, page_no)
  248. layout_obj_list = self.get_need_objs(layout_obj_list, max_y)
  249. self.convert_page(layout, layout_obj_list, max_y, page_no, delete_water_mark_list, skip_image=1)
  250. self._page.children.sort(key=lambda x: x.y)
  251. log('convert_page add page_no: ' + str(page_no) + ' cost: ' + str(time.time() - start_time))
  252. # 删除页眉页脚
  253. pages, _ = self.delete_header_footer([self._page], detete_header_footer_list)
  254. self._page = pages[0]
  255. # 提取特殊部分,即下面关键词+表格
  256. re_str = '采购清单|采购需求|需求概况'
  257. if find_flag and len(self._page.children) > 0 and type(self._page.children[0]) == _Table:
  258. log('add page1 ' + str(page_no))
  259. add_page_list.append(self._page)
  260. if len(self._page.children) - 1 > 3:
  261. find_flag = 0
  262. for index in range(len(self._page.children)):
  263. obj = self._page.children[index]
  264. if not (type(obj) == _Sentence and re.search(re_str, obj.content)):
  265. continue
  266. next_obj = None
  267. if index + 1 < len(self._page.children):
  268. for j in range(index + 1, min(len(self._page.children), index + 5)):
  269. if type(self._page.children[j]) == _Table:
  270. next_obj = self._page.children[j]
  271. break
  272. if next_obj:
  273. if self._page not in add_page_list:
  274. add_page_list.append(self._page)
  275. log('add page2 ' + str(page_no))
  276. if len(self._page.children) - index - 1 > 3:
  277. find_flag = 0
  278. else:
  279. find_flag = 1
  280. page_no += 1
  281. if add_page_list:
  282. # self._doc.children = self._doc.children[:int(limit_page_cnt / 2)] \
  283. # + add_page_list \
  284. # + self._doc.children[int(limit_page_cnt / 2):]
  285. self._doc.children = self._doc.children[:limit_page_cnt] \
  286. + add_page_list
  287. self.delete_same_image()
  288. # self.delete_bold_text_duplicate()
  289. def delete_same_image(self, show=0):
  290. # 剔除大量重复图片
  291. md5_dict = {}
  292. for _md5, image_obj in self.md5_image_obj_list:
  293. if _md5 in md5_dict.keys():
  294. md5_dict[_md5] += [image_obj]
  295. else:
  296. md5_dict[_md5] = [image_obj]
  297. cnt_threshold = 10
  298. delete_obj_list = []
  299. for _md5 in md5_dict.keys():
  300. img_list = md5_dict.get(_md5)
  301. # print('len(md5_dict.get(_md5))', _md5, len(img_list))
  302. if len(img_list) >= cnt_threshold:
  303. if show:
  304. img_np = bytes2np(img_list[0].content)
  305. cv2.namedWindow('delete same img_np', cv2.WINDOW_NORMAL)
  306. cv2.imshow('delete same img_np', img_np)
  307. cv2.waitKey(0)
  308. delete_obj_list += img_list
  309. for page in self._doc.children:
  310. for obj in delete_obj_list:
  311. if obj in page.children:
  312. page.children.remove(obj)
  313. if show:
  314. for page in self._doc.children:
  315. for obj in page.children:
  316. if isinstance(obj, _Image):
  317. img_np = bytes2np(obj.content)
  318. cv2.imshow('page img_np', img_np)
  319. cv2.waitKey(0)
  320. def delete_header_footer(self, pages, delete_list=[]):
  321. sen_dict = {}
  322. for page in pages:
  323. for obj in page.children:
  324. if isinstance(obj, _Sentence):
  325. key = str(obj.content) + ' ' + str(int(obj.y))
  326. # print('key', key)
  327. if key in sen_dict.keys():
  328. sen_dict[key] += [obj]
  329. else:
  330. sen_dict[key] = [obj]
  331. # 把需删除的加上
  332. # print('delete_list', delete_list)
  333. for key in delete_list:
  334. if key in sen_dict:
  335. sen_dict[key] = sen_dict.get(key) * 10
  336. # print('sen_dict', sen_dict)
  337. delete_footer_header_list = []
  338. for key in sen_dict.keys():
  339. l = sen_dict.get(key)
  340. if len(l) >= 1 / 3 * max(10, len(pages)):
  341. delete_footer_header_list.append(key)
  342. for page in pages:
  343. new_children = []
  344. for obj in page.children:
  345. if isinstance(obj, _Sentence):
  346. if obj not in l:
  347. new_children.append(obj)
  348. else:
  349. new_children.append(obj)
  350. page.children = new_children
  351. # print('len(l)', len(l), len(pages))
  352. # print('delete_header_footer l[0]', l[0].content, l[0].y)
  353. return pages, delete_footer_header_list
  354. @memory_decorator
  355. def delete_bold_text_duplicate(self, lt_text_box_list):
  356. # 拿出所有LTChar
  357. lt_char_list = []
  358. for lt_text_box in lt_text_box_list:
  359. if '.......' in lt_text_box.get_text():
  360. # print('....... lt_text_box continue')
  361. continue
  362. for lt_text_line in lt_text_box:
  363. for lt_char in lt_text_line:
  364. if isinstance(lt_char, LTChar):
  365. lt_char_list.append(lt_char)
  366. # 找出需剔除的
  367. lt_char_list.sort(key=lambda x: (int(x.bbox[1]), x.bbox[0]))
  368. delete_list = []
  369. for i in range(len(lt_char_list)):
  370. lt_char1 = lt_char_list[i]
  371. bbox1 = lt_char1.bbox
  372. if lt_char1 in delete_list:
  373. continue
  374. for j in range(i + 1, len(lt_char_list)):
  375. lt_char2 = lt_char_list[j]
  376. bbox2 = lt_char2.bbox
  377. if lt_char2 in delete_list:
  378. continue
  379. if lt_char1.get_text() == lt_char2.get_text() and bbox_iou(bbox1, bbox2) >= 0.3 \
  380. and re.search('[\u4e00-\u9fff():、,。]', lt_char1.get_text()):
  381. delete_list.append(lt_char2)
  382. # 重新组装
  383. new_lt_text_box_list = []
  384. for lt_text_box in lt_text_box_list:
  385. new_lt_text_box = LTTextBoxHorizontal()
  386. for lt_text_line in lt_text_box:
  387. new_lt_text_line = LTTextLine(0.01)
  388. for lt_char in lt_text_line:
  389. if lt_char in delete_list:
  390. continue
  391. if isinstance(lt_char, LTChar):
  392. new_lt_text_line.add(lt_char)
  393. new_lt_text_box.add(new_lt_text_line)
  394. new_lt_text_box_list.append(new_lt_text_box)
  395. return new_lt_text_box_list
  396. def clean_text(self, _text):
  397. return re.sub("\s", "", _text)
  398. def get_text_lines(self, page, page_no):
  399. lt_line_list = []
  400. page_plumber = pdfPage(self.doc_pdfplumber, page, page_number=page_no, initial_doctop=self.doc_top)
  401. self.doc_top += page_plumber.height
  402. table_finder = TableFinder(page_plumber)
  403. all_width_zero = True
  404. for _edge in table_finder.get_edges():
  405. if _edge.get('linewidth') and _edge.get('linewidth') > 0:
  406. all_width_zero = False
  407. break
  408. for _edge in table_finder.get_edges():
  409. # print(_edge)
  410. if _edge.get('linewidth', 0.1) > 0 or all_width_zero:
  411. lt_line_list.append(LTLine(1, (float(_edge["x0"]), float(_edge["y0"])),
  412. (float(_edge["x1"]), float(_edge["y1"]))))
  413. log("pdf page_no %s has %s lines" % (str(page_no), str(len(lt_line_list))))
  414. return lt_line_list
  415. @memory_decorator
  416. def get_page_lines(self, lt_line_list, layout, page_no):
  417. lt_line_list = table_line_pdf(lt_line_list, layout, page_no)
  418. return lt_line_list
  419. @memory_decorator
  420. def recognize_text(self, layout, page_no, lt_text_list, lt_line_list):
  421. list_tables, filter_objs, _, connect_textbox_list = self.lt.recognize_table(lt_text_list, lt_line_list,
  422. from_pdf=True, is_reverse=False)
  423. # self._page.in_table_objs = filter_objs
  424. # print("=======text_len:%d:filter_len:%d"%(len(lt_text_list),len(filter_objs)))
  425. table_list = []
  426. for table in list_tables:
  427. _table = _Table(table["table"], table["bbox"])
  428. # self._page.children.append(_table)
  429. self._page.add_child(_table)
  430. table_list.append(_table)
  431. list_sentences = ParseUtils.recognize_sentences(lt_text_list, filter_objs,
  432. layout.bbox, page_no)
  433. for sentence in list_sentences:
  434. # print('sentence.text', sentence.text)
  435. _sen = _Sentence(sentence.text, sentence.bbox)
  436. self._page.add_child(_sen)
  437. # pdf对象需反向排序
  438. # self._page.is_reverse = True
  439. return table_list
  440. def is_text_legal(self, lt_text_list, page_no):
  441. # 无法识别pdf字符编码,整页用ocr
  442. text_temp = ""
  443. for _t in lt_text_list:
  444. text_temp += _t.get_text()
  445. if re.search('[(]cid:[0-9]+[)]', text_temp):
  446. log("page_no: " + str(page_no) + " text has cid! try pymupdf...")
  447. page_image = self.get_page_image(page_no)
  448. if judge_error_code(page_image):
  449. self._page.error_code = page_image
  450. else:
  451. _image = _Image(page_image[1], page_image[0])
  452. self._page.add_child(_image)
  453. return False
  454. match1 = re.findall(get_garble_code(), text_temp)
  455. # match2 = re.search('[\u4e00-\u9fa5]', text_temp)
  456. if len(match1) > 8 and len(text_temp) > 10:
  457. log("page_no: " + str(page_no) + " garbled code! try pymupdf... " + text_temp[:20])
  458. page_image = self.get_page_image(page_no)
  459. if judge_error_code(page_image):
  460. self._page.error_code = page_image
  461. else:
  462. _image = _Image(page_image[1], page_image[0])
  463. self._page.add_child(_image)
  464. return False
  465. return True
  466. @memory_decorator
  467. def judge_b_table(self, lt_text_list, table_list, page_no):
  468. table_h_list = []
  469. for table in table_list:
  470. table_h_list.append([table.bbox[1], table.bbox[3]])
  471. # 先分行
  472. lt_text_list.sort(key=lambda x: (x.bbox[1], x.bbox[0]))
  473. lt_text_row_list = []
  474. current_h = lt_text_list[0].bbox[1]
  475. row = []
  476. threshold = 2
  477. for lt_text in lt_text_list:
  478. bbox = lt_text.bbox
  479. if current_h - threshold <= bbox[1] <= current_h + threshold:
  480. row.append(lt_text)
  481. else:
  482. if row:
  483. lt_text_row_list.append(row)
  484. row = [lt_text]
  485. current_h = lt_text.bbox[1]
  486. if row:
  487. lt_text_row_list.append(row)
  488. # 判断文本中间是否是空格,或一行文本中间有多个
  489. is_b_table_cnt = 3
  490. tolerate_cnt = 2
  491. t_cnt = 0
  492. row_cnt = 0
  493. b_table_row_list = []
  494. all_b_table = []
  495. row_col_list = []
  496. all_row_col_list = []
  497. for row in lt_text_row_list:
  498. # 水印行跳过
  499. if len(row) == 1 and len(row[0].get_text()[:-1]) == 1:
  500. continue
  501. # 目录行跳过
  502. continue_flag = False
  503. for r in row:
  504. if re.search('[.·]{7,}', r.get_text()):
  505. continue_flag = True
  506. all_row_col_list = []
  507. break
  508. if continue_flag:
  509. continue
  510. if len(row) == 1:
  511. text = row[0].get_text()
  512. bbox = row[0].bbox
  513. match = re.search('[ ]{3,}', text)
  514. if match and re.search('[\u4e00-\u9fff]{2,}', text[:match.span()[0]]) \
  515. and re.search('[\u4e00-\u9fff]{2,}', text[match.span()[1]:]):
  516. row_cnt += 1
  517. t_cnt = 0
  518. b_table_row_list += row
  519. row_col_list += [row]
  520. else:
  521. # 容忍
  522. if t_cnt < tolerate_cnt:
  523. t_cnt += 1
  524. continue
  525. if b_table_row_list and row_cnt >= is_b_table_cnt:
  526. all_b_table.append(b_table_row_list)
  527. all_row_col_list.append(row_col_list)
  528. row_cnt = 0
  529. b_table_row_list = []
  530. row_col_list = []
  531. else:
  532. row_cnt += 1
  533. t_cnt = 0
  534. b_table_row_list += row
  535. row_col_list += [row]
  536. if b_table_row_list and row_cnt >= is_b_table_cnt:
  537. all_b_table.append(b_table_row_list)
  538. all_row_col_list.append(row_col_list)
  539. # print('b_table_row_list', b_table_row_list)
  540. # 排除大部分是两列的,因为前面已经新增了两列无边框的单独识别
  541. # print('len(all_row_col_list)', len(all_row_col_list))
  542. row_cnt = 0
  543. col_2_cnt = 0
  544. for row_col_list in all_row_col_list:
  545. for col_list in row_col_list:
  546. row_cnt += 1
  547. if len(col_list) == 2:
  548. col_2_cnt += 1
  549. # print('col_list', col_list)
  550. # print('row_cnt, col_2_cnt', row_cnt, col_2_cnt)
  551. if row_cnt == 0 or col_2_cnt / row_cnt >= 0.5:
  552. log("page_no: " + str(page_no) + ' is_b_table_flag False')
  553. return False
  554. # 对每个可能的b_table判断是否与table相交
  555. is_b_table_flag = False
  556. for b_table in all_b_table:
  557. # 判断在不在有边框表格的范围
  558. in_flag = False
  559. for table_h in table_h_list:
  560. for b in b_table:
  561. if min(table_h) <= b.bbox[1] <= max(table_h) or min(table_h) <= b.bbox[3] <= max(table_h):
  562. in_flag = True
  563. break
  564. if in_flag:
  565. break
  566. if in_flag:
  567. is_b_table_flag = False
  568. else:
  569. is_b_table_flag = True
  570. # print('is_b_table_flag True ', [[x.get_text(), x.bbox] for x in b_table])
  571. # print('table_h_list', table_h_list)
  572. break
  573. log("page_no: " + str(page_no) + ' is_b_table_flag ' + str(is_b_table_flag))
  574. # 保存判断为True的pdf
  575. # if is_b_table_flag:
  576. # self.save_b_table_pdf(page_no)
  577. return is_b_table_flag
  578. def save_b_table_pdf(self, page_no):
  579. # save_dir = r"D:\Project\format_conversion_maxcompute\save_b_table_pdf"
  580. save_dir = '/data/fangjiasheng/format_conversion_maxcompute/save_b_table_pdf'
  581. max_index = 200
  582. if os.path.exists(save_dir):
  583. file_list = glob(save_dir + '/*')
  584. if file_list:
  585. file_index_list = [int(re.split('[/.\\\\-]', x)[-3]) for x in file_list]
  586. file_index_list.sort(key=lambda x: x)
  587. index = file_index_list[-1] + 1
  588. else:
  589. index = 0
  590. if index > max_index:
  591. return
  592. else:
  593. return
  594. save_path = f'{save_dir}/{index}-{page_no}.pdf'
  595. try:
  596. shutil.copy(self.path, save_path)
  597. print("文件复制成功!")
  598. except Exception as e:
  599. print(f"文件复制失败:{e}")
  600. def char_to_text_box(self, char_list):
  601. lt_text_box_list = []
  602. # char分行 20241028
  603. new_text_line_list = []
  604. new_text_line = []
  605. lt_char_y = None
  606. for y in char_list:
  607. if lt_char_y is None:
  608. lt_char_y = y.bbox[1]
  609. if abs(lt_char_y - y.bbox[1]) >= 1:
  610. # new_lt_text_box.add(new_lt_text_line)
  611. # lt_text_box_list.append(new_lt_text_box)
  612. new_text_line_list.append(new_text_line)
  613. # print('new_text_line', lt_char_y, y.bbox[1], new_text_line)
  614. new_text_line = [y]
  615. lt_char_y = y.bbox[1]
  616. else:
  617. new_text_line.append(y)
  618. if new_text_line:
  619. new_text_line_list.append(new_text_line)
  620. # char分列
  621. temp_list = []
  622. for new_text_line in new_text_line_list:
  623. new_text_line.sort(key=lambda x: x.bbox[0])
  624. lt_char_x = new_text_line[0].bbox[2]
  625. split_text_line = []
  626. for y in new_text_line:
  627. if abs(lt_char_x - y.bbox[0]) > abs(y.bbox[2] - y.bbox[0]) / len(y.get_text()):
  628. temp_list.append(split_text_line)
  629. split_text_line = [y]
  630. else:
  631. split_text_line.append(y)
  632. lt_char_x = y.bbox[2]
  633. if split_text_line:
  634. temp_list.append(split_text_line)
  635. new_text_line_list = temp_list
  636. # LTFigure也有LTChar,拼成LTTextLine/LTTextBox放入 20241012
  637. text_box_char_dict = {}
  638. for new_text_line in new_text_line_list:
  639. new_text_line.sort(key=lambda x: x.bbox[0])
  640. new_lt_text_box = LTTextBoxHorizontal()
  641. new_lt_text_line = LTTextLine(0.01)
  642. for y in new_text_line:
  643. new_lt_text_line.add(y)
  644. new_lt_text_box.add(new_lt_text_line)
  645. lt_text_box_list.append(new_lt_text_box)
  646. key = new_lt_text_line.get_text() + str(new_lt_text_line.bbox)
  647. text_box_char_dict[key] = new_text_line
  648. # print('text_box_char_dict', text_box_char_dict)
  649. return lt_text_box_list, text_box_char_dict
  650. @memory_decorator
  651. def get_need_objs(self, obj_list, max_y):
  652. # 文字
  653. lt_char_list = []
  654. lt_text_box_list = []
  655. # 图像
  656. lt_image_list = []
  657. # 嵌套图表
  658. lt_figure_list = []
  659. # 线
  660. lt_line_list = []
  661. # text_box中单个字符映射
  662. text_box_char_dict = {}
  663. for x in obj_list:
  664. # 重置bbox
  665. x.set_bbox((x.x0, round(max_y - max(x.y0, x.y1), 1), x.x1, round(max_y - min(x.y0, x.y1), 1)))
  666. # 需重置内部LTChar
  667. if isinstance(x, (LTTextBoxHorizontal, LTTextBoxVertical)):
  668. for lt_text_line in x:
  669. new_lt_char_list = []
  670. new_lt_char_text = ''
  671. for lt_char in lt_text_line:
  672. if isinstance(lt_char, LTChar):
  673. lt_char.set_bbox((lt_char.x0, round(max_y - max(lt_char.y0, lt_char.y1), 1), lt_char.x1,
  674. round(max_y - min(lt_char.y0, lt_char.y1), 1)))
  675. # 填充颜色、描边颜色都为白色
  676. if lt_char.graphicstate.scolor == [1, 1, 1] and lt_char.graphicstate.ncolor == [1, 1, 1]:
  677. continue
  678. new_lt_char_list.append(lt_char)
  679. new_lt_char_text += lt_char.get_text()
  680. text_box_char_dict[new_lt_char_text + str(x.bbox)] = new_lt_char_list
  681. lt_text_line._objs = new_lt_char_list
  682. # 分类
  683. if isinstance(x, LTChar):
  684. lt_char_list.append(x)
  685. elif isinstance(x, (LTTextBoxHorizontal, LTTextBoxVertical)):
  686. lt_text_box_list.append(x)
  687. elif isinstance(x, LTImage):
  688. lt_image_list.append(x)
  689. elif isinstance(x, LTFigure):
  690. lt_figure_list.append(x)
  691. elif isinstance(x, (LTTextContainer, LTRect, LTLine, LTCurve)):
  692. lt_line_list.append(x)
  693. # print('len(obj_list)', len(obj_list))
  694. # print('len(lt_char_list)', len(lt_char_list))
  695. # print('len(lt_text_box_list)', len(lt_text_box_list))
  696. # if len(lt_text_box_list) >= 200:
  697. # for lt_text in lt_text_box_list:
  698. # print('>= 200 lt_text', lt_text.get_text())
  699. # print('len(lt_image_list)', len(lt_image_list))
  700. if lt_figure_list:
  701. temp_figure_list = []
  702. for sub_figure in lt_figure_list:
  703. sub_lt_char_list, sub_lt_text_box_list, sub_lt_image_list, sub_lt_figure_list, \
  704. sub_lt_line_list, sub_text_box_char_dict = self.get_need_objs(sub_figure, max_y)
  705. lt_char_list += sub_lt_char_list
  706. lt_text_box_list += sub_lt_text_box_list
  707. lt_image_list += sub_lt_image_list
  708. temp_figure_list += sub_lt_figure_list
  709. lt_line_list += sub_lt_line_list
  710. text_box_char_dict = {**text_box_char_dict, **sub_text_box_char_dict}
  711. lt_figure_list = temp_figure_list
  712. # LTChar拼成LTTextBox
  713. lt_char_list.sort(key=lambda z: (z.bbox[1], z.bbox[0]))
  714. add_lt_text_box_list, add_text_box_char_dict = self.char_to_text_box(lt_char_list)
  715. for lt_text_box in add_lt_text_box_list:
  716. if lt_text_box in lt_text_box_list:
  717. continue
  718. lt_text_box_list += add_lt_text_box_list
  719. lt_char_list = []
  720. text_box_char_dict = {**text_box_char_dict, **add_text_box_char_dict}
  721. lt_text_box_list = self.delete_water_mark_by_location(lt_text_box_list)
  722. # 分行后过滤
  723. temp_list = []
  724. for lt_text_box in lt_text_box_list:
  725. if lt_text_box.get_text() in ['', ' ', '\t', '\n', '\r']:
  726. continue
  727. temp_list.append(lt_text_box)
  728. if len(lt_text_box_list) != len(temp_list):
  729. log('filter lt_text_box_list ' + str(len(lt_text_box_list)) + ' -> ' + str(len(temp_list)))
  730. lt_text_box_list = temp_list
  731. return lt_char_list, lt_text_box_list, lt_image_list, lt_figure_list, lt_line_list, text_box_char_dict
  732. @memory_decorator
  733. def read_layout(self, page, page_no):
  734. layout = self.get_layout(page, page_no)
  735. if self._doc.error_code is not None:
  736. return
  737. if judge_error_code(layout):
  738. self._page.error_code = layout
  739. return
  740. # 翻转pdf中所有对象的y坐标
  741. # max_y, min_y = 0, 10000
  742. layout_obj_list = []
  743. max_y = layout.height
  744. for x in layout:
  745. layout_obj_list.append(x)
  746. return layout, layout_obj_list, max_y
  747. def split_text_box_by_lines(self, lt_line_list, lt_text_box_list):
  748. """
  749. 无单个字符位置信息,依赖图片长度及字符串长度判断,确定截断位置,分割text
  750. :param lt_line_list:
  751. :param lt_text_box_list:
  752. :return:
  753. """
  754. col_lines = []
  755. for line in lt_line_list:
  756. bbox = line.bbox
  757. if abs(bbox[1] - bbox[3]) > abs(bbox[0] - bbox[2]):
  758. col_lines.append(line)
  759. # 匹配中文字符
  760. reg = '[\u3000-\u303f]|[\u4e00-\u9fa5]'
  761. delete_text_box_list = []
  762. add_text_box_list = []
  763. for text_box in lt_text_box_list:
  764. text = text_box.get_text()
  765. bbox = text_box.bbox
  766. if len(text) == 0:
  767. continue
  768. split_x_list = []
  769. for line in col_lines:
  770. if bbox[0] < line.bbox[0] < bbox[2] and line.bbox[1] <= bbox[1] <= line.bbox[3]:
  771. split_x_list.append(line.bbox[0])
  772. if not split_x_list:
  773. continue
  774. # 计算单个字符长度,中文宽度是英文两倍
  775. chinese_text = ''.join(re.findall(reg, text))
  776. chinese_len = len(chinese_text)
  777. other_len = len(text) - chinese_len
  778. one_char_len = abs(bbox[2] - bbox[0]) / (chinese_len * 2 + other_len)
  779. # 根据分割距离计算取前几个字符
  780. temp_text = re.sub(reg, '我', text)
  781. start_x = bbox[0]
  782. start_cnt = 0
  783. split_text_box_list = []
  784. split_x_list.append(bbox[2])
  785. split_x_list.sort(key=lambda z: z)
  786. for x in split_x_list:
  787. distance = x - start_x
  788. char_cnt = int(distance / one_char_len)
  789. add_char_cnt = 0
  790. real_char_cnt = 0
  791. for c in temp_text:
  792. if add_char_cnt >= char_cnt:
  793. break
  794. if c == '我':
  795. add_char_cnt += 2
  796. else:
  797. add_char_cnt += 1
  798. real_char_cnt += 1
  799. real_text = text[start_cnt:start_cnt+real_char_cnt]
  800. split_text_box_list.append([real_text, start_x, x])
  801. start_x = x
  802. start_cnt = start_cnt+real_char_cnt
  803. # print('split_text_box_list', split_text_box_list)
  804. # 将分割好的文本做成对象
  805. if split_text_box_list:
  806. delete_text_box_list.append(text_box)
  807. lt_chars = [y for x in text_box for y in x]
  808. lt_char = lt_chars[0]
  809. # print('lt_char', lt_char)
  810. for text, start_x, x in split_text_box_list:
  811. new_lt_char = copy.deepcopy(lt_char)
  812. new_lt_char._text = text
  813. new_lt_char.set_bbox([start_x, lt_char.bbox[1], x, lt_char.bbox[3]])
  814. # print('new_lt_char', new_lt_char)
  815. new_lt_text_box = LTTextBoxHorizontal()
  816. new_lt_text_line = LTTextLine(0.01)
  817. new_lt_text_line.add(new_lt_char)
  818. new_lt_text_box.add(new_lt_text_line)
  819. add_text_box_list.append(new_lt_text_box)
  820. temp_list = []
  821. for text_box in lt_text_box_list:
  822. if text_box in delete_text_box_list:
  823. continue
  824. temp_list.append(text_box)
  825. lt_text_box_list = temp_list
  826. lt_text_box_list += add_text_box_list
  827. # print('lt_text_box_list', lt_text_box_list)
  828. return lt_text_box_list
  829. @memory_decorator
  830. def split_text_box_by_lines2(self, lt_line_list, lt_text_box_list, text_box_char_dict):
  831. """
  832. 有单个字符位置信息,再根据表格线截断位置,分割text
  833. :param lt_line_list:
  834. :param lt_text_box_list:
  835. :return:
  836. """
  837. col_lines = []
  838. for line in lt_line_list:
  839. bbox = line.bbox
  840. if abs(bbox[1] - bbox[3]) > abs(bbox[0] - bbox[2]):
  841. col_lines.append(line)
  842. delete_text_box_list = []
  843. add_text_box_list = []
  844. for text_box in lt_text_box_list:
  845. text = text_box.get_text()
  846. bbox = text_box.bbox
  847. if len(text) == 0:
  848. continue
  849. split_x_list = []
  850. for line in col_lines:
  851. if bbox[0] < line.bbox[0] < bbox[2] and line.bbox[1] <= bbox[1] <= line.bbox[3]:
  852. split_x_list.append(line.bbox[0])
  853. # print('split_x_list', split_x_list)
  854. if not split_x_list:
  855. continue
  856. split_x_list.append(bbox[2])
  857. # 获取字符位置信息
  858. key = text + str(bbox)
  859. # print('key', key)
  860. char_list = text_box_char_dict.get(key)
  861. # print('char_list', char_list)
  862. if not char_list or len(char_list) <= 1:
  863. continue
  864. char_x_list = [x.bbox[0] for x in char_list]
  865. char_x_list += split_x_list
  866. char_x_list.sort(key=lambda x: x)
  867. split_text_box_list = []
  868. start_index = 0
  869. start_x = char_x_list[0]
  870. for x in split_x_list:
  871. index = char_x_list.index(x)
  872. sub_text = text[start_index:index]
  873. if len(sub_text) == 0:
  874. continue
  875. end_x = char_x_list[index-1]
  876. if start_x == end_x:
  877. end_x += 1
  878. split_text_box_list.append([sub_text, start_x, end_x])
  879. start_index = index
  880. if index + 1 >= len(char_x_list):
  881. break
  882. start_x = char_x_list[index+1]
  883. # print('split_text_box_list', split_text_box_list)
  884. # 将分割好的文本做成对象
  885. if split_text_box_list:
  886. delete_text_box_list.append(text_box)
  887. lt_chars = [y for x in text_box for y in x]
  888. lt_char = lt_chars[0]
  889. # print('lt_char', lt_char)
  890. for text, start_x, x in split_text_box_list:
  891. new_lt_char = copy.deepcopy(lt_char)
  892. new_lt_char._text = text
  893. new_lt_char.set_bbox([start_x, lt_char.bbox[1], x, lt_char.bbox[3]])
  894. # print('new_lt_char', new_lt_char)
  895. new_lt_text_box = LTTextBoxHorizontal()
  896. new_lt_text_line = LTTextLine(0.01)
  897. new_lt_text_line.add(new_lt_char)
  898. new_lt_text_box.add(new_lt_text_line)
  899. add_text_box_list.append(new_lt_text_box)
  900. temp_list = []
  901. for text_box in lt_text_box_list:
  902. if text_box in delete_text_box_list:
  903. continue
  904. temp_list.append(text_box)
  905. lt_text_box_list = temp_list
  906. lt_text_box_list += add_text_box_list
  907. # print('lt_text_box_list', lt_text_box_list)
  908. return lt_text_box_list
  909. @memory_decorator
  910. def convert_page(self, layout, layout_obj_list, max_y, page_no, delete_water_mark_list, skip_image=0):
  911. # 若Page中一个obj都无,后面ocr整页识别 20240820
  912. if max_y == 0 and len(layout_obj_list) > 0:
  913. return
  914. # 若该页在page_need_to_image_dict中为True,则直接ocr整页识别
  915. if self.page_need_to_image_dict.get(page_no) is True:
  916. page_image = self.get_page_image(page_no)
  917. if judge_error_code(page_image):
  918. self._page.error_code = page_image
  919. else:
  920. _image = _Image(page_image[1], page_image[0])
  921. _image.is_from_pdf = True
  922. _image.is_reverse = False
  923. self._page.add_child(_image)
  924. return
  925. lt_char_list, lt_text_box_list, lt_image_list, lt_figure_list, \
  926. lt_line_list, text_box_char_dict = layout_obj_list
  927. # 排除水印文字
  928. for water_mark in delete_water_mark_list:
  929. temp_list = []
  930. for lt_text_box in lt_text_box_list:
  931. if water_mark == lt_text_box.get_text():
  932. continue
  933. temp_list.append(lt_text_box)
  934. lt_text_box_list = temp_list
  935. # 判断纯文本
  936. if len(lt_image_list) == 0 and len(lt_text_box_list) == 0:
  937. self.only_text_list[page_no] = 0
  938. elif len(lt_image_list) == 0:
  939. self.only_text_list[page_no] = 1
  940. else:
  941. self.only_text_list[page_no] = 0
  942. # 跳过图片
  943. if skip_image:
  944. lt_image_list = []
  945. # 判断读出来的是乱码,但有图片直接识别
  946. all_text = ''.join([x.get_text() for x in lt_text_box_list])
  947. all_text = re.sub('[\s\d]', '', all_text)
  948. if len(re.findall(get_garble_code2(), all_text)) >= 3 and len(lt_image_list) >= 1:
  949. log('嵌入的文字是乱码1: ' + str(all_text[:10]))
  950. lt_text_box_list = []
  951. if 3 <= len(re.findall(get_traditional_chinese(), all_text)) <= len(all_text) / 2 and len(lt_image_list) >= 1:
  952. log('嵌入的文字是乱码2: ' + str(all_text[:10]))
  953. lt_text_box_list = []
  954. # 解决重复粗体字问题
  955. lt_text_box_list = self.delete_bold_text_duplicate(lt_text_box_list)
  956. # 删除单页水印字
  957. lt_text_box_list, _ = self.delete_water_mark(lt_text_box_list, layout.bbox, 15)
  958. log("page_no: " + str(page_no) + " len(lt_image_list), len(lt_text_box_list) " +
  959. str(len(lt_image_list)) + " " + str(len(lt_text_box_list)))
  960. # 按照字体颜色判断水印
  961. # lt_text_box_list = self.delete_water_mark_by_color(lt_text_box_list)
  962. # 按照字体旋转角度判断水印
  963. lt_text_box_list = self.delete_water_mark_by_rotate(lt_text_box_list)
  964. # 若该页图片数量过多,或无文本,则直接ocr整页识别
  965. if len(lt_image_list) > 4 or len(lt_text_box_list) == 0:
  966. page_image = self.get_page_image(page_no)
  967. if judge_error_code(page_image):
  968. self._page.error_code = page_image
  969. else:
  970. _image = _Image(page_image[1], page_image[0])
  971. _image.is_from_pdf = True
  972. _image.is_reverse = False
  973. self._page.add_child(_image)
  974. # 正常读取该页对象
  975. else:
  976. # 图表对象
  977. # for image in lt_image_list:
  978. # try:
  979. # # print("pdf2text LTImage size", page_no, image.width, image.height)
  980. # # image_stream = image.stream.get_data()
  981. # print('image.stream.get_filters()', image.stream.get_filters())
  982. # image_stream = image.stream.get_data()
  983. # # 小的图忽略
  984. # if image.width <= 300 and image.height <= 300:
  985. # continue
  986. # # 查看提取的图片高宽,太大则用pdf输出图进行ocr识别
  987. # img_test = Image.open(io.BytesIO(image_stream))
  988. # # img_test = self.pdfminer_stream_to_image(image)
  989. # if image.height >= 1000 and image.width >= 1000:
  990. # page_image = self.get_page_image(page_no)
  991. # if judge_error_code(page_image):
  992. # self._page.error_code = page_image
  993. # else:
  994. # _image = _Image(page_image[1], page_image[0])
  995. # _image.is_from_pdf = True
  996. # _image.is_reverse = False
  997. # self._page.add_child(_image)
  998. # image_md5 = get_md5_from_bytes(page_image[1])
  999. # self.md5_image_obj_list.append([image_md5, _image])
  1000. # return
  1001. # # 比较小的图则直接保存用ocr识别
  1002. # else:
  1003. # temp_path = self.unique_type_dir + 'page' + str(page_no) \
  1004. # + '_lt' + str(lt_image_list.index(image)) + '.jpg'
  1005. # img_test.save(temp_path)
  1006. # with open(temp_path, "rb") as ff:
  1007. # image_stream = ff.read()
  1008. # _image = _Image(image_stream, temp_path, image.bbox)
  1009. # self._page.add_child(_image)
  1010. # image_md5 = get_md5_from_bytes(image_stream)
  1011. # self.md5_image_obj_list.append([image_md5, _image])
  1012. # except Exception:
  1013. # log("page_no: " + str(page_no) + " pdfminer read image fail! use pymupdf read image...")
  1014. # traceback.print_exc()
  1015. # pdf对象需反向排序
  1016. # self._page.is_reverse = True
  1017. status = self.pdfminer_read_page_images(lt_image_list, page_no)
  1018. if not status:
  1019. log('pymupdf 提取页面中图片 page_no: ' + str(page_no))
  1020. status = self.pymupdf_read_page_images(page_no)
  1021. if not status:
  1022. log('pymupdf 整页转化为图片 page_no: ' + str(page_no))
  1023. status = self.pymupdf_get_whole_page_image(page_no)
  1024. if self.has_init_pdf[3] == 0:
  1025. self.init_package("pdfplumber")
  1026. if not self.is_text_legal(lt_text_box_list, page_no):
  1027. return
  1028. try:
  1029. # lt_line_list = self.get_page_lines(layout, page_no)
  1030. lt_line_list = self.get_page_lines(lt_line_list, layout, page_no)
  1031. except:
  1032. traceback.print_exc()
  1033. lt_line_list = []
  1034. self._page.error_code = [-13]
  1035. # 根据表格线分割lt_text_box
  1036. lt_text_box_list = self.split_text_box_by_lines2(lt_line_list, lt_text_box_list, text_box_char_dict)
  1037. # 文本填入表格
  1038. table_list = self.recognize_text(layout, page_no, lt_text_box_list, lt_line_list)
  1039. # 根据text规律,判断该页是否可能有无边框表格
  1040. try:
  1041. b_table_list, _ = get_b_table_by_blank_colon(lt_text_box_list, table_list, layout.bbox, None)
  1042. except:
  1043. traceback.print_exc()
  1044. b_table_list = []
  1045. self._page.error_code = [-23]
  1046. if b_table_list:
  1047. for table in b_table_list:
  1048. _table = _Table(table[0], table[1])
  1049. table_list += [_table]
  1050. self._page.add_child(_table)
  1051. for t in table_list:
  1052. self._page.table_bbox_list.append(t.bbox)
  1053. if self.judge_b_table(lt_text_box_list, table_list, page_no):
  1054. # log('judge_b_table match! ' + str(page_no))
  1055. page_image = self.get_page_image(page_no)
  1056. if judge_error_code(page_image):
  1057. self._page.error_code = page_image
  1058. else:
  1059. _image = _Image(page_image[1], page_image[0],
  1060. bbox=(10000, 10000, 10001, 10001))
  1061. _image.is_from_pdf = True
  1062. # _image.is_reverse = True
  1063. _image.b_table_from_text = True
  1064. _image.b_table_text_obj_list = lt_text_box_list
  1065. _image.b_table_layout_size = (layout.width, layout.height)
  1066. self._page.add_child(_image)
  1067. @memory_decorator
  1068. def get_layout(self, page, page_no):
  1069. if self.has_init_pdf[0] == 0:
  1070. self.init_package("pdfminer")
  1071. if self._doc.error_code is not None:
  1072. return
  1073. # 获取该页layout
  1074. start_time = time.time()
  1075. try:
  1076. if get_platform() == "Windows":
  1077. layout = pdf_analyze(self.interpreter, page, self.device, page_no)
  1078. else:
  1079. layout = pdf_analyze(self.interpreter, page, self.device, page_no)
  1080. except TimeoutError as e:
  1081. log("page_no: " + str(page_no) + " pdfminer read page time out! " + str(time.time() - start_time))
  1082. layout = [-4]
  1083. except Exception:
  1084. traceback.print_exc()
  1085. log("page_no: " + str(page_no) + " pdfminer read page error! continue...")
  1086. layout = [-3]
  1087. log("page_no: " + str(page_no) + " get_layout cost: " + str(time.time() - start_time))
  1088. return layout
  1089. @memory_decorator
  1090. def get_page_image(self, page_no):
  1091. start_time = time.time()
  1092. try:
  1093. if self.has_init_pdf[1] == 0:
  1094. self.init_package("PyMuPDF")
  1095. if self._doc.error_code is not None:
  1096. return
  1097. # save_dir = self.path.split(".")[-2] + "_" + self.path.split(".")[-1]
  1098. output = self.unique_type_dir + "page" + str(page_no) + ".png"
  1099. page = self.doc_pymupdf.loadPage(page_no)
  1100. rotate = int(0)
  1101. zoom_x = 2.
  1102. zoom_y = 2.
  1103. mat = fitz.Matrix(zoom_x, zoom_y).preRotate(rotate)
  1104. pix = page.getPixmap(matrix=mat, alpha=False)
  1105. pix.writePNG(output)
  1106. # 输出图片resize
  1107. self.resize_image(output)
  1108. with open(output, "rb") as f:
  1109. pdf_image = f.read()
  1110. log("page_no: " + str(page_no) + ' get_page_image cost: ' + str(time.time() - start_time))
  1111. return [output, pdf_image]
  1112. except ValueError as e:
  1113. traceback.print_exc()
  1114. if str(e) == "page not in document":
  1115. log("page_no: " + str(page_no) + " page not in document! continue...")
  1116. return [0]
  1117. elif "encrypted" in str(e):
  1118. log("page_no: " + str(page_no) + " document need password")
  1119. return [-7]
  1120. except RuntimeError as e:
  1121. if "cannot find page" in str(e):
  1122. log("page_no: " + str(page_no) + " page cannot find in document! continue...")
  1123. return [0]
  1124. else:
  1125. traceback.print_exc()
  1126. return [-3]
  1127. def get_all_page_image(self):
  1128. start_time = time.time()
  1129. if self.has_init_pdf[1] == 0:
  1130. self.init_package("PyMuPDF")
  1131. if self._doc.error_code is not None:
  1132. return
  1133. page_count = self.doc_pymupdf.page_count
  1134. for page_no in range(page_count):
  1135. # 限制pdf页数,只取前10页后10页
  1136. if page_count > 20:
  1137. if 10 <= page_no < page_count - 10:
  1138. continue
  1139. self._page = _Page(None, page_no)
  1140. page_image = self.get_page_image(page_no)
  1141. if judge_error_code(page_image):
  1142. self._page.error_code = page_image
  1143. else:
  1144. _image = _Image(page_image[1], page_image[0])
  1145. self._page.add_child(_image)
  1146. # 报错继续读后面页面
  1147. if self._doc.error_code is None and self._page.error_code is not None:
  1148. continue
  1149. self._doc.add_child(self._page)
  1150. log('get_all_page_image cost: ' + str(time.time() - start_time))
  1151. @memory_decorator
  1152. def connect_table(self, html_list, show=0):
  1153. if not html_list:
  1154. return html_list
  1155. # 判断初始条件1
  1156. # 0: 前一页最后一个表格为A,后一页第一个表格为B
  1157. # 1.1: A后无文本(除了页码),且B前无文本(除了页码)
  1158. # 1.2: B前有文字(可能是页眉,小于60字),且B的第一行前几个单元格为空,且第一行不为空的单元格有文字较多的格子
  1159. # 1.3: B前有文字(可能是页眉,小于60字),且B的第一行第一个单元格为空,且有文字的格子数量占所有格子的一半
  1160. # 1.4: B前有文字(可能是页眉,小于60字),且B的第一行第一个单元格为纯数字序号
  1161. # 1.5: A后有文字(除了页码还有页眉),且A的后面只有一行且中文不超过15个字
  1162. connect_flag_list = []
  1163. soup_list = []
  1164. connect_rule_dict = {}
  1165. for i, h in enumerate(html_list):
  1166. soup = BeautifulSoup(h, 'lxml')
  1167. soup_list.append(soup)
  1168. # 找最后一个表格
  1169. last_table_start, last_table_end = None, None
  1170. match = re.finditer('<table', h)
  1171. for m in match:
  1172. last_table_start = m.span()[0]
  1173. if last_table_start is not None:
  1174. match = re.finditer('</table>', h[last_table_start:])
  1175. for m in match:
  1176. last_table_end = m.span()[1] + last_table_start
  1177. # 补充规则,把表格也带上
  1178. rule_a = [0, h[last_table_start:last_table_end]]
  1179. # 最后一个表格后有无除了页码外的内容
  1180. connect_flag1 = False
  1181. if last_table_end is not None:
  1182. match = re.findall('[^-/第页0-9,,]', re.sub('<div>|</div>', '', h[last_table_end:]))
  1183. # print('match', match.group())
  1184. # if not match or match.group() == '':
  1185. if len(match) == 0:
  1186. connect_flag1 = True
  1187. # 有页脚
  1188. if not connect_flag1:
  1189. if len(re.findall('<div>', h[last_table_end:])) <= 1 \
  1190. and len(re.findall('[\u4e00-\u9fff]', h[last_table_end:])) <= 60:
  1191. connect_flag1 = True
  1192. # 找第一个表格
  1193. first_table_start, first_table_end = None, None
  1194. match = re.finditer('<table', h)
  1195. for m in match:
  1196. first_table_start = m.span()[0]
  1197. break
  1198. if first_table_start is not None:
  1199. match = re.finditer('</table>', h[first_table_start:])
  1200. for m in match:
  1201. first_table_end = m.span()[1] + first_table_start
  1202. # 补充规则,把表格也带上
  1203. rule_b = [0, h[first_table_start:first_table_end]]
  1204. # 第一个表格前有无内容
  1205. connect_flag2 = False
  1206. if first_table_start is not None and first_table_start == 0:
  1207. connect_flag2 = True
  1208. # 有内容但是是页眉
  1209. if not connect_flag2:
  1210. tables = soup.findAll('table')
  1211. if tables:
  1212. first_table = tables[0]
  1213. rows = first_table.findAll('tr')
  1214. if rows:
  1215. first_row = rows[0]
  1216. col_text_len_list = [len(x.text) for x in first_row]
  1217. col_text_list = [x.text for x in first_row]
  1218. # 文字大于60且第一个为空
  1219. if not connect_flag2 and len(h[:first_table_start]) <= 60 and col_text_len_list[0] == 0 and max(
  1220. col_text_len_list) >= 30:
  1221. connect_flag2 = True
  1222. rule_b[0] = 1
  1223. # 有文字格子数占一半一下且第一个格子为空
  1224. if not connect_flag2 and col_text_len_list.count(0) >= len(col_text_len_list) / 2 and \
  1225. col_text_len_list[0] == 0:
  1226. connect_flag2 = True
  1227. # 表格前最多只有一行且第一个格子为纯数字
  1228. if not connect_flag2 and len(col_text_list) > 0 and \
  1229. len(re.findall('<div>', h[:first_table_start])) <= 0 and \
  1230. len(re.findall('\d', col_text_list[0])) == len(col_text_list[0]):
  1231. connect_flag2 = True
  1232. # if not connect_flag2 and len(re.findall('<div>', h[:first_table_start])) <= 0 and len(re.findall('[\u4e00-\u9fff]', h[:first_table_start])) <= 25:
  1233. # connect_flag2 = True
  1234. connect_flag_list.append([i, connect_flag2, connect_flag1])
  1235. connect_rule_dict[i] = [rule_b, rule_a]
  1236. if show:
  1237. print('connect_flag_list', connect_flag_list)
  1238. print('connect_rule_dict', connect_rule_dict)
  1239. # 根据条件1合并需连接页码,形成组
  1240. connect_pages_list = []
  1241. if connect_flag_list:
  1242. temp_list = [connect_flag_list[0]]
  1243. for i in range(1, len(connect_flag_list)):
  1244. c = connect_flag_list[i]
  1245. if c[1] and temp_list[-1][2]:
  1246. temp_list.append(c)
  1247. else:
  1248. if temp_list:
  1249. connect_pages_list.append(temp_list)
  1250. temp_list = [c]
  1251. # connect_pages_list.append([c])
  1252. if temp_list:
  1253. connect_pages_list.append(temp_list)
  1254. if show:
  1255. print('connect_pages_list', connect_pages_list)
  1256. # 判断后续条件:判断组内列数是否相同
  1257. connect_pages_list2 = []
  1258. for c_list in connect_pages_list:
  1259. if len(c_list) == 1:
  1260. connect_pages_list2.append(c_list)
  1261. else:
  1262. col_cnt_list = []
  1263. # 单元格可能被复制了,相同的合并当做一列
  1264. merge_col_cnt_list = []
  1265. for c in c_list:
  1266. soup = soup_list[c[0]]
  1267. table1 = soup.findAll('table')[-1]
  1268. table2 = soup.findAll('table')[0]
  1269. tr1 = table1.findAll('tr')
  1270. tr2 = table2.findAll('tr')
  1271. td1 = tr1[-1].findAll('td')
  1272. td2 = tr2[0].findAll('td')
  1273. col_cnt_list.append([len(td2), len(td1)])
  1274. # # 计算合并重复文本格子后的列数
  1275. # last_text = td1[0].text
  1276. # merge_td1 = [last_text]
  1277. # for td in td1:
  1278. # if td.text == last_text:
  1279. # continue
  1280. # else:
  1281. # merge_td1.append(td.text)
  1282. # last_text = td.text
  1283. # last_text = td2[0].text
  1284. # merge_td2 = [last_text]
  1285. # for td in td2:
  1286. # if td.text == last_text:
  1287. # continue
  1288. # else:
  1289. # merge_td2.append(td.text)
  1290. # last_text = td.text
  1291. # merge_col_cnt_list.append([len(merge_td2), len(merge_td1)])
  1292. # 判断
  1293. new_c_list = [c_list[0]]
  1294. # print('col_cnt_list', col_cnt_list)
  1295. for i in range(len(col_cnt_list) - 1):
  1296. if col_cnt_list[i][1] != col_cnt_list[i + 1][0]:
  1297. # and merge_col_cnt_list[i][1] != merge_col_cnt_list[i + 1][0]:
  1298. connect_pages_list2.append(new_c_list)
  1299. new_c_list = [c_list[i + 1]]
  1300. else:
  1301. new_c_list.append(c_list[i + 1])
  1302. if new_c_list:
  1303. connect_pages_list2.append(new_c_list)
  1304. if show:
  1305. print('connect_pages_list2', connect_pages_list2)
  1306. # 判断连接的两个表格是否需要补单元格内容
  1307. for c_list in connect_pages_list2:
  1308. for i in range(len(c_list) - 1):
  1309. page_index1 = c_list[i][0]
  1310. page_index2 = c_list[i + 1][0]
  1311. html2 = html_list[page_index2]
  1312. soup2 = soup_list[page_index2]
  1313. rule1 = connect_rule_dict.get(page_index1)[1]
  1314. rule2 = connect_rule_dict.get(page_index2)[0]
  1315. # print('rule1', rule1)
  1316. # if rule2[0]:
  1317. table1 = BeautifulSoup(rule1[1], 'lxml').findAll('table')[0]
  1318. table2 = BeautifulSoup(rule2[1], 'lxml').findAll('table')[0]
  1319. add_td_value = []
  1320. # 获取最后一行td
  1321. for tr in table1.findAll('tr')[::-1]:
  1322. temp_list = []
  1323. for td in tr.findAll('td'):
  1324. temp_list.append(td.get_text())
  1325. add_td_value = temp_list
  1326. break
  1327. # print('add_td_value', add_td_value)
  1328. tr_index = 0
  1329. for tr in table2.findAll('tr'):
  1330. temp_list = []
  1331. for td in tr.findAll('td'):
  1332. if len(td.get_text()) < 1:
  1333. temp_list.append(0)
  1334. else:
  1335. temp_list.append(1)
  1336. # print('temp_list', temp_list)
  1337. if temp_list and add_td_value and len(temp_list) == len(add_td_value) \
  1338. and 1 in temp_list and temp_list[0] != 1 \
  1339. and 1 not in temp_list[:temp_list.index(1)]:
  1340. for j in range(len(temp_list)):
  1341. if temp_list[j] == 0:
  1342. tr.findAll('td')[j].string = add_td_value[j]
  1343. # else:
  1344. # # 只有第一行,且列数大于3,且只有一列有值情况下,上下两行文本合并
  1345. # if tr_index == 0 and len(temp_list) >= 3 and temp_list.count(1) == 1:
  1346. # tr.findAll('td')[j].string += add_td_value[j]
  1347. # print('tr.findAll(td)[0]', tr.findAll('td')[0])
  1348. tr_index += 1
  1349. soup2.findAll('table')[0].replace_with(table2)
  1350. html_list[page_index2] = str(soup2)
  1351. # 符合连接条件的拼接表格
  1352. new_html_list = []
  1353. for c_list in connect_pages_list2:
  1354. if len(c_list) == 1:
  1355. new_html_list.append(html_list[c_list[0][0]])
  1356. continue
  1357. new_html = ''
  1358. for c in c_list:
  1359. match = re.finditer('</table>', new_html)
  1360. last_table_index = None
  1361. for m in match:
  1362. last_table_index = m.span()[0]
  1363. new_html += html_list[c[0]]
  1364. # print('html_list[c[0]]', html_list[c[0]])
  1365. if last_table_index is None:
  1366. continue
  1367. match = re.finditer('<table border="1">', new_html[last_table_index:])
  1368. first_table_index = None
  1369. for m in match:
  1370. first_table_index = last_table_index + m.span()[1]
  1371. break
  1372. if first_table_index is None:
  1373. continue
  1374. # print('re', re.findall('</table>.*?<table border="1">', new_html[last_table_index:first_table_index]))
  1375. # 非贪婪匹配
  1376. new_html_sub = re.sub('</table>.*?<table border="1">',
  1377. '<tr><td>#@#@#</td></tr>',
  1378. new_html[last_table_index:first_table_index])
  1379. new_html = new_html[:last_table_index] + new_html_sub + new_html[first_table_index:]
  1380. # print('new_html', new_html)
  1381. # new_html = new_html[:-5]
  1382. # ([-/第页0-9]|<div>|</div>)*
  1383. # 非贪婪匹配
  1384. # match = re.finditer('</table>.*?<table border="1">', new_html)
  1385. # for m in match:
  1386. # if '#@#@#' in m.group():
  1387. #
  1388. # new_html = re.sub('</table>.*#@#@#.*?<table border="1">',
  1389. # '<tr><td>#@#@#</td></tr>',
  1390. # new_html)
  1391. # print('new_html', new_html)
  1392. soup = BeautifulSoup(new_html, 'lxml')
  1393. trs = soup.findAll('tr')
  1394. decompose_trs = []
  1395. for i in range(len(trs)):
  1396. if trs[i].get_text() == '#@#@#':
  1397. td1 = trs[i - 1].findAll('td')
  1398. td2 = trs[i + 1].findAll('td')
  1399. if td2[0].get_text() == '':
  1400. # 解决连续多页是一行表格,该行会被去掉问题
  1401. find_father = False
  1402. for father, son in decompose_trs:
  1403. # print('son', son)
  1404. # print('td1', trs[i - 1])
  1405. if father != '' and son == trs[i - 1]:
  1406. td_father = father.findAll('td')
  1407. for j in range(len(td_father)):
  1408. # print('td_father[j].string3', td_father[j].string)
  1409. td_father[j].string = td_father[j].get_text() + td2[j].get_text()
  1410. # print('td_father[j].string4', td_father[j].string)
  1411. find_father = True
  1412. decompose_trs.append([father, trs[i + 1]])
  1413. break
  1414. if not find_father:
  1415. for j in range(len(td1)):
  1416. # print('td1[j].string1', td1[j].string)
  1417. td1[j].string = td1[j].get_text() + td2[j].get_text()
  1418. # print('td1[j].string2', td1[j].string)
  1419. decompose_trs.append([trs[i - 1], trs[i + 1]])
  1420. # print('trs[i + 1]', trs[i + 1])
  1421. # trs[i + 1].decompose()
  1422. # print('trs[i-1]', trs[i-1])
  1423. # trs[i].decompose()
  1424. decompose_trs.append(['', trs[i]])
  1425. # print('decompose_trs', decompose_trs)
  1426. # for father, son in decompose_trs:
  1427. # print('father', father)
  1428. # print('son', son)
  1429. # print('len(decompose_trs)', len(decompose_trs))
  1430. for father, son in decompose_trs:
  1431. for tr in trs:
  1432. if tr == son:
  1433. tr.decompose()
  1434. break
  1435. new_html = str(soup)
  1436. new_html_list.append(new_html)
  1437. html_str = ''
  1438. for h in new_html_list:
  1439. html_str += h
  1440. return [html_str]
  1441. def get_html(self):
  1442. if self._doc.error_code is not None:
  1443. return self._doc.error_code
  1444. self.convert()
  1445. if self._doc.error_code is not None:
  1446. return self._doc.error_code
  1447. html = self._doc.get_html(return_list=True)
  1448. if self._doc.error_code is not None:
  1449. return self._doc.error_code
  1450. # 表格连接
  1451. try:
  1452. html = self.connect_table(html)
  1453. except:
  1454. traceback.print_exc()
  1455. return [-12]
  1456. return html
  1457. @memory_decorator
  1458. def delete_water_mark(self, lt_text_list, page_bbox, times=5):
  1459. # 删除过多重复字句,为水印
  1460. duplicate_dict = {}
  1461. for _obj in lt_text_list:
  1462. t = _obj.get_text()
  1463. if t in duplicate_dict.keys():
  1464. duplicate_dict[t][0] += 1
  1465. duplicate_dict[t][1].append(_obj)
  1466. else:
  1467. duplicate_dict[t] = [1, [_obj]]
  1468. delete_text = []
  1469. for t in duplicate_dict.keys():
  1470. if duplicate_dict[t][0] >= times:
  1471. obj_list = duplicate_dict[t][1]
  1472. obj_list.sort(key=lambda x: x.bbox[3])
  1473. obj_distance_h = abs(obj_list[-1].bbox[3] - obj_list[0].bbox[1])
  1474. obj_list.sort(key=lambda x: x.bbox[2])
  1475. obj_distance_w = abs(obj_list[-1].bbox[2] - obj_list[0].bbox[0])
  1476. if obj_distance_h >= abs(page_bbox[1] - page_bbox[3]) * 0.7 \
  1477. and obj_distance_w >= abs(page_bbox[0] - page_bbox[2]) * 0.7:
  1478. delete_text.append(t)
  1479. # 排除字符少的
  1480. temp_list = []
  1481. for t in delete_text:
  1482. if len(t) <= 5:
  1483. continue
  1484. temp_list.append(t)
  1485. delete_text = temp_list
  1486. temp_text_list = []
  1487. for _obj in lt_text_list:
  1488. t = _obj.get_text()
  1489. if t not in delete_text:
  1490. temp_text_list.append(_obj)
  1491. return temp_text_list, delete_text
  1492. @memory_decorator
  1493. def delete_water_mark_by_location(self, lt_text_box_list):
  1494. x_text_box_dict = {}
  1495. # 水印,x坐标相同,且长度为1
  1496. for lt_text_box in lt_text_box_list:
  1497. x1, y1, x2, y2 = lt_text_box.bbox
  1498. text = lt_text_box.get_text()
  1499. if len(text) != 1:
  1500. continue
  1501. key = f'{x1}-{x2}-{text}'
  1502. if key in x_text_box_dict:
  1503. x_text_box_dict[key] += [lt_text_box]
  1504. else:
  1505. x_text_box_dict[key] = [lt_text_box]
  1506. len1 = len(lt_text_box_list)
  1507. for key, box_list in x_text_box_dict.items():
  1508. if len(box_list) >= 3:
  1509. for box in box_list:
  1510. if box in lt_text_box_list:
  1511. lt_text_box_list.remove(box)
  1512. len2 = len(lt_text_box_list)
  1513. if len1 != len2:
  1514. log('delete_water_mark_by_location box num ' + str(len1) + ' -> ' + str(len2))
  1515. return lt_text_box_list
  1516. def delete_water_mark_by_color(self, lt_text_list):
  1517. # 删除浅色字体,大概率为水印
  1518. # 1. 单个char颜色透明度0.8以上
  1519. # 2. 整个text_box中所有char颜色透明度0.4以上
  1520. water_mark_text_box_list = []
  1521. threshold1 = 0.8
  1522. threshold2 = 0.4
  1523. for lt_text_box in lt_text_list:
  1524. # print('lt_text_box11', lt_text_box.get_text())
  1525. for lt_text_line in lt_text_box:
  1526. water_mark_char_cnt = 0
  1527. lt_text_line_len = 0
  1528. for lt_char in lt_text_line:
  1529. lt_text_line_len += 1
  1530. color = lt_char.graphicstate.ncolor
  1531. print('color', lt_char.get_text(), color, lt_char.matrix)
  1532. find_flag = 0
  1533. if color is None:
  1534. continue
  1535. elif isinstance(color, (tuple, list)):
  1536. r, g, b = color
  1537. if r >= threshold2 and g >= threshold2 and b >= threshold2:
  1538. water_mark_char_cnt += 1
  1539. if r >= threshold1 and g >= threshold1 and b >= threshold1:
  1540. find_flag = 1
  1541. else:
  1542. if color >= threshold2:
  1543. water_mark_char_cnt += 1
  1544. if color >= threshold1:
  1545. find_flag = 1
  1546. if find_flag:
  1547. print('water mark char', lt_char.get_text(), color)
  1548. water_mark_text_box_list.append(lt_text_box)
  1549. lt_char._text = ''
  1550. if lt_text_line_len == water_mark_char_cnt:
  1551. for lt_char in lt_text_line:
  1552. lt_char._text = ''
  1553. water_mark_text_box_list.append(lt_text_box)
  1554. return lt_text_list
  1555. def delete_water_mark_by_rotate(self, lt_text_list):
  1556. water_mark_text_box_list = []
  1557. sin_range = [0.3, 0.94]
  1558. for lt_text_box in lt_text_list:
  1559. if '.......' in lt_text_box.get_text():
  1560. # print('....... lt_text_box continue')
  1561. continue
  1562. for lt_text_line in lt_text_box:
  1563. for lt_char in lt_text_line:
  1564. matrix = lt_char.matrix
  1565. # print('matrix', lt_char.get_text(), matrix)
  1566. if matrix is None:
  1567. continue
  1568. _, b, c, _, _, _ = matrix
  1569. if abs(b) == abs(c) and b != c \
  1570. and sin_range[0] <= abs(b) <= sin_range[1] \
  1571. and sin_range[0] <= abs(c) <= sin_range[1]:
  1572. # print('water mark char', lt_char.get_text(), matrix)
  1573. water_mark_text_box_list.append(lt_text_box)
  1574. lt_char._text = ''
  1575. return lt_text_list
  1576. def resize_image(self, img_path, max_size=2000):
  1577. _img = cv2.imread(img_path)
  1578. if _img.shape[0] <= max_size or _img.shape[1] <= max_size:
  1579. return
  1580. else:
  1581. resize_axis = 0 if _img.shape[0] >= _img.shape[1] else 1
  1582. ratio = max_size / _img.shape[resize_axis]
  1583. new_shape = [0, 0]
  1584. new_shape[resize_axis] = max_size
  1585. new_shape[1 - resize_axis] = int(_img.shape[1 - resize_axis] * ratio)
  1586. _img = cv2.resize(_img, (new_shape[1], new_shape[0]))
  1587. cv2.imwrite(img_path, _img)
  1588. def get_single_pdf(self, path, page_no):
  1589. start_time = time.time()
  1590. try:
  1591. pdf_origin = copy.deepcopy(self.doc_pypdf2)
  1592. pdf_new = copy.deepcopy(self.doc_pypdf2_new)
  1593. pdf_new.addPage(pdf_origin.getPage(page_no))
  1594. path_new = path.split(".")[0] + "_split.pdf"
  1595. with open(path_new, "wb") as ff:
  1596. pdf_new.write(ff)
  1597. log("page_no: " + str(page_no) + " get_single_pdf cost: " + str(time.time() - start_time))
  1598. return path_new
  1599. except PyPDF2.utils.PdfReadError as e:
  1600. return [-3]
  1601. except Exception as e:
  1602. log("page_no: " + str(page_no) + " get_single_pdf error!")
  1603. return [-3]
  1604. def pymupdf_read_page_images(self, page_no):
  1605. try:
  1606. self.init_package("PyMuPDF")
  1607. # 获取指定页面
  1608. page = self.doc_pymupdf.load_page(page_no)
  1609. # 获取页面中所有图片的信息
  1610. image_list = page.get_images(full=True)
  1611. # 存储提取的图片信息
  1612. extracted_images = []
  1613. # 遍历图片列表
  1614. for img_index, img_info in enumerate(image_list):
  1615. xref = img_info[0] # 图片xref编号
  1616. base_image = self.doc_pymupdf.extract_image(xref)
  1617. image_bytes = base_image["image"] # 图片字节数据
  1618. image_ext = base_image["ext"] # 图片扩展名
  1619. # 获取图片在页面中的位置和大小
  1620. bbox = img_info[0:4] # x0, y0, x1, y1
  1621. # print('img_info', img_info)
  1622. width = img_info[2] - img_info[0] # 计算宽度
  1623. height = img_info[3] - img_info[1] # 计算高度
  1624. # 构建图片信息字典
  1625. img_data = {
  1626. "xref": xref,
  1627. "width": width,
  1628. "height": height,
  1629. "image": image_bytes,
  1630. "ext": image_ext,
  1631. "bbox": bbox
  1632. }
  1633. extracted_images.append(img_data)
  1634. image_obj_list = []
  1635. for index, d in enumerate(extracted_images):
  1636. temp_path = self.unique_type_dir + 'page' + str(page_no) \
  1637. + '_lt2_' + str(index) + '.jpg'
  1638. image_bytes = d.get("image")
  1639. bbox = d.get('bbox')
  1640. with open(temp_path, 'wb') as f:
  1641. f.write(image_bytes)
  1642. _image = _Image(image_bytes, temp_path, bbox)
  1643. image_md5 = get_md5_from_bytes(image_bytes)
  1644. image_obj_list.append([_image, image_md5])
  1645. except:
  1646. traceback.print_exc()
  1647. return False
  1648. for _image, image_md5 in image_obj_list:
  1649. self._page.add_child(_image)
  1650. self.md5_image_obj_list.append([image_md5, _image])
  1651. return True
  1652. def pymupdf_get_whole_page_image(self, page_no):
  1653. image_obj_list = []
  1654. page_image = self.get_page_image(page_no)
  1655. if judge_error_code(page_image):
  1656. self._page.error_code = page_image
  1657. return False
  1658. else:
  1659. _image = _Image(page_image[1], page_image[0])
  1660. _image.is_from_pdf = True
  1661. _image.is_reverse = False
  1662. image_md5 = get_md5_from_bytes(page_image[1])
  1663. image_obj_list.append([_image, image_md5])
  1664. for _image, image_md5 in image_obj_list:
  1665. self._page.add_child(_image)
  1666. self.md5_image_obj_list.append([image_md5, _image])
  1667. return True
  1668. def pdfminer_read_page_images(self, lt_image_list, page_no):
  1669. # 图表对象
  1670. image_obj_list = []
  1671. for image in lt_image_list:
  1672. try:
  1673. # print("pdf2text LTImage size", page_no, image.width, image.height)
  1674. # image_stream = image.stream.get_data()
  1675. # print('image.stream.get_filters()', image.stream.get_filters())
  1676. image_stream = image.stream.get_data()
  1677. # 小的图忽略
  1678. if image.width <= 300 and image.height <= 300:
  1679. continue
  1680. # 查看提取的图片高宽,太大则用pdf输出图进行ocr识别
  1681. img_test = Image.open(io.BytesIO(image_stream))
  1682. # img_test = self.pdfminer_stream_to_image(image)
  1683. # if image.height >= 1000 and image.width >= 1000:
  1684. # page_image = self.get_page_image(page_no)
  1685. # if judge_error_code(page_image):
  1686. # self._page.error_code = page_image
  1687. # else:
  1688. # _image = _Image(page_image[1], page_image[0])
  1689. # _image.is_from_pdf = True
  1690. # _image.is_reverse = False
  1691. # image_md5 = get_md5_from_bytes(page_image[1])
  1692. # image_obj_list.append([_image, image_md5])
  1693. # # 比较小的图则直接保存用ocr识别
  1694. # else:
  1695. temp_path = self.unique_type_dir + 'page' + str(page_no) \
  1696. + '_lt_' + str(lt_image_list.index(image)) + '.jpg'
  1697. img_test.save(temp_path)
  1698. with open(temp_path, "rb") as ff:
  1699. image_stream = ff.read()
  1700. _image = _Image(image_stream, temp_path, image.bbox)
  1701. self._page.add_child(_image)
  1702. image_md5 = get_md5_from_bytes(image_stream)
  1703. self.md5_image_obj_list.append([image_md5, _image])
  1704. except Exception:
  1705. log("page_no: " + str(page_no) + " pdfminer read image fail!")
  1706. traceback.print_exc()
  1707. return False
  1708. for _image, image_md5 in image_obj_list:
  1709. self._page.add_child(_image)
  1710. self.md5_image_obj_list.append([image_md5, _image])
  1711. return True
  1712. def get_text_font():
  1713. def flags_decomposer(flags):
  1714. """Make font flags human readable."""
  1715. l = []
  1716. if flags & 2 ** 0:
  1717. l.append("superscript")
  1718. if flags & 2 ** 1:
  1719. l.append("italic")
  1720. if flags & 2 ** 2:
  1721. l.append("serifed")
  1722. else:
  1723. l.append("sans")
  1724. if flags & 2 ** 3:
  1725. l.append("monospaced")
  1726. else:
  1727. l.append("proportional")
  1728. if flags & 2 ** 4:
  1729. l.append("bold")
  1730. return ", ".join(l)
  1731. def get_underlined_textLines(page):
  1732. """
  1733. 获取某页pdf上的所有下划线文本信息
  1734. :param page: fitz中的一页
  1735. :return: list of tuples,每个tuple都是一个完整的下划线覆盖的整体:[(下划线句, 所在blk_no, 所在line_no), ...]
  1736. """
  1737. paths = page.get_drawings() # get drawings on the current page
  1738. # 获取该页内所有的height很小的bbox。因为下划线其实大多是这种矩形
  1739. # subselect things we may regard as lines
  1740. lines = []
  1741. for p in paths:
  1742. for item in p["items"]:
  1743. if item[0] == "l": # an actual line
  1744. p1, p2 = item[1:]
  1745. if p1.y == p2.y:
  1746. lines.append((p1, p2))
  1747. elif item[0] == "re": # a rectangle: check if height is small
  1748. r = item[1]
  1749. if r.width > r.height and r.height <= 2:
  1750. lines.append((r.tl, r.tr)) # take top left / right points
  1751. # 获取该页的`max_lineheight`,用于下面比较距离使用
  1752. blocks = page.get_text("dict", flags=11)["blocks"]
  1753. max_lineheight = 0
  1754. for b in blocks:
  1755. for l in b["lines"]:
  1756. bbox = fitz.Rect(l["bbox"])
  1757. if bbox.height > max_lineheight:
  1758. max_lineheight = bbox.height
  1759. underlined_res = []
  1760. # 开始对下划线内容进行查询
  1761. # make a list of words
  1762. words = page.get_text("words")
  1763. # if underlined, the bottom left / right of a word
  1764. # should not be too far away from left / right end of some line:
  1765. for wdx, w in enumerate(words): # w[4] is the actual word string
  1766. r = fitz.Rect(w[:4]) # first 4 items are the word bbox
  1767. for p1, p2 in lines: # check distances for start / end points
  1768. if abs(r.bl - p1) <= max_lineheight: # 当前word的左下满足下划线左下
  1769. if abs(r.br - p2) <= max_lineheight: # 当前word的右下满足下划线右下(单个词,无空格)
  1770. print(f"Word '{w[4]}' is underlined! Its block-line number is {w[-3], w[-2]}")
  1771. underlined_res.append((w[4], w[-3], w[-2])) # 分别是(下划线词,所在blk_no,所在line_no)
  1772. break # don't check more lines
  1773. else: # 继续寻找同line右侧的有缘人,因为有些下划线覆盖的词包含多个词,多个词之间有空格
  1774. curr_line_num = w[-2] # line nunmber
  1775. for right_wdx in range(wdx + 1, len(words), 1):
  1776. _next_w = words[right_wdx]
  1777. if _next_w[-2] != curr_line_num: # 当前遍历到的右侧word已经不是当前行的了(跨行是不行的)
  1778. break
  1779. _r_right = fitz.Rect(_next_w[:4]) # 获取当前同行右侧某word的方框4点
  1780. if abs(_r_right.br - p2) <= max_lineheight: # 用此word右下点和p2(目标下划线右上点)算距离,距离要小于max_lineheight
  1781. print(
  1782. f"Word '{' '.join([_one_word[4] for _one_word in words[wdx:right_wdx + 1]])}' is underlined! " +
  1783. f"Its block-line number is {w[-3], w[-2]}")
  1784. underlined_res.append(
  1785. (' '.join([_one_word[4] for _one_word in words[wdx:right_wdx + 1]]),
  1786. w[-3], w[-2])
  1787. ) # 分别是(下划线词,所在blk_no,所在line_no)
  1788. break # don't check more lines
  1789. return underlined_res
  1790. _p = r'C:\Users\Administrator\Desktop\test_pdf\error2-2.pdf'
  1791. doc_pymupdf = read_pymupdf(_p)
  1792. page = doc_pymupdf[0]
  1793. blocks = page.get_text("dict", flags=11)["blocks"]
  1794. for b in blocks: # iterate through the text blocks
  1795. for l in b["lines"]: # iterate through the text lines
  1796. for s in l["spans"]: # iterate through the text spans
  1797. print("")
  1798. font_properties = "Font: '%s' (%s), size %g, color #%06x" % (
  1799. s["font"], # font name
  1800. flags_decomposer(s["flags"]), # readable font flags
  1801. s["size"], # font size
  1802. s["color"], # font color
  1803. )
  1804. print(s)
  1805. print("Text: '%s'" % s["text"]) # simple print of text
  1806. print(font_properties)
  1807. get_underlined_textLines(page)
  1808. # 以下为现成pdf单页解析接口
  1809. class ParseSentence:
  1810. def __init__(self, bbox, fontname, fontsize, _text, _title, title_text, _pattern, title_degree, is_outline,
  1811. outline_location, page_no):
  1812. (x0, y0, x1, y1) = bbox
  1813. self.x0 = x0
  1814. self.y0 = y0
  1815. self.x1 = x1
  1816. self.y1 = y1
  1817. self.bbox = bbox
  1818. self.fontname = fontname
  1819. self.fontsize = fontsize
  1820. self.text = _text
  1821. self.title = _title
  1822. self.title_text = title_text
  1823. self.groups = _pattern
  1824. self.title_degree = title_degree
  1825. self.is_outline = is_outline
  1826. self.outline_location = outline_location
  1827. self.page_no = page_no
  1828. def __repr__(self):
  1829. return "%s,%s,%s,%d,%s" % (self.text, self.title, self.is_outline, self.outline_location, str(self.bbox))
  1830. class ParseUtils:
  1831. @staticmethod
  1832. def getFontinfo(_page):
  1833. for _obj in _page._objs:
  1834. if isinstance(_obj, (LTTextBoxHorizontal, LTTextBoxVertical)):
  1835. for textline in _obj._objs:
  1836. done = False
  1837. for lchar in textline._objs:
  1838. if isinstance(lchar, (LTChar)):
  1839. _obj.fontname = lchar.fontname
  1840. _obj.fontsize = lchar.size
  1841. done = True
  1842. break
  1843. if done:
  1844. break
  1845. @staticmethod
  1846. def recognize_sentences(list_textbox, filter_objs, page_bbox, page_no,
  1847. remove_space=True, sourceP_LB=True):
  1848. list_textbox.sort(key=lambda x: x.bbox[0])
  1849. list_textbox.sort(key=lambda x: x.bbox[3], reverse=sourceP_LB)
  1850. cluster_textbox = []
  1851. for _textbox in list_textbox:
  1852. if _textbox in filter_objs:
  1853. continue
  1854. _find = False
  1855. for _ct in cluster_textbox:
  1856. if abs(_ct["y"] - _textbox.bbox[1]) < 5:
  1857. _find = True
  1858. _ct["textbox"].append(_textbox)
  1859. if not _find:
  1860. cluster_textbox.append({"y": _textbox.bbox[1], "textbox": [_textbox]})
  1861. cluster_textbox.sort(key=lambda x: x["y"], reverse=sourceP_LB)
  1862. list_sentences = []
  1863. for _line in cluster_textbox:
  1864. _textboxs = _line["textbox"]
  1865. _textboxs.sort(key=lambda x: x.bbox[0])
  1866. _linetext = _textboxs[0].get_text()
  1867. for _i in range(1, len(_textboxs)):
  1868. if abs(_textboxs[_i].bbox[0] - _textboxs[_i - 1].bbox[2]) > 60:
  1869. if _linetext and _linetext[-1] not in (",", ",", "。", ".", "、", ";"):
  1870. _linetext += "=,="
  1871. _linetext += _textboxs[_i].get_text()
  1872. _linetext = re.sub("[\s\r\n]", "", _linetext)
  1873. _bbox = (_textboxs[0].bbox[0], _textboxs[0].bbox[1],
  1874. _textboxs[-1].bbox[2], _textboxs[-1].bbox[3])
  1875. _title = None
  1876. _pattern_groups = None
  1877. title_text = ""
  1878. if not _title:
  1879. _groups = ParseUtils.find_title_by_pattern(_textboxs[0].get_text())
  1880. if _groups:
  1881. _title = _groups[0][0]
  1882. title_text = _groups[0][1]
  1883. _pattern_groups = _groups
  1884. if not _title:
  1885. _groups = ParseUtils.find_title_by_pattern(_linetext)
  1886. if _groups:
  1887. _title = _groups[0][0]
  1888. title_text = _groups[0][1]
  1889. _pattern_groups = _groups
  1890. if not _title:
  1891. _title = ParseUtils.rec_incenter(_bbox, page_bbox)
  1892. title_degree = 2
  1893. if not _title:
  1894. _linetext = _linetext.replace("=,=", ",")
  1895. else:
  1896. _linetext = _linetext.replace("=,=", "")
  1897. title_degree = int(_title.split("_")[1])
  1898. # 页码
  1899. if ParseUtils.rec_incenter(_bbox, page_bbox) and re.search("^\d+$", _linetext) is not None:
  1900. continue
  1901. if _linetext == "" or re.search("^,+$", _linetext) is not None:
  1902. continue
  1903. is_outline = False
  1904. outline_location = -1
  1905. _search = re.search("(?P<text>.+?)\.{5,}(?P<nums>\d+)$", _linetext)
  1906. if _search is not None:
  1907. is_outline = True
  1908. _linetext = _search.group("text")
  1909. outline_location = int(_search.group("nums"))
  1910. list_sentences.append(
  1911. ParseSentence(_bbox, _textboxs[-1].__dict__.get("fontname"), _textboxs[-1].__dict__.get("fontsize"),
  1912. _linetext, _title, title_text, _pattern_groups, title_degree, is_outline,
  1913. outline_location, page_no))
  1914. # for _sen in list_sentences:
  1915. # print(_sen.__dict__)
  1916. return list_sentences
  1917. @staticmethod
  1918. def find_title_by_pattern(_text,
  1919. _pattern="(?P<title_1>(?P<title_1_index_0_0>^第?)(?P<title_1_index_1_1>[一二三四五六七八九十ⅠⅡⅢⅣⅤⅥⅦⅧⅨⅩⅪⅫ]+)(?P<title_1_index_2_0>[、章]))|" \
  1920. "(?P<title_3>^(?P<title_3_index_0_1>[ⅠⅡⅢⅣⅤⅥⅦⅧⅨⅩⅪⅫ]+))|" \
  1921. "(?P<title_4>^(?P<title_4_index_0_0>第?)(?P<title_4_index_1_1>[一二三四五六七八九十]+)(?P<title_4_index_2_0>[节]))|" \
  1922. "(?P<title_11>^(?P<title_11_index_0_0>\d{1,2}[\..、\s\-]\d{1,2}[\..、\s\-]\d{1,2}[\..、\s\-])(?P<title_11_index_1_1>\d{1,2})(?P<title_11_index_2_0>[\..、\s\-]))|" \
  1923. "(?P<title_10>^(?P<title_10_index_0_0>\d{1,2}[\..、\s\-]\d{1,2}[\..、\s\-])(?P<title_10_index_1_1>\d{1,2})(?P<title_10_index_2_0>[\..、\s\-]))|" \
  1924. "(?P<title_7>^(?P<title_7_index_0_0>\d{1,2}[\..、\s\-])(?P<title_7_index_1_1>\d{1,2})(?P<title_7_index_2_0>[\..、\s\-]))|" \
  1925. "(?P<title_6>^(?P<title_6_index_0_1>\d{1,2})(?P<title_6_index_1_0>[\..、\s\-]))|" \
  1926. "(?P<title_15>^(?P<title_15_index_0_0>(?)(?P<title_15_index_1_1>\d{1,2})(?P<title_15_index_2_0>)))|" \
  1927. "(?P<title_17>^(?P<title_17_index_0_0>(?)(?P<title_17_index_1_1>[a-zA-Z]+)(?P<title_17_index_2_0>)))|"
  1928. "(?P<title_19>^(?P<title_19_index_0_0>(?)(?P<title_19_index_1_1>[一二三四五六七八九十ⅠⅡⅢⅣⅤⅥⅦⅧⅨⅩⅪⅫ]+)(?P<title_19_index_2_0>)))|" \
  1929. ):
  1930. _se = re.search(_pattern, _text)
  1931. groups = []
  1932. if _se is not None:
  1933. _gd = _se.groupdict()
  1934. for k, v in _gd.items():
  1935. if v is not None:
  1936. groups.append((k, v))
  1937. if len(groups):
  1938. groups.sort(key=lambda x: x[0])
  1939. return groups
  1940. return None
  1941. @staticmethod
  1942. def rec_incenter(o_bbox, p_bbox):
  1943. p_width = p_bbox[2] - p_bbox[0]
  1944. l_space = (o_bbox[0] - p_bbox[0]) / p_width
  1945. r_space = (p_bbox[2] - o_bbox[2]) / p_width
  1946. if abs((l_space - r_space)) < 0.1 and l_space > 0.2:
  1947. return "title_2"
  1948. @staticmethod
  1949. def is_first_title(_title):
  1950. if _title is None:
  1951. return False
  1952. if re.search("^\d+$", _title) is not None:
  1953. if int(_title) == 1:
  1954. return True
  1955. return False
  1956. if re.search("^[一二三四五六七八九十百]+$", _title) is not None:
  1957. if _title == "一":
  1958. return True
  1959. return False
  1960. if re.search("^[a-z]+$", _title) is not None:
  1961. if _title == "a":
  1962. return True
  1963. return False
  1964. if re.search("^[A-Z]+$", _title) is not None:
  1965. if _title == "A":
  1966. return True
  1967. return False
  1968. if re.search("^[ⅠⅡⅢⅣⅤⅥⅦⅧⅨⅩⅪⅫ]$", _title) is not None:
  1969. if _title == "Ⅰ":
  1970. return True
  1971. return False
  1972. return False
  1973. @staticmethod
  1974. def get_next_title(_title):
  1975. if re.search("^\d+$", _title) is not None:
  1976. return str(int(_title) + 1)
  1977. if re.search("^[一二三四五六七八九十百]+$", _title) is not None:
  1978. _next_title = ParseUtils.make_increase(['一', '二', '三', '四', '五', '六', '七', '八', '九', '十'],
  1979. re.sub("[十百]", '', _title))
  1980. _next_title = list(_next_title)
  1981. _next_title.reverse()
  1982. if _next_title[-1] != "十":
  1983. if len(_next_title) >= 2:
  1984. _next_title.insert(-1, '十')
  1985. if len(_next_title) >= 4:
  1986. _next_title.insert(-3, '百')
  1987. if _title[0] == "十":
  1988. if _next_title == "十":
  1989. _next_title = ["二", "十"]
  1990. _next_title.insert(0, "十")
  1991. _next_title = "".join(_next_title)
  1992. return _next_title
  1993. if re.search("^[a-z]+$", _title) is not None:
  1994. _next_title = ParseUtils.make_increase([chr(i + ord('a')) for i in range(26)], _title)
  1995. _next_title = list(_next_title)
  1996. _next_title.reverse()
  1997. return "".join(_next_title)
  1998. if re.search("^[A-Z]+$", _title) is not None:
  1999. _next_title = ParseUtils.make_increase([chr(i + ord('A')) for i in range(26)], _title)
  2000. _next_title = list(_next_title)
  2001. _next_title.reverse()
  2002. return "".join(_next_title)
  2003. if re.search("^[ⅠⅡⅢⅣⅤⅥⅦⅧⅨⅩⅪⅫ]$", _title) is not None:
  2004. _sort = ["Ⅰ", "Ⅱ", "Ⅲ", "Ⅳ", "Ⅴ", "Ⅵ", "Ⅶ", "Ⅷ", "Ⅸ", "Ⅹ", "Ⅺ", "Ⅻ"]
  2005. _index = _sort.index(_title)
  2006. if _index < len(_sort) - 1:
  2007. return _sort[_index + 1]
  2008. return None
  2009. @staticmethod
  2010. def make_increase(_sort, _title, _add=1):
  2011. if len(_title) == 0 and _add == 0:
  2012. return ""
  2013. if len(_title) == 0 and _add == 1:
  2014. return _sort[0]
  2015. _index = _sort.index(_title[-1])
  2016. next_index = (_index + _add) % len(_sort)
  2017. next_chr = _sort[next_index]
  2018. if _index == len(_sort) - 1:
  2019. _add = 1
  2020. else:
  2021. _add = 0
  2022. return next_chr + ParseUtils.make_increase(_sort, _title[:-1], _add)
  2023. @staticmethod
  2024. def rec_serial(_text, o_bbox, p_bbox, fontname, _pattern="(?P<title_1>^[一二三四五六七八九十]+[、])|" \
  2025. "(?P<title_2>^\d+[\.、\s])|" \
  2026. "(?P<title_3>^\d+\.\d+[\.、\s])|" \
  2027. "(?P<title_4>^\d+\.\d+\.\d+[\.、\s])|" \
  2028. "(?P<title_5>^\d+\.\d+\.\d+\.\d+[\.、\s])"):
  2029. # todo :recog the serial of the sentence
  2030. _se = re.search(_pattern, _text)
  2031. if _se is not None:
  2032. _gd = _se.groupdict()
  2033. for k, v in _gd.items():
  2034. if v is not None:
  2035. return k
  2036. return None
  2037. if __name__ == '__main__':
  2038. _pp = r'D:\Project\format_conversion_maxcompute\save_b_table_pdf/e-116.pdf'
  2039. # _pp = r'C:\Users\Administrator\Downloads\1746582280828.pdf'
  2040. _html = PDFConvert(_pp, r"D:\Project\format_conversion_maxcompute\format_convert\temp", None).get_html()
  2041. with open('../result.html', 'w', encoding='utf-8') as f:
  2042. f.write('<!DOCTYPE HTML><head><meta charset="UTF-8"></head>' + _html[0])