convert_pdf.py 67 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050105110521053105410551056105710581059106010611062106310641065106610671068106910701071107210731074107510761077107810791080108110821083108410851086108710881089109010911092109310941095109610971098109911001101110211031104110511061107110811091110111111121113111411151116111711181119112011211122112311241125112611271128112911301131113211331134113511361137113811391140114111421143114411451146114711481149115011511152115311541155115611571158115911601161116211631164116511661167116811691170117111721173117411751176117711781179118011811182118311841185118611871188118911901191119211931194119511961197119811991200120112021203120412051206120712081209121012111212121312141215121612171218121912201221122212231224122512261227122812291230123112321233123412351236123712381239124012411242124312441245124612471248124912501251125212531254125512561257125812591260126112621263126412651266126712681269127012711272127312741275127612771278127912801281128212831284128512861287128812891290129112921293129412951296129712981299130013011302130313041305130613071308130913101311131213131314131513161317131813191320132113221323132413251326132713281329133013311332133313341335133613371338133913401341134213431344134513461347134813491350135113521353135413551356135713581359136013611362136313641365136613671368136913701371137213731374137513761377137813791380138113821383138413851386138713881389139013911392139313941395139613971398139914001401140214031404140514061407140814091410141114121413141414151416141714181419142014211422142314241425142614271428142914301431143214331434143514361437143814391440144114421443144414451446144714481449145014511452145314541455145614571458145914601461146214631464146514661467146814691470147114721473147414751476147714781479148014811482148314841485148614871488148914901491149214931494149514961497149814991500150115021503150415051506150715081509151015111512151315141515151615171518151915201521152215231524152515261527152815291530153115321533153415351536153715381539154015411542154315441545154615471548154915501551155215531554155515561557155815591560156115621563156415651566156715681569157015711572157315741575157615771578157915801581
  1. import copy
  2. import io
  3. import os
  4. import re
  5. import sys
  6. from bs4 import BeautifulSoup
  7. sys.path.append(os.path.dirname(__file__) + "/../")
  8. from pdfplumber import PDF
  9. from pdfplumber.table import TableFinder
  10. from pdfplumber.page import Page as pdfPage
  11. from format_convert.convert_tree import _Document, _Page, _Image, _Sentence, _Table, TextBox
  12. import time
  13. from PIL import Image
  14. import traceback
  15. import cv2
  16. import PyPDF2
  17. from PyPDF2 import PdfFileReader, PdfFileWriter
  18. from pdfminer.pdfparser import PDFParser
  19. from pdfminer.pdfdocument import PDFDocument
  20. from pdfminer.pdfpage import PDFPage
  21. from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
  22. from pdfminer.converter import PDFPageAggregator
  23. from pdfminer.layout import LTTextBoxHorizontal, LAParams, LTFigure, LTImage, LTCurve, LTText, LTChar, LTRect, \
  24. LTTextBoxVertical, LTLine, LTTextContainer, LTTextLine
  25. from format_convert.utils import judge_error_code, get_platform, LineTable, log, \
  26. memory_decorator, get_garble_code, get_md5_from_bytes, bytes2np, bbox_iou, get_garble_code2, get_traditional_chinese
  27. import fitz
  28. from format_convert.wrapt_timeout_decorator import timeout
  29. from otr.table_line_pdf import table_line_pdf
  30. # import jieba
  31. @memory_decorator
  32. def pdf2text(path, unique_type_dir):
  33. return
  34. @timeout(10, timeout_exception=TimeoutError)
  35. def pdf_analyze(interpreter, page, device, page_no):
  36. pdf_time = time.time()
  37. interpreter.process_page(page)
  38. layout = device.get_result()
  39. log("page_no: " + str(page_no) + " pdf_analyze cost: " + str(time.time() - pdf_time))
  40. return layout
  41. @timeout(25, timeout_exception=TimeoutError)
  42. def read_pdfminer(path, laparams):
  43. fp = open(path, 'rb')
  44. parser = PDFParser(fp)
  45. doc_pdfminer = PDFDocument(parser)
  46. rsrcmgr = PDFResourceManager()
  47. device = PDFPageAggregator(rsrcmgr, laparams=laparams)
  48. interpreter = PDFPageInterpreter(rsrcmgr, device)
  49. return doc_pdfminer, device, interpreter
  50. @timeout(15, timeout_exception=TimeoutError)
  51. def read_pymupdf(path):
  52. return fitz.open(path)
  53. @timeout(15, timeout_exception=TimeoutError)
  54. def read_pypdf2(path):
  55. doc_pypdf2 = PdfFileReader(path, strict=False)
  56. doc_pypdf2_new = PdfFileWriter()
  57. return doc_pypdf2, doc_pypdf2_new
  58. @timeout(25, timeout_exception=TimeoutError, use_signals=False)
  59. def read_pdfplumber(path, laparams):
  60. fp = open(path, 'rb')
  61. lt = LineTable()
  62. doc_top = 0
  63. doc_pdfplumber = PDF(fp, laparams=laparams.__dict__)
  64. return lt, doc_top, doc_pdfplumber
  65. class PDFConvert:
  66. def __init__(self, path, unique_type_dir, need_page_no):
  67. self._doc = _Document(path)
  68. self.path = path
  69. self.unique_type_dir = unique_type_dir
  70. if not os.path.exists(self.unique_type_dir):
  71. os.mkdir(self.unique_type_dir)
  72. # 指定提取的页码范围
  73. self.need_page_no = need_page_no
  74. self.start_page_no = None
  75. self.end_page_no = None
  76. # 默认使用limit_page_cnt控制,前10页后10页
  77. if self.need_page_no is None:
  78. self.limit_page_cnt = 20
  79. else:
  80. # 使用start_page_no,end_page_no范围控制,例如2,5
  81. ss = self.need_page_no.split(',')
  82. if len(ss) != 2:
  83. self._doc.error_code = [-14]
  84. else:
  85. self.start_page_no = int(ss[0])
  86. self.end_page_no = int(ss[-1])
  87. if self.end_page_no == -1:
  88. self.end_page_no = 1000000
  89. self.start_page_no -= 1
  90. self.end_page_no -= 1
  91. if self.end_page_no <= self.start_page_no or self.start_page_no < 0 or self.end_page_no < -1:
  92. self._doc.error_code = [-14]
  93. self.packages = ["pdfminer", "PyMuPDF", "PyPDF2", "pdfplumber"]
  94. self.has_init_pdf = [0] * len(self.packages)
  95. # 记录图片对象的md5,用于去除大量重复图片
  96. self.md5_image_obj_list = []
  97. # 记录该页是不是纯文本
  98. self.only_text_list = []
  99. # 是否提取特殊页
  100. self.convert_specific_page = 1
  101. @memory_decorator
  102. def init_package(self, package_name):
  103. # 各个包初始化
  104. try:
  105. laparams = LAParams(line_overlap=0.01,
  106. char_margin=0.3,
  107. line_margin=0.01,
  108. word_margin=0.01,
  109. boxes_flow=0.1, )
  110. if package_name == self.packages[0]:
  111. self.doc_pdfminer, self.device, self.interpreter = read_pdfminer(self.path, laparams)
  112. self.has_init_pdf[0] = 1
  113. elif package_name == self.packages[1]:
  114. self.doc_pymupdf = read_pymupdf(self.path)
  115. self.has_init_pdf[1] = 1
  116. elif package_name == self.packages[2]:
  117. self.doc_pypdf2, self.doc_pypdf2_new = read_pypdf2(self.path)
  118. self.has_init_pdf[2] = 1
  119. elif package_name == self.packages[3]:
  120. self.lt, self.doc_top, self.doc_pdfplumber = read_pdfplumber(self.path, laparams)
  121. self.has_init_pdf[3] = 1
  122. else:
  123. log("Only Support Packages " + str(self.packages))
  124. raise Exception
  125. except Exception as e:
  126. log(package_name + " cannot open pdf!")
  127. traceback.print_exc()
  128. self._doc.error_code = [-3]
  129. @memory_decorator
  130. def convert(self, limit_page_cnt=20):
  131. if self.has_init_pdf[0] == 0:
  132. self.init_package("pdfminer")
  133. if self._doc.error_code is not None:
  134. self._doc.error_code = None
  135. # pdfminer读不了直接转成图片识别
  136. self.get_all_page_image()
  137. return
  138. # 判断是否能读pdf
  139. try:
  140. pages = PDFPage.create_pages(self.doc_pdfminer)
  141. for page in pages:
  142. break
  143. pages = list(pages)
  144. # except pdfminer.psparser.PSEOF as e:
  145. except:
  146. # pdfminer 读不了空白页的对象,直接使用pymupdf转换出的图片进行ocr识别
  147. log("pdfminer read failed! read by pymupdf!")
  148. traceback.print_exc()
  149. try:
  150. self.get_all_page_image()
  151. return
  152. except:
  153. traceback.print_exc()
  154. log("use pymupdf read failed!")
  155. self._doc.error_code = [-3]
  156. return
  157. # 每一页进行处理
  158. pages = PDFPage.create_pages(self.doc_pdfminer)
  159. pages = list(pages)
  160. page_count = len(pages)
  161. self.only_text_list = [-1] * len(pages)
  162. page_no = 0
  163. for page in pages:
  164. # 指定pdf页码
  165. if self.start_page_no is not None and self.end_page_no is not None:
  166. if page_count < self.end_page_no:
  167. self.end_page_no = page_count
  168. if page_no < self.start_page_no or page_no >= self.end_page_no:
  169. page_no += 1
  170. continue
  171. # 限制pdf页数,只取前后各10页
  172. else:
  173. if page_count > limit_page_cnt and int(limit_page_cnt/2) <= page_no < page_count - int(limit_page_cnt/2):
  174. page_no += 1
  175. continue
  176. # 解析单页
  177. start_time = time.time()
  178. self._page = _Page(page, page_no)
  179. self.convert_page(page, page_no)
  180. log('convert_page page_no: ' + str(page_no) + ' cost: ' + str(time.time()-start_time))
  181. if self._doc.error_code is None and self._page.error_code is not None:
  182. if self._page.error_code[0] in [-4, -3, 0]:
  183. page_no += 1
  184. continue
  185. else:
  186. self._doc.error_code = self._page.error_code
  187. break
  188. self._doc.add_child(self._page)
  189. page_no += 1
  190. self._doc.children, detete_header_footer_list = self.delete_header_footer(self._doc.children)
  191. if self.convert_specific_page and self.need_page_no is None:
  192. # 补充提取特定页
  193. # print('self.only_text_list', self.only_text_list)
  194. if self.only_text_list.count(0) == 0:
  195. ratio = 0
  196. else:
  197. ratio = self.only_text_list.count(0) / (page_count-self.only_text_list.count(-1))
  198. # print('ratio', ratio)
  199. if page_count > limit_page_cnt and ratio <= 0.2:
  200. page_no = 0
  201. find_flag = 0
  202. add_page_list = []
  203. for page in pages:
  204. if not int(limit_page_cnt/2) <= page_no < page_count - int(limit_page_cnt/2):
  205. page_no += 1
  206. continue
  207. # 解析单页
  208. start_time = time.time()
  209. self._page = _Page(page, page_no)
  210. self.convert_page(page, page_no, skip_image=1)
  211. log('convert_page add page_no: ' + str(page_no) + ' cost: ' + str(time.time()-start_time))
  212. # 删除页眉页脚
  213. pages, _ = self.delete_header_footer([self._page], detete_header_footer_list)
  214. self._page = pages[0]
  215. # 提取特殊部分
  216. re_str = '采购清单'
  217. # 坐标都是上下颠倒的,回正
  218. self._page.children.sort(key=lambda x: x.y, reverse=True)
  219. # print('find_flag', find_flag, type(self._page.children[-1]))
  220. if find_flag and type(self._page.children[0]) == _Table:
  221. add_page_list.append(self._page)
  222. if len(self._page.children) - 1 > 3:
  223. find_flag = 0
  224. for index in range(len(self._page.children)):
  225. obj = self._page.children[index]
  226. next_obj = None
  227. if index+1 < len(self._page.children):
  228. next_obj = self._page.children[index+1]
  229. # print('采购清单', type(obj) == _Sentence, re.search(re_str, str(obj.content)), str(obj.content)[:20])
  230. if type(obj) == _Sentence and re.search(re_str, obj.content) \
  231. and next_obj and type(next_obj) == _Table:
  232. add_page_list.append(self._page)
  233. # print('add_page_list', page_no)
  234. if len(self._page.children) - index - 1 > 3:
  235. find_flag = 0
  236. else:
  237. find_flag = 1
  238. page_no += 1
  239. # print('add_page_list', add_page_list)
  240. if add_page_list:
  241. self._doc.children = self._doc.children[:int(limit_page_cnt/2)] + add_page_list + self._doc.children[int(limit_page_cnt/2):]
  242. self.delete_same_image()
  243. # self.delete_bold_text_duplicate()
  244. def delete_same_image(self, show=0):
  245. # 剔除大量重复图片
  246. md5_dict = {}
  247. for _md5, image_obj in self.md5_image_obj_list:
  248. if _md5 in md5_dict.keys():
  249. md5_dict[_md5] += [image_obj]
  250. else:
  251. md5_dict[_md5] = [image_obj]
  252. cnt_threshold = 10
  253. delete_obj_list = []
  254. for _md5 in md5_dict.keys():
  255. img_list = md5_dict.get(_md5)
  256. # print('len(md5_dict.get(_md5))', _md5, len(img_list))
  257. if len(img_list) >= cnt_threshold:
  258. if show:
  259. img_np = bytes2np(img_list[0].content)
  260. cv2.namedWindow('delete same img_np', cv2.WINDOW_NORMAL)
  261. cv2.imshow('delete same img_np', img_np)
  262. cv2.waitKey(0)
  263. delete_obj_list += img_list
  264. for page in self._doc.children:
  265. for obj in delete_obj_list:
  266. if obj in page.children:
  267. page.children.remove(obj)
  268. if show:
  269. for page in self._doc.children:
  270. for obj in page.children:
  271. if isinstance(obj, _Image):
  272. img_np = bytes2np(obj.content)
  273. cv2.imshow('page img_np', img_np)
  274. cv2.waitKey(0)
  275. def delete_header_footer(self, pages, delete_list=[]):
  276. sen_dict = {}
  277. for page in pages:
  278. for obj in page.children:
  279. if isinstance(obj, _Sentence):
  280. key = str(obj.content) + ' ' + str(int(obj.y))
  281. # print('key', key)
  282. if key in sen_dict.keys():
  283. sen_dict[key] += [obj]
  284. else:
  285. sen_dict[key] = [obj]
  286. # 把需删除的加上
  287. # print('delete_list', delete_list)
  288. for key in delete_list:
  289. if key in sen_dict:
  290. sen_dict[key] = sen_dict.get(key) * 10
  291. # print('sen_dict', sen_dict)
  292. delete_footer_header_list = []
  293. for key in sen_dict.keys():
  294. l = sen_dict.get(key)
  295. if len(l) >= 1/3 * max(10, len(pages)):
  296. delete_footer_header_list.append(key)
  297. for page in pages:
  298. new_children = []
  299. for obj in page.children:
  300. if isinstance(obj, _Sentence):
  301. if obj not in l:
  302. new_children.append(obj)
  303. else:
  304. new_children.append(obj)
  305. page.children = new_children
  306. # print('len(l)', len(l), len(pages))
  307. # print('delete_header_footer l[0]', l[0].content, l[0].y)
  308. return pages, delete_footer_header_list
  309. def delete_bold_text_duplicate(self, lt_text_box_list):
  310. # 拿出所有LTChar
  311. lt_char_list = []
  312. for lt_text_box in lt_text_box_list:
  313. for lt_text_line in lt_text_box:
  314. for lt_char in lt_text_line:
  315. if isinstance(lt_char, LTChar):
  316. lt_char_list.append(lt_char)
  317. # 找出需剔除的
  318. lt_char_list.sort(key=lambda x: (int(x.bbox[1]), x.bbox[0]))
  319. delete_list = []
  320. for i in range(len(lt_char_list)):
  321. lt_char1 = lt_char_list[i]
  322. bbox1 = lt_char1.bbox
  323. # lt_char2 = lt_char_list[i+1]
  324. # bbox2 = lt_char2.bbox
  325. if lt_char1 in delete_list:
  326. continue
  327. # if lt_char2 in delete_list:
  328. # continue
  329. # if lt_char1.get_text() == lt_char2.get_text() and bbox1[0] <= bbox2[0] <= bbox1[2] <= bbox2[2] \
  330. # and int(bbox1[1]) == int(bbox2[1]) and int(bbox1[3]) == int(bbox2[3]) \
  331. # and re.search('[\u4e00-\u9fff():、,。]', lt_char1.get_text()):
  332. for j in range(i+1, len(lt_char_list)):
  333. lt_char2 = lt_char_list[j]
  334. bbox2 = lt_char2.bbox
  335. if lt_char2 in delete_list:
  336. continue
  337. if lt_char1.get_text() == lt_char2.get_text() and bbox_iou(bbox1, bbox2) >= 0.3 \
  338. and re.search('[\u4e00-\u9fff():、,。]', lt_char1.get_text()):
  339. # continue
  340. delete_list.append(lt_char2)
  341. # 重新组装
  342. new_lt_text_box_list = []
  343. for lt_text_box in lt_text_box_list:
  344. new_lt_text_box = LTTextBoxHorizontal()
  345. for lt_text_line in lt_text_box:
  346. new_lt_text_line = LTTextLine(0.01)
  347. for lt_char in lt_text_line:
  348. if lt_char in delete_list:
  349. continue
  350. if isinstance(lt_char, LTChar):
  351. new_lt_text_line.add(lt_char)
  352. new_lt_text_box.add(new_lt_text_line)
  353. new_lt_text_box_list.append(new_lt_text_box)
  354. return new_lt_text_box_list
  355. def clean_text(self, _text):
  356. return re.sub("\s", "", _text)
  357. def get_text_lines(self, page, page_no):
  358. lt_line_list = []
  359. page_plumber = pdfPage(self.doc_pdfplumber, page, page_number=page_no, initial_doctop=self.doc_top)
  360. self.doc_top += page_plumber.height
  361. table_finder = TableFinder(page_plumber)
  362. all_width_zero = True
  363. for _edge in table_finder.get_edges():
  364. if _edge.get('linewidth') and _edge.get('linewidth') > 0:
  365. all_width_zero = False
  366. break
  367. for _edge in table_finder.get_edges():
  368. # print(_edge)
  369. if _edge.get('linewidth', 0.1) > 0 or all_width_zero:
  370. lt_line_list.append(LTLine(1, (float(_edge["x0"]), float(_edge["y0"])),
  371. (float(_edge["x1"]), float(_edge["y1"]))))
  372. log("pdf page_no %s has %s lines" % (str(page_no), str(len(lt_line_list))))
  373. return lt_line_list
  374. @memory_decorator
  375. def get_page_lines(self, layout, page_no, show=0):
  376. lt_line_list = table_line_pdf(layout, page_no, show)
  377. return lt_line_list
  378. @memory_decorator
  379. def recognize_text(self, layout, page_no, lt_text_list, lt_line_list):
  380. list_tables, filter_objs, _, connect_textbox_list = self.lt.recognize_table(lt_text_list, lt_line_list,
  381. from_pdf=True, is_reverse=False)
  382. self._page.in_table_objs = filter_objs
  383. # print("=======text_len:%d:filter_len:%d"%(len(lt_text_list),len(filter_objs)))
  384. for table in list_tables:
  385. _table = _Table(table["table"], table["bbox"])
  386. # self._page.children.append(_table)
  387. self._page.add_child(_table)
  388. list_sentences = ParseUtils.recognize_sentences(lt_text_list, filter_objs,
  389. layout.bbox, page_no)
  390. for sentence in list_sentences:
  391. # print('sentence.text', sentence.text)
  392. _sen = _Sentence(sentence.text, sentence.bbox)
  393. self._page.add_child(_sen)
  394. # pdf对象需反向排序
  395. # self._page.is_reverse = True
  396. return list_tables
  397. def is_text_legal(self, lt_text_list, page_no):
  398. # 无法识别pdf字符编码,整页用ocr
  399. text_temp = ""
  400. for _t in lt_text_list:
  401. text_temp += _t.get_text()
  402. if re.search('[(]cid:[0-9]+[)]', text_temp):
  403. log("page_no: " + str(page_no) + " text has cid! try pymupdf...")
  404. page_image = self.get_page_image(page_no)
  405. if judge_error_code(page_image):
  406. self._page.error_code = page_image
  407. else:
  408. _image = _Image(page_image[1], page_image[0])
  409. self._page.add_child(_image)
  410. return False
  411. match1 = re.findall(get_garble_code(), text_temp)
  412. # match2 = re.search('[\u4e00-\u9fa5]', text_temp)
  413. if len(match1) > 8 and len(text_temp) > 10:
  414. log("page_no: " + str(page_no) + " garbled code! try pymupdf... " + text_temp[:20])
  415. page_image = self.get_page_image(page_no)
  416. if judge_error_code(page_image):
  417. self._page.error_code = page_image
  418. else:
  419. _image = _Image(page_image[1], page_image[0])
  420. self._page.add_child(_image)
  421. return False
  422. return True
  423. def judge_b_table(self, lt_text_list, table_list, page_no):
  424. table_h_list = []
  425. for table in table_list:
  426. table_h_list.append([table.get('bbox')[1], table.get('bbox')[3]])
  427. # 先分行
  428. lt_text_list.sort(key=lambda x: (x.bbox[1], x.bbox[0]))
  429. lt_text_row_list = []
  430. current_h = lt_text_list[0].bbox[1]
  431. row = []
  432. threshold = 2
  433. for lt_text in lt_text_list:
  434. bbox = lt_text.bbox
  435. if current_h - threshold <= bbox[1] <= current_h + threshold:
  436. row.append(lt_text)
  437. else:
  438. if row:
  439. lt_text_row_list.append(row)
  440. row = [lt_text]
  441. current_h = lt_text.bbox[1]
  442. if row:
  443. lt_text_row_list.append(row)
  444. # 判断文本中间是否是空格,或一行文本中间有多个
  445. is_b_table_cnt = 3
  446. tolerate_cnt = 2
  447. t_cnt = 0
  448. row_cnt = 0
  449. b_table_row_list = []
  450. all_b_table = []
  451. for row in lt_text_row_list:
  452. # 水印行跳过
  453. if len(row) == 1 and len(row[0].get_text()[:-1]) == 1:
  454. continue
  455. # 目录行跳过
  456. continue_flag = False
  457. for r in row:
  458. if re.search('[.·]{7,}', r.get_text()):
  459. continue_flag = True
  460. break
  461. if continue_flag:
  462. continue
  463. if len(row) == 1:
  464. text = row[0].get_text()
  465. bbox = row[0].bbox
  466. match = re.search('[ ]{3,}', text)
  467. if match and re.search('[\u4e00-\u9fff]{2,}', text[:match.span()[0]]) \
  468. and re.search('[\u4e00-\u9fff]{2,}', text[match.span()[1]:]):
  469. row_cnt += 1
  470. t_cnt = 0
  471. b_table_row_list += row
  472. else:
  473. # 容忍
  474. if t_cnt < tolerate_cnt:
  475. t_cnt += 1
  476. continue
  477. if b_table_row_list and row_cnt >= is_b_table_cnt:
  478. all_b_table.append(b_table_row_list)
  479. row_cnt = 0
  480. b_table_row_list = []
  481. else:
  482. row_cnt += 1
  483. t_cnt = 0
  484. b_table_row_list += row
  485. if b_table_row_list and row_cnt >= is_b_table_cnt:
  486. all_b_table.append(b_table_row_list)
  487. # 对每个可能的b_table判断是否与table相交
  488. is_b_table_flag = False
  489. for b_table in all_b_table:
  490. # 判断在不在有边框表格的范围
  491. in_flag = False
  492. for table_h in table_h_list:
  493. for b in b_table:
  494. if min(table_h) <= b.bbox[1] <= max(table_h) or min(table_h) <= b.bbox[3] <= max(table_h):
  495. in_flag = True
  496. break
  497. if in_flag:
  498. break
  499. if in_flag:
  500. is_b_table_flag = False
  501. else:
  502. is_b_table_flag = True
  503. # print('is_b_table_flag True ', [[x.get_text(), x.bbox] for x in b_table])
  504. # print('table_h_list', table_h_list)
  505. break
  506. log("page_no: " + str(page_no) + ' is_b_table_flag ' + str(is_b_table_flag))
  507. return is_b_table_flag
  508. @memory_decorator
  509. def convert_page(self, page, page_no, skip_image=0):
  510. layout = self.get_layout(page, page_no)
  511. if self._doc.error_code is not None:
  512. return
  513. if judge_error_code(layout):
  514. self._page.error_code = layout
  515. return
  516. # 翻转pdf中所有对象的y坐标
  517. max_y, min_y = 0, 10000
  518. for x in layout:
  519. min_y = min(min_y, x.y0, x.y1)
  520. max_y = max(max_y, x.y0, x.y1)
  521. if max_y == 0:
  522. return
  523. for x in layout:
  524. # 外层obj的bbox设置
  525. x.set_bbox((x.x0, round(max_y - max(x.y0, x.y1), 1), x.x1, round(max_y - min(x.y0, x.y1), 1)))
  526. # 内层单个字符的bbox设置
  527. if isinstance(x, (LTTextBoxHorizontal, LTTextBoxVertical)):
  528. for lt_text_line in x:
  529. for lt_char in lt_text_line:
  530. if isinstance(lt_char, LTChar):
  531. lt_char.set_bbox((lt_char.x0, round(max_y - max(lt_char.y0, lt_char.y1), 1), lt_char.x1, round(max_y - min(lt_char.y0, lt_char.y1), 1)))
  532. # 判断该页的对象类型,并存储
  533. lt_text_list = []
  534. lt_image_list = []
  535. for x in layout:
  536. if isinstance(x, (LTTextBoxHorizontal, LTTextBoxVertical)):
  537. lt_text_list.append(x)
  538. if isinstance(x, LTFigure):
  539. for y in x:
  540. if isinstance(y, LTImage):
  541. # 小的图忽略
  542. if y.width <= 300 and y.height <= 300:
  543. continue
  544. # 图的width超过layout width,很大可能是水印
  545. if y.width > layout.width + 20:
  546. continue
  547. lt_image_list.append(y)
  548. # 判断纯文本
  549. if len(lt_image_list) == 0 and len(lt_text_list) == 0:
  550. self.only_text_list[page_no] = 0
  551. elif len(lt_image_list) == 0:
  552. self.only_text_list[page_no] = 1
  553. else:
  554. self.only_text_list[page_no] = 0
  555. # 跳过图片
  556. if skip_image:
  557. lt_image_list = []
  558. # 判断读出来的是乱码,但有图片直接识别
  559. all_text = ''.join([x.get_text() for x in lt_text_list])
  560. all_text = re.sub('[\s\d]', '', all_text)
  561. if len(re.findall(get_garble_code2(), all_text)) >= 3 and len(lt_image_list) >= 1:
  562. log('嵌入的文字是乱码1: ' + str(all_text[:10]))
  563. lt_text_list = []
  564. # print('11111', re.findall(get_traditional_chinese(), all_text))
  565. if 3 <= len(re.findall(get_traditional_chinese(), all_text)) <= len(all_text) / 2 and len(lt_image_list) >= 1:
  566. log('嵌入的文字是乱码2: ' + str(all_text[:10]))
  567. lt_text_list = []
  568. # 解决重复粗体字问题
  569. lt_text_list = self.delete_bold_text_duplicate(lt_text_list)
  570. # 删除水印字
  571. lt_text_list = self.delete_water_mark(lt_text_list, layout.bbox, 15)
  572. log("page_no: " + str(page_no) + " len(lt_image_list), len(lt_text_list) " + str(len(lt_image_list)) + " " + str(len(lt_text_list)))
  573. # 若该页图片数量过多,或无文本,则直接ocr整页识别
  574. if len(lt_image_list) > 4 or len(lt_text_list) == 0:
  575. page_image = self.get_page_image(page_no)
  576. if judge_error_code(page_image):
  577. self._page.error_code = page_image
  578. else:
  579. _image = _Image(page_image[1], page_image[0])
  580. _image.is_from_pdf = True
  581. _image.is_reverse = False
  582. self._page.add_child(_image)
  583. # 正常读取该页对象
  584. else:
  585. # 图表对象
  586. for image in lt_image_list:
  587. try:
  588. # print("pdf2text LTImage size", page_no, image.width, image.height)
  589. image_stream = image.stream.get_data()
  590. # 小的图忽略
  591. if image.width <= 300 and image.height <= 300:
  592. continue
  593. # 查看提取的图片高宽,太大则用pdf输出图进行ocr识别
  594. img_test = Image.open(io.BytesIO(image_stream))
  595. if image.height >= 1000 and image.width >= 1000:
  596. page_image = self.get_page_image(page_no)
  597. if judge_error_code(page_image):
  598. self._page.error_code = page_image
  599. else:
  600. _image = _Image(page_image[1], page_image[0])
  601. _image.is_from_pdf = True
  602. _image.is_reverse = False
  603. self._page.add_child(_image)
  604. image_md5 = get_md5_from_bytes(page_image[1])
  605. self.md5_image_obj_list.append([image_md5, _image])
  606. return
  607. # 比较小的图则直接保存用ocr识别
  608. else:
  609. temp_path = self.unique_type_dir + 'page' + str(page_no) \
  610. + '_lt' + str(lt_image_list.index(image)) + '.jpg'
  611. img_test.save(temp_path)
  612. with open(temp_path, "rb") as ff:
  613. image_stream = ff.read()
  614. _image = _Image(image_stream, temp_path, image.bbox)
  615. self._page.add_child(_image)
  616. image_md5 = get_md5_from_bytes(image_stream)
  617. self.md5_image_obj_list.append([image_md5, _image])
  618. except Exception:
  619. log("page_no: " + str(page_no) + " pdfminer read image fail! use pymupdf read image...")
  620. traceback.print_exc()
  621. # pdf对象需反向排序
  622. # self._page.is_reverse = True
  623. if self.has_init_pdf[3] == 0:
  624. self.init_package("pdfplumber")
  625. if not self.is_text_legal(lt_text_list, page_no):
  626. return
  627. try:
  628. lt_line_list = self.get_page_lines(layout, page_no)
  629. except:
  630. traceback.print_exc()
  631. lt_line_list = []
  632. self._page.error_code = [-13]
  633. table_list = self.recognize_text(layout, page_no, lt_text_list, lt_line_list)
  634. # 根据text规律,判断该页是否可能有无边框表格
  635. if self.judge_b_table(lt_text_list, table_list, page_no):
  636. page_image = self.get_page_image(page_no)
  637. if judge_error_code(page_image):
  638. self._page.error_code = page_image
  639. else:
  640. _image = _Image(page_image[1], page_image[0])
  641. _image.is_from_pdf = True
  642. # _image.is_reverse = True
  643. _image.b_table_from_text = True
  644. _image.b_table_text_obj_list = lt_text_list
  645. _image.b_table_layout_size = (layout.width, layout.height)
  646. self._page.add_child(_image)
  647. def get_layout(self, page, page_no):
  648. if self.has_init_pdf[0] == 0:
  649. self.init_package("pdfminer")
  650. if self._doc.error_code is not None:
  651. return
  652. # 获取该页layout
  653. start_time = time.time()
  654. try:
  655. if get_platform() == "Windows":
  656. layout = pdf_analyze(self.interpreter, page, self.device, page_no)
  657. else:
  658. layout = pdf_analyze(self.interpreter, page, self.device, page_no)
  659. except TimeoutError as e:
  660. log("page_no: " + str(page_no) + " pdfminer read page time out! " + str(time.time() - start_time))
  661. layout = [-4]
  662. except Exception:
  663. traceback.print_exc()
  664. log("page_no: " + str(page_no) + " pdfminer read page error! continue...")
  665. layout = [-3]
  666. log("page_no: " + str(page_no) + " get_layout cost: " + str(time.time()-start_time))
  667. return layout
  668. def get_page_image(self, page_no):
  669. start_time = time.time()
  670. try:
  671. if self.has_init_pdf[1] == 0:
  672. self.init_package("PyMuPDF")
  673. if self._doc.error_code is not None:
  674. return
  675. # save_dir = self.path.split(".")[-2] + "_" + self.path.split(".")[-1]
  676. output = self.unique_type_dir + "page" + str(page_no) + ".png"
  677. page = self.doc_pymupdf.loadPage(page_no)
  678. rotate = int(0)
  679. zoom_x = 2.
  680. zoom_y = 2.
  681. mat = fitz.Matrix(zoom_x, zoom_y).preRotate(rotate)
  682. pix = page.getPixmap(matrix=mat, alpha=False)
  683. pix.writePNG(output)
  684. # 输出图片resize
  685. self.resize_image(output)
  686. with open(output, "rb") as f:
  687. pdf_image = f.read()
  688. log("page_no: " + str(page_no) + ' get_page_image cost: ' + str(time.time()-start_time))
  689. return [output, pdf_image]
  690. except ValueError as e:
  691. traceback.print_exc()
  692. if str(e) == "page not in document":
  693. log("page_no: " + str(page_no) + " page not in document! continue...")
  694. return [0]
  695. elif "encrypted" in str(e):
  696. log("page_no: " + str(page_no) + " document need password")
  697. return [-7]
  698. except RuntimeError as e:
  699. if "cannot find page" in str(e):
  700. log("page_no: " + str(page_no) + " page cannot find in document! continue...")
  701. return [0]
  702. else:
  703. traceback.print_exc()
  704. return [-3]
  705. def get_all_page_image(self):
  706. start_time = time.time()
  707. if self.has_init_pdf[1] == 0:
  708. self.init_package("PyMuPDF")
  709. if self._doc.error_code is not None:
  710. return
  711. page_count = self.doc_pymupdf.page_count
  712. for page_no in range(page_count):
  713. # 限制pdf页数,只取前10页后10页
  714. if page_count > 20:
  715. if 10 <= page_no < page_count - 10:
  716. continue
  717. self._page = _Page(None, page_no)
  718. page_image = self.get_page_image(page_no)
  719. if judge_error_code(page_image):
  720. self._page.error_code = page_image
  721. else:
  722. _image = _Image(page_image[1], page_image[0])
  723. self._page.add_child(_image)
  724. # 报错继续读后面页面
  725. if self._doc.error_code is None and self._page.error_code is not None:
  726. continue
  727. self._doc.add_child(self._page)
  728. log('get_all_page_image cost: ' + str(time.time()-start_time))
  729. @memory_decorator
  730. def connect_table(self, html_list, show=0):
  731. if not html_list:
  732. return html_list
  733. # 判断初始条件1
  734. # 0: 前一页最后一个表格为A,后一页第一个表格为B
  735. # 1.1: A后无文本(除了页码),且B前无文本(除了页码)
  736. # 1.2: B前有文字(可能是页眉,小于60字),且B的第一行前几个单元格为空,且第一行不为空的单元格有文字较多的格子
  737. # 1.3: B前有文字(可能是页眉,小于60字),且B的第一行第一个单元格为空,且有文字的格子数量占所有格子的一半
  738. # 1.4: B前有文字(可能是页眉,小于60字),且B的第一行第一个单元格为纯数字序号
  739. # 1.5: A后有文字(除了页码还有页眉),且A的后面只有一行且中文不超过15个字
  740. connect_flag_list = []
  741. soup_list = []
  742. connect_rule_dict = {}
  743. for i, h in enumerate(html_list):
  744. soup = BeautifulSoup(h, 'lxml')
  745. soup_list.append(soup)
  746. # 找最后一个表格
  747. last_table_start, last_table_end = None, None
  748. match = re.finditer('<table', h)
  749. for m in match:
  750. last_table_start = m.span()[0]
  751. if last_table_start is not None:
  752. match = re.finditer('</table>', h[last_table_start:])
  753. for m in match:
  754. last_table_end = m.span()[1] + last_table_start
  755. # 补充规则,把表格也带上
  756. rule_a = [0, h[last_table_start:last_table_end]]
  757. # 最后一个表格后有无除了页码外的内容
  758. connect_flag1 = False
  759. if last_table_end is not None:
  760. match = re.findall('[^-/第页0-9,,]', re.sub('<div>|</div>', '', h[last_table_end:]))
  761. # print('match', match.group())
  762. # if not match or match.group() == '':
  763. if len(match) == 0:
  764. connect_flag1 = True
  765. # 有页脚
  766. if not connect_flag1:
  767. if len(re.findall('<div>', h[last_table_end:])) <= 1 \
  768. and len(re.findall('[\u4e00-\u9fff]', h[last_table_end:])) <= 60:
  769. connect_flag1 = True
  770. # 找第一个表格
  771. first_table_start, first_table_end = None, None
  772. match = re.finditer('<table', h)
  773. for m in match:
  774. first_table_start = m.span()[0]
  775. break
  776. if first_table_start is not None:
  777. match = re.finditer('</table>', h[first_table_start:])
  778. for m in match:
  779. first_table_end = m.span()[1] + first_table_start
  780. # 补充规则,把表格也带上
  781. rule_b = [0, h[first_table_start:first_table_end]]
  782. # 第一个表格前有无内容
  783. connect_flag2 = False
  784. if first_table_start is not None and first_table_start == 0:
  785. connect_flag2 = True
  786. # 有内容但是是页眉
  787. if not connect_flag2:
  788. tables = soup.findAll('table')
  789. if tables:
  790. first_table = tables[0]
  791. rows = first_table.findAll('tr')
  792. if rows:
  793. first_row = rows[0]
  794. col_text_len_list = [len(x.text) for x in first_row]
  795. col_text_list = [x.text for x in first_row]
  796. # 文字大于60且第一个为空
  797. if not connect_flag2 and len(h[:first_table_start]) <= 60 and col_text_len_list[0] == 0 and max(col_text_len_list) >= 30:
  798. connect_flag2 = True
  799. rule_b[0] = 1
  800. # 有文字格子数占一半一下且第一个格子为空
  801. if not connect_flag2 and col_text_len_list.count(0) >= len(col_text_len_list) / 2 and col_text_len_list[0] == 0:
  802. connect_flag2 = True
  803. # 表格前最多只有一行且第一个格子为纯数字
  804. if not connect_flag2 and len(col_text_list) > 0 and \
  805. len(re.findall('<div>', h[:first_table_start])) <= 0 and \
  806. len(re.findall('\d', col_text_list[0])) == len(col_text_list[0]):
  807. connect_flag2 = True
  808. # if not connect_flag2 and len(re.findall('<div>', h[:first_table_start])) <= 0 and len(re.findall('[\u4e00-\u9fff]', h[:first_table_start])) <= 25:
  809. # connect_flag2 = True
  810. connect_flag_list.append([i, connect_flag2, connect_flag1])
  811. connect_rule_dict[i] = [rule_b, rule_a]
  812. if show:
  813. print('connect_flag_list', connect_flag_list)
  814. print('connect_rule_dict', connect_rule_dict)
  815. # 根据条件1合并需连接页码,形成组
  816. connect_pages_list = []
  817. if connect_flag_list:
  818. temp_list = [connect_flag_list[0]]
  819. for i in range(1, len(connect_flag_list)):
  820. c = connect_flag_list[i]
  821. if c[1] and temp_list[-1][2]:
  822. temp_list.append(c)
  823. else:
  824. if temp_list:
  825. connect_pages_list.append(temp_list)
  826. temp_list = [c]
  827. # connect_pages_list.append([c])
  828. if temp_list:
  829. connect_pages_list.append(temp_list)
  830. if show:
  831. print('connect_pages_list', connect_pages_list)
  832. # 判断后续条件:判断组内列数是否相同
  833. connect_pages_list2 = []
  834. for c_list in connect_pages_list:
  835. if len(c_list) == 1:
  836. connect_pages_list2.append(c_list)
  837. else:
  838. col_cnt_list = []
  839. # 单元格可能被复制了,相同的合并当做一列
  840. merge_col_cnt_list = []
  841. for c in c_list:
  842. soup = soup_list[c[0]]
  843. table1 = soup.findAll('table')[-1]
  844. table2 = soup.findAll('table')[0]
  845. tr1 = table1.findAll('tr')
  846. tr2 = table2.findAll('tr')
  847. td1 = tr1[-1].findAll('td')
  848. td2 = tr2[0].findAll('td')
  849. col_cnt_list.append([len(td2), len(td1)])
  850. # # 计算合并重复文本格子后的列数
  851. # last_text = td1[0].text
  852. # merge_td1 = [last_text]
  853. # for td in td1:
  854. # if td.text == last_text:
  855. # continue
  856. # else:
  857. # merge_td1.append(td.text)
  858. # last_text = td.text
  859. # last_text = td2[0].text
  860. # merge_td2 = [last_text]
  861. # for td in td2:
  862. # if td.text == last_text:
  863. # continue
  864. # else:
  865. # merge_td2.append(td.text)
  866. # last_text = td.text
  867. # merge_col_cnt_list.append([len(merge_td2), len(merge_td1)])
  868. # 判断
  869. new_c_list = [c_list[0]]
  870. # print('col_cnt_list', col_cnt_list)
  871. for i in range(len(col_cnt_list) - 1):
  872. if col_cnt_list[i][1] != col_cnt_list[i + 1][0]:
  873. # and merge_col_cnt_list[i][1] != merge_col_cnt_list[i + 1][0]:
  874. connect_pages_list2.append(new_c_list)
  875. new_c_list = [c_list[i + 1]]
  876. else:
  877. new_c_list.append(c_list[i + 1])
  878. if new_c_list:
  879. connect_pages_list2.append(new_c_list)
  880. if show:
  881. print('connect_pages_list2', connect_pages_list2)
  882. # 判断连接的两个表格是否需要补单元格内容
  883. for c_list in connect_pages_list2:
  884. for i in range(len(c_list)-1):
  885. page_index1 = c_list[i][0]
  886. page_index2 = c_list[i+1][0]
  887. html2 = html_list[page_index2]
  888. soup2 = soup_list[page_index2]
  889. rule1 = connect_rule_dict.get(page_index1)[1]
  890. rule2 = connect_rule_dict.get(page_index2)[0]
  891. # print('rule1', rule1)
  892. # if rule2[0]:
  893. table1 = BeautifulSoup(rule1[1], 'lxml').findAll('table')[0]
  894. table2 = BeautifulSoup(rule2[1], 'lxml').findAll('table')[0]
  895. add_td_value = []
  896. # 获取最后一行td
  897. for tr in table1.findAll('tr')[::-1]:
  898. temp_list = []
  899. for td in tr.findAll('td'):
  900. temp_list.append(td.get_text())
  901. add_td_value = temp_list
  902. break
  903. # print('add_td_value', add_td_value)
  904. tr_index = 0
  905. for tr in table2.findAll('tr'):
  906. temp_list = []
  907. for td in tr.findAll('td'):
  908. if len(td.get_text()) < 1:
  909. temp_list.append(0)
  910. else:
  911. temp_list.append(1)
  912. # print('temp_list', temp_list)
  913. if temp_list and add_td_value and len(temp_list) == len(add_td_value) \
  914. and 1 in temp_list and temp_list[0] != 1 \
  915. and 1 not in temp_list[:temp_list.index(1)]:
  916. for j in range(len(temp_list)):
  917. if temp_list[j] == 0:
  918. tr.findAll('td')[j].string = add_td_value[j]
  919. # else:
  920. # # 只有第一行,且列数大于3,且只有一列有值情况下,上下两行文本合并
  921. # if tr_index == 0 and len(temp_list) >= 3 and temp_list.count(1) == 1:
  922. # tr.findAll('td')[j].string += add_td_value[j]
  923. # print('tr.findAll(td)[0]', tr.findAll('td')[0])
  924. tr_index += 1
  925. soup2.findAll('table')[0].replace_with(table2)
  926. html_list[page_index2] = str(soup2)
  927. # 符合连接条件的拼接表格
  928. new_html_list = []
  929. for c_list in connect_pages_list2:
  930. if len(c_list) == 1:
  931. new_html_list.append(html_list[c_list[0][0]])
  932. continue
  933. new_html = ''
  934. for c in c_list:
  935. match = re.finditer('</table>', new_html)
  936. last_table_index = None
  937. for m in match:
  938. last_table_index = m.span()[0]
  939. new_html += html_list[c[0]]
  940. # print('html_list[c[0]]', html_list[c[0]])
  941. if last_table_index is None:
  942. continue
  943. match = re.finditer('<table border="1">', new_html[last_table_index:])
  944. first_table_index = None
  945. for m in match:
  946. first_table_index = last_table_index + m.span()[1]
  947. break
  948. if first_table_index is None:
  949. continue
  950. # print('re', re.findall('</table>.*?<table border="1">', new_html[last_table_index:first_table_index]))
  951. # 非贪婪匹配
  952. new_html_sub = re.sub('</table>.*?<table border="1">',
  953. '<tr><td>#@#@#</td></tr>',
  954. new_html[last_table_index:first_table_index])
  955. new_html = new_html[:last_table_index] + new_html_sub + new_html[first_table_index:]
  956. # print('new_html', new_html)
  957. # new_html = new_html[:-5]
  958. # ([-/第页0-9]|<div>|</div>)*
  959. # 非贪婪匹配
  960. # match = re.finditer('</table>.*?<table border="1">', new_html)
  961. # for m in match:
  962. # if '#@#@#' in m.group():
  963. #
  964. # new_html = re.sub('</table>.*#@#@#.*?<table border="1">',
  965. # '<tr><td>#@#@#</td></tr>',
  966. # new_html)
  967. # print('new_html', new_html)
  968. soup = BeautifulSoup(new_html, 'lxml')
  969. trs = soup.findAll('tr')
  970. decompose_trs = []
  971. for i in range(len(trs)):
  972. if trs[i].get_text() == '#@#@#':
  973. td1 = trs[i - 1].findAll('td')
  974. td2 = trs[i + 1].findAll('td')
  975. if td2[0].get_text() == '':
  976. # 解决连续多页是一行表格,该行会被去掉问题
  977. find_father = False
  978. for father, son in decompose_trs:
  979. # print('son', son)
  980. # print('td1', trs[i - 1])
  981. if father != '' and son == trs[i - 1]:
  982. td_father = father.findAll('td')
  983. for j in range(len(td_father)):
  984. # print('td_father[j].string3', td_father[j].string)
  985. td_father[j].string = td_father[j].get_text() + td2[j].get_text()
  986. # print('td_father[j].string4', td_father[j].string)
  987. find_father = True
  988. decompose_trs.append([father, trs[i + 1]])
  989. break
  990. if not find_father:
  991. for j in range(len(td1)):
  992. # print('td1[j].string1', td1[j].string)
  993. td1[j].string = td1[j].get_text() + td2[j].get_text()
  994. # print('td1[j].string2', td1[j].string)
  995. decompose_trs.append([trs[i - 1], trs[i + 1]])
  996. # print('trs[i + 1]', trs[i + 1])
  997. # trs[i + 1].decompose()
  998. # print('trs[i-1]', trs[i-1])
  999. # trs[i].decompose()
  1000. decompose_trs.append(['', trs[i]])
  1001. # print('decompose_trs', decompose_trs)
  1002. # for father, son in decompose_trs:
  1003. # print('father', father)
  1004. # print('son', son)
  1005. # print('len(decompose_trs)', len(decompose_trs))
  1006. for father, son in decompose_trs:
  1007. for tr in trs:
  1008. if tr == son:
  1009. tr.decompose()
  1010. break
  1011. new_html = str(soup)
  1012. new_html_list.append(new_html)
  1013. html_str = ''
  1014. for h in new_html_list:
  1015. html_str += h
  1016. return [html_str]
  1017. def get_html(self):
  1018. if self._doc.error_code is not None:
  1019. return self._doc.error_code
  1020. self.convert()
  1021. if self._doc.error_code is not None:
  1022. return self._doc.error_code
  1023. html = self._doc.get_html(return_list=True)
  1024. # 表格连接
  1025. try:
  1026. html = self.connect_table(html)
  1027. except:
  1028. traceback.print_exc()
  1029. return [-12]
  1030. return html
  1031. def delete_water_mark(self, lt_text_list, page_bbox, times=5):
  1032. # 删除过多重复字句,为水印
  1033. duplicate_dict = {}
  1034. for _obj in lt_text_list:
  1035. t = _obj.get_text()
  1036. if t in duplicate_dict.keys():
  1037. duplicate_dict[t][0] += 1
  1038. duplicate_dict[t][1].append(_obj)
  1039. else:
  1040. duplicate_dict[t] = [1, [_obj]]
  1041. delete_text = []
  1042. for t in duplicate_dict.keys():
  1043. if duplicate_dict[t][0] >= times:
  1044. obj_list = duplicate_dict[t][1]
  1045. obj_list.sort(key=lambda x: x.bbox[3])
  1046. obj_distance_h = abs(obj_list[-1].bbox[3] - obj_list[0].bbox[1])
  1047. obj_list.sort(key=lambda x: x.bbox[2])
  1048. obj_distance_w = abs(obj_list[-1].bbox[2] - obj_list[0].bbox[0])
  1049. if obj_distance_h >= abs(page_bbox[1] - page_bbox[3]) * 0.7 \
  1050. and obj_distance_w >= abs(page_bbox[0] - page_bbox[2]) * 0.7:
  1051. delete_text.append(t)
  1052. temp_text_list = []
  1053. for _obj in lt_text_list:
  1054. t = _obj.get_text()
  1055. if t not in delete_text:
  1056. temp_text_list.append(_obj)
  1057. return temp_text_list
  1058. def resize_image(self, img_path, max_size=2000):
  1059. _img = cv2.imread(img_path)
  1060. if _img.shape[0] <= max_size or _img.shape[1] <= max_size:
  1061. return
  1062. else:
  1063. resize_axis = 0 if _img.shape[0] >= _img.shape[1] else 1
  1064. ratio = max_size / _img.shape[resize_axis]
  1065. new_shape = [0, 0]
  1066. new_shape[resize_axis] = max_size
  1067. new_shape[1 - resize_axis] = int(_img.shape[1 - resize_axis] * ratio)
  1068. _img = cv2.resize(_img, (new_shape[1], new_shape[0]))
  1069. cv2.imwrite(img_path, _img)
  1070. def get_single_pdf(self, path, page_no):
  1071. start_time = time.time()
  1072. try:
  1073. pdf_origin = copy.deepcopy(self.doc_pypdf2)
  1074. pdf_new = copy.deepcopy(self.doc_pypdf2_new)
  1075. pdf_new.addPage(pdf_origin.getPage(page_no))
  1076. path_new = path.split(".")[0] + "_split.pdf"
  1077. with open(path_new, "wb") as ff:
  1078. pdf_new.write(ff)
  1079. log("page_no: " + str(page_no) + " get_single_pdf cost: " + str(time.time()-start_time))
  1080. return path_new
  1081. except PyPDF2.utils.PdfReadError as e:
  1082. return [-3]
  1083. except Exception as e:
  1084. log("page_no: " + str(page_no) + " get_single_pdf error!")
  1085. return [-3]
  1086. def get_text_font():
  1087. def flags_decomposer(flags):
  1088. """Make font flags human readable."""
  1089. l = []
  1090. if flags & 2 ** 0:
  1091. l.append("superscript")
  1092. if flags & 2 ** 1:
  1093. l.append("italic")
  1094. if flags & 2 ** 2:
  1095. l.append("serifed")
  1096. else:
  1097. l.append("sans")
  1098. if flags & 2 ** 3:
  1099. l.append("monospaced")
  1100. else:
  1101. l.append("proportional")
  1102. if flags & 2 ** 4:
  1103. l.append("bold")
  1104. return ", ".join(l)
  1105. def get_underlined_textLines(page):
  1106. """
  1107. 获取某页pdf上的所有下划线文本信息
  1108. :param page: fitz中的一页
  1109. :return: list of tuples,每个tuple都是一个完整的下划线覆盖的整体:[(下划线句, 所在blk_no, 所在line_no), ...]
  1110. """
  1111. paths = page.get_drawings() # get drawings on the current page
  1112. # 获取该页内所有的height很小的bbox。因为下划线其实大多是这种矩形
  1113. # subselect things we may regard as lines
  1114. lines = []
  1115. for p in paths:
  1116. for item in p["items"]:
  1117. if item[0] == "l": # an actual line
  1118. p1, p2 = item[1:]
  1119. if p1.y == p2.y:
  1120. lines.append((p1, p2))
  1121. elif item[0] == "re": # a rectangle: check if height is small
  1122. r = item[1]
  1123. if r.width > r.height and r.height <= 2:
  1124. lines.append((r.tl, r.tr)) # take top left / right points
  1125. # 获取该页的`max_lineheight`,用于下面比较距离使用
  1126. blocks = page.get_text("dict", flags=11)["blocks"]
  1127. max_lineheight = 0
  1128. for b in blocks:
  1129. for l in b["lines"]:
  1130. bbox = fitz.Rect(l["bbox"])
  1131. if bbox.height > max_lineheight:
  1132. max_lineheight = bbox.height
  1133. underlined_res = []
  1134. # 开始对下划线内容进行查询
  1135. # make a list of words
  1136. words = page.get_text("words")
  1137. # if underlined, the bottom left / right of a word
  1138. # should not be too far away from left / right end of some line:
  1139. for wdx, w in enumerate(words): # w[4] is the actual word string
  1140. r = fitz.Rect(w[:4]) # first 4 items are the word bbox
  1141. for p1, p2 in lines: # check distances for start / end points
  1142. if abs(r.bl - p1) <= max_lineheight: # 当前word的左下满足下划线左下
  1143. if abs(r.br - p2) <= max_lineheight: # 当前word的右下满足下划线右下(单个词,无空格)
  1144. print(f"Word '{w[4]}' is underlined! Its block-line number is {w[-3], w[-2]}")
  1145. underlined_res.append((w[4], w[-3], w[-2])) # 分别是(下划线词,所在blk_no,所在line_no)
  1146. break # don't check more lines
  1147. else: # 继续寻找同line右侧的有缘人,因为有些下划线覆盖的词包含多个词,多个词之间有空格
  1148. curr_line_num = w[-2] # line nunmber
  1149. for right_wdx in range(wdx + 1, len(words), 1):
  1150. _next_w = words[right_wdx]
  1151. if _next_w[-2] != curr_line_num: # 当前遍历到的右侧word已经不是当前行的了(跨行是不行的)
  1152. break
  1153. _r_right = fitz.Rect(_next_w[:4]) # 获取当前同行右侧某word的方框4点
  1154. if abs(_r_right.br - p2) <= max_lineheight: # 用此word右下点和p2(目标下划线右上点)算距离,距离要小于max_lineheight
  1155. print(
  1156. f"Word '{' '.join([_one_word[4] for _one_word in words[wdx:right_wdx + 1]])}' is underlined! " +
  1157. f"Its block-line number is {w[-3], w[-2]}")
  1158. underlined_res.append(
  1159. (' '.join([_one_word[4] for _one_word in words[wdx:right_wdx + 1]]),
  1160. w[-3], w[-2])
  1161. ) # 分别是(下划线词,所在blk_no,所在line_no)
  1162. break # don't check more lines
  1163. return underlined_res
  1164. _p = r'C:\Users\Administrator\Desktop\test_pdf\error2-2.pdf'
  1165. doc_pymupdf = read_pymupdf(_p)
  1166. page = doc_pymupdf[0]
  1167. blocks = page.get_text("dict", flags=11)["blocks"]
  1168. for b in blocks: # iterate through the text blocks
  1169. for l in b["lines"]: # iterate through the text lines
  1170. for s in l["spans"]: # iterate through the text spans
  1171. print("")
  1172. font_properties = "Font: '%s' (%s), size %g, color #%06x" % (
  1173. s["font"], # font name
  1174. flags_decomposer(s["flags"]), # readable font flags
  1175. s["size"], # font size
  1176. s["color"], # font color
  1177. )
  1178. print(s)
  1179. print("Text: '%s'" % s["text"]) # simple print of text
  1180. print(font_properties)
  1181. get_underlined_textLines(page)
  1182. # 以下为现成pdf单页解析接口
  1183. class ParseSentence:
  1184. def __init__(self, bbox, fontname, fontsize, _text, _title, title_text, _pattern, title_degree, is_outline,
  1185. outline_location, page_no):
  1186. (x0, y0, x1, y1) = bbox
  1187. self.x0 = x0
  1188. self.y0 = y0
  1189. self.x1 = x1
  1190. self.y1 = y1
  1191. self.bbox = bbox
  1192. self.fontname = fontname
  1193. self.fontsize = fontsize
  1194. self.text = _text
  1195. self.title = _title
  1196. self.title_text = title_text
  1197. self.groups = _pattern
  1198. self.title_degree = title_degree
  1199. self.is_outline = is_outline
  1200. self.outline_location = outline_location
  1201. self.page_no = page_no
  1202. def __repr__(self):
  1203. return "%s,%s,%s,%d,%s" % (self.text, self.title, self.is_outline, self.outline_location, str(self.bbox))
  1204. class ParseUtils:
  1205. @staticmethod
  1206. def getFontinfo(_page):
  1207. for _obj in _page._objs:
  1208. if isinstance(_obj, (LTTextBoxHorizontal, LTTextBoxVertical)):
  1209. for textline in _obj._objs:
  1210. done = False
  1211. for lchar in textline._objs:
  1212. if isinstance(lchar, (LTChar)):
  1213. _obj.fontname = lchar.fontname
  1214. _obj.fontsize = lchar.size
  1215. done = True
  1216. break
  1217. if done:
  1218. break
  1219. @staticmethod
  1220. def recognize_sentences(list_textbox, filter_objs, page_bbox, page_no,
  1221. remove_space=True, sourceP_LB=True):
  1222. list_textbox.sort(key=lambda x: x.bbox[0])
  1223. list_textbox.sort(key=lambda x: x.bbox[3], reverse=sourceP_LB)
  1224. cluster_textbox = []
  1225. for _textbox in list_textbox:
  1226. if _textbox in filter_objs:
  1227. continue
  1228. _find = False
  1229. for _ct in cluster_textbox:
  1230. if abs(_ct["y"] - _textbox.bbox[1]) < 5:
  1231. _find = True
  1232. _ct["textbox"].append(_textbox)
  1233. if not _find:
  1234. cluster_textbox.append({"y": _textbox.bbox[1], "textbox": [_textbox]})
  1235. cluster_textbox.sort(key=lambda x: x["y"], reverse=sourceP_LB)
  1236. list_sentences = []
  1237. for _line in cluster_textbox:
  1238. _textboxs = _line["textbox"]
  1239. _textboxs.sort(key=lambda x: x.bbox[0])
  1240. _linetext = _textboxs[0].get_text()
  1241. for _i in range(1, len(_textboxs)):
  1242. if abs(_textboxs[_i].bbox[0] - _textboxs[_i - 1].bbox[2]) > 60:
  1243. if _linetext and _linetext[-1] not in (",", ",", "。", ".", "、", ";"):
  1244. _linetext += "=,="
  1245. _linetext += _textboxs[_i].get_text()
  1246. _linetext = re.sub("[\s\r\n]", "", _linetext)
  1247. _bbox = (_textboxs[0].bbox[0], _textboxs[0].bbox[1],
  1248. _textboxs[-1].bbox[2], _textboxs[-1].bbox[3])
  1249. _title = None
  1250. _pattern_groups = None
  1251. title_text = ""
  1252. if not _title:
  1253. _groups = ParseUtils.find_title_by_pattern(_textboxs[0].get_text())
  1254. if _groups:
  1255. _title = _groups[0][0]
  1256. title_text = _groups[0][1]
  1257. _pattern_groups = _groups
  1258. if not _title:
  1259. _groups = ParseUtils.find_title_by_pattern(_linetext)
  1260. if _groups:
  1261. _title = _groups[0][0]
  1262. title_text = _groups[0][1]
  1263. _pattern_groups = _groups
  1264. if not _title:
  1265. _title = ParseUtils.rec_incenter(_bbox, page_bbox)
  1266. title_degree = 2
  1267. if not _title:
  1268. _linetext = _linetext.replace("=,=", ",")
  1269. else:
  1270. _linetext = _linetext.replace("=,=", "")
  1271. title_degree = int(_title.split("_")[1])
  1272. # 页码
  1273. if ParseUtils.rec_incenter(_bbox, page_bbox) and re.search("^\d+$", _linetext) is not None:
  1274. continue
  1275. if _linetext == "" or re.search("^,+$", _linetext) is not None:
  1276. continue
  1277. is_outline = False
  1278. outline_location = -1
  1279. _search = re.search("(?P<text>.+?)\.{5,}(?P<nums>\d+)$", _linetext)
  1280. if _search is not None:
  1281. is_outline = True
  1282. _linetext = _search.group("text")
  1283. outline_location = int(_search.group("nums"))
  1284. list_sentences.append(
  1285. ParseSentence(_bbox, _textboxs[-1].__dict__.get("fontname"), _textboxs[-1].__dict__.get("fontsize"),
  1286. _linetext, _title, title_text, _pattern_groups, title_degree, is_outline,
  1287. outline_location, page_no))
  1288. # for _sen in list_sentences:
  1289. # print(_sen.__dict__)
  1290. return list_sentences
  1291. @staticmethod
  1292. def find_title_by_pattern(_text,
  1293. _pattern="(?P<title_1>(?P<title_1_index_0_0>^第?)(?P<title_1_index_1_1>[一二三四五六七八九十ⅠⅡⅢⅣⅤⅥⅦⅧⅨⅩⅪⅫ]+)(?P<title_1_index_2_0>[、章]))|" \
  1294. "(?P<title_3>^(?P<title_3_index_0_1>[ⅠⅡⅢⅣⅤⅥⅦⅧⅨⅩⅪⅫ]+))|" \
  1295. "(?P<title_4>^(?P<title_4_index_0_0>第?)(?P<title_4_index_1_1>[一二三四五六七八九十]+)(?P<title_4_index_2_0>[节]))|" \
  1296. "(?P<title_11>^(?P<title_11_index_0_0>\d{1,2}[\..、\s\-]\d{1,2}[\..、\s\-]\d{1,2}[\..、\s\-])(?P<title_11_index_1_1>\d{1,2})(?P<title_11_index_2_0>[\..、\s\-]))|" \
  1297. "(?P<title_10>^(?P<title_10_index_0_0>\d{1,2}[\..、\s\-]\d{1,2}[\..、\s\-])(?P<title_10_index_1_1>\d{1,2})(?P<title_10_index_2_0>[\..、\s\-]))|" \
  1298. "(?P<title_7>^(?P<title_7_index_0_0>\d{1,2}[\..、\s\-])(?P<title_7_index_1_1>\d{1,2})(?P<title_7_index_2_0>[\..、\s\-]))|" \
  1299. "(?P<title_6>^(?P<title_6_index_0_1>\d{1,2})(?P<title_6_index_1_0>[\..、\s\-]))|" \
  1300. "(?P<title_15>^(?P<title_15_index_0_0>(?)(?P<title_15_index_1_1>\d{1,2})(?P<title_15_index_2_0>)))|" \
  1301. "(?P<title_17>^(?P<title_17_index_0_0>(?)(?P<title_17_index_1_1>[a-zA-Z]+)(?P<title_17_index_2_0>)))|"
  1302. "(?P<title_19>^(?P<title_19_index_0_0>(?)(?P<title_19_index_1_1>[一二三四五六七八九十ⅠⅡⅢⅣⅤⅥⅦⅧⅨⅩⅪⅫ]+)(?P<title_19_index_2_0>)))|" \
  1303. ):
  1304. _se = re.search(_pattern, _text)
  1305. groups = []
  1306. if _se is not None:
  1307. _gd = _se.groupdict()
  1308. for k, v in _gd.items():
  1309. if v is not None:
  1310. groups.append((k, v))
  1311. if len(groups):
  1312. groups.sort(key=lambda x: x[0])
  1313. return groups
  1314. return None
  1315. @staticmethod
  1316. def rec_incenter(o_bbox, p_bbox):
  1317. p_width = p_bbox[2] - p_bbox[0]
  1318. l_space = (o_bbox[0] - p_bbox[0]) / p_width
  1319. r_space = (p_bbox[2] - o_bbox[2]) / p_width
  1320. if abs((l_space - r_space)) < 0.1 and l_space > 0.2:
  1321. return "title_2"
  1322. @staticmethod
  1323. def is_first_title(_title):
  1324. if _title is None:
  1325. return False
  1326. if re.search("^\d+$", _title) is not None:
  1327. if int(_title) == 1:
  1328. return True
  1329. return False
  1330. if re.search("^[一二三四五六七八九十百]+$", _title) is not None:
  1331. if _title == "一":
  1332. return True
  1333. return False
  1334. if re.search("^[a-z]+$", _title) is not None:
  1335. if _title == "a":
  1336. return True
  1337. return False
  1338. if re.search("^[A-Z]+$", _title) is not None:
  1339. if _title == "A":
  1340. return True
  1341. return False
  1342. if re.search("^[ⅠⅡⅢⅣⅤⅥⅦⅧⅨⅩⅪⅫ]$", _title) is not None:
  1343. if _title == "Ⅰ":
  1344. return True
  1345. return False
  1346. return False
  1347. @staticmethod
  1348. def get_next_title(_title):
  1349. if re.search("^\d+$", _title) is not None:
  1350. return str(int(_title) + 1)
  1351. if re.search("^[一二三四五六七八九十百]+$", _title) is not None:
  1352. _next_title = ParseUtils.make_increase(['一', '二', '三', '四', '五', '六', '七', '八', '九', '十'],
  1353. re.sub("[十百]", '', _title))
  1354. _next_title = list(_next_title)
  1355. _next_title.reverse()
  1356. if _next_title[-1] != "十":
  1357. if len(_next_title) >= 2:
  1358. _next_title.insert(-1, '十')
  1359. if len(_next_title) >= 4:
  1360. _next_title.insert(-3, '百')
  1361. if _title[0] == "十":
  1362. if _next_title == "十":
  1363. _next_title = ["二", "十"]
  1364. _next_title.insert(0, "十")
  1365. _next_title = "".join(_next_title)
  1366. return _next_title
  1367. if re.search("^[a-z]+$", _title) is not None:
  1368. _next_title = ParseUtils.make_increase([chr(i + ord('a')) for i in range(26)], _title)
  1369. _next_title = list(_next_title)
  1370. _next_title.reverse()
  1371. return "".join(_next_title)
  1372. if re.search("^[A-Z]+$", _title) is not None:
  1373. _next_title = ParseUtils.make_increase([chr(i + ord('A')) for i in range(26)], _title)
  1374. _next_title = list(_next_title)
  1375. _next_title.reverse()
  1376. return "".join(_next_title)
  1377. if re.search("^[ⅠⅡⅢⅣⅤⅥⅦⅧⅨⅩⅪⅫ]$", _title) is not None:
  1378. _sort = ["Ⅰ", "Ⅱ", "Ⅲ", "Ⅳ", "Ⅴ", "Ⅵ", "Ⅶ", "Ⅷ", "Ⅸ", "Ⅹ", "Ⅺ", "Ⅻ"]
  1379. _index = _sort.index(_title)
  1380. if _index < len(_sort) - 1:
  1381. return _sort[_index + 1]
  1382. return None
  1383. @staticmethod
  1384. def make_increase(_sort, _title, _add=1):
  1385. if len(_title) == 0 and _add == 0:
  1386. return ""
  1387. if len(_title) == 0 and _add == 1:
  1388. return _sort[0]
  1389. _index = _sort.index(_title[-1])
  1390. next_index = (_index + _add) % len(_sort)
  1391. next_chr = _sort[next_index]
  1392. if _index == len(_sort) - 1:
  1393. _add = 1
  1394. else:
  1395. _add = 0
  1396. return next_chr + ParseUtils.make_increase(_sort, _title[:-1], _add)
  1397. @staticmethod
  1398. def rec_serial(_text, o_bbox, p_bbox, fontname, _pattern="(?P<title_1>^[一二三四五六七八九十]+[、])|" \
  1399. "(?P<title_2>^\d+[\.、\s])|" \
  1400. "(?P<title_3>^\d+\.\d+[\.、\s])|" \
  1401. "(?P<title_4>^\d+\.\d+\.\d+[\.、\s])|" \
  1402. "(?P<title_5>^\d+\.\d+\.\d+\.\d+[\.、\s])"):
  1403. # todo :recog the serial of the sentence
  1404. _se = re.search(_pattern, _text)
  1405. if _se is not None:
  1406. _gd = _se.groupdict()
  1407. for k, v in _gd.items():
  1408. if v is not None:
  1409. return k
  1410. return None
  1411. if __name__ == '__main__':
  1412. PDFConvert(r"C:/Users/Administrator/Downloads/1651896704621.pdf", "C:/Users/Administrator/Downloads/1").get_html()