convert.py 90 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989910010110210310410510610710810911011111211311411511611711811912012112212312412512612712812913013113213313413513613713813914014114214314414514614714814915015115215315415515615715815916016116216316416516616716816917017117217317417517617717817918018118218318418518618718818919019119219319419519619719819920020120220320420520620720820921021121221321421521621721821922022122222322422522622722822923023123223323423523623723823924024124224324424524624724824925025125225325425525625725825926026126226326426526626726826927027127227327427527627727827928028128228328428528628728828929029129229329429529629729829930030130230330430530630730830931031131231331431531631731831932032132232332432532632732832933033133233333433533633733833934034134234334434534634734834935035135235335435535635735835936036136236336436536636736836937037137237337437537637737837938038138238338438538638738838939039139239339439539639739839940040140240340440540640740840941041141241341441541641741841942042142242342442542642742842943043143243343443543643743843944044144244344444544644744844945045145245345445545645745845946046146246346446546646746846947047147247347447547647747847948048148248348448548648748848949049149249349449549649749849950050150250350450550650750850951051151251351451551651751851952052152252352452552652752852953053153253353453553653753853954054154254354454554654754854955055155255355455555655755855956056156256356456556656756856957057157257357457557657757857958058158258358458558658758858959059159259359459559659759859960060160260360460560660760860961061161261361461561661761861962062162262362462562662762862963063163263363463563663763863964064164264364464564664764864965065165265365465565665765865966066166266366466566666766866967067167267367467567667767867968068168268368468568668768868969069169269369469569669769869970070170270370470570670770870971071171271371471571671771871972072172272372472572672772872973073173273373473573673773873974074174274374474574674774874975075175275375475575675775875976076176276376476576676776876977077177277377477577677777877978078178278378478578678778878979079179279379479579679779879980080180280380480580680780880981081181281381481581681781881982082182282382482582682782882983083183283383483583683783883984084184284384484584684784884985085185285385485585685785885986086186286386486586686786886987087187287387487587687787887988088188288388488588688788888989089189289389489589689789889990090190290390490590690790890991091191291391491591691791891992092192292392492592692792892993093193293393493593693793893994094194294394494594694794894995095195295395495595695795895996096196296396496596696796896997097197297397497597697797897998098198298398498598698798898999099199299399499599699799899910001001100210031004100510061007100810091010101110121013101410151016101710181019102010211022102310241025102610271028102910301031103210331034103510361037103810391040104110421043104410451046104710481049105010511052105310541055105610571058105910601061106210631064106510661067106810691070107110721073107410751076107710781079108010811082108310841085108610871088108910901091109210931094109510961097109810991100110111021103110411051106110711081109111011111112111311141115111611171118111911201121112211231124112511261127112811291130113111321133113411351136113711381139114011411142114311441145114611471148114911501151115211531154115511561157115811591160116111621163116411651166116711681169117011711172117311741175117611771178117911801181118211831184118511861187118811891190119111921193119411951196119711981199120012011202120312041205120612071208120912101211121212131214121512161217121812191220122112221223122412251226122712281229123012311232123312341235123612371238123912401241124212431244124512461247124812491250125112521253125412551256125712581259126012611262126312641265126612671268126912701271127212731274127512761277127812791280128112821283128412851286128712881289129012911292129312941295129612971298129913001301130213031304130513061307130813091310131113121313131413151316131713181319132013211322132313241325132613271328132913301331133213331334133513361337133813391340134113421343134413451346134713481349135013511352135313541355135613571358135913601361136213631364136513661367136813691370137113721373137413751376137713781379138013811382138313841385138613871388138913901391139213931394139513961397139813991400140114021403140414051406140714081409141014111412141314141415141614171418141914201421142214231424142514261427142814291430143114321433143414351436143714381439144014411442144314441445144614471448144914501451145214531454145514561457145814591460146114621463146414651466146714681469147014711472147314741475147614771478147914801481148214831484148514861487148814891490149114921493149414951496149714981499150015011502150315041505150615071508150915101511151215131514151515161517151815191520152115221523152415251526152715281529153015311532153315341535153615371538153915401541154215431544154515461547154815491550155115521553155415551556155715581559156015611562156315641565156615671568156915701571157215731574157515761577157815791580158115821583158415851586158715881589159015911592159315941595159615971598159916001601160216031604160516061607160816091610161116121613161416151616161716181619162016211622162316241625162616271628162916301631163216331634163516361637163816391640164116421643164416451646164716481649165016511652165316541655165616571658165916601661166216631664166516661667166816691670167116721673167416751676167716781679168016811682168316841685168616871688168916901691169216931694169516961697169816991700170117021703170417051706170717081709171017111712171317141715171617171718171917201721172217231724172517261727172817291730173117321733173417351736173717381739174017411742174317441745174617471748174917501751175217531754175517561757175817591760176117621763176417651766176717681769177017711772177317741775177617771778177917801781178217831784178517861787178817891790179117921793179417951796179717981799180018011802180318041805180618071808180918101811181218131814181518161817181818191820182118221823182418251826182718281829183018311832183318341835183618371838183918401841184218431844184518461847184818491850185118521853185418551856185718581859186018611862186318641865186618671868186918701871187218731874187518761877187818791880188118821883188418851886188718881889189018911892189318941895189618971898189919001901190219031904190519061907190819091910191119121913191419151916191719181919192019211922192319241925192619271928192919301931193219331934193519361937193819391940194119421943194419451946194719481949195019511952195319541955195619571958195919601961196219631964196519661967196819691970197119721973197419751976197719781979198019811982198319841985198619871988198919901991199219931994199519961997199819992000200120022003200420052006200720082009201020112012201320142015201620172018201920202021202220232024202520262027202820292030203120322033203420352036203720382039204020412042204320442045204620472048204920502051205220532054205520562057205820592060206120622063206420652066206720682069207020712072207320742075207620772078207920802081208220832084208520862087208820892090209120922093209420952096209720982099210021012102210321042105210621072108210921102111211221132114211521162117211821192120212121222123212421252126212721282129213021312132213321342135213621372138213921402141214221432144214521462147214821492150215121522153215421552156215721582159216021612162216321642165216621672168216921702171217221732174217521762177217821792180218121822183218421852186218721882189219021912192219321942195219621972198219922002201220222032204220522062207220822092210221122122213221422152216221722182219222022212222222322242225222622272228222922302231223222332234223522362237223822392240224122422243224422452246224722482249225022512252225322542255225622572258225922602261226222632264226522662267226822692270227122722273227422752276227722782279228022812282228322842285228622872288228922902291229222932294229522962297229822992300230123022303230423052306230723082309231023112312231323142315231623172318231923202321232223232324232523262327232823292330233123322333233423352336233723382339234023412342234323442345234623472348234923502351235223532354235523562357235823592360236123622363236423652366236723682369237023712372237323742375237623772378237923802381238223832384238523862387238823892390239123922393239423952396239723982399240024012402240324042405240624072408240924102411241224132414241524162417241824192420242124222423242424252426242724282429243024312432243324342435243624372438243924402441244224432444244524462447244824492450245124522453245424552456245724582459
  1. #-*- coding: utf-8 -*-
  2. import sys
  3. import os
  4. sys.path.append(os.path.dirname(os.path.abspath(__file__)) + "/../")
  5. import codecs
  6. import gc
  7. import hashlib
  8. import io
  9. import json
  10. import multiprocessing
  11. import sys
  12. import subprocess
  13. import PyPDF2
  14. import lxml
  15. import pdfminer
  16. from PIL import Image
  17. from format_convert import get_memory_info
  18. from ocr import ocr_interface
  19. from ocr.ocr_interface import ocr, OcrModels
  20. from otr import otr_interface
  21. from otr.otr_interface import otr, OtrModels
  22. import re
  23. import shutil
  24. import signal
  25. import sys
  26. import base64
  27. import time
  28. import traceback
  29. import uuid
  30. from os.path import basename
  31. import cv2
  32. import fitz
  33. import pandas
  34. import docx
  35. import zipfile
  36. import mimetypes
  37. import filetype
  38. # import pdfplumber
  39. import psutil
  40. import requests
  41. import rarfile
  42. from PyPDF2 import PdfFileReader, PdfFileWriter
  43. import xml.dom.minidom
  44. import subprocess
  45. import logging
  46. from pdfminer.pdfparser import PDFParser
  47. from pdfminer.pdfdocument import PDFDocument
  48. from pdfminer.pdfpage import PDFPage
  49. from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
  50. from pdfminer.converter import PDFPageAggregator
  51. from pdfminer.layout import LTTextBoxHorizontal, LAParams, LTFigure, LTImage, LTCurve, LTText, LTChar
  52. import logging
  53. import chardet
  54. from bs4 import BeautifulSoup
  55. from format_convert.libreoffice_interface import office_convert
  56. from format_convert.swf.export import SVGExporter
  57. logging.getLogger("pdfminer").setLevel(logging.WARNING)
  58. from format_convert.table_correct import *
  59. from format_convert.swf.movie import SWF
  60. import logging
  61. # import timeout_decorator
  62. from format_convert import timeout_decorator
  63. logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(name)s - %(levelname)s - %(message)s')
  64. # txt doc docx xls xlsx pdf zip rar swf jpg jpeg png
  65. def judge_error_code(_list, code=[-1, -2, -3, -4, -5, -7]):
  66. for c in code:
  67. if _list == [c]:
  68. return True
  69. return False
  70. def set_timeout(signum, frame):
  71. print("=======================set_timeout")
  72. print("=======================set_timeout")
  73. print("=======================set_timeout")
  74. print("=======================set_timeout")
  75. print("=======================set_timeout")
  76. print("=======================set_timeout")
  77. print("=======================set_timeout")
  78. print("=======================set_timeout")
  79. print("=======================set_timeout")
  80. print("=======================set_timeout")
  81. print("=======================set_timeout")
  82. print("=======================set_timeout")
  83. print("=======================set_timeout")
  84. print("=======================set_timeout")
  85. print("=======================set_timeout")
  86. print("=======================set_timeout")
  87. raise TimeoutError
  88. def log_traceback(func_name):
  89. logging.info(func_name)
  90. etype, value, tb = sys.exc_info()
  91. for line in traceback.TracebackException(
  92. type(value), value, tb, limit=None).format(chain=True):
  93. logging.info(line)
  94. def judge_format(path):
  95. guess1 = mimetypes.guess_type(path)
  96. _type = None
  97. if guess1[0]:
  98. _type = guess1[0]
  99. else:
  100. guess2 = filetype.guess(path)
  101. if guess2:
  102. _type = guess2.mime
  103. if _type == "application/pdf":
  104. return "pdf"
  105. if _type == "application/vnd.openxmlformats-officedocument.wordprocessingml.document":
  106. return "docx"
  107. if _type == "application/x-zip-compressed" or _type == "application/zip":
  108. return "zip"
  109. if _type == "application/x-rar-compressed" or _type == "application/rar":
  110. return "rar"
  111. if _type == "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet":
  112. return "xlsx"
  113. if _type == "application/msword":
  114. return "doc"
  115. if _type == "image/png":
  116. return "png"
  117. if _type == "image/jpeg":
  118. return "jpg"
  119. # 猜不到,返回None
  120. return None
  121. @get_memory_info.memory_decorator
  122. def txt2text(path):
  123. logging.info("into txt2text")
  124. try:
  125. # 判断字符编码
  126. with open(path, "rb") as ff:
  127. data = ff.read()
  128. encode = chardet.detect(data).get("encoding")
  129. print("txt2text judge code is", encode)
  130. try:
  131. if encode is None:
  132. logging.info("txt2text cannot judge file code!")
  133. return [-3]
  134. with open(path, "r", encoding=encode) as ff:
  135. txt_text = ff.read()
  136. return [txt_text]
  137. except:
  138. logging.info("txt2text cannot open file with code " + encode)
  139. return [-3]
  140. except Exception as e:
  141. print("txt2text", traceback.print_exc())
  142. logging.info("txt2text error!")
  143. return [-1]
  144. @get_memory_info.memory_decorator
  145. def doc2text(path, unique_type_dir):
  146. logging.info("into doc2text")
  147. try:
  148. # 调用office格式转换
  149. file_path = from_office_interface(path, unique_type_dir, 'docx')
  150. # if file_path == [-3]:
  151. # return [-3]
  152. if judge_error_code(file_path):
  153. return file_path
  154. text = docx2text(file_path, unique_type_dir)
  155. return text
  156. except Exception as e:
  157. logging.info("doc2text error!")
  158. print("doc2text", traceback.print_exc())
  159. # log_traceback("doc2text")
  160. return [-1]
  161. @get_memory_info.memory_decorator
  162. def read_xml_order(path, save_path):
  163. logging.info("into read_xml_order")
  164. try:
  165. try:
  166. f = zipfile.ZipFile(path)
  167. for file in f.namelist():
  168. if "word/document.xml" == str(file):
  169. f.extract(file, save_path)
  170. f.close()
  171. except Exception as e:
  172. # print("docx format error!", e)
  173. logging.info("docx format error!")
  174. return [-3]
  175. # DOMTree = xml.dom.minidom.parse(save_path + "word/document.xml")
  176. # collection = DOMTree.documentElement
  177. try:
  178. collection = xml_analyze(save_path + "word/document.xml")
  179. except TimeoutError:
  180. logging.info("read_xml_order timeout")
  181. return [-4]
  182. body = collection.getElementsByTagName("w:body")[0]
  183. order_list = []
  184. for line in body.childNodes:
  185. # print(str(line))
  186. if "w:p" in str(line):
  187. text = line.getElementsByTagName("w:t")
  188. picture = line.getElementsByTagName("wp:docPr")
  189. if text:
  190. order_list.append("w:t")
  191. if picture:
  192. order_list.append("wp:docPr")
  193. for line1 in line.childNodes:
  194. if "w:r" in str(line1):
  195. # print("read_xml_order", "w:r")
  196. picture1 = line1.getElementsByTagName("w:pict")
  197. if picture1:
  198. order_list.append("wp:docPr")
  199. if "w:tbl" in str(line):
  200. order_list.append("w:tbl")
  201. read_xml_table(path, save_path)
  202. return order_list
  203. except Exception as e:
  204. logging.info("read_xml_order error!")
  205. print("read_xml_order", traceback.print_exc())
  206. # log_traceback("read_xml_order")
  207. return [-1]
  208. @get_memory_info.memory_decorator
  209. def read_xml_table(path, save_path):
  210. logging.info("into read_xml_table")
  211. try:
  212. # print("into read_xml_table")
  213. try:
  214. f = zipfile.ZipFile(path)
  215. for file in f.namelist():
  216. if "word/document.xml" == str(file):
  217. f.extract(file, save_path)
  218. f.close()
  219. except Exception as e:
  220. # print("docx format error!", e)
  221. logging.info("docx format error!")
  222. return [-3]
  223. # DOMTree = xml.dom.minidom.parse(save_path + "word/document.xml")
  224. # collection = DOMTree.documentElement
  225. try:
  226. collection = xml_analyze(save_path + "word/document.xml")
  227. except TimeoutError:
  228. logging.info("read_xml_table timeout")
  229. return [-4]
  230. body = collection.getElementsByTagName("w:body")[0]
  231. table_text_list = []
  232. # print("body.childNodes", body.childNodes)
  233. for line in body.childNodes:
  234. if "w:tbl" in str(line):
  235. # print("str(line)", str(line))
  236. table_text = '<table border="1">' + "\n"
  237. tr_list = line.getElementsByTagName("w:tr")
  238. # print("line.childNodes", line.childNodes)
  239. tr_index = 0
  240. tr_text_list = []
  241. tr_text_list_colspan = []
  242. for tr in tr_list:
  243. table_text = table_text + "<tr rowspan=1>" + "\n"
  244. tc_list = tr.getElementsByTagName("w:tc")
  245. tc_index = 0
  246. tc_text_list = []
  247. for tc in tc_list:
  248. tc_text = ""
  249. # 获取一格占多少列
  250. col_span = tc.getElementsByTagName("w:gridSpan")
  251. if col_span:
  252. col_span = int(col_span[0].getAttribute("w:val"))
  253. else:
  254. col_span = 1
  255. # 获取是否是合并单元格的下一个空单元格
  256. is_merge = tc.getElementsByTagName("w:vMerge")
  257. if is_merge:
  258. is_merge = is_merge[0].getAttribute("w:val")
  259. if is_merge == "continue":
  260. col_span_index = 0
  261. real_tc_index = 0
  262. # if get_platform() == "Windows":
  263. # print("read_xml_table tr_text_list", tr_text_list)
  264. # print("read_xml_table tr_index", tr_index)
  265. if 0 <= tr_index - 1 < len(tr_text_list):
  266. for tc_colspan in tr_text_list[tr_index - 1]:
  267. if col_span_index < tc_index:
  268. col_span_index += tc_colspan[1]
  269. real_tc_index += 1
  270. # print("tr_index-1, real_tc_index", tr_index-1, real_tc_index)
  271. # print(tr_text_list[tr_index-1])
  272. if real_tc_index < len(tr_text_list[tr_index - 1]):
  273. tc_text = tr_text_list[tr_index - 1][real_tc_index][0]
  274. table_text = table_text + "<td colspan=" + str(col_span) + ">" + "\n"
  275. p_list = tc.getElementsByTagName("w:p")
  276. for p in p_list:
  277. t = p.getElementsByTagName("w:t")
  278. if t:
  279. for tt in t:
  280. # print("tt", tt.childNodes)
  281. if len(tt.childNodes) > 0:
  282. tc_text += tt.childNodes[0].nodeValue
  283. tc_text += "\n"
  284. table_text = table_text + tc_text + "</td>" + "\n"
  285. tc_index += 1
  286. tc_text_list.append([tc_text, col_span])
  287. table_text += "</tr>" + "\n"
  288. tr_index += 1
  289. tr_text_list.append(tc_text_list)
  290. table_text += "</table>" + "\n"
  291. table_text_list.append(table_text)
  292. return table_text_list
  293. except Exception as e:
  294. logging.info("read_xml_table error")
  295. print("read_xml_table", traceback.print_exc())
  296. # log_traceback("read_xml_table")
  297. return [-1]
  298. @get_memory_info.memory_decorator
  299. @timeout_decorator.timeout(300, timeout_exception=TimeoutError)
  300. def xml_analyze(path):
  301. # 解析xml
  302. DOMTree = xml.dom.minidom.parse(path)
  303. collection = DOMTree.documentElement
  304. return collection
  305. def read_docx_table(document):
  306. table_text_list = []
  307. for table in document.tables:
  308. table_text = "<table>\n"
  309. print("==================")
  310. for row in table.rows:
  311. table_text += "<tr>\n"
  312. for cell in row.cells:
  313. table_text += "<td>" + cell.text + "</td>\n"
  314. table_text += "</tr>\n"
  315. table_text += "</table>\n"
  316. print(table_text)
  317. table_text_list.append(table_text)
  318. return table_text_list
  319. @get_memory_info.memory_decorator
  320. def docx2text(path, unique_type_dir):
  321. logging.info("into docx2text")
  322. try:
  323. try:
  324. doc = docx.Document(path)
  325. except Exception as e:
  326. print("docx format error!", e)
  327. print(traceback.print_exc())
  328. logging.info("docx format error!")
  329. return [-3]
  330. # 遍历段落
  331. # print("docx2text extract paragraph")
  332. paragraph_text_list = []
  333. for paragraph in doc.paragraphs:
  334. if paragraph.text != "":
  335. paragraph_text_list.append("<div>" + paragraph.text + "</div>" + "\n")
  336. # print("paragraph_text", paragraph.text)
  337. # 遍历表
  338. try:
  339. table_text_list = read_xml_table(path, unique_type_dir)
  340. except TimeoutError:
  341. return [-4]
  342. if judge_error_code(table_text_list):
  343. return table_text_list
  344. # 顺序遍历图片
  345. # print("docx2text extract image")
  346. image_text_list = []
  347. temp_image_path = unique_type_dir + "temp_image.png"
  348. pattern = re.compile('rId\d+')
  349. for graph in doc.paragraphs:
  350. for run in graph.runs:
  351. if run.text == '':
  352. try:
  353. if not pattern.search(run.element.xml):
  354. continue
  355. content_id = pattern.search(run.element.xml).group(0)
  356. content_type = doc.part.related_parts[content_id].content_type
  357. except Exception as e:
  358. print("docx no image!", e)
  359. continue
  360. if not content_type.startswith('image'):
  361. continue
  362. # 写入临时文件
  363. img_data = doc.part.related_parts[content_id].blob
  364. with open(temp_image_path, 'wb') as f:
  365. f.write(img_data)
  366. # if get_platform() == "Windows":
  367. # print("img_data", img_data)
  368. if img_data is None:
  369. continue
  370. # 识别图片文字
  371. image_text = picture2text(temp_image_path)
  372. if image_text == [-2]:
  373. return [-2]
  374. if image_text == [-1]:
  375. return [-1]
  376. if image_text == [-3]:
  377. continue
  378. image_text = image_text[0]
  379. image_text_list.append(add_div(image_text))
  380. # 解析document.xml,获取文字顺序
  381. # print("docx2text extract order")
  382. order_list = read_xml_order(path, unique_type_dir)
  383. if order_list == [-2]:
  384. return [-2]
  385. if order_list == [-1]:
  386. return [-1]
  387. text = ""
  388. print("len(order_list)", len(order_list))
  389. print("len(paragraph_text_list)", len(paragraph_text_list))
  390. print("len(image_text_list)", len(image_text_list))
  391. print("len(table_text_list)", len(table_text_list))
  392. # log("docx2text output in order")
  393. for tag in order_list:
  394. if tag == "w:t":
  395. if len(paragraph_text_list) > 0:
  396. text += paragraph_text_list.pop(0)
  397. if tag == "wp:docPr":
  398. if len(image_text_list) > 0:
  399. text += image_text_list.pop(0)
  400. if tag == "w:tbl":
  401. if len(table_text_list) > 0:
  402. text += table_text_list.pop(0)
  403. return [text]
  404. except Exception as e:
  405. # print("docx2text", e, global_type)
  406. logging.info("docx2text error!")
  407. print("docx2text", traceback.print_exc())
  408. # log_traceback("docx2text")
  409. return [-1]
  410. def add_div(text):
  411. if text == "" or text is None:
  412. return text
  413. if get_platform() == "Windows":
  414. print("add_div", text)
  415. if re.findall("<div>", text):
  416. return text
  417. text = "<div>" + text + "\n"
  418. text = re.sub("\n", "</div>\n<div>", text)
  419. # text += "</div>"
  420. if text[-5:] == "<div>":
  421. print("add_div has cut", text[-30:])
  422. text = text[:-5]
  423. return text
  424. @get_memory_info.memory_decorator
  425. def pdf2Image(path, save_dir):
  426. logging.info("into pdf2Image")
  427. try:
  428. try:
  429. doc = fitz.open(path)
  430. except Exception as e:
  431. logging.info("pdf format error!")
  432. # print("pdf format error!", e)
  433. return [-3]
  434. output_image_list = []
  435. for page_no in range(doc.page_count):
  436. # 限制pdf页数,只取前100页
  437. if page_no >= 70:
  438. logging.info("pdf2Image: pdf pages count " + str(doc.page_count)
  439. + ", only get 70 pages")
  440. break
  441. try:
  442. page = doc.loadPage(page_no)
  443. output = save_dir + "_page" + str(page_no) + ".png"
  444. rotate = int(0)
  445. # 每个尺寸的缩放系数为1.3,这将为我们生成分辨率提高2.6的图像。
  446. # 此处若是不做设置,默认图片大小为:792X612, dpi=96
  447. # (1.33333333-->1056x816) (2-->1584x1224)
  448. zoom_x = 1.33333333
  449. zoom_y = 1.33333333
  450. # mat = fitz.Matrix(zoom_x, zoom_y).preRotate(rotate)
  451. mat = fitz.Matrix(zoom_x, zoom_y).preRotate(rotate)
  452. pix = page.getPixmap(matrix=mat, alpha=False)
  453. pix.writePNG(output)
  454. output_image_list.append(output)
  455. except ValueError as e:
  456. traceback.print_exc()
  457. if str(e) == "page not in document":
  458. logging.info("pdf2Image page not in document! continue..." + str(page_no))
  459. continue
  460. elif "encrypted" in str(e):
  461. logging.info("pdf2Image document need password " + str(page_no))
  462. return [-7]
  463. except RuntimeError as e:
  464. if "cannot find page" in str(e):
  465. logging.info("pdf2Image page {} not in document! continue... ".format(str(page_no)) + str(e))
  466. continue
  467. else:
  468. traceback.print_exc()
  469. return [-3]
  470. return output_image_list
  471. except Exception as e:
  472. logging.info("pdf2Image error!")
  473. print("pdf2Image", traceback.print_exc())
  474. return [-1]
  475. def image_preprocess(image_np, image_path, use_ocr=True):
  476. logging.info("into image_preprocess")
  477. try:
  478. # 长 宽
  479. # resize_size = (1024, 768)
  480. # 限制图片大小
  481. # resize_image(image_path, resize_size)
  482. # 图片倾斜校正,写入原来的图片路径
  483. g_r_i = get_rotated_image(image_np, image_path)
  484. if g_r_i == [-1]:
  485. return [-1], [], [], 0
  486. # otr需要图片resize, 写入另一个路径
  487. image_np = cv2.imread(image_path)
  488. best_h, best_w = get_best_predict_size(image_np)
  489. image_resize = cv2.resize(image_np, (best_w, best_h), interpolation=cv2.INTER_AREA)
  490. image_resize_path = image_path[:-4] + "_resize" + image_path[-4:]
  491. cv2.imwrite(image_resize_path, image_resize)
  492. # 调用otr模型接口
  493. with open(image_resize_path, "rb") as f:
  494. image_bytes = f.read()
  495. points, split_lines, bboxes, outline_points = from_otr_interface(image_bytes)
  496. if judge_error_code(points):
  497. return points, [], [], 0
  498. # 将resize后得到的bbox根据比例还原
  499. ratio = (image_np.shape[0]/best_h, image_np.shape[1]/best_w)
  500. for i in range(len(bboxes)):
  501. bbox = bboxes[i]
  502. bboxes[i] = [(int(bbox[0][0]*ratio[1]), int(bbox[0][1]*ratio[0])),
  503. (int(bbox[1][0]*ratio[1]), int(bbox[1][1]*ratio[0]))]
  504. # 查看是否能输出正确框
  505. # for box in bboxes:
  506. # cv2.rectangle(image_np, box[0], box[1], (0, 255, 0), 3)
  507. # cv2.imshow("bbox", image_np)
  508. # cv2.waitKey(0)
  509. # 调用ocr模型接口
  510. with open(image_path, "rb") as f:
  511. image_bytes = f.read()
  512. # 有表格
  513. if len(bboxes) >= 2:
  514. text_list, bbox_list = from_ocr_interface(image_bytes, True)
  515. if judge_error_code(text_list):
  516. return text_list, [], [], 0
  517. # for i in range(len(text_list)):
  518. # print(text_list[i], bbox_list[i])
  519. # 查看是否能输出正确框
  520. # for box in bbox_list:
  521. # cv2.rectangle(image_np, (int(box[0][0]), int(box[0][1])),
  522. # (int(box[2][0]), int(box[2][1])), (0, 255, 0), 1)
  523. # cv2.imshow("bbox", image_np)
  524. # cv2.waitKey(0)
  525. text, column_list = get_formatted_table(text_list, bbox_list, bboxes, split_lines)
  526. if judge_error_code(text):
  527. return text, [], [], 0
  528. is_table = 1
  529. return text, column_list, outline_points, is_table
  530. # 无表格
  531. else:
  532. if use_ocr:
  533. text = from_ocr_interface(image_bytes)
  534. if judge_error_code(text):
  535. return text, [], [], 0
  536. is_table = 0
  537. return text, [], [], is_table
  538. else:
  539. is_table = 0
  540. return None, [], [], is_table
  541. except Exception as e:
  542. logging.info("image_preprocess error")
  543. print("image_preprocess", traceback.print_exc())
  544. return [-1], [], [], 0
  545. def get_best_predict_size(image_np):
  546. sizes = [1280, 1152, 1024, 896, 768, 640, 512, 384, 256, 128]
  547. min_len = 10000
  548. best_height = sizes[0]
  549. for height in sizes:
  550. if abs(image_np.shape[0] - height) < min_len:
  551. min_len = abs(image_np.shape[0] - height)
  552. best_height = height
  553. min_len = 10000
  554. best_width = sizes[0]
  555. for width in sizes:
  556. if abs(image_np.shape[1] - width) < min_len:
  557. min_len = abs(image_np.shape[1] - width)
  558. best_width = width
  559. return best_height, best_width
  560. @get_memory_info.memory_decorator
  561. def pdf2text(path, unique_type_dir):
  562. logging.info("into pdf2text")
  563. try:
  564. # pymupdf pdf to image
  565. save_dir = path.split(".")[-2] + "_" + path.split(".")[-1]
  566. output_image_list = pdf2Image(path, save_dir)
  567. if judge_error_code(output_image_list):
  568. return output_image_list
  569. # 获取每页pdf提取的文字、表格的列数、轮廓点、是否含表格、页码
  570. page_info_list = []
  571. page_no = 0
  572. for img_path in output_image_list:
  573. print("pdf page", page_no, "in total", len(output_image_list))
  574. # 读不出来的跳过
  575. try:
  576. img = cv2.imread(img_path)
  577. img_size = img.shape
  578. except:
  579. logging.info("pdf2text read image in page fail! continue...")
  580. continue
  581. # print("pdf2text img_size", img_size)
  582. text, column_list, outline_points, is_table = image_preprocess(img, img_path,
  583. use_ocr=False)
  584. if judge_error_code(text):
  585. return text
  586. page_info_list.append([text, column_list, outline_points, is_table,
  587. page_no, img_size])
  588. page_no += 1
  589. # print("pdf2text", page_info_list)
  590. # 包含table的和不包含table的
  591. has_table_list = []
  592. has_table_page_no_list = []
  593. no_table_list = []
  594. no_table_page_no_list = []
  595. for page_info in page_info_list:
  596. # 含表格不含表格分开
  597. if not page_info[3]:
  598. no_table_list.append(page_info)
  599. no_table_page_no_list.append(page_info[4])
  600. else:
  601. has_table_list.append(page_info)
  602. has_table_page_no_list.append(page_info[4])
  603. # 页码表格连接
  604. table_connect_list, connect_text_list = page_table_connect(has_table_list,
  605. page_info_list)
  606. # table_connect_list, connect_text_list = [], []
  607. if judge_error_code(table_connect_list):
  608. return table_connect_list
  609. # 连接的页码
  610. table_connect_page_no_list = []
  611. for area in connect_text_list:
  612. table_connect_page_no_list.append(area[1])
  613. # print("pdf2text table_connect_list", table_connect_list)
  614. # pdfminer 方式
  615. try:
  616. fp = open(path, 'rb')
  617. # 用文件对象创建一个PDF文档分析器
  618. parser = PDFParser(fp)
  619. # 创建一个PDF文档
  620. doc = PDFDocument(parser)
  621. # 连接分析器,与文档对象
  622. rsrcmgr = PDFResourceManager()
  623. device = PDFPageAggregator(rsrcmgr, laparams=LAParams())
  624. interpreter = PDFPageInterpreter(rsrcmgr, device)
  625. # 判断是否能读pdf
  626. for page in PDFPage.create_pages(doc):
  627. break
  628. except pdfminer.psparser.PSEOF as e:
  629. # pdfminer 读不了空白页的对象,直接使用pymupdf转换出的图片进行ocr识别
  630. logging.info("pdf2text " + str(e) + " use ocr read pdf!")
  631. text_list = []
  632. for page_info in page_info_list:
  633. page_no = page_info[4]
  634. # 表格
  635. if page_info[3]:
  636. # 判断表格是否跨页连接
  637. area_no = 0
  638. jump_page = 0
  639. for area in table_connect_list:
  640. if page_no in area:
  641. # 只记录一次text
  642. if page_no == area[0]:
  643. image_text = connect_text_list[area_no][0]
  644. text_list.append([image_text, page_no, 0])
  645. jump_page = 1
  646. area_no += 1
  647. # 是连接页的跳过后面步骤
  648. if jump_page:
  649. continue
  650. # 直接取text
  651. image_text = page_info_list[page_no][0]
  652. text_list.append([image_text, page_no, 0])
  653. # 非表格
  654. else:
  655. with open(output_image_list[page_no], "rb") as ff:
  656. image_stream = ff.read()
  657. image_text = from_ocr_interface(image_stream)
  658. text_list.append([image_text, page_no, 0])
  659. text_list.sort(key=lambda z: z[1])
  660. text = ""
  661. for t in text_list:
  662. text += t[0]
  663. return [text]
  664. except Exception as e:
  665. logging.info("pdf format error!")
  666. traceback.print_exc()
  667. return [-3]
  668. text_list = []
  669. page_no = 0
  670. pages = PDFPage.create_pages(doc)
  671. for page in pages:
  672. logging.info("pdf2text page_no " + str(page_no))
  673. # 限制pdf页数,只取前100页
  674. if page_no >= 70:
  675. logging.info("pdf2text: pdf pages only get 100 pages")
  676. break
  677. # 判断页码在含表格页码中,直接拿已生成的text
  678. if page_no in has_table_page_no_list:
  679. # 判断表格是否跨页连接
  680. area_no = 0
  681. jump_page = 0
  682. for area in table_connect_list:
  683. if page_no in area:
  684. # 只记录一次text
  685. if page_no == area[0]:
  686. image_text = connect_text_list[area_no][0]
  687. text_list.append([image_text, page_no, 0])
  688. jump_page = 1
  689. area_no += 1
  690. # 是连接页的跳过后面步骤
  691. if jump_page:
  692. page_no += 1
  693. continue
  694. # 直接取text
  695. image_text = page_info_list[page_no][0]
  696. text_list.append([image_text, page_no, 0])
  697. page_no += 1
  698. continue
  699. # 不含表格的解析pdf
  700. else:
  701. if get_platform() == "Windows":
  702. try:
  703. interpreter.process_page(page)
  704. layout = device.get_result()
  705. except Exception:
  706. logging.info("pdf2text pdfminer read pdf page error! continue...")
  707. continue
  708. else:
  709. # 设置超时时间
  710. try:
  711. # 解析pdf中的不含表格的页
  712. if get_platform() == "Windows":
  713. origin_pdf_analyze = pdf_analyze.__wrapped__
  714. layout = origin_pdf_analyze(interpreter, page, device)
  715. else:
  716. layout = pdf_analyze(interpreter, page, device)
  717. except TimeoutError as e:
  718. logging.info("pdf2text pdfminer read pdf page time out!")
  719. return [-4]
  720. except Exception:
  721. logging.info("pdf2text pdfminer read pdf page error! continue...")
  722. continue
  723. # 判断该页有没有文字对象,没有则有可能是有水印
  724. only_image = 1
  725. image_count = 0
  726. for x in layout:
  727. if isinstance(x, LTTextBoxHorizontal):
  728. only_image = 0
  729. if isinstance(x, LTFigure):
  730. image_count += 1
  731. # 如果该页图片数量过多,直接ocr整页识别
  732. logging.info("pdf2text image_count" + str(image_count))
  733. if image_count >= 3:
  734. with open(output_image_list[page_no], "rb") as ff:
  735. image_stream = ff.read()
  736. image_text = from_ocr_interface(image_stream)
  737. if judge_error_code(image_text):
  738. return image_text
  739. text_list.append([image_text, page_no, 0])
  740. page_no += 1
  741. continue
  742. order_list = []
  743. for x in layout:
  744. if get_platform() == "Windows":
  745. # print("x", page_no, x)
  746. print()
  747. if isinstance(x, LTTextBoxHorizontal):
  748. image_text = x.get_text()
  749. # 无法识别编码,用ocr
  750. if re.search('[(]cid:[0-9]+[)]', image_text):
  751. print(re.search('[(]cid:[0-9]+[)]', image_text))
  752. with open(output_image_list[page_no], "rb") as ff:
  753. image_stream = ff.read()
  754. image_text = from_ocr_interface(image_stream)
  755. if judge_error_code(image_text):
  756. return image_text
  757. image_text = add_div(image_text)
  758. order_list.append([image_text, page_no, x.bbox[1]])
  759. break
  760. else:
  761. image_text = add_div(image_text)
  762. order_list.append([image_text, page_no, x.bbox[1]])
  763. continue
  764. if isinstance(x, LTFigure):
  765. for image in x:
  766. if isinstance(image, LTImage):
  767. try:
  768. print(image.width, image.height)
  769. image_stream = image.stream.get_data()
  770. # 有些水印导致pdf分割、读取报错
  771. # if image.width <= 200 and image.height<=200:
  772. # continue
  773. # img_test = Image.open(io.BytesIO(image_stream))
  774. # img_test.save('temp/LTImage.jpg')
  775. # 查看提取的图片高宽,太大则抛错用另一张图
  776. img_test = Image.open(io.BytesIO(image_stream))
  777. if img_test.size[1] > 2000 or img_test.size[0] > 1500:
  778. print("pdf2text LTImage size", img_test.size)
  779. raise Exception
  780. img_test.save('temp/LTImage.jpg')
  781. # except pdfminer.pdftypes.PDFNotImplementedError:
  782. # with open(output_image_list[page_no], "rb") as ff:
  783. # image_stream = ff.read()
  784. except Exception:
  785. logging.info("pdf2text pdfminer read image in page fail! use pymupdf read image...")
  786. print(traceback.print_exc())
  787. with open(output_image_list[page_no], "rb") as ff:
  788. image_stream = ff.read()
  789. image_text = from_ocr_interface(image_stream)
  790. if judge_error_code(image_text):
  791. return image_text
  792. # 判断只拿到了水印图: 无文字输出且只有图片对象
  793. if image_text == "" and only_image:
  794. # 拆出该页pdf
  795. try:
  796. logging.info("pdf2text guess pdf has watermark")
  797. split_path = get_single_pdf(path, page_no)
  798. except:
  799. # 如果拆分抛异常,则大概率不是水印图,用ocr识别图片
  800. logging.info("pdf2text guess pdf has no watermark")
  801. with open(output_image_list[page_no], "rb") as ff:
  802. image_stream = ff.read()
  803. image_text = from_ocr_interface(image_stream)
  804. image_text = image_text
  805. order_list.append([image_text, page_no, x.bbox[1]])
  806. continue
  807. if judge_error_code(split_path):
  808. return split_path
  809. # 调用office格式转换
  810. file_path = from_office_interface(split_path, unique_type_dir, 'html', 3)
  811. # if file_path == [-3]:
  812. # return [-3]
  813. if judge_error_code(file_path):
  814. return file_path
  815. # 获取html文本
  816. image_text = get_html_p(file_path)
  817. if judge_error_code(image_text):
  818. return image_text
  819. if get_platform() == "Windows":
  820. print("image_text", page_no, image_text)
  821. with open("temp" + str(x.bbox[0]) + ".jpg", "wb") as ff:
  822. ff.write(image_stream)
  823. image_text = add_div(image_text)
  824. order_list.append([image_text, page_no, x.bbox[1]])
  825. if get_platform() == "Windows":
  826. print("order_list", page_no, order_list)
  827. order_list.sort(key=lambda z: z[2], reverse=True)
  828. text_list += order_list
  829. page_no += 1
  830. text = ""
  831. for t in text_list:
  832. # text += add_div(t[0])
  833. text += t[0]
  834. return [text]
  835. except UnicodeDecodeError as e:
  836. logging.info("pdf2text pdfminer create pages failed! " + str(e))
  837. return [-3]
  838. except Exception as e:
  839. logging.info("pdf2text error!")
  840. print("pdf2text", traceback.print_exc())
  841. return [-1]
  842. @get_memory_info.memory_decorator
  843. @timeout_decorator.timeout(300, timeout_exception=TimeoutError)
  844. def pdf_analyze(interpreter, page, device):
  845. logging.info("into pdf_analyze")
  846. # 解析pdf中的不含表格的页
  847. pdf_time = time.time()
  848. print("pdf_analyze interpreter process...")
  849. interpreter.process_page(page)
  850. print("pdf_analyze device get_result...")
  851. layout = device.get_result()
  852. logging.info("pdf2text read time " + str(time.time()-pdf_time))
  853. return layout
  854. def get_html_p(html_path):
  855. logging.info("into get_html_p")
  856. try:
  857. with open(html_path, "r") as ff:
  858. html_str = ff.read()
  859. soup = BeautifulSoup(html_str, 'lxml')
  860. text = ""
  861. for p in soup.find_all("p"):
  862. p_text = p.text
  863. p_text = p_text.strip()
  864. if p.string != "":
  865. text += p_text
  866. text += "\n"
  867. return text
  868. except Exception as e:
  869. logging.info("get_html_p error!")
  870. print("get_html_p", traceback.print_exc())
  871. return [-1]
  872. def get_single_pdf(path, page_no):
  873. logging.info("into get_single_pdf")
  874. try:
  875. # print("path, ", path)
  876. pdf_origin = PdfFileReader(path, strict=False)
  877. pdf_new = PdfFileWriter()
  878. pdf_new.addPage(pdf_origin.getPage(page_no))
  879. path_new = path.split(".")[0] + "_split.pdf"
  880. with open(path_new, "wb") as ff:
  881. pdf_new.write(ff)
  882. return path_new
  883. except PyPDF2.utils.PdfReadError as e:
  884. raise e
  885. except Exception as e:
  886. logging.info("get_single_pdf error! page " + str(page_no))
  887. print("get_single_pdf", traceback.print_exc())
  888. raise e
  889. def page_table_connect(has_table_list, page_info_list):
  890. logging.info("into page_table_connect")
  891. try:
  892. # 判断是否有页码的表格相连
  893. table_connect_list = []
  894. temp_list = []
  895. # 离图片顶部或底部距离
  896. threshold = 100
  897. for i in range(1, len(has_table_list)):
  898. page_info = has_table_list[i]
  899. last_page_info = has_table_list[i - 1]
  900. # 页码需相连
  901. if page_info[4] - last_page_info[4] == 1:
  902. # 上一页的最后一个列数和下一页的第一个列数都为0,且相等
  903. if not last_page_info[1][-1] and not page_info[1][0] and \
  904. last_page_info[1][-1] == page_info[1][0]:
  905. # 上一页的轮廓点要离底部一定距离内,下一页的轮廓点要离顶部一定距离内
  906. if page_info[5][0] - last_page_info[2][-1][1][1] <= threshold and \
  907. page_info[2][0][0][1] - 0 <= 100:
  908. # print("page_table_connect accept")
  909. temp_list.append(last_page_info[4])
  910. temp_list.append(page_info[4])
  911. continue
  912. # 条件不符合的,存储之前保存的连接页码
  913. if len(temp_list) > 1:
  914. temp_list = list(set(temp_list))
  915. temp_list.sort(key=lambda x: x)
  916. table_connect_list.append(temp_list)
  917. temp_list = []
  918. if len(temp_list) > 1:
  919. temp_list = list(set(temp_list))
  920. temp_list.sort(key=lambda x: x)
  921. table_connect_list.append(temp_list)
  922. temp_list = []
  923. # 连接两页内容
  924. connect_text_list = []
  925. for area in table_connect_list:
  926. first_page_no = area[0]
  927. area_page_text = str(page_info_list[first_page_no][0])
  928. # print("area_page_text", area_page_text)
  929. for i in range(1, len(area)):
  930. current_page_no = area[i]
  931. current_page_text = page_info_list[current_page_no][0]
  932. # 连接两个table
  933. table_prefix = re.finditer('<table border="1">', current_page_text)
  934. index_list = []
  935. for t in table_prefix:
  936. index_list.append(t.span())
  937. delete_index = index_list[0]
  938. current_page_text = current_page_text[:delete_index[0]] \
  939. + current_page_text[delete_index[1]:]
  940. # current_page_text = current_page_text[18:]
  941. # print("current_page_text", current_page_text[:30])
  942. # print("current_page_text", current_page_text)
  943. table_suffix = re.finditer('</table>', area_page_text)
  944. index_list = []
  945. for t in table_suffix:
  946. index_list.append(t.span())
  947. delete_index = index_list[-1]
  948. area_page_text = area_page_text[:delete_index[0]] \
  949. + area_page_text[delete_index[1]:]
  950. # area_page_text = area_page_text[:-9]
  951. # print("area_page_text", area_page_text[-20:])
  952. area_page_text = area_page_text + current_page_text
  953. connect_text_list.append([area_page_text, area])
  954. return table_connect_list, connect_text_list
  955. except Exception as e:
  956. # print("page_table_connect", e)
  957. logging.info("page_table_connect error!")
  958. print("page_table_connect", traceback.print_exc())
  959. return [-1], [-1]
  960. @get_memory_info.memory_decorator
  961. def zip2text(path, unique_type_dir):
  962. logging.info("into zip2text")
  963. try:
  964. zip_path = unique_type_dir
  965. try:
  966. zip_file = zipfile.ZipFile(path)
  967. zip_list = zip_file.namelist()
  968. # print("zip list namelist", zip_list)
  969. if get_platform() == "Windows":
  970. if os.path.exists(zip_list[0]):
  971. print("zip2text exists")
  972. # 循环解压文件到指定目录
  973. file_list = []
  974. for f in zip_list:
  975. file_list.append(zip_file.extract(f, path=zip_path))
  976. # zip_file.extractall(path=zip_path)
  977. zip_file.close()
  978. # 获取文件名
  979. # file_list = []
  980. # for root, dirs, files in os.walk(zip_path, topdown=False):
  981. # for name in dirs:
  982. # file_list.append(os.path.join(root, name) + os.sep)
  983. # for name in files:
  984. # file_list.append(os.path.join(root, name))
  985. #
  986. # # if get_platform() == "Windows":
  987. # # print("file_list", file_list)
  988. #
  989. # # 过滤掉doc缓存文件
  990. # temp_list = []
  991. # for f in file_list:
  992. # if re.search("~\$", f):
  993. # continue
  994. # else:
  995. # temp_list.append(f)
  996. # file_list = temp_list
  997. except Exception as e:
  998. logging.info("zip format error!")
  999. print("zip format error!", traceback.print_exc())
  1000. return [-3]
  1001. # 内部文件重命名
  1002. # file_list = inner_file_rename(file_list)
  1003. file_list = rename_inner_files(zip_path)
  1004. if judge_error_code(file_list):
  1005. return file_list
  1006. if get_platform() == "Windows":
  1007. print("============= zip file list")
  1008. # print(file_list)
  1009. text = []
  1010. for file in file_list:
  1011. if os.path.isdir(file):
  1012. continue
  1013. # 无文件后缀,猜格式
  1014. if len(file.split(".")) <= 1:
  1015. logging.info(str(file) + " has no type! Guess type...")
  1016. _type = judge_format(file)
  1017. if _type is None:
  1018. logging.info(str(file) + "cannot guess type!")
  1019. sub_text = [""]
  1020. else:
  1021. logging.info(str(file) + " guess type: " + _type)
  1022. new_file = str(file) + "." + _type
  1023. os.rename(file, new_file)
  1024. file = new_file
  1025. sub_text = getText(_type, file)
  1026. # 有文件后缀,截取
  1027. else:
  1028. _type = file.split(".")[-1]
  1029. sub_text = getText(_type, file)
  1030. if judge_error_code(sub_text, code=[-3]):
  1031. continue
  1032. if judge_error_code(sub_text):
  1033. return sub_text
  1034. text = text + sub_text
  1035. return text
  1036. except Exception as e:
  1037. logging.info("zip2text error!")
  1038. print("zip2text", traceback.print_exc())
  1039. return [-1]
  1040. @get_memory_info.memory_decorator
  1041. def rar2text(path, unique_type_dir):
  1042. logging.info("into rar2text")
  1043. try:
  1044. rar_path = unique_type_dir
  1045. try:
  1046. # shell调用unrar解压
  1047. _signal = os.system("unrar x " + path + " " + rar_path)
  1048. print("rar2text _signal", _signal)
  1049. # =0, 解压成功
  1050. if _signal != 0:
  1051. raise Exception
  1052. except Exception as e:
  1053. logging.info("rar format error!")
  1054. print("rar format error!", e)
  1055. return [-3]
  1056. # 获取文件名
  1057. # file_list = []
  1058. # for root, dirs, files in os.walk(rar_path, topdown=False):
  1059. # for name in dirs:
  1060. # file_list.append(os.path.join(root, name) + os.sep)
  1061. # for name in files:
  1062. # file_list.append(os.path.join(root, name))
  1063. if get_platform() == "Windows":
  1064. print("============= rar file list")
  1065. # 内部文件重命名
  1066. # file_list = inner_file_rename(file_list)
  1067. file_list = rename_inner_files(rar_path)
  1068. if judge_error_code(file_list):
  1069. return file_list
  1070. text = []
  1071. for file in file_list:
  1072. if os.path.isdir(file):
  1073. continue
  1074. # 无文件后缀,猜格式
  1075. if len(file.split(".")) <= 1:
  1076. logging.info(str(file) + " has no type! Guess type...")
  1077. _type = judge_format(file)
  1078. if _type is None:
  1079. logging.info(str(file) + "cannot guess type!")
  1080. sub_text = [""]
  1081. else:
  1082. logging.info(str(file) + " guess type: " + _type)
  1083. new_file = str(file) + "." + _type
  1084. os.rename(file, new_file)
  1085. file = new_file
  1086. sub_text = getText(_type, file)
  1087. # 有文件后缀,截取
  1088. else:
  1089. _type = file.split(".")[-1]
  1090. sub_text = getText(_type, file)
  1091. if judge_error_code(sub_text, code=[-3]):
  1092. continue
  1093. if judge_error_code(sub_text):
  1094. return sub_text
  1095. # print("sub text", sub_text, file, _type)
  1096. text = text + sub_text
  1097. return text
  1098. except Exception as e:
  1099. logging.info("rar2text error!")
  1100. print("rar2text", traceback.print_exc())
  1101. return [-1]
  1102. def inner_file_rename(path_list):
  1103. logging.info("into inner_file_rename")
  1104. try:
  1105. # 先过滤文件名中的点 '.'
  1106. path_list.sort(key=lambda x: len(x), reverse=True)
  1107. for i in range(len(path_list)):
  1108. old_path = path_list[i]
  1109. # 对于目录,判断最后一级是否需过滤,重命名
  1110. if os.path.isdir(old_path):
  1111. ps = old_path.split(os.sep)
  1112. old_p = ps[-2]
  1113. if '.' in old_p:
  1114. new_p = re.sub("\\.", "", old_p)
  1115. new_path = ""
  1116. for p in ps[:-2]:
  1117. new_path += p + os.sep
  1118. new_path += new_p + os.sep
  1119. # 重命名,更新
  1120. # print("has .", path_list[i], new_path)
  1121. os.rename(old_path, new_path)
  1122. for j in range(len(path_list)):
  1123. if old_path in path_list[j]:
  1124. path_list[j] = re.sub(old_p, new_p, path_list[j]) + os.sep
  1125. # 将path分割,按分割个数排名
  1126. path_len_list = []
  1127. for p in path_list:
  1128. p_ss = p.split(os.sep)
  1129. temp_p_ss = []
  1130. for pp in p_ss:
  1131. if pp == "":
  1132. continue
  1133. temp_p_ss.append(pp)
  1134. p_ss = temp_p_ss
  1135. path_len_list.append([p, p_ss, len(p_ss)])
  1136. # 从路径分割少的开始改名,即从根目录开始改
  1137. path_len_list.sort(key=lambda x: x[2])
  1138. # for p in path_len_list:
  1139. # print("---", p[1])
  1140. # 判断不用变的目录在第几级
  1141. no_change_level = 0
  1142. loop = 0
  1143. for p_s in path_len_list[0][1]:
  1144. if p_s[-4:] == "_rar" or p_s[-4:] == "_zip":
  1145. no_change_level += loop
  1146. loop = 0
  1147. loop += 1
  1148. no_change_level += 1
  1149. # 每个
  1150. new_path_list = []
  1151. for path_len in path_len_list:
  1152. # 前n个是固定路径
  1153. new_path = ""
  1154. for i in range(no_change_level):
  1155. new_path += path_len[1][i] + os.sep
  1156. old_path = new_path
  1157. if not get_platform() == "Windows":
  1158. old_path = os.sep + old_path
  1159. new_path = os.sep + new_path
  1160. # print("path_len[1][3:]", path_len[1][3:])
  1161. count = 0
  1162. for p in path_len[1][no_change_level:]:
  1163. # 新路径全部转换hash
  1164. new_path += str(hash(p))
  1165. # 最后一个不加os.sep,并且旧路径最后一个不转换hash
  1166. if count < len(path_len[1][no_change_level:]) - 1:
  1167. old_path += str(hash(p)) + os.sep
  1168. new_path += os.sep
  1169. else:
  1170. old_path += p
  1171. count += 1
  1172. # path是文件夹再加os.sep
  1173. if os.path.isdir(path_len[0]):
  1174. new_path += os.sep
  1175. old_path += os.sep
  1176. # path是文件再加文件名后缀
  1177. else:
  1178. p_ss = path_len[1][-1].split(".")
  1179. if len(p_ss) > 1:
  1180. path_suffix = "." + p_ss[-1]
  1181. new_path += path_suffix
  1182. print("inner_file_rename", old_path, "to", new_path)
  1183. os.rename(old_path, new_path)
  1184. new_path_list.append(new_path)
  1185. return new_path_list
  1186. except Exception as e:
  1187. logging.info("inner_file_rename error!")
  1188. print("inner_file_rename", traceback.print_exc())
  1189. return [-1]
  1190. def rename_inner_files(root_path):
  1191. try:
  1192. logging.info("into rename_inner_files")
  1193. # 获取解压文件夹下所有文件+文件夹,不带根路径
  1194. path_list = []
  1195. for root, dirs, files in os.walk(root_path, topdown=False):
  1196. for name in dirs:
  1197. p = os.path.join(root, name) + os.sep
  1198. p = re.sub(root_path, "", p)
  1199. path_list.append(p)
  1200. for name in files:
  1201. p = os.path.join(root, name)
  1202. p = re.sub(root_path, "", p)
  1203. path_list.append(p)
  1204. # 按路径长度排序
  1205. path_list.sort(key=lambda x: len(x), reverse=True)
  1206. # 循环改名
  1207. for old_path in path_list:
  1208. # 按路径分隔符分割
  1209. ss = old_path.split(os.sep)
  1210. # 判断是否文件夹
  1211. is_dir = 0
  1212. file_type = ""
  1213. if os.path.isdir(root_path + old_path):
  1214. ss = ss[:-1]
  1215. is_dir = 1
  1216. else:
  1217. if "." in old_path:
  1218. file_type = "." + old_path.split(".")[-1]
  1219. else:
  1220. file_type = ""
  1221. # 最后一级需要用hash改名
  1222. new_path = ""
  1223. # new_path = re.sub(ss[-1], str(hash(ss[-1])), old_path) + file_type
  1224. current_level = 0
  1225. for s in ss:
  1226. # 路径拼接
  1227. if current_level < len(ss) - 1:
  1228. new_path += s + os.sep
  1229. else:
  1230. new_path += str(hash(s)) + file_type
  1231. current_level += 1
  1232. new_ab_path = root_path + new_path
  1233. old_ab_path = root_path + old_path
  1234. os.rename(old_ab_path, new_ab_path)
  1235. # 重新获取解压文件夹下所有文件+文件夹
  1236. new_path_list = []
  1237. for root, dirs, files in os.walk(root_path, topdown=False):
  1238. for name in dirs:
  1239. new_path_list.append(os.path.join(root, name) + os.sep)
  1240. for name in files:
  1241. new_path_list.append(os.path.join(root, name))
  1242. # print("new_path_list", new_path_list)
  1243. return new_path_list
  1244. except:
  1245. traceback.print_exc()
  1246. return [-1]
  1247. @get_memory_info.memory_decorator
  1248. def xls2text(path, unique_type_dir):
  1249. logging.info("into xls2text")
  1250. try:
  1251. # 调用libreoffice格式转换
  1252. file_path = from_office_interface(path, unique_type_dir, 'xlsx')
  1253. # if file_path == [-3]:
  1254. # return [-3]
  1255. if judge_error_code(file_path):
  1256. return file_path
  1257. text = xlsx2text(file_path, unique_type_dir)
  1258. # if text == [-1]:
  1259. # return [-1]
  1260. # if text == [-3]:
  1261. # return [-3]
  1262. if judge_error_code(text):
  1263. return text
  1264. return text
  1265. except Exception as e:
  1266. logging.info("xls2text error!")
  1267. print("xls2text", traceback.print_exc())
  1268. return [-1]
  1269. @get_memory_info.memory_decorator
  1270. def xlsx2text(path, unique_type_dir):
  1271. logging.info("into xlsx2text")
  1272. try:
  1273. try:
  1274. # sheet_name=None, 即拿取所有sheet,存为dict
  1275. df_dict = pandas.read_excel(path, header=None, keep_default_na=False, sheet_name=None)
  1276. except Exception as e:
  1277. logging.info("xlsx format error!")
  1278. # print("xlsx format error!", e)
  1279. return [-3]
  1280. df_list = [sheet for sheet in df_dict.values()]
  1281. sheet_text = ""
  1282. for df in df_list:
  1283. text = '<table border="1">' + "\n"
  1284. for index, row in df.iterrows():
  1285. text = text + "<tr>"
  1286. for r in row:
  1287. text = text + "<td>" + str(r) + "</td>" + "\n"
  1288. # print(text)
  1289. text = text + "</tr>" + "\n"
  1290. text = text + "</table>" + "\n"
  1291. sheet_text += text
  1292. return [sheet_text]
  1293. except Exception as e:
  1294. logging.info("xlsx2text error!")
  1295. print("xlsx2text", traceback.print_exc())
  1296. return [-1]
  1297. @get_memory_info.memory_decorator
  1298. def swf2text(path, unique_type_dir):
  1299. logging.info("into swf2text")
  1300. try:
  1301. try:
  1302. with open(path, 'rb') as f:
  1303. swf_file = SWF(f)
  1304. svg_exporter = SVGExporter()
  1305. svg = swf_file.export(svg_exporter)
  1306. # with open('swf_export.jpg', 'wb') as f:
  1307. # f.write(svg.read())
  1308. swf_str = str(svg.getvalue(), encoding='utf-8')
  1309. except Exception as e:
  1310. logging.info("swf format error!")
  1311. traceback.print_exc()
  1312. return [-3]
  1313. # 正则匹配图片的信息位置
  1314. result0 = re.finditer('<image id=(.[^>]*)', swf_str)
  1315. image_bytes_list = []
  1316. i = 0
  1317. image_path_prefix = path.split(".")[-2] + "_" + path.split(".")[-1]
  1318. image_path_list = []
  1319. for r in result0:
  1320. # 截取图片信息所在位置
  1321. swf_str0 = swf_str[r.span()[0]:r.span()[1] + 1]
  1322. # 正则匹配得到图片的base64编码
  1323. result1 = re.search('xlink:href="data:(.[^>]*)', swf_str0)
  1324. swf_str1 = swf_str0[result1.span()[0]:result1.span()[1]]
  1325. reg1_prefix = 'b\''
  1326. result1 = re.search(reg1_prefix + '(.[^\']*)', swf_str1)
  1327. swf_str1 = swf_str1[result1.span()[0] + len(reg1_prefix):result1.span()[1]]
  1328. # base64_str -> base64_bytes -> no "\\" base64_bytes -> bytes -> image
  1329. base64_bytes_with_double = bytes(swf_str1, "utf-8")
  1330. base64_bytes = codecs.escape_decode(base64_bytes_with_double, "hex-escape")[0]
  1331. image_bytes = base64.b64decode(base64_bytes)
  1332. image_bytes_list.append(image_bytes)
  1333. image_path = image_path_prefix + "_page_" + str(i) + ".png"
  1334. with open(image_path, 'wb') as f:
  1335. f.write(image_bytes)
  1336. image_path_list.append(image_path)
  1337. # 正则匹配得到图片的宽高
  1338. # reg2_prefix = 'width="'
  1339. # result2 = re.search(reg2_prefix + '(\d+)', swf_str0)
  1340. # swf_str2 = swf_str0[result2.span()[0]+len(reg2_prefix):result2.span()[1]]
  1341. # width = swf_str2
  1342. # reg2_prefix = 'height="'
  1343. # result2 = re.search(reg2_prefix + '(\d+)', swf_str0)
  1344. # swf_str2 = swf_str0[result2.span()[0]+len(reg2_prefix):result2.span()[1]]
  1345. # height = swf_str2
  1346. i += 1
  1347. text_list = []
  1348. # print("image_path_list", image_path_list)
  1349. for image_path in image_path_list:
  1350. text = picture2text(image_path)
  1351. # print("text", text)
  1352. if judge_error_code(text, code=[-3]):
  1353. continue
  1354. if judge_error_code(text):
  1355. return text
  1356. text = text[0]
  1357. text_list.append(text)
  1358. text = ""
  1359. for t in text_list:
  1360. text += t
  1361. return [text]
  1362. except Exception as e:
  1363. logging.info("swf2text error!")
  1364. print("swf2text", traceback.print_exc())
  1365. return [-1]
  1366. @get_memory_info.memory_decorator
  1367. def picture2text(path, html=False):
  1368. logging.info("into picture2text")
  1369. try:
  1370. # 判断图片中表格
  1371. img = cv2.imread(path)
  1372. if img is None:
  1373. return [-3]
  1374. # if get_platform() == "Windows":
  1375. # print("picture2text img", img)
  1376. text, column_list, outline_points, is_table = image_preprocess(img, path)
  1377. if judge_error_code(text):
  1378. return text
  1379. # if text == [-5]:
  1380. # return [-5]
  1381. # if text == [-2]:
  1382. # return [-2]
  1383. # if text == [-1]:
  1384. # return [-1]
  1385. if html:
  1386. text = add_div(text)
  1387. return [text]
  1388. except Exception as e:
  1389. logging.info("picture2text error!")
  1390. print("picture2text", traceback.print_exc())
  1391. return [-1]
  1392. port_num = [0]
  1393. def choose_port():
  1394. process_num = 4
  1395. if port_num[0] % process_num == 0:
  1396. _url = local_url + ":15011"
  1397. elif port_num[0] % process_num == 1:
  1398. _url = local_url + ":15012"
  1399. elif port_num[0] % process_num == 2:
  1400. _url = local_url + ":15013"
  1401. elif port_num[0] % process_num == 3:
  1402. _url = local_url + ":15014"
  1403. port_num[0] = port_num[0] + 1
  1404. return _url
  1405. @get_memory_info.memory_decorator
  1406. def from_ocr_interface(image_stream, is_table=False):
  1407. logging.info("into from_ocr_interface")
  1408. try:
  1409. base64_stream = base64.b64encode(image_stream)
  1410. # 调用接口
  1411. try:
  1412. r = ocr(data=base64_stream, ocr_model=globals().get("global_ocr_model"))
  1413. except TimeoutError:
  1414. if is_table:
  1415. return [-5], [-5]
  1416. else:
  1417. return [-5]
  1418. except requests.exceptions.ConnectionError as e:
  1419. if is_table:
  1420. return [-2], [-2]
  1421. else:
  1422. return [-2]
  1423. _dict = r
  1424. text_list = eval(_dict.get("text"))
  1425. bbox_list = eval(_dict.get("bbox"))
  1426. if text_list is None:
  1427. text_list = []
  1428. if bbox_list is None:
  1429. bbox_list = []
  1430. if is_table:
  1431. return text_list, bbox_list
  1432. else:
  1433. if text_list and bbox_list:
  1434. text = get_sequential_data(text_list, bbox_list, html=True)
  1435. if judge_error_code(text):
  1436. return text
  1437. # if text == [-1]:
  1438. # return [-1]
  1439. else:
  1440. text = ""
  1441. return text
  1442. except Exception as e:
  1443. logging.info("from_ocr_interface error!")
  1444. # print("from_ocr_interface", e, global_type)
  1445. if is_table:
  1446. return [-1], [-1]
  1447. else:
  1448. return [-1]
  1449. @get_memory_info.memory_decorator
  1450. def from_otr_interface(image_stream):
  1451. logging.info("into from_otr_interface")
  1452. try:
  1453. base64_stream = base64.b64encode(image_stream)
  1454. # 调用接口
  1455. try:
  1456. r = otr(data=base64_stream, otr_model=globals().get("global_otr_model"))
  1457. except TimeoutError:
  1458. return [-5], [-5], [-5], [-5]
  1459. except requests.exceptions.ConnectionError as e:
  1460. logging.info("from_otr_interface")
  1461. print("from_otr_interface", traceback.print_exc())
  1462. return [-2], [-2], [-2], [-2]
  1463. # 处理结果
  1464. _dict = r
  1465. points = eval(_dict.get("points"))
  1466. split_lines = eval(_dict.get("split_lines"))
  1467. bboxes = eval(_dict.get("bboxes"))
  1468. outline_points = eval(_dict.get("outline_points"))
  1469. # print("from_otr_interface len(bboxes)", len(bboxes))
  1470. if points is None:
  1471. points = []
  1472. if split_lines is None:
  1473. split_lines = []
  1474. if bboxes is None:
  1475. bboxes = []
  1476. if outline_points is None:
  1477. outline_points = []
  1478. return points, split_lines, bboxes, outline_points
  1479. except Exception as e:
  1480. logging.info("from_otr_interface error!")
  1481. print("from_otr_interface", traceback.print_exc())
  1482. return [-1], [-1], [-1], [-1]
  1483. def from_office_interface(src_path, dest_path, target_format, retry_times=1):
  1484. try:
  1485. # Win10跳出超时装饰器
  1486. if get_platform() == "Windows":
  1487. # origin_office_convert = office_convert.__wrapped__
  1488. # file_path = origin_office_convert(src_path, dest_path, target_format, retry_times)
  1489. file_path = office_convert(src_path, dest_path, target_format, retry_times)
  1490. else:
  1491. # 将装饰器包装为一个类,否则多进程Pickle会报错 it's not the same object as xxx 问题,
  1492. # timeout_decorator_obj = my_timeout_decorator.TimeoutClass(office_convert, 180, TimeoutError)
  1493. # file_path = timeout_decorator_obj.run(src_path, dest_path, target_format, retry_times)
  1494. file_path = office_convert(src_path, dest_path, target_format, retry_times)
  1495. if judge_error_code(file_path):
  1496. return file_path
  1497. return file_path
  1498. except TimeoutError:
  1499. logging.info("from_office_interface timeout error!")
  1500. return [-5]
  1501. except:
  1502. logging.info("from_office_interface error!")
  1503. print("from_office_interface", traceback.print_exc())
  1504. return [-1]
  1505. def get_sequential_data(text_list, bbox_list, html=False):
  1506. logging.info("into get_sequential_data")
  1507. try:
  1508. text = ""
  1509. order_list = []
  1510. for i in range(len(text_list)):
  1511. length_start = bbox_list[i][0][0]
  1512. length_end = bbox_list[i][1][0]
  1513. height_start = bbox_list[i][0][1]
  1514. height_end = bbox_list[i][-1][1]
  1515. # print([length_start, length_end, height_start, height_end])
  1516. order_list.append([text_list[i], length_start, length_end, height_start, height_end])
  1517. # text = text + infomation['text'] + "\n"
  1518. if get_platform() == "Windows":
  1519. print("get_sequential_data", order_list)
  1520. if not order_list:
  1521. if get_platform() == "Windows":
  1522. print("get_sequential_data", "no order list")
  1523. return ""
  1524. # 根据bbox的坐标对输出排序
  1525. order_list.sort(key=lambda x: (x[3], x[1]))
  1526. # 根据bbox分行分列
  1527. # col_list = []
  1528. # height_end = int((order_list[0][4] + order_list[0][3]) / 2)
  1529. # for i in range(len(order_list)):
  1530. # if height_end - threshold <= order_list[i][3] <= height_end + threshold:
  1531. # col_list.append(order_list[i])
  1532. # else:
  1533. # row_list.append(col_list)
  1534. # col_list = []
  1535. # height_end = int((order_list[i][4] + order_list[i][3]) / 2)
  1536. # col_list.append(order_list[i])
  1537. # if i == len(order_list) - 1:
  1538. # row_list.append(col_list)
  1539. row_list = []
  1540. used_box = []
  1541. threshold = 5
  1542. for box in order_list:
  1543. if box in used_box:
  1544. continue
  1545. height_center = (box[4] + box[3]) / 2
  1546. row = []
  1547. for box2 in order_list:
  1548. if box2 in used_box:
  1549. continue
  1550. height_center2 = (box2[4] + box2[3]) / 2
  1551. if height_center - threshold <= height_center2 <= height_center + threshold:
  1552. if box2 not in row:
  1553. row.append(box2)
  1554. used_box.append(box2)
  1555. row.sort(key=lambda x: x[0])
  1556. row_list.append(row)
  1557. for row in row_list:
  1558. if not row:
  1559. continue
  1560. if len(row) <= 1:
  1561. text = text + row[0][0] + "\n"
  1562. else:
  1563. sub_text = ""
  1564. row.sort(key=lambda x: x[1])
  1565. for col in row:
  1566. sub_text = sub_text + col[0] + " "
  1567. sub_text = sub_text + "\n"
  1568. text += sub_text
  1569. if html:
  1570. text = "<div>" + text
  1571. text = re.sub("\n", "</div>\n<div>", text)
  1572. text += "</div>"
  1573. # if text[-5:] == "<div>":
  1574. # text = text[:-5]
  1575. return text
  1576. except Exception as e:
  1577. logging.info("get_sequential_data error!")
  1578. print("get_sequential_data", traceback.print_exc())
  1579. return [-1]
  1580. def get_formatted_table(text_list, text_bbox_list, table_bbox_list, split_line):
  1581. logging.info("into get_formatted_table")
  1582. try:
  1583. # 重新定义text_bbox_list,[point, point, text]
  1584. text_bbox_list = [[text_bbox_list[i][0], text_bbox_list[i][2], text_list[i]] for i in
  1585. range(len(text_bbox_list))]
  1586. # 按纵坐标排序
  1587. text_bbox_list.sort(key=lambda x: (x[0][1], x[0][0]))
  1588. table_bbox_list.sort(key=lambda x: (x[0][1], x[0][0]))
  1589. # print("text_bbox_list", text_bbox_list)
  1590. # print("table_bbox_list", table_bbox_list)
  1591. # bbox位置 threshold
  1592. threshold = 5
  1593. # 根据split_line分区,可能有个区多个表格 [(), ()]
  1594. area_text_bbox_list = []
  1595. area_table_bbox_list = []
  1596. # print("get_formatted_table, split_line", split_line)
  1597. for j in range(1, len(split_line)):
  1598. last_y = split_line[j - 1][0][1]
  1599. current_y = split_line[j][0][1]
  1600. temp_text_bbox_list = []
  1601. temp_table_bbox_list = []
  1602. # 找出该区域下text bbox
  1603. for text_bbox in text_bbox_list:
  1604. # 计算 text bbox 中心点
  1605. text_bbox_center = ((text_bbox[1][0] + text_bbox[0][0]) / 2,
  1606. (text_bbox[1][1] + text_bbox[0][1]) / 2)
  1607. if last_y - threshold <= text_bbox_center[1] <= current_y + threshold:
  1608. temp_text_bbox_list.append(text_bbox)
  1609. area_text_bbox_list.append(temp_text_bbox_list)
  1610. # 找出该区域下table bbox
  1611. for table_bbox in table_bbox_list:
  1612. # 计算 table bbox 中心点
  1613. table_bbox_center = ((table_bbox[1][0] + table_bbox[0][0]) / 2,
  1614. (table_bbox[1][1] + table_bbox[0][1]) / 2)
  1615. if last_y < table_bbox_center[1] < current_y:
  1616. temp_table_bbox_list.append(table_bbox)
  1617. area_table_bbox_list.append(temp_table_bbox_list)
  1618. # 对每个区域分别进行两个bbox匹配,生成表格
  1619. area_text_list = []
  1620. area_column_list = []
  1621. for j in range(len(area_text_bbox_list)):
  1622. # 每个区域的table bbox 和text bbox
  1623. temp_table_bbox_list = area_table_bbox_list[j]
  1624. temp_text_bbox_list = area_text_bbox_list[j]
  1625. # 判断该区域有无表格bbox
  1626. # 若无表格,将该区域文字连接
  1627. if not temp_table_bbox_list:
  1628. # 找出该区域的所有text bbox
  1629. only_text_list = []
  1630. only_bbox_list = []
  1631. for text_bbox in temp_text_bbox_list:
  1632. only_text_list.append(text_bbox[2])
  1633. only_bbox_list.append([text_bbox[0], text_bbox[1]])
  1634. only_text = get_sequential_data(only_text_list, only_bbox_list, True)
  1635. if only_text == [-1]:
  1636. return [-1], [-1]
  1637. area_text_list.append(only_text)
  1638. area_column_list.append(0)
  1639. continue
  1640. # 有表格
  1641. # 文本对应的表格格子
  1642. text_in_table = {}
  1643. for i in range(len(temp_text_bbox_list)):
  1644. text_bbox = temp_text_bbox_list[i]
  1645. # 计算 text bbox 中心点
  1646. text_bbox_center = ((text_bbox[1][0] + text_bbox[0][0]) / 2,
  1647. (text_bbox[1][1] + text_bbox[0][1]) / 2)
  1648. # 判断中心点在哪个table bbox中
  1649. for table_bbox in temp_table_bbox_list:
  1650. # 中心点在table bbox中,将text写入字典
  1651. if table_bbox[0][0] <= text_bbox_center[0] <= table_bbox[1][0] and \
  1652. table_bbox[0][1] <= text_bbox_center[1] <= table_bbox[1][1]:
  1653. if str(table_bbox) in text_in_table.keys():
  1654. text_in_table[str(table_bbox)] = text_in_table.get(str(table_bbox)) + text_bbox[2]
  1655. else:
  1656. text_in_table[str(table_bbox)] = text_bbox[2]
  1657. break
  1658. # 如果未找到text bbox匹配的table bbox,加大threshold匹配
  1659. # elif (table_bbox[0][0] <= text_bbox_center[0]+threshold <= table_bbox[1][0] and
  1660. # table_bbox[0][1] <= text_bbox_center[1]+threshold <= table_bbox[1][1]) or \
  1661. # (table_bbox[0][0] <= text_bbox_center[0]-threshold <= table_bbox[1][0] and
  1662. # table_bbox[0][1] <= text_bbox_center[1]-threshold <= table_bbox[1][1]) or \
  1663. # (table_bbox[0][0] <= text_bbox_center[0]+threshold <= table_bbox[1][0] and
  1664. # table_bbox[0][1] <= text_bbox_center[1]-threshold <= table_bbox[1][1]) or \
  1665. # (table_bbox[0][0] <= text_bbox_center[0]-threshold <= table_bbox[1][0] and
  1666. # table_bbox[0][1] <= text_bbox_center[1]+threshold <= table_bbox[1][1]):
  1667. # if str(table_bbox) in text_in_table.keys():
  1668. # text_in_table[str(table_bbox)] = text_in_table.get(str(table_bbox)) + text_bbox[2]
  1669. # else:
  1670. # text_in_table[str(table_bbox)] = text_bbox[2]
  1671. # break
  1672. # 对表格格子进行分行分列,并计算总计多少小列
  1673. # 放入坐标
  1674. all_col_list = []
  1675. all_row_list = []
  1676. for i in range(len(temp_table_bbox_list)):
  1677. table_bbox = temp_table_bbox_list[i]
  1678. # 放入所有坐标x
  1679. if table_bbox[0][0] not in all_col_list:
  1680. all_col_list.append(table_bbox[0][0])
  1681. if table_bbox[1][0] not in all_col_list:
  1682. all_col_list.append(table_bbox[1][0])
  1683. # 放入所有坐标y
  1684. if table_bbox[0][1] not in all_row_list:
  1685. all_row_list.append(table_bbox[0][1])
  1686. if table_bbox[1][1] not in all_row_list:
  1687. all_row_list.append(table_bbox[1][1])
  1688. all_col_list.sort(key=lambda x: x)
  1689. all_row_list.sort(key=lambda x: x)
  1690. # 分行
  1691. row_list = []
  1692. rows = []
  1693. temp_table_bbox_list.sort(key=lambda x: (x[0][1], x[0][0], x[1][1], x[1][0]))
  1694. y_row = temp_table_bbox_list[0][0][1]
  1695. for i in range(len(temp_table_bbox_list)):
  1696. table_bbox = temp_table_bbox_list[i]
  1697. if y_row - threshold <= table_bbox[0][1] <= y_row + threshold:
  1698. rows.append(table_bbox)
  1699. else:
  1700. y_row = table_bbox[0][1]
  1701. if rows:
  1702. rows.sort(key=lambda x: x[0][0])
  1703. row_list.append(rows)
  1704. rows = []
  1705. rows.append(table_bbox)
  1706. # print("*" * 30)
  1707. # print(row_list)
  1708. if i == len(temp_table_bbox_list) - 1:
  1709. if rows:
  1710. rows.sort(key=lambda x: x[0][0])
  1711. row_list.append(rows)
  1712. # 生成表格,包括文字和格子宽度
  1713. area_column = []
  1714. text = '<table border="1">' + "\n"
  1715. for row in row_list:
  1716. text += "<tr>" + "\n"
  1717. for col in row:
  1718. # 计算bbox y坐标之间有多少其他点,+1即为所占行数
  1719. row_span = 1
  1720. for y in all_row_list:
  1721. if col[0][1] < y < col[1][1]:
  1722. if y - col[0][1] >= 2 and col[1][1] - y >= 2:
  1723. row_span += 1
  1724. # 计算bbox x坐标之间有多少其他点,+1即为所占列数
  1725. col_span = 1
  1726. for x in all_col_list:
  1727. if col[0][0] < x < col[1][0]:
  1728. if x - col[0][0] >= 2 and col[1][0] - x >= 2:
  1729. col_span += 1
  1730. text += "<td colspan=" + str(col_span) + " rowspan=" + str(row_span) + ">"
  1731. if str(col) in text_in_table.keys():
  1732. text += text_in_table.get(str(col))
  1733. else:
  1734. text += ""
  1735. text += "</td>" + "\n"
  1736. text += "</tr>" + "\n"
  1737. text += "</table>" + "\n"
  1738. # 计算最大column
  1739. max_col_num = 0
  1740. for row in row_list:
  1741. col_num = 0
  1742. for col in row:
  1743. col_num += 1
  1744. if max_col_num < col_num:
  1745. max_col_num = col_num
  1746. area_text_list.append(text)
  1747. area_column_list.append(max_col_num)
  1748. text = ""
  1749. if get_platform() == "Windows":
  1750. print("get_formatted_table area_text_list", area_text_list)
  1751. for area_text in area_text_list:
  1752. text += area_text
  1753. return text, area_column_list
  1754. except Exception as e:
  1755. logging.info("get_formatted_table error!")
  1756. print("get_formatted_table", traceback.print_exc())
  1757. return [-1], [-1]
  1758. def getText(_type, path_or_stream):
  1759. print("file type - " + _type)
  1760. logging.info("file type - " + _type)
  1761. try:
  1762. ss = path_or_stream.split(".")
  1763. unique_type_dir = ss[-2] + "_" + ss[-1] + os.sep
  1764. except:
  1765. unique_type_dir = path_or_stream + "_" + _type + os.sep
  1766. if _type == "pdf":
  1767. return pdf2text(path_or_stream, unique_type_dir)
  1768. if _type == "docx":
  1769. return docx2text(path_or_stream, unique_type_dir)
  1770. if _type == "zip":
  1771. return zip2text(path_or_stream, unique_type_dir)
  1772. if _type == "rar":
  1773. return rar2text(path_or_stream, unique_type_dir)
  1774. if _type == "xlsx":
  1775. return xlsx2text(path_or_stream, unique_type_dir)
  1776. if _type == "xls":
  1777. return xls2text(path_or_stream, unique_type_dir)
  1778. if _type == "doc":
  1779. return doc2text(path_or_stream, unique_type_dir)
  1780. if _type == "jpg" or _type == "png" or _type == "jpeg":
  1781. return picture2text(path_or_stream)
  1782. if _type == "swf":
  1783. return swf2text(path_or_stream, unique_type_dir)
  1784. if _type == "txt":
  1785. return txt2text(path_or_stream)
  1786. return [""]
  1787. def to_html(path, text):
  1788. with open(path, 'w') as f:
  1789. f.write("<!DOCTYPE HTML>")
  1790. f.write('<head><meta charset="UTF-8"></head>')
  1791. f.write("<body>")
  1792. f.write(text)
  1793. f.write("</body>")
  1794. def resize_image(image_path, size):
  1795. try:
  1796. image_np = cv2.imread(image_path)
  1797. # print(image_np.shape)
  1798. width = image_np.shape[1]
  1799. height = image_np.shape[0]
  1800. h_w_rate = height / width
  1801. # width_standard = 900
  1802. # height_standard = 1400
  1803. width_standard = size[1]
  1804. height_standard = size[0]
  1805. width_new = int(height_standard / h_w_rate)
  1806. height_new = int(width_standard * h_w_rate)
  1807. if width > width_standard:
  1808. image_np = cv2.resize(image_np, (width_standard, height_new))
  1809. elif height > height_standard:
  1810. image_np = cv2.resize(image_np, (width_new, height_standard))
  1811. cv2.imwrite(image_path, image_np)
  1812. # print("resize_image", image_np.shape)
  1813. return
  1814. except Exception as e:
  1815. logging.info("resize_image")
  1816. print("resize_image", e, global_type)
  1817. return
  1818. def remove_red_seal(image_np):
  1819. """
  1820. 去除红色印章
  1821. """
  1822. # 获得红色通道
  1823. blue_c, green_c, red_c = cv2.split(image_np)
  1824. # 多传入一个参数cv2.THRESH_OTSU,并且把阈值thresh设为0,算法会找到最优阈值
  1825. thresh, ret = cv2.threshold(red_c, 0, 255, cv2.THRESH_BINARY + cv2.THRESH_OTSU)
  1826. # print("remove_red_seal thresh", thresh)
  1827. # 实测调整为95%效果好一些
  1828. filter_condition = int(thresh * 0.98)
  1829. thresh1, red_thresh = cv2.threshold(red_c, filter_condition, 255, cv2.THRESH_BINARY)
  1830. # 把图片转回 3 通道
  1831. image_and = np.expand_dims(red_thresh, axis=2)
  1832. image_and = np.concatenate((image_and, image_and, image_and), axis=-1)
  1833. # print(image_and.shape)
  1834. # 膨胀
  1835. gray = cv2.cvtColor(image_and, cv2.COLOR_RGB2GRAY)
  1836. kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (2, 2))
  1837. erode = cv2.erode(gray, kernel)
  1838. cv2.imshow("erode", erode)
  1839. cv2.waitKey(0)
  1840. image_and = np.bitwise_and(cv2.bitwise_not(blue_c), cv2.bitwise_not(erode))
  1841. result_img = cv2.bitwise_not(image_and)
  1842. cv2.imshow("remove_red_seal", result_img)
  1843. cv2.waitKey(0)
  1844. return result_img
  1845. def remove_underline(image_np):
  1846. """
  1847. 去除文字下划线
  1848. """
  1849. # 灰度化
  1850. gray = cv2.cvtColor(image_np, cv2.COLOR_BGR2GRAY)
  1851. # 二值化
  1852. binary = cv2.adaptiveThreshold(~gray, 255,
  1853. cv2.ADAPTIVE_THRESH_GAUSSIAN_C, cv2.THRESH_BINARY,
  1854. 15, 10)
  1855. # Sobel
  1856. kernel_row = np.array([[-1, -2, -1], [0, 0, 0], [1, 2, 1]], np.float32)
  1857. kernel_col = np.array([[-1, 0, 1], [-2, 0, 2], [-1, 0, 1]], np.float32)
  1858. # binary = cv2.filter2D(binary, -1, kernel=kernel)
  1859. binary_row = cv2.filter2D(binary, -1, kernel=kernel_row)
  1860. binary_col = cv2.filter2D(binary, -1, kernel=kernel_col)
  1861. cv2.imshow("custom_blur_demo", binary)
  1862. cv2.waitKey(0)
  1863. rows, cols = binary.shape
  1864. # 识别横线
  1865. scale = 5
  1866. kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (cols // scale, 1))
  1867. erodedcol = cv2.erode(binary_row, kernel, iterations=1)
  1868. cv2.imshow("Eroded Image", erodedcol)
  1869. cv2.waitKey(0)
  1870. dilatedcol = cv2.dilate(erodedcol, kernel, iterations=1)
  1871. cv2.imshow("dilate Image", dilatedcol)
  1872. cv2.waitKey(0)
  1873. return
  1874. def getMDFFromFile(path):
  1875. _length = 0
  1876. try:
  1877. _md5 = hashlib.md5()
  1878. with open(path, "rb") as ff:
  1879. while True:
  1880. data = ff.read(4096)
  1881. if not data:
  1882. break
  1883. _length += len(data)
  1884. _md5.update(data)
  1885. return _md5.hexdigest(), _length
  1886. except Exception as e:
  1887. traceback.print_exc()
  1888. return None, _length
  1889. def add_html_format(text_list):
  1890. new_text_list = []
  1891. for t in text_list:
  1892. html_t = "<!DOCTYPE HTML>\n"
  1893. html_t += '<head><meta charset="UTF-8"></head>\n'
  1894. html_t += "<body>\n"
  1895. html_t += t
  1896. html_t += "\n</body>\n"
  1897. new_text_list.append(html_t)
  1898. return new_text_list
  1899. @timeout_decorator.timeout(1200, timeout_exception=TimeoutError)
  1900. def unique_temp_file_process(stream, _type):
  1901. logging.info("into unique_temp_file_process")
  1902. try:
  1903. # 每个调用在temp中创建一个唯一空间
  1904. uid1 = uuid.uuid1().hex
  1905. unique_space_path = _path + os.sep + "temp" + os.sep + uid1 + os.sep
  1906. # unique_space_path = "/mnt/fangjiasheng/" + "temp/" + uid1 + "/"
  1907. # 判断冲突
  1908. if not os.path.exists(unique_space_path):
  1909. if not os.path.exists(_path + os.sep + "temp"):
  1910. os.mkdir(_path + os.sep + "temp" + os.sep)
  1911. os.mkdir(unique_space_path)
  1912. else:
  1913. uid2 = uuid.uuid1().hex
  1914. if not os.path.exists(_path + os.sep + "temp"):
  1915. os.mkdir(_path + os.sep + "temp" + os.sep)
  1916. os.mkdir(_path + os.sep + "temp" + os.sep + uid2 + os.sep)
  1917. # os.mkdir("/mnt/" + "temp/" + uid2 + "/")
  1918. # 在唯一空间中,对传入的文件也保存为唯一
  1919. uid3 = uuid.uuid1().hex
  1920. file_path = unique_space_path + uid3 + "." + _type
  1921. with open(file_path, "wb") as ff:
  1922. ff.write(stream)
  1923. # 跳过一些编号
  1924. print("getMDFFromFile", getMDFFromFile(file_path))
  1925. if getMDFFromFile(file_path)[0] == '84dba5a65339f338d3ebdf9f33fae13e'\
  1926. or getMDFFromFile(file_path)[0] == '3d9f9f4354582d85b21b060ebd5786db'\
  1927. or getMDFFromFile(file_path)[0] == 'b52da40f24c6b29dfc2ebeaefe4e41f1' \
  1928. or getMDFFromFile(file_path)[0] == 'eefb925b7ccec1467be20b462fde2a09':
  1929. raise Exception
  1930. text = getText(_type, file_path)
  1931. return text
  1932. except Exception as e:
  1933. # print("Convert error! Delete temp file. ", e, global_type)
  1934. logging.info("unique_temp_file_process")
  1935. print("unique_temp_file_process:", traceback.print_exc())
  1936. return [-1]
  1937. finally:
  1938. print("======================================")
  1939. print("File md5:", getMDFFromFile(file_path))
  1940. try:
  1941. if get_platform() == "Linux":
  1942. # 删除该唯一空间下所有文件
  1943. if os.path.exists(unique_space_path):
  1944. shutil.rmtree(unique_space_path)
  1945. print()
  1946. except Exception as e:
  1947. logging.info("Delete Files Failed!")
  1948. # print("Delete Files Failed!")
  1949. return [-1]
  1950. print("Finally")
  1951. # to_html(_path + "6.html", text[0])
  1952. # to_html(unique_space_path + "result.html", text[0])
  1953. # return text
  1954. logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(name)s - %(levelname)s - %(message)s')
  1955. logger = logging.getLogger(__name__)
  1956. def log(msg):
  1957. """
  1958. @summary:打印信息
  1959. """
  1960. logger.info(msg)
  1961. def cut_str(text_list, only_text_list, max_bytes_length=2000000):
  1962. logging.info("into cut_str")
  1963. try:
  1964. # 计算有格式总字节数
  1965. bytes_length = 0
  1966. for text in text_list:
  1967. bytes_length += len(bytes(text, encoding='utf-8'))
  1968. print("text_list", bytes_length)
  1969. # 小于直接返回
  1970. if bytes_length < max_bytes_length:
  1971. print("return text_list no cut")
  1972. return text_list
  1973. # 全部文件连接,重新计算无格式字节数
  1974. all_text = ""
  1975. bytes_length = 0
  1976. for text in only_text_list:
  1977. bytes_length += len(bytes(text, encoding='utf-8'))
  1978. all_text += text
  1979. print("only_text_list", bytes_length)
  1980. # 小于直接返回
  1981. if bytes_length < max_bytes_length:
  1982. print("return only_text_list no cut")
  1983. return only_text_list
  1984. # 截取字符
  1985. all_text = all_text[:int(max_bytes_length/3)]
  1986. print("text bytes ", len(bytes(all_text, encoding='utf-8')))
  1987. print("return only_text_list has cut")
  1988. return [all_text]
  1989. except Exception as e:
  1990. logging.info("cut_str " + str(e))
  1991. return ["-1"]
  1992. @get_memory_info.memory_decorator
  1993. def convert(data, ocr_model, otr_model):
  1994. """
  1995. 接口返回值:
  1996. {[str], 1}: 处理成功
  1997. {[-1], 0}: 逻辑处理错误
  1998. {[-2], 0}: 接口调用错误
  1999. {[-3], 1}: 文件格式错误,无法打开
  2000. {[-4], 0}: 各类文件调用第三方包读取超时
  2001. {[-5], 0}: 整个转换过程超时
  2002. {[-6], 0}: 阿里云UDF队列超时
  2003. {[-7], 1}: 文件需密码,无法打开
  2004. :return: {"result": [], "is_success": int}
  2005. """
  2006. # 控制内存
  2007. # soft, hard = resource.getrlimit(resource.RLIMIT_AS)
  2008. # resource.setrlimit(resource.RLIMIT_AS, (15 * 1024 ** 3, hard))
  2009. logging.info("into convert")
  2010. start_time = time.time()
  2011. try:
  2012. # 模型加入全局变量
  2013. globals().update({"global_ocr_model": ocr_model})
  2014. globals().update({"global_otr_model": otr_model})
  2015. stream = base64.b64decode(data.get("file"))
  2016. _type = data.get("type")
  2017. if get_platform() == "Windows":
  2018. # 解除超时装饰器,直接访问原函数
  2019. origin_unique_temp_file_process = unique_temp_file_process.__wrapped__
  2020. text = origin_unique_temp_file_process(stream, _type)
  2021. else:
  2022. # Linux 通过装饰器设置整个转换超时时间
  2023. try:
  2024. text = unique_temp_file_process(stream, _type)
  2025. except TimeoutError:
  2026. logging.info("convert time out! 1200 sec")
  2027. text = [-5]
  2028. if text == [-1]:
  2029. print({"failed result": [-1], "is_success": 0}, time.time() - start_time)
  2030. return {"result_html": ["-1"], "result_text": ["-1"], "is_success": 0}
  2031. if text == [-2]:
  2032. print({"failed result": [-2], "is_success": 0}, time.time() - start_time)
  2033. return {"result_html": ["-2"], "result_text": ["-2"], "is_success": 0}
  2034. if text == [-3]:
  2035. print({"failed result": [-3], "is_success": 1}, time.time() - start_time)
  2036. return {"result_html": ["-3"], "result_text": ["-3"], "is_success": 1}
  2037. if text == [-4]:
  2038. print({"failed result": [-4], "is_success": 0}, time.time() - start_time)
  2039. return {"result_html": ["-4"], "result_text": ["-4"], "is_success": 0}
  2040. if text == [-5]:
  2041. print({"failed result": [-5], "is_success": 0}, time.time() - start_time)
  2042. return {"result_html": ["-5"], "result_text": ["-5"], "is_success": 0}
  2043. if text == [-7]:
  2044. print({"failed result": [-7], "is_success": 1}, time.time() - start_time)
  2045. return {"result_html": ["-7"], "result_text": ["-7"], "is_success": 1}
  2046. # text = add_html_format(text)
  2047. # 结果保存result.html
  2048. if get_platform() == "Windows":
  2049. text_str = ""
  2050. for t in text:
  2051. text_str += t
  2052. to_html("../result.html", text_str)
  2053. # 取纯文本
  2054. only_text = []
  2055. for t in text:
  2056. new_t = BeautifulSoup(t, "lxml").get_text()
  2057. new_t = re.sub("\n", "", new_t)
  2058. only_text.append(new_t)
  2059. # 判断长度,过长截取
  2060. text = cut_str(text, only_text)
  2061. only_text = cut_str(only_text, only_text)
  2062. if len(only_text) == 0:
  2063. only_text = [""]
  2064. if only_text[0] == '' and len(only_text) <= 1:
  2065. print({"finished result": ["", 0], "is_success": 1}, time.time() - start_time)
  2066. else:
  2067. print({"finished result": [str(only_text)[:20], len(str(text))],
  2068. "is_success": 1}, time.time() - start_time)
  2069. return {"result_html": text, "result_text": only_text, "is_success": 1}
  2070. except Exception as e:
  2071. print({"failed result": [-1], "is_success": 0}, time.time() - start_time)
  2072. print("convert", traceback.print_exc())
  2073. return {"result_html": ["-1"], "result_text": ["-1"], "is_success": 0}
  2074. global_type = ""
  2075. local_url = "http://127.0.0.1"
  2076. if get_platform() == "Windows":
  2077. _path = os.path.abspath(os.path.dirname(__file__))
  2078. else:
  2079. _path = "/home/admin"
  2080. if not os.path.exists(_path):
  2081. _path = os.path.dirname(os.path.abspath(__file__))
  2082. if __name__ == '__main__':
  2083. print(os.path.abspath(__file__) + "/../../")
  2084. # if len(sys.argv) == 2:
  2085. # port = int(sys.argv[1])
  2086. # else:
  2087. # port = 15015
  2088. # app.run(host='0.0.0.0', port=port, threaded=True, debug=False)
  2089. # log("format_conversion running")
  2090. # convert("", "ocr_model", "otr_model")
  2091. # _str = "啊"
  2092. # str1 = ""
  2093. # str2 = ""
  2094. # for i in range(900000):
  2095. # str1 += _str
  2096. # list1 = [str1]
  2097. # for i in range(700000):
  2098. # str2 += _str
  2099. # list2 = [str2]
  2100. # cut_str(list1, list2)
  2101. # file_path = "C:/Users/Administrator/Desktop/error1.png"
  2102. # file_path = "D:/Project/table-detect-master/train_data/label_1.jpg"
  2103. # file_path = "D:/Project/table-detect-master/test_files/1.png"
  2104. # file_path = "D:/Project/table-detect-master/test_files/table2.jpg"
  2105. file_path = "C:/Users/Administrator/Desktop/error9.pdf"
  2106. # file_path = "C:/Users/Administrator/Desktop/Test_Interface/test1.pdf"
  2107. # file_path = "C:/Users/Administrator/Desktop/Test_ODPS/1624875783055.pdf"
  2108. # file_path = "table2.jpg"
  2109. with open(file_path, "rb") as f:
  2110. file_bytes = f.read()
  2111. file_base64 = base64.b64encode(file_bytes)
  2112. data = {"file": file_base64, "type": file_path.split(".")[-1], "filemd5": 100}
  2113. ocr_model = ocr_interface.OcrModels().get_model()
  2114. otr_model = otr_interface.OtrModels().get_model()
  2115. result = convert(data, ocr_model, otr_model)
  2116. print("*"*40)
  2117. result = convert(data, ocr_model, otr_model)
  2118. # print(result)