convert.py 104 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012101310141015101610171018101910201021102210231024102510261027102810291030103110321033103410351036103710381039104010411042104310441045104610471048104910501051105210531054105510561057105810591060106110621063106410651066106710681069107010711072107310741075107610771078107910801081108210831084108510861087108810891090109110921093109410951096109710981099110011011102110311041105110611071108110911101111111211131114111511161117111811191120112111221123112411251126112711281129113011311132113311341135113611371138113911401141114211431144114511461147114811491150115111521153115411551156115711581159116011611162116311641165116611671168116911701171117211731174117511761177117811791180118111821183118411851186118711881189119011911192119311941195119611971198119912001201120212031204120512061207120812091210121112121213121412151216121712181219122012211222122312241225122612271228122912301231123212331234123512361237123812391240124112421243124412451246124712481249125012511252125312541255125612571258125912601261126212631264126512661267126812691270127112721273127412751276127712781279128012811282128312841285128612871288128912901291129212931294129512961297129812991300130113021303130413051306130713081309131013111312131313141315131613171318131913201321132213231324132513261327132813291330133113321333133413351336133713381339134013411342134313441345134613471348134913501351135213531354135513561357135813591360136113621363136413651366136713681369137013711372137313741375137613771378137913801381138213831384138513861387138813891390139113921393139413951396139713981399140014011402140314041405140614071408140914101411141214131414141514161417141814191420142114221423142414251426142714281429143014311432143314341435143614371438143914401441144214431444144514461447144814491450145114521453145414551456145714581459146014611462146314641465146614671468146914701471147214731474147514761477147814791480148114821483148414851486148714881489149014911492149314941495149614971498149915001501150215031504150515061507150815091510151115121513151415151516151715181519152015211522152315241525152615271528152915301531153215331534153515361537153815391540154115421543154415451546154715481549155015511552155315541555155615571558155915601561156215631564156515661567156815691570157115721573157415751576157715781579158015811582158315841585158615871588158915901591159215931594159515961597159815991600160116021603160416051606160716081609161016111612161316141615161616171618161916201621162216231624162516261627162816291630163116321633163416351636163716381639164016411642164316441645164616471648164916501651165216531654165516561657165816591660166116621663166416651666166716681669167016711672167316741675167616771678167916801681168216831684168516861687168816891690169116921693169416951696169716981699170017011702170317041705170617071708170917101711171217131714171517161717171817191720172117221723172417251726172717281729173017311732173317341735173617371738173917401741174217431744174517461747174817491750175117521753175417551756175717581759176017611762176317641765176617671768176917701771177217731774177517761777177817791780178117821783178417851786178717881789179017911792179317941795179617971798179918001801180218031804180518061807180818091810181118121813181418151816181718181819182018211822182318241825182618271828182918301831183218331834183518361837183818391840184118421843184418451846184718481849185018511852185318541855185618571858185918601861186218631864186518661867186818691870187118721873187418751876187718781879188018811882188318841885188618871888188918901891189218931894189518961897189818991900190119021903190419051906190719081909191019111912191319141915191619171918191919201921192219231924192519261927192819291930193119321933193419351936193719381939194019411942194319441945194619471948194919501951195219531954195519561957195819591960196119621963196419651966196719681969197019711972197319741975197619771978197919801981198219831984198519861987198819891990199119921993199419951996199719981999200020012002200320042005200620072008200920102011201220132014201520162017201820192020202120222023202420252026202720282029203020312032203320342035203620372038203920402041204220432044204520462047204820492050205120522053205420552056205720582059206020612062206320642065206620672068206920702071207220732074207520762077207820792080208120822083208420852086208720882089209020912092209320942095209620972098209921002101210221032104210521062107210821092110211121122113211421152116211721182119212021212122212321242125212621272128212921302131213221332134213521362137213821392140214121422143214421452146214721482149215021512152215321542155215621572158215921602161216221632164216521662167216821692170217121722173217421752176217721782179218021812182218321842185218621872188218921902191219221932194219521962197219821992200220122022203220422052206220722082209221022112212221322142215221622172218221922202221222222232224222522262227222822292230223122322233223422352236223722382239224022412242224322442245224622472248224922502251225222532254225522562257225822592260226122622263226422652266226722682269227022712272227322742275227622772278227922802281228222832284228522862287228822892290229122922293229422952296229722982299230023012302230323042305230623072308230923102311231223132314231523162317231823192320232123222323232423252326232723282329233023312332233323342335233623372338233923402341234223432344234523462347234823492350235123522353235423552356235723582359236023612362236323642365236623672368236923702371237223732374237523762377237823792380238123822383238423852386238723882389239023912392239323942395239623972398239924002401240224032404240524062407240824092410241124122413241424152416241724182419242024212422242324242425242624272428242924302431243224332434243524362437243824392440244124422443244424452446244724482449245024512452245324542455245624572458245924602461246224632464246524662467246824692470247124722473247424752476247724782479248024812482248324842485248624872488248924902491249224932494249524962497249824992500250125022503250425052506250725082509251025112512251325142515251625172518251925202521252225232524252525262527252825292530253125322533253425352536253725382539254025412542254325442545254625472548254925502551255225532554255525562557255825592560256125622563256425652566256725682569257025712572257325742575257625772578257925802581258225832584258525862587258825892590259125922593259425952596259725982599260026012602260326042605260626072608260926102611261226132614261526162617261826192620262126222623262426252626262726282629263026312632263326342635263626372638263926402641264226432644264526462647264826492650265126522653265426552656265726582659266026612662266326642665266626672668266926702671267226732674267526762677267826792680268126822683268426852686268726882689
  1. #-*- coding: utf-8 -*-
  2. import copy
  3. import difflib
  4. import sys
  5. import os
  6. sys.path.append(os.path.dirname(os.path.abspath(__file__)) + "/../")
  7. from format_convert.convert_doc import doc2text
  8. from format_convert.convert_docx import docx2text
  9. from format_convert.convert_image import picture2text
  10. from format_convert.convert_pdf import pdf2text, PDFConvert
  11. from format_convert.convert_rar import rar2text
  12. from format_convert.convert_swf import swf2text
  13. from format_convert.convert_txt import txt2text
  14. from format_convert.convert_xls import xls2text
  15. from format_convert.convert_xlsx import xlsx2text
  16. from format_convert.convert_zip import zip2text
  17. import codecs
  18. import gc
  19. import hashlib
  20. import io
  21. import json
  22. import multiprocessing
  23. import sys
  24. import subprocess
  25. import PyPDF2
  26. import lxml
  27. import pdfminer
  28. from PIL import Image
  29. from format_convert import get_memory_info
  30. from ocr import ocr_interface
  31. from ocr.ocr_interface import ocr, OcrModels
  32. from otr import otr_interface
  33. from otr.otr_interface import otr, OtrModels
  34. import re
  35. import shutil
  36. import signal
  37. import sys
  38. import base64
  39. import time
  40. import traceback
  41. import uuid
  42. from os.path import basename
  43. import cv2
  44. import fitz
  45. import pandas
  46. import docx
  47. import zipfile
  48. import mimetypes
  49. import filetype
  50. # import pdfplumber
  51. import psutil
  52. import requests
  53. import rarfile
  54. from PyPDF2 import PdfFileReader, PdfFileWriter
  55. import xml.dom.minidom
  56. import subprocess
  57. import logging
  58. from pdfminer.pdfparser import PDFParser
  59. from pdfminer.pdfdocument import PDFDocument
  60. from pdfminer.pdfpage import PDFPage
  61. from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
  62. from pdfminer.converter import PDFPageAggregator
  63. from pdfminer.layout import LTTextBoxHorizontal, LAParams, LTFigure, LTImage, LTCurve, LTText, LTChar
  64. import logging
  65. import chardet
  66. from bs4 import BeautifulSoup
  67. from format_convert.libreoffice_interface import office_convert
  68. from format_convert.swf.export import SVGExporter
  69. logging.getLogger("pdfminer").setLevel(logging.WARNING)
  70. from format_convert.table_correct import *
  71. from format_convert.swf.movie import SWF
  72. import logging
  73. # import timeout_decorator
  74. from format_convert import timeout_decorator
  75. logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(name)s - %(levelname)s - %(message)s')
  76. # txt doc docx xls xlsx pdf zip rar swf jpg jpeg png
  77. # def judge_error_code(_list, code=[-1, -2, -3, -4, -5, -7]):
  78. # for c in code:
  79. # if _list == [c]:
  80. # return True
  81. # return False
  82. #
  83. #
  84. # def set_timeout(signum, frame):
  85. # print("=======================set_timeout")
  86. # print("=======================set_timeout")
  87. # print("=======================set_timeout")
  88. # print("=======================set_timeout")
  89. # print("=======================set_timeout")
  90. # print("=======================set_timeout")
  91. # print("=======================set_timeout")
  92. # print("=======================set_timeout")
  93. # print("=======================set_timeout")
  94. # print("=======================set_timeout")
  95. # print("=======================set_timeout")
  96. # print("=======================set_timeout")
  97. # print("=======================set_timeout")
  98. # print("=======================set_timeout")
  99. # print("=======================set_timeout")
  100. # print("=======================set_timeout")
  101. #
  102. # raise TimeoutError
  103. #
  104. #
  105. # def log_traceback(func_name):
  106. # logging.info(func_name)
  107. # etype, value, tb = sys.exc_info()
  108. # for line in traceback.TracebackException(
  109. # type(value), value, tb, limit=None).format(chain=True):
  110. # logging.info(line)
  111. #
  112. #
  113. # def judge_format(path):
  114. # guess1 = mimetypes.guess_type(path)
  115. # _type = None
  116. # if guess1[0]:
  117. # _type = guess1[0]
  118. # else:
  119. # guess2 = filetype.guess(path)
  120. # if guess2:
  121. # _type = guess2.mime
  122. #
  123. # if _type == "application/pdf":
  124. # return "pdf"
  125. # if _type == "application/vnd.openxmlformats-officedocument.wordprocessingml.document":
  126. # return "docx"
  127. # if _type == "application/x-zip-compressed" or _type == "application/zip":
  128. # return "zip"
  129. # if _type == "application/x-rar-compressed" or _type == "application/rar":
  130. # return "rar"
  131. # if _type == "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet":
  132. # return "xlsx"
  133. # if _type == "application/msword":
  134. # return "doc"
  135. # if _type == "image/png":
  136. # return "png"
  137. # if _type == "image/jpeg":
  138. # return "jpg"
  139. #
  140. # # 猜不到,返回None
  141. # return None
  142. #
  143. #
  144. # @get_memory_info.memory_decorator
  145. # def txt2text(path):
  146. # logging.info("into txt2text")
  147. # try:
  148. # # 判断字符编码
  149. # with open(path, "rb") as ff:
  150. # data = ff.read()
  151. # encode = chardet.detect(data).get("encoding")
  152. # print("txt2text judge code is", encode)
  153. #
  154. # try:
  155. # if encode is None:
  156. # logging.info("txt2text cannot judge file code!")
  157. # return [-3]
  158. # with open(path, "r", encoding=encode) as ff:
  159. # txt_text = ff.read()
  160. # return [txt_text]
  161. # except:
  162. # logging.info("txt2text cannot open file with code " + encode)
  163. # return [-3]
  164. # except Exception as e:
  165. # print("txt2text", traceback.print_exc())
  166. # logging.info("txt2text error!")
  167. # return [-1]
  168. #
  169. #
  170. # @get_memory_info.memory_decorator
  171. # def doc2text(path, unique_type_dir):
  172. # logging.info("into doc2text")
  173. # try:
  174. # # 调用office格式转换
  175. # file_path = from_office_interface(path, unique_type_dir, 'docx')
  176. # # if file_path == [-3]:
  177. # # return [-3]
  178. # if judge_error_code(file_path):
  179. # return file_path
  180. #
  181. # text = docx2text(file_path, unique_type_dir)
  182. # return text
  183. # except Exception as e:
  184. # logging.info("doc2text error!")
  185. # print("doc2text", traceback.print_exc())
  186. # # log_traceback("doc2text")
  187. # return [-1]
  188. #
  189. #
  190. # @get_memory_info.memory_decorator
  191. # def read_xml_order(path, save_path):
  192. # logging.info("into read_xml_order")
  193. # try:
  194. # try:
  195. # f = zipfile.ZipFile(path)
  196. # for file in f.namelist():
  197. # if "word/document.xml" == str(file):
  198. # f.extract(file, save_path)
  199. # f.close()
  200. # except Exception as e:
  201. # # print("docx format error!", e)
  202. # logging.info("docx format error!")
  203. # return [-3]
  204. #
  205. # # DOMTree = xml.dom.minidom.parse(save_path + "word/document.xml")
  206. # # collection = DOMTree.documentElement
  207. #
  208. # try:
  209. # collection = xml_analyze(save_path + "word/document.xml")
  210. # except TimeoutError:
  211. # logging.info("read_xml_order timeout")
  212. # return [-4]
  213. #
  214. # body = collection.getElementsByTagName("w:body")[0]
  215. # order_list = []
  216. # for line in body.childNodes:
  217. # # print(str(line))
  218. # if "w:p" in str(line):
  219. # text = line.getElementsByTagName("w:t")
  220. # picture = line.getElementsByTagName("wp:docPr")
  221. # if text:
  222. # order_list.append("w:t")
  223. # if picture:
  224. # order_list.append("wp:docPr")
  225. #
  226. # for line1 in line.childNodes:
  227. # if "w:r" in str(line1):
  228. # # print("read_xml_order", "w:r")
  229. # picture1 = line1.getElementsByTagName("w:pict")
  230. # if picture1:
  231. # order_list.append("wp:docPr")
  232. #
  233. # if "w:tbl" in str(line):
  234. # order_list.append("w:tbl")
  235. # read_xml_table(path, save_path)
  236. # return order_list
  237. # except Exception as e:
  238. # logging.info("read_xml_order error!")
  239. # print("read_xml_order", traceback.print_exc())
  240. # # log_traceback("read_xml_order")
  241. # return [-1]
  242. #
  243. #
  244. # @get_memory_info.memory_decorator
  245. # def read_xml_table(path, save_path):
  246. # logging.info("into read_xml_table")
  247. # try:
  248. # # print("into read_xml_table")
  249. # try:
  250. # f = zipfile.ZipFile(path)
  251. # for file in f.namelist():
  252. # if "word/document.xml" == str(file):
  253. # f.extract(file, save_path)
  254. # f.close()
  255. # except Exception as e:
  256. # # print("docx format error!", e)
  257. # logging.info("docx format error!")
  258. # return [-3]
  259. #
  260. # # DOMTree = xml.dom.minidom.parse(save_path + "word/document.xml")
  261. # # collection = DOMTree.documentElement
  262. #
  263. # try:
  264. # collection = xml_analyze(save_path + "word/document.xml")
  265. # except TimeoutError:
  266. # logging.info("read_xml_table timeout")
  267. # return [-4]
  268. #
  269. # body = collection.getElementsByTagName("w:body")[0]
  270. # table_text_list = []
  271. # # print("body.childNodes", body.childNodes)
  272. # for line in body.childNodes:
  273. # if "w:tbl" in str(line):
  274. # # print("str(line)", str(line))
  275. # table_text = '<table border="1">' + "\n"
  276. # tr_list = line.getElementsByTagName("w:tr")
  277. # # print("line.childNodes", line.childNodes)
  278. # tr_index = 0
  279. # tr_text_list = []
  280. # tr_text_list_colspan = []
  281. # for tr in tr_list:
  282. # table_text = table_text + "<tr rowspan=1>" + "\n"
  283. # tc_list = tr.getElementsByTagName("w:tc")
  284. # tc_index = 0
  285. # tc_text_list = []
  286. # for tc in tc_list:
  287. # tc_text = ""
  288. #
  289. # # 获取一格占多少列
  290. # col_span = tc.getElementsByTagName("w:gridSpan")
  291. # if col_span:
  292. # col_span = int(col_span[0].getAttribute("w:val"))
  293. # else:
  294. # col_span = 1
  295. #
  296. # # 获取是否是合并单元格的下一个空单元格
  297. # is_merge = tc.getElementsByTagName("w:vMerge")
  298. # if is_merge:
  299. # is_merge = is_merge[0].getAttribute("w:val")
  300. # if is_merge == "continue":
  301. # col_span_index = 0
  302. # real_tc_index = 0
  303. #
  304. # # if get_platform() == "Windows":
  305. # # print("read_xml_table tr_text_list", tr_text_list)
  306. # # print("read_xml_table tr_index", tr_index)
  307. #
  308. # if 0 <= tr_index - 1 < len(tr_text_list):
  309. # for tc_colspan in tr_text_list[tr_index - 1]:
  310. # if col_span_index < tc_index:
  311. # col_span_index += tc_colspan[1]
  312. # real_tc_index += 1
  313. #
  314. # # print("tr_index-1, real_tc_index", tr_index-1, real_tc_index)
  315. # # print(tr_text_list[tr_index-1])
  316. # if real_tc_index < len(tr_text_list[tr_index - 1]):
  317. # tc_text = tr_text_list[tr_index - 1][real_tc_index][0]
  318. #
  319. # table_text = table_text + "<td colspan=" + str(col_span) + ">" + "\n"
  320. # p_list = tc.getElementsByTagName("w:p")
  321. #
  322. # for p in p_list:
  323. # t = p.getElementsByTagName("w:t")
  324. # if t:
  325. # for tt in t:
  326. # # print("tt", tt.childNodes)
  327. # if len(tt.childNodes) > 0:
  328. # tc_text += tt.childNodes[0].nodeValue
  329. # tc_text += "\n"
  330. #
  331. # table_text = table_text + tc_text + "</td>" + "\n"
  332. # tc_index += 1
  333. # tc_text_list.append([tc_text, col_span])
  334. # table_text += "</tr>" + "\n"
  335. # tr_index += 1
  336. # tr_text_list.append(tc_text_list)
  337. # table_text += "</table>" + "\n"
  338. # table_text_list.append(table_text)
  339. # return table_text_list
  340. #
  341. # except Exception as e:
  342. # logging.info("read_xml_table error")
  343. # print("read_xml_table", traceback.print_exc())
  344. # # log_traceback("read_xml_table")
  345. # return [-1]
  346. #
  347. #
  348. # @get_memory_info.memory_decorator
  349. # @timeout_decorator.timeout(300, timeout_exception=TimeoutError)
  350. # def xml_analyze(path):
  351. # # 解析xml
  352. # DOMTree = xml.dom.minidom.parse(path)
  353. # collection = DOMTree.documentElement
  354. # return collection
  355. #
  356. #
  357. # def read_docx_table(document):
  358. # table_text_list = []
  359. # for table in document.tables:
  360. # table_text = "<table>\n"
  361. # print("==================")
  362. # for row in table.rows:
  363. # table_text += "<tr>\n"
  364. # for cell in row.cells:
  365. # table_text += "<td>" + cell.text + "</td>\n"
  366. # table_text += "</tr>\n"
  367. # table_text += "</table>\n"
  368. # print(table_text)
  369. # table_text_list.append(table_text)
  370. # return table_text_list
  371. #
  372. #
  373. # @get_memory_info.memory_decorator
  374. # def docx2text(path, unique_type_dir):
  375. # logging.info("into docx2text")
  376. # try:
  377. # try:
  378. # doc = docx.Document(path)
  379. # except Exception as e:
  380. # print("docx format error!", e)
  381. # print(traceback.print_exc())
  382. # logging.info("docx format error!")
  383. # return [-3]
  384. #
  385. # # 遍历段落
  386. # # print("docx2text extract paragraph")
  387. # paragraph_text_list = []
  388. # for paragraph in doc.paragraphs:
  389. # if paragraph.text != "":
  390. # paragraph_text_list.append("<div>" + paragraph.text + "</div>" + "\n")
  391. # # print("paragraph_text", paragraph.text)
  392. #
  393. # # 遍历表
  394. # try:
  395. # table_text_list = read_xml_table(path, unique_type_dir)
  396. # except TimeoutError:
  397. # return [-4]
  398. #
  399. # if judge_error_code(table_text_list):
  400. # return table_text_list
  401. #
  402. # # 顺序遍历图片
  403. # # print("docx2text extract image")
  404. # image_text_list = []
  405. # temp_image_path = unique_type_dir + "temp_image.png"
  406. # pattern = re.compile('rId\d+')
  407. # for graph in doc.paragraphs:
  408. # for run in graph.runs:
  409. # if run.text == '':
  410. # try:
  411. # if not pattern.search(run.element.xml):
  412. # continue
  413. # content_id = pattern.search(run.element.xml).group(0)
  414. # content_type = doc.part.related_parts[content_id].content_type
  415. # except Exception as e:
  416. # print("docx no image!", e)
  417. # continue
  418. # if not content_type.startswith('image'):
  419. # continue
  420. #
  421. # # 写入临时文件
  422. # img_data = doc.part.related_parts[content_id].blob
  423. # with open(temp_image_path, 'wb') as f:
  424. # f.write(img_data)
  425. #
  426. # # if get_platform() == "Windows":
  427. # # print("img_data", img_data)
  428. #
  429. # if img_data is None:
  430. # continue
  431. #
  432. # # 识别图片文字
  433. # image_text = picture2text(temp_image_path)
  434. # if image_text == [-2]:
  435. # return [-2]
  436. # if image_text == [-1]:
  437. # return [-1]
  438. # if image_text == [-3]:
  439. # continue
  440. #
  441. # image_text = image_text[0]
  442. # image_text_list.append(add_div(image_text))
  443. #
  444. # # 解析document.xml,获取文字顺序
  445. # # print("docx2text extract order")
  446. # order_list = read_xml_order(path, unique_type_dir)
  447. # if order_list == [-2]:
  448. # return [-2]
  449. # if order_list == [-1]:
  450. # return [-1]
  451. #
  452. # text = ""
  453. # print("len(order_list)", len(order_list))
  454. # print("len(paragraph_text_list)", len(paragraph_text_list))
  455. # print("len(image_text_list)", len(image_text_list))
  456. # print("len(table_text_list)", len(table_text_list))
  457. #
  458. # # log("docx2text output in order")
  459. # for tag in order_list:
  460. # if tag == "w:t":
  461. # if len(paragraph_text_list) > 0:
  462. # text += paragraph_text_list.pop(0)
  463. # if tag == "wp:docPr":
  464. # if len(image_text_list) > 0:
  465. # text += image_text_list.pop(0)
  466. # if tag == "w:tbl":
  467. # if len(table_text_list) > 0:
  468. # text += table_text_list.pop(0)
  469. # return [text]
  470. # except Exception as e:
  471. # # print("docx2text", e, global_type)
  472. # logging.info("docx2text error!")
  473. # print("docx2text", traceback.print_exc())
  474. # # log_traceback("docx2text")
  475. # return [-1]
  476. #
  477. #
  478. # def add_div(text):
  479. # if text == "" or text is None:
  480. # return text
  481. #
  482. # if get_platform() == "Windows":
  483. # print("add_div", text)
  484. # if re.findall("<div>", text):
  485. # return text
  486. #
  487. # text = "<div>" + text + "\n"
  488. # text = re.sub("\n", "</div>\n<div>", text)
  489. # # text += "</div>"
  490. # if text[-5:] == "<div>":
  491. # print("add_div has cut", text[-30:])
  492. # text = text[:-5]
  493. # return text
  494. #
  495. #
  496. # @get_memory_info.memory_decorator
  497. # def pdf2Image(path, save_dir):
  498. # logging.info("into pdf2Image")
  499. # try:
  500. # try:
  501. # doc = fitz.open(path)
  502. # except Exception as e:
  503. # logging.info("pdf format error!")
  504. # # print("pdf format error!", e)
  505. # return [-3]
  506. #
  507. # # output_image_list = []
  508. # output_image_dict = {}
  509. # page_count = doc.page_count
  510. # for page_no in range(page_count):
  511. # # 限制pdf页数,只取前10页后10页
  512. # if page_count > 20:
  513. # if 10 <= page_no < page_count-10:
  514. # # logging.info("pdf2Image: pdf pages count " + str(doc.page_count)
  515. # # + ", only get 70 pages")
  516. # continue
  517. #
  518. # try:
  519. # page = doc.loadPage(page_no)
  520. # output = save_dir + "_page" + str(page_no) + ".png"
  521. # rotate = int(0)
  522. # # 每个尺寸的缩放系数为1.3,这将为我们生成分辨率提高2.6的图像。
  523. # # 此处若是不做设置,默认图片大小为:792X612, dpi=96
  524. # # (1.33333333 --> 1056x816) (2 --> 1584x1224)
  525. # # (1.183, 2.28 --> 1920x1080)
  526. # zoom_x = 3.
  527. # zoom_y = 3.
  528. # # mat = fitz.Matrix(zoom_x, zoom_y).preRotate(rotate)
  529. # mat = fitz.Matrix(zoom_x, zoom_y).preRotate(rotate)
  530. # pix = page.getPixmap(matrix=mat, alpha=False)
  531. # pix.writePNG(output)
  532. # pdf_image = cv2.imread(output)
  533. # print("pdf_image", page_no, pdf_image.shape)
  534. # # output_image_list.append([page_no, output])
  535. # output_image_dict[int(page_no)] = output
  536. # except ValueError as e:
  537. # traceback.print_exc()
  538. # if str(e) == "page not in document":
  539. # logging.info("pdf2Image page not in document! continue..." + str(page_no))
  540. # continue
  541. # elif "encrypted" in str(e):
  542. # logging.info("pdf2Image document need password " + str(page_no))
  543. # return [-7]
  544. # except RuntimeError as e:
  545. # if "cannot find page" in str(e):
  546. # logging.info("pdf2Image page {} not in document! continue... ".format(str(page_no)) + str(e))
  547. # continue
  548. # else:
  549. # traceback.print_exc()
  550. # return [-3]
  551. # return [output_image_dict]
  552. #
  553. # except Exception as e:
  554. # logging.info("pdf2Image error!")
  555. # print("pdf2Image", traceback.print_exc())
  556. # return [-1]
  557. #
  558. #
  559. # ocr_result_flag = 0
  560. # def image_preprocess(image_np, image_path, use_ocr=True):
  561. # logging.info("into image_preprocess")
  562. # try:
  563. # # 长 宽
  564. # # resize_size = (1024, 768)
  565. # # 限制图片大小
  566. # # resize_image(image_path, resize_size)
  567. #
  568. # # 图片倾斜校正,写入原来的图片路径
  569. # g_r_i = get_rotated_image(image_np, image_path)
  570. # if g_r_i == [-1]:
  571. # return [-1], [], [], 0
  572. #
  573. # # otr需要图片resize, 写入另一个路径
  574. # image_np = cv2.imread(image_path)
  575. # best_h, best_w = get_best_predict_size(image_np)
  576. # image_resize = cv2.resize(image_np, (best_w, best_h), interpolation=cv2.INTER_AREA)
  577. # image_resize_path = image_path[:-4] + "_resize" + image_path[-4:]
  578. # cv2.imwrite(image_resize_path, image_resize)
  579. #
  580. # # 调用otr模型接口
  581. # with open(image_resize_path, "rb") as f:
  582. # image_bytes = f.read()
  583. # points, split_lines, bboxes, outline_points = from_otr_interface(image_bytes)
  584. # if judge_error_code(points):
  585. # return points, [], [], 0
  586. #
  587. # # 将resize后得到的bbox根据比例还原
  588. # ratio = (image_np.shape[0]/best_h, image_np.shape[1]/best_w)
  589. # for i in range(len(bboxes)):
  590. # bbox = bboxes[i]
  591. # bboxes[i] = [(int(bbox[0][0]*ratio[1]), int(bbox[0][1]*ratio[0])),
  592. # (int(bbox[1][0]*ratio[1]), int(bbox[1][1]*ratio[0]))]
  593. # for i in range(len(split_lines)):
  594. # line = split_lines[i]
  595. # split_lines[i] = [(int(line[0][0]*ratio[1]), int(line[0][1]*ratio[0])),
  596. # (int(line[1][0]*ratio[1]), int(line[1][1]*ratio[0]))]
  597. # for i in range(len(points)):
  598. # point = points[i]
  599. # points[i] = (int(point[0]*ratio[1]), int(point[1]*ratio[0]))
  600. #
  601. # for i in range(len(outline_points)):
  602. # point = outline_points[i]
  603. # outline_points[i] = [(int(point[0][0]*ratio[1]), int(point[0][1]*ratio[0])),
  604. # (int(point[1][0]*ratio[1]), int(point[1][1]*ratio[0]))]
  605. #
  606. # # 查看是否能输出正确框
  607. # for box in bboxes:
  608. # cv2.rectangle(image_np, box[0], box[1], (0, 255, 0), 2)
  609. # # cv2.namedWindow('bbox', 0)
  610. # # cv2.imshow("bbox", image_np)
  611. # # cv2.waitKey(0)
  612. #
  613. # # 调用ocr模型接口
  614. # with open(image_path, "rb") as f:
  615. # image_bytes = f.read()
  616. # # 有表格
  617. # if len(bboxes) >= 2:
  618. # text_list, bbox_list = from_ocr_interface(image_bytes, True)
  619. # if judge_error_code(text_list):
  620. # return text_list, [], [], 0
  621. #
  622. # # for i in range(len(text_list)):
  623. # # print(text_list[i], bbox_list[i])
  624. # # 查看是否能输出正确框
  625. #
  626. # # for box in bbox_list:
  627. # # cv2.rectangle(image_np, (int(box[0][0]), int(box[0][1])),
  628. # # (int(box[2][0]), int(box[2][1])), (255, 0, 0), 1)
  629. # # cv2.namedWindow('bbox', 0)
  630. # # cv2.imshow("bbox", image_np)
  631. # # cv2.waitKey(0)
  632. #
  633. # text, column_list = get_formatted_table(text_list, bbox_list, bboxes, split_lines)
  634. # if judge_error_code(text):
  635. # return text, [], [], 0
  636. # is_table = 1
  637. # return text, column_list, outline_points, is_table
  638. #
  639. # # 无表格
  640. # else:
  641. # if use_ocr:
  642. # text = from_ocr_interface(image_bytes)
  643. # if judge_error_code(text):
  644. # return text, [], [], 0
  645. #
  646. # is_table = 0
  647. # return text, [], [], is_table
  648. # else:
  649. # is_table = 0
  650. # return None, [], [], is_table
  651. #
  652. # except Exception as e:
  653. # logging.info("image_preprocess error")
  654. # print("image_preprocess", traceback.print_exc())
  655. # return [-1], [], [], 0
  656. #
  657. #
  658. # def get_best_predict_size2(image_np):
  659. # sizes = [1280, 1152, 1024, 896, 768, 640, 512, 384, 256, 128]
  660. #
  661. # min_len = 10000
  662. # best_height = sizes[0]
  663. # for height in sizes:
  664. # if abs(image_np.shape[0] - height) < min_len:
  665. # min_len = abs(image_np.shape[0] - height)
  666. # best_height = height
  667. #
  668. # min_len = 10000
  669. # best_width = sizes[0]
  670. # for width in sizes:
  671. # if abs(image_np.shape[1] - width) < min_len:
  672. # min_len = abs(image_np.shape[1] - width)
  673. # best_width = width
  674. #
  675. # return best_height, best_width
  676. #
  677. #
  678. # def get_best_predict_size(image_np, times=64):
  679. # sizes = []
  680. # for i in range(1, 100):
  681. # if i*times <= 3000:
  682. # sizes.append(i*times)
  683. # sizes.sort(key=lambda x: x, reverse=True)
  684. #
  685. # min_len = 10000
  686. # best_height = sizes[0]
  687. # for height in sizes:
  688. # if abs(image_np.shape[0] - height) < min_len:
  689. # min_len = abs(image_np.shape[0] - height)
  690. # best_height = height
  691. #
  692. # min_len = 10000
  693. # best_width = sizes[0]
  694. # for width in sizes:
  695. # if abs(image_np.shape[1] - width) < min_len:
  696. # min_len = abs(image_np.shape[1] - width)
  697. # best_width = width
  698. #
  699. # return best_height, best_width
  700. #
  701. #
  702. # @get_memory_info.memory_decorator
  703. # def pdf2text(path, unique_type_dir):
  704. # logging.info("into pdf2text")
  705. # try:
  706. # # pymupdf pdf to image
  707. # save_dir = path.split(".")[-2] + "_" + path.split(".")[-1]
  708. # output_image_dict = pdf2Image(path, save_dir)
  709. # if judge_error_code(output_image_dict):
  710. # return output_image_dict
  711. # output_image_dict = output_image_dict[0]
  712. # output_image_no_list = list(output_image_dict.keys())
  713. # output_image_no_list.sort(key=lambda x: x)
  714. #
  715. # # 获取每页pdf提取的文字、表格的列数、轮廓点、是否含表格、页码
  716. # # page_info_list = []
  717. # page_info_dict = {}
  718. # has_table_dict = {}
  719. # no_table_dict = {}
  720. # for page_no in output_image_no_list:
  721. # img_path = output_image_dict.get(page_no)
  722. # print("pdf page", page_no, "in total", output_image_no_list[-1])
  723. # # 读不出来的跳过
  724. # try:
  725. # img = cv2.imread(img_path)
  726. # img_size = img.shape
  727. # except:
  728. # logging.info("pdf2text read image in page fail! continue...")
  729. # continue
  730. #
  731. # # 每张图片处理
  732. # text, column_list, outline_points, is_table = image_preprocess(img, img_path,
  733. # use_ocr=False)
  734. # if judge_error_code(text):
  735. # return text
  736. #
  737. # # page_info_list.append([text, column_list, outline_points, is_table,
  738. # # page_no, img_size])
  739. # page_info = [text, column_list, outline_points, is_table, img_size]
  740. # page_info_dict[int(page_no)] = page_info
  741. # # 包含table的和不包含table的
  742. # if is_table:
  743. # has_table_dict[int(page_no)] = page_info
  744. # else:
  745. # no_table_dict[int(page_no)] = page_info
  746. #
  747. # has_table_no_list = list(has_table_dict.keys())
  748. # has_table_no_list.sort(key=lambda x: x)
  749. # page_no_list = list(page_info_dict.keys())
  750. # page_no_list.sort(key=lambda x: x)
  751. #
  752. # # 页码表格连接
  753. # table_connect_list, connect_text_list = page_table_connect(has_table_dict)
  754. # if judge_error_code(table_connect_list):
  755. # return table_connect_list
  756. #
  757. # # 连接的页码
  758. # table_connect_page_no_list = []
  759. # for area in connect_text_list:
  760. # table_connect_page_no_list.append(area[1])
  761. # print("pdf2text table_connect_list", table_connect_list)
  762. # print("connect_text_list", connect_text_list)
  763. #
  764. # # pdfminer 方式
  765. # try:
  766. # fp = open(path, 'rb')
  767. # # 用文件对象创建一个PDF文档分析器
  768. # parser = PDFParser(fp)
  769. # # 创建一个PDF文档
  770. # doc = PDFDocument(parser)
  771. # # 连接分析器,与文档对象
  772. # rsrcmgr = PDFResourceManager()
  773. # device = PDFPageAggregator(rsrcmgr, laparams=LAParams())
  774. # interpreter = PDFPageInterpreter(rsrcmgr, device)
  775. #
  776. # # 判断是否能读pdf
  777. # for page in PDFPage.create_pages(doc):
  778. # break
  779. # except pdfminer.psparser.PSEOF as e:
  780. # # pdfminer 读不了空白页的对象,直接使用pymupdf转换出的图片进行ocr识别
  781. # logging.info("pdf2text " + str(e) + " use ocr read pdf!")
  782. # text_list = []
  783. # for page_no in page_no_list:
  784. # logging.info("pdf2text ocr page_no " + str(page_no))
  785. # page_info = page_info_dict.get(page_no)
  786. # # 表格
  787. # if page_info[3]:
  788. # # 判断表格是否跨页连接
  789. # area_no = 0
  790. # jump_page = 0
  791. # for area in table_connect_list:
  792. # if page_no in area:
  793. # # 只记录一次text
  794. # if page_no == area[0]:
  795. # image_text = connect_text_list[area_no][0]
  796. # text_list.append([image_text, page_no, 0])
  797. # jump_page = 1
  798. # area_no += 1
  799. #
  800. # # 是连接页的跳过后面步骤
  801. # if jump_page:
  802. # continue
  803. #
  804. # # 直接取text
  805. # image_text = page_info_dict.get(page_no)[0]
  806. # text_list.append([image_text, page_no, 0])
  807. # # 非表格
  808. # else:
  809. # with open(output_image_dict.get(page_no), "rb") as ff:
  810. # image_stream = ff.read()
  811. # image_text = from_ocr_interface(image_stream)
  812. # text_list.append([image_text, page_no, 0])
  813. #
  814. # text_list.sort(key=lambda z: z[1])
  815. # text = ""
  816. # for t in text_list:
  817. # text += t[0]
  818. # return [text]
  819. # except Exception as e:
  820. # logging.info("pdf format error!")
  821. # traceback.print_exc()
  822. # return [-3]
  823. #
  824. # text_list = []
  825. # page_no = 0
  826. # pages = PDFPage.create_pages(doc)
  827. # pages = list(pages)
  828. # page_count = len(pages)
  829. # for page in pages:
  830. # logging.info("pdf2text pymupdf page_no " + str(page_no))
  831. # # 限制pdf页数,只取前100页
  832. # # if page_no >= 70:
  833. # # logging.info("pdf2text: pdf pages only get 70 pages")
  834. # # break
  835. # if page_count > 20:
  836. # if 10 <= page_no < page_count-10:
  837. # page_no += 1
  838. # continue
  839. #
  840. # # 判断页码在含表格页码中,直接拿已生成的text
  841. # if page_no in has_table_no_list:
  842. # # 判断表格是否跨页连接
  843. # area_no = 0
  844. # jump_page = 0
  845. # for area in table_connect_list:
  846. # if page_no in area:
  847. # # 只记录一次text
  848. # if page_no == area[0]:
  849. # image_text = connect_text_list[area_no][0]
  850. # text_list.append([image_text, page_no, 0])
  851. # jump_page = 1
  852. # area_no += 1
  853. #
  854. # # 是连接页的跳过后面步骤
  855. # if jump_page:
  856. # page_no += 1
  857. # continue
  858. #
  859. # # 直接取text
  860. # image_text = has_table_dict.get(page_no)[0]
  861. # text_list.append([image_text, page_no, 0])
  862. # page_no += 1
  863. # continue
  864. #
  865. # # 不含表格的解析pdf
  866. # else:
  867. # if get_platform() == "Windows":
  868. # try:
  869. # interpreter.process_page(page)
  870. # layout = device.get_result()
  871. # except Exception:
  872. # logging.info("pdf2text pdfminer read pdf page error! continue...")
  873. # continue
  874. #
  875. # else:
  876. # # 设置超时时间
  877. # try:
  878. # # 解析pdf中的不含表格的页
  879. # if get_platform() == "Windows":
  880. # origin_pdf_analyze = pdf_analyze.__wrapped__
  881. # layout = origin_pdf_analyze(interpreter, page, device)
  882. # else:
  883. # layout = pdf_analyze(interpreter, page, device)
  884. # except TimeoutError as e:
  885. # logging.info("pdf2text pdfminer read pdf page time out!")
  886. # return [-4]
  887. # except Exception:
  888. # logging.info("pdf2text pdfminer read pdf page error! continue...")
  889. # continue
  890. #
  891. # # 判断该页有没有文字对象,没有则有可能是有水印
  892. # only_image = 1
  893. # image_count = 0
  894. # for x in layout:
  895. # if isinstance(x, LTTextBoxHorizontal):
  896. # only_image = 0
  897. # if isinstance(x, LTFigure):
  898. # image_count += 1
  899. #
  900. # # 如果该页图片数量过多,直接ocr整页识别
  901. # logging.info("pdf2text image_count " + str(image_count))
  902. # if image_count >= 3:
  903. # image_text = page_info_dict.get(page_no)[0]
  904. # if image_text is None:
  905. # with open(output_image_dict.get(page_no), "rb") as ff:
  906. # image_stream = ff.read()
  907. # image_text = from_ocr_interface(image_stream)
  908. # if judge_error_code(image_text):
  909. # return image_text
  910. # page_info_dict[page_no][0] = image_text
  911. #
  912. # text_list.append([image_text, page_no, 0])
  913. # page_no += 1
  914. # continue
  915. #
  916. # order_list = []
  917. # for x in layout:
  918. # # 该对象是否是ocr识别
  919. # ocr_flag = 0
  920. #
  921. # if get_platform() == "Windows":
  922. # # print("x", page_no, x)
  923. # print()
  924. #
  925. # if isinstance(x, LTTextBoxHorizontal):
  926. # image_text = x.get_text()
  927. #
  928. # # 无法识别编码,用ocr
  929. # if re.search('[(]cid:[0-9]+[)]', image_text):
  930. # print(re.search('[(]cid:[0-9]+[)]', image_text))
  931. # image_text = page_info_dict.get(page_no)[0]
  932. # if image_text is None:
  933. # with open(output_image_dict.get(page_no), "rb") as ff:
  934. # image_stream = ff.read()
  935. # image_text = from_ocr_interface(image_stream)
  936. # if judge_error_code(image_text):
  937. # return image_text
  938. # page_info_dict[page_no][0] = image_text
  939. # image_text = add_div(image_text)
  940. # # order_list.append([image_text, page_no, x.bbox[1]])
  941. # order_list = [[image_text, page_no, x.bbox[1]]]
  942. # break
  943. # else:
  944. # image_text = add_div(image_text)
  945. # order_list.append([image_text, page_no, x.bbox[1]])
  946. # continue
  947. #
  948. # if isinstance(x, LTFigure):
  949. # for image in x:
  950. # if isinstance(image, LTImage):
  951. # try:
  952. # print("pdf2text LTImage size", page_no, image.width, image.height)
  953. # image_stream = image.stream.get_data()
  954. #
  955. # # 小的图忽略
  956. # if image.width <= 300 and image.height <= 300:
  957. # continue
  958. #
  959. # # 有些水印导致pdf分割、读取报错
  960. # # if image.width <= 200 and image.height<=200:
  961. # # continue
  962. #
  963. # # img_test = Image.open(io.BytesIO(image_stream))
  964. # # img_test.save('temp/LTImage.jpg')
  965. #
  966. # # 查看提取的图片高宽,太大则抛错用pdf输出图进行ocr识别
  967. # img_test = Image.open(io.BytesIO(image_stream))
  968. # if img_test.size[1] > 2000 or img_test.size[0] > 1500:
  969. # print("pdf2text LTImage stream output size", img_test.size)
  970. # raise Exception
  971. # # 比较小的图则直接保存用ocr识别
  972. # else:
  973. # img_test.save('temp/LTImage.jpg')
  974. # with open('temp/LTImage.jpg', "rb") as ff:
  975. # image_stream = ff.read()
  976. # image_text = from_ocr_interface(image_stream)
  977. # if judge_error_code(image_text):
  978. # return image_text
  979. # # except pdfminer.pdftypes.PDFNotImplementedError:
  980. # # with open(output_image_list[page_no], "rb") as ff:
  981. # # image_stream = ff.read()
  982. # except Exception:
  983. # logging.info("pdf2text pdfminer read image in page " + str(page_no) +
  984. # " fail! use pymupdf read image...")
  985. # print(traceback.print_exc())
  986. # image_text = page_info_dict.get(page_no)[0]
  987. # if image_text is None:
  988. # with open(output_image_dict.get(page_no), "rb") as ff:
  989. # image_stream = ff.read()
  990. # image_text = from_ocr_interface(image_stream)
  991. # if judge_error_code(image_text):
  992. # return image_text
  993. # page_info_dict[page_no][0] = image_text
  994. # ocr_flag = 1
  995. #
  996. # # 判断只拿到了水印图: 无文字输出且只有图片对象
  997. # if image_text == "" and only_image:
  998. # # 拆出该页pdf
  999. # try:
  1000. # logging.info("pdf2text guess pdf has watermark")
  1001. # split_path = get_single_pdf(path, page_no)
  1002. # except:
  1003. # # 如果拆分抛异常,则大概率不是水印图,用ocr识别图片
  1004. # logging.info("pdf2text guess pdf has no watermark")
  1005. # image_text = page_info_dict.get(page_no)[0]
  1006. # if image_text is None:
  1007. # with open(output_image_dict.get(page_no), "rb") as ff:
  1008. # image_stream = ff.read()
  1009. # image_text = from_ocr_interface(image_stream)
  1010. # order_list.append([image_text, page_no, -1])
  1011. # page_info_dict[page_no][0] = image_text
  1012. # ocr_flag = 1
  1013. # continue
  1014. # if judge_error_code(split_path):
  1015. # return split_path
  1016. #
  1017. # # 调用office格式转换
  1018. # file_path = from_office_interface(split_path, unique_type_dir, 'html', 3)
  1019. # # if file_path == [-3]:
  1020. # # return [-3]
  1021. # if judge_error_code(file_path):
  1022. # return file_path
  1023. #
  1024. # # 获取html文本
  1025. # image_text = get_html_p(file_path)
  1026. # if judge_error_code(image_text):
  1027. # return image_text
  1028. #
  1029. # if get_platform() == "Windows":
  1030. # print("image_text", page_no, x.bbox[1], image_text)
  1031. # with open("temp" + str(x.bbox[0]) + ".jpg", "wb") as ff:
  1032. # ff.write(image_stream)
  1033. # image_text = add_div(image_text)
  1034. # if ocr_flag:
  1035. # order_list.append([image_text, page_no, -1])
  1036. # else:
  1037. # order_list.append([image_text, page_no, x.bbox[1]])
  1038. #
  1039. # order_list.sort(key=lambda z: z[2], reverse=True)
  1040. #
  1041. # # 有ocr参与识别
  1042. # if order_list[-1][2] == -1:
  1043. # ocr_order_list = [order_list[-1]]
  1044. # not_ocr_order_list = []
  1045. # not_ocr_text = ""
  1046. # # 去重,因读取失败而重复获取
  1047. # for order in order_list:
  1048. # if order[2] != -1:
  1049. # not_ocr_order_list.append(order)
  1050. # not_ocr_text += order[0]
  1051. # if string_similarity(ocr_order_list[0][0], not_ocr_text) >= 0.85:
  1052. # order_list = not_ocr_order_list
  1053. # else:
  1054. # order_list = ocr_order_list
  1055. #
  1056. # for order in order_list:
  1057. # text_list.append(order)
  1058. # page_no += 1
  1059. #
  1060. # text = ""
  1061. # for t in text_list:
  1062. # # text += add_div(t[0])
  1063. # if t[0] is not None:
  1064. # text += t[0]
  1065. # return [text]
  1066. # except UnicodeDecodeError as e:
  1067. # logging.info("pdf2text pdfminer create pages failed! " + str(e))
  1068. # return [-3]
  1069. # except Exception as e:
  1070. # logging.info("pdf2text error!")
  1071. # print("pdf2text", traceback.print_exc())
  1072. # return [-1]
  1073. #
  1074. #
  1075. # def string_similarity(str1, str2):
  1076. # # 去掉<div>和回车
  1077. # str1 = re.sub("<div>", "", str1)
  1078. # str1 = re.sub("</div>", "", str1)
  1079. # str1 = re.sub("\n", "", str1)
  1080. # str2 = re.sub("<div>", "", str2)
  1081. # str2 = re.sub("</div>", "", str2)
  1082. # str2 = re.sub("\n", "", str2)
  1083. # # print("********************************")
  1084. # # print("str1", str1)
  1085. # # print("********************************")
  1086. # # print("str2", str2)
  1087. # # print("********************************")
  1088. # score = difflib.SequenceMatcher(None, str1, str2).ratio()
  1089. # print("string_similarity", score)
  1090. # return score
  1091. #
  1092. #
  1093. # @get_memory_info.memory_decorator
  1094. # @timeout_decorator.timeout(300, timeout_exception=TimeoutError)
  1095. # def pdf_analyze(interpreter, page, device):
  1096. # logging.info("into pdf_analyze")
  1097. # # 解析pdf中的不含表格的页
  1098. # pdf_time = time.time()
  1099. # print("pdf_analyze interpreter process...")
  1100. # interpreter.process_page(page)
  1101. # print("pdf_analyze device get_result...")
  1102. # layout = device.get_result()
  1103. # logging.info("pdf2text read time " + str(time.time()-pdf_time))
  1104. # return layout
  1105. #
  1106. #
  1107. # def get_html_p(html_path):
  1108. # logging.info("into get_html_p")
  1109. # try:
  1110. # with open(html_path, "r") as ff:
  1111. # html_str = ff.read()
  1112. #
  1113. # soup = BeautifulSoup(html_str, 'lxml')
  1114. # text = ""
  1115. # for p in soup.find_all("p"):
  1116. # p_text = p.text
  1117. # p_text = p_text.strip()
  1118. # if p.string != "":
  1119. # text += p_text
  1120. # text += "\n"
  1121. # return text
  1122. # except Exception as e:
  1123. # logging.info("get_html_p error!")
  1124. # print("get_html_p", traceback.print_exc())
  1125. # return [-1]
  1126. #
  1127. #
  1128. # def get_single_pdf(path, page_no):
  1129. # logging.info("into get_single_pdf")
  1130. # try:
  1131. # # print("path, ", path)
  1132. # pdf_origin = PdfFileReader(path, strict=False)
  1133. #
  1134. # pdf_new = PdfFileWriter()
  1135. # pdf_new.addPage(pdf_origin.getPage(page_no))
  1136. #
  1137. # path_new = path.split(".")[0] + "_split.pdf"
  1138. # with open(path_new, "wb") as ff:
  1139. # pdf_new.write(ff)
  1140. # return path_new
  1141. # except PyPDF2.utils.PdfReadError as e:
  1142. # raise e
  1143. # except Exception as e:
  1144. # logging.info("get_single_pdf error! page " + str(page_no))
  1145. # print("get_single_pdf", traceback.print_exc())
  1146. # raise e
  1147. #
  1148. #
  1149. # def page_table_connect2(has_table_list, page_info_list):
  1150. # logging.info("into page_table_connect")
  1151. # try:
  1152. # # 判断是否有页码的表格相连
  1153. # table_connect_list = []
  1154. # temp_list = []
  1155. # # 离图片顶部或底部距离,页面高度的1/7
  1156. # threshold = 7
  1157. #
  1158. # for i in range(1, len(has_table_list)):
  1159. # page_info = has_table_list[i]
  1160. # last_page_info = has_table_list[i - 1]
  1161. #
  1162. # # 页码需相连
  1163. # if page_info[4] - last_page_info[4] == 1:
  1164. #
  1165. # # 上一页最后一个区域的列数和下一页第一个区域列数都为0,且相等
  1166. # if not last_page_info[1][-1] and not page_info[1][0] and \
  1167. # last_page_info[1][-1] == page_info[1][0]:
  1168. #
  1169. # # 上一页的轮廓点要离底部一定距离内,下一页的轮廓点要离顶部一定距离内
  1170. # if last_page_info[5][0] - last_page_info[2][-1][1][1] \
  1171. # <= int(last_page_info[5][0]/threshold) \
  1172. # and page_info[2][0][0][1] - 0 \
  1173. # <= int(page_info[5][0]/threshold):
  1174. # temp_list.append(last_page_info[4])
  1175. # temp_list.append(page_info[4])
  1176. # continue
  1177. #
  1178. # # 条件不符合的,存储之前保存的连接页码
  1179. # if len(temp_list) > 1:
  1180. # temp_list = list(set(temp_list))
  1181. # temp_list.sort(key=lambda x: x)
  1182. # table_connect_list.append(temp_list)
  1183. # temp_list = []
  1184. # if len(temp_list) > 1:
  1185. # temp_list = list(set(temp_list))
  1186. # temp_list.sort(key=lambda x: x)
  1187. # table_connect_list.append(temp_list)
  1188. # temp_list = []
  1189. #
  1190. # # 连接两页内容
  1191. # connect_text_list = []
  1192. # for area in table_connect_list:
  1193. # first_page_no = area[0]
  1194. # for page in page_info_list:
  1195. # if page[4] == first_page_no:
  1196. # area_page_text = str(page[0])
  1197. # break
  1198. # for i in range(1, len(area)):
  1199. # current_page_no = area[i]
  1200. # for page in page_info_list:
  1201. # if page[4] == current_page_no:
  1202. # current_page_text = str(page[0])
  1203. # break
  1204. #
  1205. # # 连接两个table
  1206. # table_prefix = re.finditer('<table border="1">', current_page_text)
  1207. # index_list = []
  1208. # for t in table_prefix:
  1209. # index_list.append(t.span())
  1210. #
  1211. # delete_index = index_list[0]
  1212. # current_page_text = current_page_text[:delete_index[0]] \
  1213. # + current_page_text[delete_index[1]:]
  1214. #
  1215. # table_suffix = re.finditer('</table>', area_page_text)
  1216. # index_list = []
  1217. # for t in table_suffix:
  1218. # index_list.append(t.span())
  1219. #
  1220. # delete_index = index_list[-1]
  1221. # area_page_text = area_page_text[:delete_index[0]] \
  1222. # + area_page_text[delete_index[1]:]
  1223. # area_page_text = area_page_text + current_page_text
  1224. # connect_text_list.append([area_page_text, area])
  1225. #
  1226. # return table_connect_list, connect_text_list
  1227. # except Exception as e:
  1228. # # print("page_table_connect", e)
  1229. # logging.info("page_table_connect error!")
  1230. # print("page_table_connect", traceback.print_exc())
  1231. # return [-1], [-1]
  1232. #
  1233. #
  1234. # def page_table_connect(has_table_dict):
  1235. # logging.info("into page_table_connect")
  1236. # if not has_table_dict:
  1237. # return [], []
  1238. #
  1239. # try:
  1240. # # 判断是否有页码的表格相连
  1241. # table_connect_list = []
  1242. # temp_list = []
  1243. # # 离图片顶部或底部距离,页面高度的1/7
  1244. # threshold = 7
  1245. # page_no_list = list(has_table_dict.keys())
  1246. # page_no_list.sort(key=lambda x: x)
  1247. # for i in range(1, len(page_no_list)):
  1248. # page_info = has_table_dict.get(page_no_list[i])
  1249. # last_page_info = has_table_dict.get(page_no_list[i-1])
  1250. # # 页码需相连
  1251. # if page_no_list[i] - page_no_list[i-1] == 1:
  1252. # # 上一页最后一个区域的列数和下一页第一个区域列数都为0,且相等
  1253. # if not last_page_info[1][-1] and not page_info[1][0] and \
  1254. # last_page_info[1][-1] == page_info[1][0]:
  1255. #
  1256. # # 上一页的轮廓点要离底部一定距离内,下一页的轮廓点要离顶部一定距离内
  1257. # if last_page_info[4][0] - last_page_info[2][-1][1][1] \
  1258. # <= int(last_page_info[4][0]/threshold) \
  1259. # and page_info[2][0][0][1] - 0 \
  1260. # <= int(page_info[4][0]/threshold):
  1261. # temp_list.append(page_no_list[i-1])
  1262. # temp_list.append(page_no_list[i])
  1263. # continue
  1264. #
  1265. # # 条件不符合的,存储之前保存的连接页码
  1266. # if len(temp_list) > 1:
  1267. # temp_list = list(set(temp_list))
  1268. # temp_list.sort(key=lambda x: x)
  1269. # table_connect_list.append(temp_list)
  1270. # temp_list = []
  1271. # if len(temp_list) > 1:
  1272. # temp_list = list(set(temp_list))
  1273. # temp_list.sort(key=lambda x: x)
  1274. # table_connect_list.append(temp_list)
  1275. # temp_list = []
  1276. #
  1277. # # 连接两页内容
  1278. # connect_text_list = []
  1279. # for area in table_connect_list:
  1280. # first_page_no = area[0]
  1281. # area_page_text = str(has_table_dict.get(first_page_no)[0])
  1282. # for i in range(1, len(area)):
  1283. # current_page_no = area[i]
  1284. # current_page_text = str(has_table_dict.get(current_page_no)[0])
  1285. #
  1286. # # 连接两个table
  1287. # table_prefix = re.finditer('<table border="1">', current_page_text)
  1288. # index_list = []
  1289. # for t in table_prefix:
  1290. # index_list.append(t.span())
  1291. #
  1292. # delete_index = index_list[0]
  1293. # current_page_text = current_page_text[:delete_index[0]] \
  1294. # + current_page_text[delete_index[1]:]
  1295. #
  1296. # table_suffix = re.finditer('</table>', area_page_text)
  1297. # index_list = []
  1298. # for t in table_suffix:
  1299. # index_list.append(t.span())
  1300. #
  1301. # delete_index = index_list[-1]
  1302. # area_page_text = area_page_text[:delete_index[0]] \
  1303. # + area_page_text[delete_index[1]:]
  1304. # area_page_text = area_page_text + current_page_text
  1305. # connect_text_list.append([area_page_text, area])
  1306. #
  1307. # return table_connect_list, connect_text_list
  1308. # except Exception as e:
  1309. # # print("page_table_connect", e)
  1310. # logging.info("page_table_connect error!")
  1311. # print("page_table_connect", traceback.print_exc())
  1312. # return [-1], [-1]
  1313. #
  1314. #
  1315. # @get_memory_info.memory_decorator
  1316. # def zip2text(path, unique_type_dir):
  1317. # logging.info("into zip2text")
  1318. # try:
  1319. # zip_path = unique_type_dir
  1320. #
  1321. # try:
  1322. # zip_file = zipfile.ZipFile(path)
  1323. # zip_list = zip_file.namelist()
  1324. # # print("zip list namelist", zip_list)
  1325. #
  1326. # if get_platform() == "Windows":
  1327. # if os.path.exists(zip_list[0]):
  1328. # print("zip2text exists")
  1329. #
  1330. # # 循环解压文件到指定目录
  1331. # file_list = []
  1332. # for f in zip_list:
  1333. # file_list.append(zip_file.extract(f, path=zip_path))
  1334. # # zip_file.extractall(path=zip_path)
  1335. # zip_file.close()
  1336. #
  1337. # # 获取文件名
  1338. # # file_list = []
  1339. # # for root, dirs, files in os.walk(zip_path, topdown=False):
  1340. # # for name in dirs:
  1341. # # file_list.append(os.path.join(root, name) + os.sep)
  1342. # # for name in files:
  1343. # # file_list.append(os.path.join(root, name))
  1344. # #
  1345. # # # if get_platform() == "Windows":
  1346. # # # print("file_list", file_list)
  1347. # #
  1348. # # # 过滤掉doc缓存文件
  1349. # # temp_list = []
  1350. # # for f in file_list:
  1351. # # if re.search("~\$", f):
  1352. # # continue
  1353. # # else:
  1354. # # temp_list.append(f)
  1355. # # file_list = temp_list
  1356. #
  1357. # except Exception as e:
  1358. # logging.info("zip format error!")
  1359. # print("zip format error!", traceback.print_exc())
  1360. # return [-3]
  1361. #
  1362. # # 内部文件重命名
  1363. # # file_list = inner_file_rename(file_list)
  1364. # file_list = rename_inner_files(zip_path)
  1365. # if judge_error_code(file_list):
  1366. # return file_list
  1367. #
  1368. # if get_platform() == "Windows":
  1369. # print("============= zip file list")
  1370. # # print(file_list)
  1371. #
  1372. # text = []
  1373. # for file in file_list:
  1374. # if os.path.isdir(file):
  1375. # continue
  1376. #
  1377. # # 无文件后缀,猜格式
  1378. # if len(file.split(".")) <= 1:
  1379. # logging.info(str(file) + " has no type! Guess type...")
  1380. # _type = judge_format(file)
  1381. # if _type is None:
  1382. # logging.info(str(file) + "cannot guess type!")
  1383. # sub_text = [""]
  1384. # else:
  1385. # logging.info(str(file) + " guess type: " + _type)
  1386. # new_file = str(file) + "." + _type
  1387. # os.rename(file, new_file)
  1388. # file = new_file
  1389. # sub_text = getText(_type, file)
  1390. # # 有文件后缀,截取
  1391. # else:
  1392. # _type = file.split(".")[-1]
  1393. # sub_text = getText(_type, file)
  1394. #
  1395. # if judge_error_code(sub_text, code=[-3]):
  1396. # continue
  1397. # if judge_error_code(sub_text):
  1398. # return sub_text
  1399. #
  1400. # text = text + sub_text
  1401. # return text
  1402. # except Exception as e:
  1403. # logging.info("zip2text error!")
  1404. # print("zip2text", traceback.print_exc())
  1405. # return [-1]
  1406. #
  1407. #
  1408. # @get_memory_info.memory_decorator
  1409. # def rar2text(path, unique_type_dir):
  1410. # logging.info("into rar2text")
  1411. # try:
  1412. # rar_path = unique_type_dir
  1413. #
  1414. # try:
  1415. # # shell调用unrar解压
  1416. # _signal = os.system("unrar x " + path + " " + rar_path)
  1417. # print("rar2text _signal", _signal)
  1418. # # =0, 解压成功
  1419. # if _signal != 0:
  1420. # raise Exception
  1421. # except Exception as e:
  1422. # logging.info("rar format error!")
  1423. # print("rar format error!", e)
  1424. # return [-3]
  1425. #
  1426. # # 获取文件名
  1427. # # file_list = []
  1428. # # for root, dirs, files in os.walk(rar_path, topdown=False):
  1429. # # for name in dirs:
  1430. # # file_list.append(os.path.join(root, name) + os.sep)
  1431. # # for name in files:
  1432. # # file_list.append(os.path.join(root, name))
  1433. #
  1434. # if get_platform() == "Windows":
  1435. # print("============= rar file list")
  1436. #
  1437. # # 内部文件重命名
  1438. # # file_list = inner_file_rename(file_list)
  1439. # file_list = rename_inner_files(rar_path)
  1440. # if judge_error_code(file_list):
  1441. # return file_list
  1442. #
  1443. # text = []
  1444. # for file in file_list:
  1445. # if os.path.isdir(file):
  1446. # continue
  1447. #
  1448. # # 无文件后缀,猜格式
  1449. # if len(file.split(".")) <= 1:
  1450. # logging.info(str(file) + " has no type! Guess type...")
  1451. # _type = judge_format(file)
  1452. # if _type is None:
  1453. # logging.info(str(file) + "cannot guess type!")
  1454. # sub_text = [""]
  1455. # else:
  1456. # logging.info(str(file) + " guess type: " + _type)
  1457. # new_file = str(file) + "." + _type
  1458. # os.rename(file, new_file)
  1459. # file = new_file
  1460. # sub_text = getText(_type, file)
  1461. # # 有文件后缀,截取
  1462. # else:
  1463. # _type = file.split(".")[-1]
  1464. # sub_text = getText(_type, file)
  1465. #
  1466. # if judge_error_code(sub_text, code=[-3]):
  1467. # continue
  1468. # if judge_error_code(sub_text):
  1469. # return sub_text
  1470. #
  1471. # # print("sub text", sub_text, file, _type)
  1472. # text = text + sub_text
  1473. # return text
  1474. # except Exception as e:
  1475. # logging.info("rar2text error!")
  1476. # print("rar2text", traceback.print_exc())
  1477. # return [-1]
  1478. #
  1479. #
  1480. # def inner_file_rename(path_list):
  1481. # logging.info("into inner_file_rename")
  1482. # try:
  1483. # # 先过滤文件名中的点 '.'
  1484. # path_list.sort(key=lambda x: len(x), reverse=True)
  1485. # for i in range(len(path_list)):
  1486. # old_path = path_list[i]
  1487. # # 对于目录,判断最后一级是否需过滤,重命名
  1488. # if os.path.isdir(old_path):
  1489. # ps = old_path.split(os.sep)
  1490. # old_p = ps[-2]
  1491. # if '.' in old_p:
  1492. # new_p = re.sub("\\.", "", old_p)
  1493. # new_path = ""
  1494. # for p in ps[:-2]:
  1495. # new_path += p + os.sep
  1496. # new_path += new_p + os.sep
  1497. #
  1498. # # 重命名,更新
  1499. # # print("has .", path_list[i], new_path)
  1500. # os.rename(old_path, new_path)
  1501. # for j in range(len(path_list)):
  1502. # if old_path in path_list[j]:
  1503. # path_list[j] = re.sub(old_p, new_p, path_list[j]) + os.sep
  1504. #
  1505. # # 将path分割,按分割个数排名
  1506. # path_len_list = []
  1507. # for p in path_list:
  1508. # p_ss = p.split(os.sep)
  1509. # temp_p_ss = []
  1510. # for pp in p_ss:
  1511. # if pp == "":
  1512. # continue
  1513. # temp_p_ss.append(pp)
  1514. # p_ss = temp_p_ss
  1515. # path_len_list.append([p, p_ss, len(p_ss)])
  1516. #
  1517. # # 从路径分割少的开始改名,即从根目录开始改
  1518. # path_len_list.sort(key=lambda x: x[2])
  1519. #
  1520. # # for p in path_len_list:
  1521. # # print("---", p[1])
  1522. #
  1523. # # 判断不用变的目录在第几级
  1524. # no_change_level = 0
  1525. # loop = 0
  1526. # for p_s in path_len_list[0][1]:
  1527. # if p_s[-4:] == "_rar" or p_s[-4:] == "_zip":
  1528. # no_change_level += loop
  1529. # loop = 0
  1530. # loop += 1
  1531. # no_change_level += 1
  1532. #
  1533. # # 每个
  1534. # new_path_list = []
  1535. # for path_len in path_len_list:
  1536. # # 前n个是固定路径
  1537. # new_path = ""
  1538. # for i in range(no_change_level):
  1539. # new_path += path_len[1][i] + os.sep
  1540. # old_path = new_path
  1541. #
  1542. # if not get_platform() == "Windows":
  1543. # old_path = os.sep + old_path
  1544. # new_path = os.sep + new_path
  1545. # # print("path_len[1][3:]", path_len[1][3:])
  1546. #
  1547. # count = 0
  1548. # for p in path_len[1][no_change_level:]:
  1549. # # 新路径全部转换hash
  1550. # new_path += str(hash(p))
  1551. #
  1552. # # 最后一个不加os.sep,并且旧路径最后一个不转换hash
  1553. # if count < len(path_len[1][no_change_level:]) - 1:
  1554. # old_path += str(hash(p)) + os.sep
  1555. # new_path += os.sep
  1556. # else:
  1557. # old_path += p
  1558. # count += 1
  1559. #
  1560. # # path是文件夹再加os.sep
  1561. # if os.path.isdir(path_len[0]):
  1562. # new_path += os.sep
  1563. # old_path += os.sep
  1564. # # path是文件再加文件名后缀
  1565. # else:
  1566. # p_ss = path_len[1][-1].split(".")
  1567. # if len(p_ss) > 1:
  1568. # path_suffix = "." + p_ss[-1]
  1569. # new_path += path_suffix
  1570. #
  1571. # print("inner_file_rename", old_path, "to", new_path)
  1572. # os.rename(old_path, new_path)
  1573. # new_path_list.append(new_path)
  1574. #
  1575. # return new_path_list
  1576. # except Exception as e:
  1577. # logging.info("inner_file_rename error!")
  1578. # print("inner_file_rename", traceback.print_exc())
  1579. # return [-1]
  1580. #
  1581. #
  1582. # def rename_inner_files(root_path):
  1583. # try:
  1584. # logging.info("into rename_inner_files")
  1585. # # 获取解压文件夹下所有文件+文件夹,不带根路径
  1586. # path_list = []
  1587. # for root, dirs, files in os.walk(root_path, topdown=False):
  1588. # for name in dirs:
  1589. # p = os.path.join(root, name) + os.sep
  1590. # p = re.sub(root_path, "", p)
  1591. # path_list.append(p)
  1592. # for name in files:
  1593. # p = os.path.join(root, name)
  1594. # p = re.sub(root_path, "", p)
  1595. # path_list.append(p)
  1596. #
  1597. # # 按路径长度排序
  1598. # path_list.sort(key=lambda x: len(x), reverse=True)
  1599. #
  1600. # # 循环改名
  1601. # for old_path in path_list:
  1602. # # 按路径分隔符分割
  1603. # ss = old_path.split(os.sep)
  1604. # # 判断是否文件夹
  1605. # is_dir = 0
  1606. # file_type = ""
  1607. # if os.path.isdir(root_path + old_path):
  1608. # ss = ss[:-1]
  1609. # is_dir = 1
  1610. # else:
  1611. # if "." in old_path:
  1612. # file_type = "." + old_path.split(".")[-1]
  1613. # else:
  1614. # file_type = ""
  1615. #
  1616. # # 最后一级需要用hash改名
  1617. # new_path = ""
  1618. # # new_path = re.sub(ss[-1], str(hash(ss[-1])), old_path) + file_type
  1619. # current_level = 0
  1620. # for s in ss:
  1621. # # 路径拼接
  1622. # if current_level < len(ss) - 1:
  1623. # new_path += s + os.sep
  1624. # else:
  1625. # new_path += str(hash(s)) + file_type
  1626. # current_level += 1
  1627. #
  1628. # new_ab_path = root_path + new_path
  1629. # old_ab_path = root_path + old_path
  1630. # os.rename(old_ab_path, new_ab_path)
  1631. #
  1632. # # 重新获取解压文件夹下所有文件+文件夹
  1633. # new_path_list = []
  1634. # for root, dirs, files in os.walk(root_path, topdown=False):
  1635. # for name in dirs:
  1636. # new_path_list.append(os.path.join(root, name) + os.sep)
  1637. # for name in files:
  1638. # new_path_list.append(os.path.join(root, name))
  1639. # # print("new_path_list", new_path_list)
  1640. # return new_path_list
  1641. # except:
  1642. # traceback.print_exc()
  1643. # return [-1]
  1644. #
  1645. #
  1646. # @get_memory_info.memory_decorator
  1647. # def xls2text(path, unique_type_dir):
  1648. # logging.info("into xls2text")
  1649. # try:
  1650. # # 调用libreoffice格式转换
  1651. # file_path = from_office_interface(path, unique_type_dir, 'xlsx')
  1652. # # if file_path == [-3]:
  1653. # # return [-3]
  1654. # if judge_error_code(file_path):
  1655. # return file_path
  1656. #
  1657. # text = xlsx2text(file_path, unique_type_dir)
  1658. # # if text == [-1]:
  1659. # # return [-1]
  1660. # # if text == [-3]:
  1661. # # return [-3]
  1662. # if judge_error_code(text):
  1663. # return text
  1664. #
  1665. # return text
  1666. # except Exception as e:
  1667. # logging.info("xls2text error!")
  1668. # print("xls2text", traceback.print_exc())
  1669. # return [-1]
  1670. #
  1671. #
  1672. # @get_memory_info.memory_decorator
  1673. # def xlsx2text(path, unique_type_dir):
  1674. # logging.info("into xlsx2text")
  1675. # try:
  1676. # try:
  1677. # # sheet_name=None, 即拿取所有sheet,存为dict
  1678. # df_dict = pandas.read_excel(path, header=None, keep_default_na=False, sheet_name=None)
  1679. # except Exception as e:
  1680. # logging.info("xlsx format error!")
  1681. # # print("xlsx format error!", e)
  1682. # return [-3]
  1683. #
  1684. # df_list = [sheet for sheet in df_dict.values()]
  1685. # sheet_text = ""
  1686. # for df in df_list:
  1687. # text = '<table border="1">' + "\n"
  1688. # for index, row in df.iterrows():
  1689. # text = text + "<tr>"
  1690. # for r in row:
  1691. # text = text + "<td>" + str(r) + "</td>" + "\n"
  1692. # # print(text)
  1693. # text = text + "</tr>" + "\n"
  1694. # text = text + "</table>" + "\n"
  1695. # sheet_text += text
  1696. #
  1697. # return [sheet_text]
  1698. # except Exception as e:
  1699. # logging.info("xlsx2text error!")
  1700. # print("xlsx2text", traceback.print_exc())
  1701. # return [-1]
  1702. #
  1703. #
  1704. # @get_memory_info.memory_decorator
  1705. # def swf2text(path, unique_type_dir):
  1706. # logging.info("into swf2text")
  1707. # try:
  1708. # try:
  1709. # with open(path, 'rb') as f:
  1710. # swf_file = SWF(f)
  1711. # svg_exporter = SVGExporter()
  1712. # svg = swf_file.export(svg_exporter)
  1713. # # with open('swf_export.jpg', 'wb') as f:
  1714. # # f.write(svg.read())
  1715. # swf_str = str(svg.getvalue(), encoding='utf-8')
  1716. # except Exception as e:
  1717. # logging.info("swf format error!")
  1718. # traceback.print_exc()
  1719. # return [-3]
  1720. #
  1721. # # 正则匹配图片的信息位置
  1722. # result0 = re.finditer('<image id=(.[^>]*)', swf_str)
  1723. # image_bytes_list = []
  1724. # i = 0
  1725. # image_path_prefix = path.split(".")[-2] + "_" + path.split(".")[-1]
  1726. # image_path_list = []
  1727. # for r in result0:
  1728. # # 截取图片信息所在位置
  1729. # swf_str0 = swf_str[r.span()[0]:r.span()[1] + 1]
  1730. #
  1731. # # 正则匹配得到图片的base64编码
  1732. # result1 = re.search('xlink:href="data:(.[^>]*)', swf_str0)
  1733. # swf_str1 = swf_str0[result1.span()[0]:result1.span()[1]]
  1734. # reg1_prefix = 'b\''
  1735. # result1 = re.search(reg1_prefix + '(.[^\']*)', swf_str1)
  1736. # swf_str1 = swf_str1[result1.span()[0] + len(reg1_prefix):result1.span()[1]]
  1737. #
  1738. # # base64_str -> base64_bytes -> no "\\" base64_bytes -> bytes -> image
  1739. # base64_bytes_with_double = bytes(swf_str1, "utf-8")
  1740. # base64_bytes = codecs.escape_decode(base64_bytes_with_double, "hex-escape")[0]
  1741. # image_bytes = base64.b64decode(base64_bytes)
  1742. # image_bytes_list.append(image_bytes)
  1743. # image_path = image_path_prefix + "_page_" + str(i) + ".png"
  1744. # with open(image_path, 'wb') as f:
  1745. # f.write(image_bytes)
  1746. #
  1747. # image_path_list.append(image_path)
  1748. # # 正则匹配得到图片的宽高
  1749. # # reg2_prefix = 'width="'
  1750. # # result2 = re.search(reg2_prefix + '(\d+)', swf_str0)
  1751. # # swf_str2 = swf_str0[result2.span()[0]+len(reg2_prefix):result2.span()[1]]
  1752. # # width = swf_str2
  1753. # # reg2_prefix = 'height="'
  1754. # # result2 = re.search(reg2_prefix + '(\d+)', swf_str0)
  1755. # # swf_str2 = swf_str0[result2.span()[0]+len(reg2_prefix):result2.span()[1]]
  1756. # # height = swf_str2
  1757. # i += 1
  1758. #
  1759. # text_list = []
  1760. # # print("image_path_list", image_path_list)
  1761. # for image_path in image_path_list:
  1762. # text = picture2text(image_path)
  1763. # # print("text", text)
  1764. #
  1765. # if judge_error_code(text, code=[-3]):
  1766. # continue
  1767. # if judge_error_code(text):
  1768. # return text
  1769. #
  1770. # text = text[0]
  1771. # text_list.append(text)
  1772. #
  1773. # text = ""
  1774. # for t in text_list:
  1775. # text += t
  1776. #
  1777. # return [text]
  1778. # except Exception as e:
  1779. # logging.info("swf2text error!")
  1780. # print("swf2text", traceback.print_exc())
  1781. # return [-1]
  1782. #
  1783. #
  1784. # @get_memory_info.memory_decorator
  1785. # def picture2text(path, html=False):
  1786. # logging.info("into picture2text")
  1787. # try:
  1788. # # 判断图片中表格
  1789. # img = cv2.imread(path)
  1790. # if img is None:
  1791. # return [-3]
  1792. #
  1793. # # if get_platform() == "Windows":
  1794. # # print("picture2text img", img)
  1795. #
  1796. # text, column_list, outline_points, is_table = image_preprocess(img, path)
  1797. # if judge_error_code(text):
  1798. # return text
  1799. # # if text == [-5]:
  1800. # # return [-5]
  1801. # # if text == [-2]:
  1802. # # return [-2]
  1803. # # if text == [-1]:
  1804. # # return [-1]
  1805. #
  1806. # if html:
  1807. # text = add_div(text)
  1808. # return [text]
  1809. # except Exception as e:
  1810. # logging.info("picture2text error!")
  1811. # print("picture2text", traceback.print_exc())
  1812. # return [-1]
  1813. #
  1814. #
  1815. # @get_memory_info.memory_decorator
  1816. # def from_ocr_interface(image_stream, is_table=False):
  1817. # logging.info("into from_ocr_interface")
  1818. # try:
  1819. # base64_stream = base64.b64encode(image_stream)
  1820. #
  1821. # # 调用接口
  1822. # try:
  1823. # r = ocr(data=base64_stream, ocr_model=globals().get("global_ocr_model"))
  1824. # except TimeoutError:
  1825. # if is_table:
  1826. # return [-5], [-5]
  1827. # else:
  1828. # return [-5]
  1829. # except requests.exceptions.ConnectionError as e:
  1830. # if is_table:
  1831. # return [-2], [-2]
  1832. # else:
  1833. # return [-2]
  1834. #
  1835. # _dict = r
  1836. # text_list = eval(_dict.get("text"))
  1837. # bbox_list = eval(_dict.get("bbox"))
  1838. # if text_list is None:
  1839. # text_list = []
  1840. # if bbox_list is None:
  1841. # bbox_list = []
  1842. #
  1843. # if is_table:
  1844. # return text_list, bbox_list
  1845. # else:
  1846. # if text_list and bbox_list:
  1847. # text = get_sequential_data(text_list, bbox_list, html=True)
  1848. # if judge_error_code(text):
  1849. # return text
  1850. # # if text == [-1]:
  1851. # # return [-1]
  1852. # else:
  1853. # text = ""
  1854. # return text
  1855. # except Exception as e:
  1856. # logging.info("from_ocr_interface error!")
  1857. # # print("from_ocr_interface", e, global_type)
  1858. # if is_table:
  1859. # return [-1], [-1]
  1860. # else:
  1861. # return [-1]
  1862. #
  1863. #
  1864. # @get_memory_info.memory_decorator
  1865. # def from_otr_interface(image_stream):
  1866. # logging.info("into from_otr_interface")
  1867. # try:
  1868. # base64_stream = base64.b64encode(image_stream)
  1869. #
  1870. # # 调用接口
  1871. # try:
  1872. # r = otr(data=base64_stream, otr_model=globals().get("global_otr_model"))
  1873. # except TimeoutError:
  1874. # return [-5], [-5], [-5], [-5]
  1875. # except requests.exceptions.ConnectionError as e:
  1876. # logging.info("from_otr_interface")
  1877. # print("from_otr_interface", traceback.print_exc())
  1878. # return [-2], [-2], [-2], [-2]
  1879. #
  1880. # # 处理结果
  1881. # _dict = r
  1882. # points = eval(_dict.get("points"))
  1883. # split_lines = eval(_dict.get("split_lines"))
  1884. # bboxes = eval(_dict.get("bboxes"))
  1885. # outline_points = eval(_dict.get("outline_points"))
  1886. # # print("from_otr_interface len(bboxes)", len(bboxes))
  1887. # if points is None:
  1888. # points = []
  1889. # if split_lines is None:
  1890. # split_lines = []
  1891. # if bboxes is None:
  1892. # bboxes = []
  1893. # if outline_points is None:
  1894. # outline_points = []
  1895. # return points, split_lines, bboxes, outline_points
  1896. # except Exception as e:
  1897. # logging.info("from_otr_interface error!")
  1898. # print("from_otr_interface", traceback.print_exc())
  1899. # return [-1], [-1], [-1], [-1]
  1900. #
  1901. #
  1902. # def from_office_interface(src_path, dest_path, target_format, retry_times=1):
  1903. # try:
  1904. # # Win10跳出超时装饰器
  1905. # if get_platform() == "Windows":
  1906. # # origin_office_convert = office_convert.__wrapped__
  1907. # # file_path = origin_office_convert(src_path, dest_path, target_format, retry_times)
  1908. # file_path = office_convert(src_path, dest_path, target_format, retry_times)
  1909. # else:
  1910. # # 将装饰器包装为一个类,否则多进程Pickle会报错 it's not the same object as xxx 问题,
  1911. # # timeout_decorator_obj = my_timeout_decorator.TimeoutClass(office_convert, 180, TimeoutError)
  1912. # # file_path = timeout_decorator_obj.run(src_path, dest_path, target_format, retry_times)
  1913. #
  1914. # file_path = office_convert(src_path, dest_path, target_format, retry_times)
  1915. #
  1916. # if judge_error_code(file_path):
  1917. # return file_path
  1918. # return file_path
  1919. # except TimeoutError:
  1920. # logging.info("from_office_interface timeout error!")
  1921. # return [-5]
  1922. # except:
  1923. # logging.info("from_office_interface error!")
  1924. # print("from_office_interface", traceback.print_exc())
  1925. # return [-1]
  1926. #
  1927. #
  1928. # def get_sequential_data(text_list, bbox_list, html=False):
  1929. # logging.info("into get_sequential_data")
  1930. # try:
  1931. # text = ""
  1932. # order_list = []
  1933. # for i in range(len(text_list)):
  1934. # length_start = bbox_list[i][0][0]
  1935. # length_end = bbox_list[i][1][0]
  1936. # height_start = bbox_list[i][0][1]
  1937. # height_end = bbox_list[i][-1][1]
  1938. # # print([length_start, length_end, height_start, height_end])
  1939. # order_list.append([text_list[i], length_start, length_end, height_start, height_end])
  1940. # # text = text + infomation['text'] + "\n"
  1941. #
  1942. # if get_platform() == "Windows":
  1943. # print("get_sequential_data", order_list)
  1944. # if not order_list:
  1945. # if get_platform() == "Windows":
  1946. # print("get_sequential_data", "no order list")
  1947. # return ""
  1948. #
  1949. # # 根据bbox的坐标对输出排序
  1950. # order_list.sort(key=lambda x: (x[3], x[1]))
  1951. #
  1952. # # 根据bbox分行分列
  1953. # # col_list = []
  1954. # # height_end = int((order_list[0][4] + order_list[0][3]) / 2)
  1955. # # for i in range(len(order_list)):
  1956. # # if height_end - threshold <= order_list[i][3] <= height_end + threshold:
  1957. # # col_list.append(order_list[i])
  1958. # # else:
  1959. # # row_list.append(col_list)
  1960. # # col_list = []
  1961. # # height_end = int((order_list[i][4] + order_list[i][3]) / 2)
  1962. # # col_list.append(order_list[i])
  1963. # # if i == len(order_list) - 1:
  1964. # # row_list.append(col_list)
  1965. #
  1966. # row_list = []
  1967. # used_box = []
  1968. # threshold = 5
  1969. # for box in order_list:
  1970. # if box in used_box:
  1971. # continue
  1972. #
  1973. # height_center = (box[4] + box[3]) / 2
  1974. # row = []
  1975. # for box2 in order_list:
  1976. # if box2 in used_box:
  1977. # continue
  1978. # height_center2 = (box2[4] + box2[3]) / 2
  1979. # if height_center - threshold <= height_center2 <= height_center + threshold:
  1980. # if box2 not in row:
  1981. # row.append(box2)
  1982. # used_box.append(box2)
  1983. # row.sort(key=lambda x: x[0])
  1984. # row_list.append(row)
  1985. #
  1986. # for row in row_list:
  1987. # if not row:
  1988. # continue
  1989. # if len(row) <= 1:
  1990. # text = text + row[0][0] + "\n"
  1991. # else:
  1992. # sub_text = ""
  1993. # row.sort(key=lambda x: x[1])
  1994. # for col in row:
  1995. # sub_text = sub_text + col[0] + " "
  1996. # sub_text = sub_text + "\n"
  1997. # text += sub_text
  1998. #
  1999. # if html:
  2000. # text = "<div>" + text
  2001. # text = re.sub("\n", "</div>\n<div>", text)
  2002. # text += "</div>"
  2003. # # if text[-5:] == "<div>":
  2004. # # text = text[:-5]
  2005. # return text
  2006. #
  2007. # except Exception as e:
  2008. # logging.info("get_sequential_data error!")
  2009. # print("get_sequential_data", traceback.print_exc())
  2010. # return [-1]
  2011. #
  2012. #
  2013. # def get_formatted_table(text_list, text_bbox_list, table_bbox_list, split_line):
  2014. # logging.info("into get_formatted_table")
  2015. # try:
  2016. # # 重新定义text_bbox_list,[point, point, text]
  2017. # text_bbox_list = [[text_bbox_list[i][0], text_bbox_list[i][2], text_list[i]] for i in
  2018. # range(len(text_bbox_list))]
  2019. # # 按纵坐标排序
  2020. # text_bbox_list.sort(key=lambda x: (x[0][1], x[0][0]))
  2021. # table_bbox_list.sort(key=lambda x: (x[0][1], x[0][0]))
  2022. #
  2023. # # print("text_bbox_list", text_bbox_list)
  2024. # # print("table_bbox_list", table_bbox_list)
  2025. #
  2026. # # bbox位置 threshold
  2027. # threshold = 5
  2028. #
  2029. # # 根据split_line分区,可能有个区多个表格 [(), ()]
  2030. # area_text_bbox_list = []
  2031. # area_table_bbox_list = []
  2032. # # print("get_formatted_table, split_line", split_line)
  2033. # for j in range(1, len(split_line)):
  2034. # last_y = split_line[j - 1][0][1]
  2035. # current_y = split_line[j][0][1]
  2036. # temp_text_bbox_list = []
  2037. # temp_table_bbox_list = []
  2038. #
  2039. # # 找出该区域下text bbox
  2040. # for text_bbox in text_bbox_list:
  2041. # # 计算 text bbox 中心点
  2042. # text_bbox_center = ((text_bbox[1][0] + text_bbox[0][0]) / 2,
  2043. # (text_bbox[1][1] + text_bbox[0][1]) / 2)
  2044. # if last_y - threshold <= text_bbox_center[1] <= current_y + threshold:
  2045. # temp_text_bbox_list.append(text_bbox)
  2046. # area_text_bbox_list.append(temp_text_bbox_list)
  2047. #
  2048. # # 找出该区域下table bbox
  2049. # for table_bbox in table_bbox_list:
  2050. # # 计算 table bbox 中心点
  2051. # table_bbox_center = ((table_bbox[1][0] + table_bbox[0][0]) / 2,
  2052. # (table_bbox[1][1] + table_bbox[0][1]) / 2)
  2053. # if last_y < table_bbox_center[1] < current_y:
  2054. # temp_table_bbox_list.append(table_bbox)
  2055. # area_table_bbox_list.append(temp_table_bbox_list)
  2056. #
  2057. # # for j in range(len(area_text_bbox_list)):
  2058. # # print("area_text_bbox_list", j, area_text_bbox_list[j])
  2059. #
  2060. # # 对每个区域分别进行两个bbox匹配,生成表格
  2061. # area_text_list = []
  2062. # area_column_list = []
  2063. # for j in range(len(area_text_bbox_list)):
  2064. # # 每个区域的table bbox 和text bbox
  2065. # temp_table_bbox_list = area_table_bbox_list[j]
  2066. # temp_text_bbox_list = area_text_bbox_list[j]
  2067. #
  2068. # # 判断该区域有无表格bbox
  2069. # # 若无表格,将该区域文字连接
  2070. # if not temp_table_bbox_list:
  2071. # # 找出该区域的所有text bbox
  2072. # only_text_list = []
  2073. # only_bbox_list = []
  2074. # for text_bbox in temp_text_bbox_list:
  2075. # only_text_list.append(text_bbox[2])
  2076. # only_bbox_list.append([text_bbox[0], text_bbox[1]])
  2077. # only_text = get_sequential_data(only_text_list, only_bbox_list, True)
  2078. # if only_text == [-1]:
  2079. # return [-1], [-1]
  2080. # area_text_list.append(only_text)
  2081. # area_column_list.append(0)
  2082. # continue
  2083. #
  2084. # # 有表格
  2085. # # 文本对应的表格格子
  2086. # text_in_table = {}
  2087. # for i in range(len(temp_text_bbox_list)):
  2088. # text_bbox = temp_text_bbox_list[i]
  2089. #
  2090. # # 计算 text bbox 中心点
  2091. # text_bbox_center = ((text_bbox[1][0] + text_bbox[0][0]) / 2,
  2092. # (text_bbox[1][1] + text_bbox[0][1]) / 2)
  2093. #
  2094. # # 判断中心点在哪个table bbox中
  2095. # for table_bbox in temp_table_bbox_list:
  2096. # # 中心点在table bbox中,将text写入字典
  2097. # if table_bbox[0][0] <= text_bbox_center[0] <= table_bbox[1][0] and \
  2098. # table_bbox[0][1] <= text_bbox_center[1] <= table_bbox[1][1]:
  2099. # if str(table_bbox) in text_in_table.keys():
  2100. # text_in_table[str(table_bbox)] = text_in_table.get(str(table_bbox)) + text_bbox[2]
  2101. # else:
  2102. # text_in_table[str(table_bbox)] = text_bbox[2]
  2103. # break
  2104. #
  2105. # # 如果未找到text bbox匹配的table bbox,加大threshold匹配
  2106. # # elif (table_bbox[0][0] <= text_bbox_center[0]+threshold <= table_bbox[1][0] and
  2107. # # table_bbox[0][1] <= text_bbox_center[1]+threshold <= table_bbox[1][1]) or \
  2108. # # (table_bbox[0][0] <= text_bbox_center[0]-threshold <= table_bbox[1][0] and
  2109. # # table_bbox[0][1] <= text_bbox_center[1]-threshold <= table_bbox[1][1]) or \
  2110. # # (table_bbox[0][0] <= text_bbox_center[0]+threshold <= table_bbox[1][0] and
  2111. # # table_bbox[0][1] <= text_bbox_center[1]-threshold <= table_bbox[1][1]) or \
  2112. # # (table_bbox[0][0] <= text_bbox_center[0]-threshold <= table_bbox[1][0] and
  2113. # # table_bbox[0][1] <= text_bbox_center[1]+threshold <= table_bbox[1][1]):
  2114. # # if str(table_bbox) in text_in_table.keys():
  2115. # # text_in_table[str(table_bbox)] = text_in_table.get(str(table_bbox)) + text_bbox[2]
  2116. # # else:
  2117. # # text_in_table[str(table_bbox)] = text_bbox[2]
  2118. # # break
  2119. #
  2120. # # 对表格格子进行分行分列,并计算总计多少小列
  2121. # # 放入坐标
  2122. # all_col_list = []
  2123. # all_row_list = []
  2124. # for i in range(len(temp_table_bbox_list)):
  2125. # table_bbox = temp_table_bbox_list[i]
  2126. #
  2127. # # 放入所有坐标x
  2128. # if table_bbox[0][0] not in all_col_list:
  2129. # all_col_list.append(table_bbox[0][0])
  2130. # if table_bbox[1][0] not in all_col_list:
  2131. # all_col_list.append(table_bbox[1][0])
  2132. #
  2133. # # 放入所有坐标y
  2134. # if table_bbox[0][1] not in all_row_list:
  2135. # all_row_list.append(table_bbox[0][1])
  2136. # if table_bbox[1][1] not in all_row_list:
  2137. # all_row_list.append(table_bbox[1][1])
  2138. # all_col_list.sort(key=lambda x: x)
  2139. # all_row_list.sort(key=lambda x: x)
  2140. #
  2141. # # 分行
  2142. # row_list = []
  2143. # rows = []
  2144. # temp_table_bbox_list.sort(key=lambda x: (x[0][1], x[0][0], x[1][1], x[1][0]))
  2145. # y_row = temp_table_bbox_list[0][0][1]
  2146. # for i in range(len(temp_table_bbox_list)):
  2147. # table_bbox = temp_table_bbox_list[i]
  2148. #
  2149. # if y_row - threshold <= table_bbox[0][1] <= y_row + threshold:
  2150. # rows.append(table_bbox)
  2151. # else:
  2152. # y_row = table_bbox[0][1]
  2153. # if rows:
  2154. # rows.sort(key=lambda x: x[0][0])
  2155. # row_list.append(rows)
  2156. # rows = []
  2157. # rows.append(table_bbox)
  2158. # # print("*" * 30)
  2159. # # print(row_list)
  2160. #
  2161. # if i == len(temp_table_bbox_list) - 1:
  2162. # if rows:
  2163. # rows.sort(key=lambda x: x[0][0])
  2164. # row_list.append(rows)
  2165. #
  2166. # # 生成表格,包括文字和格子宽度
  2167. # area_column = []
  2168. # text = '<table border="1">' + "\n"
  2169. # for row in row_list:
  2170. # text += "<tr>" + "\n"
  2171. # for col in row:
  2172. # # 计算bbox y坐标之间有多少其他点,+1即为所占行数
  2173. # row_span = 1
  2174. # for y in all_row_list:
  2175. # if col[0][1] < y < col[1][1]:
  2176. # if y - col[0][1] >= 2 and col[1][1] - y >= 2:
  2177. # row_span += 1
  2178. #
  2179. # # 计算bbox x坐标之间有多少其他点,+1即为所占列数
  2180. # col_span = 1
  2181. # for x in all_col_list:
  2182. # if col[0][0] < x < col[1][0]:
  2183. # if x - col[0][0] >= 2 and col[1][0] - x >= 2:
  2184. # col_span += 1
  2185. #
  2186. # text += "<td colspan=" + str(col_span) + " rowspan=" + str(row_span) + ">"
  2187. #
  2188. # if str(col) in text_in_table.keys():
  2189. # text += text_in_table.get(str(col))
  2190. # else:
  2191. # text += ""
  2192. # text += "</td>" + "\n"
  2193. # text += "</tr>" + "\n"
  2194. # text += "</table>" + "\n"
  2195. #
  2196. # # 计算最大column
  2197. # max_col_num = 0
  2198. # for row in row_list:
  2199. # col_num = 0
  2200. # for col in row:
  2201. # col_num += 1
  2202. # if max_col_num < col_num:
  2203. # max_col_num = col_num
  2204. #
  2205. # area_text_list.append(text)
  2206. # area_column_list.append(max_col_num)
  2207. #
  2208. # text = ""
  2209. # if get_platform() == "Windows":
  2210. # print("get_formatted_table area_text_list", area_text_list)
  2211. # for area_text in area_text_list:
  2212. # text += area_text
  2213. # return text, area_column_list
  2214. # except Exception as e:
  2215. # logging.info("get_formatted_table error!")
  2216. # print("get_formatted_table", traceback.print_exc())
  2217. # return [-1], [-1]
  2218. port_num = [0]
  2219. def choose_port():
  2220. process_num = 4
  2221. if port_num[0] % process_num == 0:
  2222. _url = local_url + ":15011"
  2223. elif port_num[0] % process_num == 1:
  2224. _url = local_url + ":15012"
  2225. elif port_num[0] % process_num == 2:
  2226. _url = local_url + ":15013"
  2227. elif port_num[0] % process_num == 3:
  2228. _url = local_url + ":15014"
  2229. port_num[0] = port_num[0] + 1
  2230. return _url
  2231. def getText(_type, path_or_stream):
  2232. print("file type - " + _type)
  2233. logging.info("file type - " + _type)
  2234. try:
  2235. ss = path_or_stream.split(".")
  2236. unique_type_dir = ss[-2] + "_" + ss[-1] + os.sep
  2237. except:
  2238. unique_type_dir = path_or_stream + "_" + _type + os.sep
  2239. if _type == "pdf":
  2240. # return pdf2text(path_or_stream, unique_type_dir)
  2241. return PDFConvert(path_or_stream).get_html()
  2242. if _type == "docx":
  2243. return docx2text(path_or_stream, unique_type_dir)
  2244. if _type == "zip":
  2245. return zip2text(path_or_stream, unique_type_dir)
  2246. if _type == "rar":
  2247. return rar2text(path_or_stream, unique_type_dir)
  2248. if _type == "xlsx":
  2249. return xlsx2text(path_or_stream, unique_type_dir)
  2250. if _type == "xls":
  2251. return xls2text(path_or_stream, unique_type_dir)
  2252. if _type == "doc":
  2253. return doc2text(path_or_stream, unique_type_dir)
  2254. if _type == "jpg" or _type == "png" or _type == "jpeg":
  2255. return picture2text(path_or_stream)
  2256. if _type == "swf":
  2257. return swf2text(path_or_stream, unique_type_dir)
  2258. if _type == "txt":
  2259. return txt2text(path_or_stream)
  2260. return [""]
  2261. def to_html(path, text):
  2262. with open(path, 'w',encoding="utf8") as f:
  2263. f.write("<!DOCTYPE HTML>")
  2264. f.write('<head><meta charset="UTF-8"></head>')
  2265. f.write("<body>")
  2266. f.write(text)
  2267. f.write("</body>")
  2268. def resize_image(image_path, size):
  2269. try:
  2270. image_np = cv2.imread(image_path)
  2271. # print(image_np.shape)
  2272. width = image_np.shape[1]
  2273. height = image_np.shape[0]
  2274. h_w_rate = height / width
  2275. # width_standard = 900
  2276. # height_standard = 1400
  2277. width_standard = size[1]
  2278. height_standard = size[0]
  2279. width_new = int(height_standard / h_w_rate)
  2280. height_new = int(width_standard * h_w_rate)
  2281. if width > width_standard:
  2282. image_np = cv2.resize(image_np, (width_standard, height_new))
  2283. elif height > height_standard:
  2284. image_np = cv2.resize(image_np, (width_new, height_standard))
  2285. cv2.imwrite(image_path, image_np)
  2286. # print("resize_image", image_np.shape)
  2287. return
  2288. except Exception as e:
  2289. logging.info("resize_image")
  2290. print("resize_image", e, global_type)
  2291. return
  2292. def remove_red_seal(image_np):
  2293. """
  2294. 去除红色印章
  2295. """
  2296. # 获得红色通道
  2297. blue_c, green_c, red_c = cv2.split(image_np)
  2298. # 多传入一个参数cv2.THRESH_OTSU,并且把阈值thresh设为0,算法会找到最优阈值
  2299. thresh, ret = cv2.threshold(red_c, 0, 255, cv2.THRESH_BINARY + cv2.THRESH_OTSU)
  2300. # print("remove_red_seal thresh", thresh)
  2301. # 实测调整为95%效果好一些
  2302. filter_condition = int(thresh * 0.98)
  2303. thresh1, red_thresh = cv2.threshold(red_c, filter_condition, 255, cv2.THRESH_BINARY)
  2304. # 把图片转回 3 通道
  2305. image_and = np.expand_dims(red_thresh, axis=2)
  2306. image_and = np.concatenate((image_and, image_and, image_and), axis=-1)
  2307. # print(image_and.shape)
  2308. # 膨胀
  2309. gray = cv2.cvtColor(image_and, cv2.COLOR_RGB2GRAY)
  2310. kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (2, 2))
  2311. erode = cv2.erode(gray, kernel)
  2312. cv2.imshow("erode", erode)
  2313. cv2.waitKey(0)
  2314. image_and = np.bitwise_and(cv2.bitwise_not(blue_c), cv2.bitwise_not(erode))
  2315. result_img = cv2.bitwise_not(image_and)
  2316. cv2.imshow("remove_red_seal", result_img)
  2317. cv2.waitKey(0)
  2318. return result_img
  2319. def remove_underline(image_np):
  2320. """
  2321. 去除文字下划线
  2322. """
  2323. # 灰度化
  2324. gray = cv2.cvtColor(image_np, cv2.COLOR_BGR2GRAY)
  2325. # 二值化
  2326. binary = cv2.adaptiveThreshold(~gray, 255,
  2327. cv2.ADAPTIVE_THRESH_GAUSSIAN_C, cv2.THRESH_BINARY,
  2328. 15, 10)
  2329. # Sobel
  2330. kernel_row = np.array([[-1, -2, -1], [0, 0, 0], [1, 2, 1]], np.float32)
  2331. kernel_col = np.array([[-1, 0, 1], [-2, 0, 2], [-1, 0, 1]], np.float32)
  2332. # binary = cv2.filter2D(binary, -1, kernel=kernel)
  2333. binary_row = cv2.filter2D(binary, -1, kernel=kernel_row)
  2334. binary_col = cv2.filter2D(binary, -1, kernel=kernel_col)
  2335. cv2.imshow("custom_blur_demo", binary)
  2336. cv2.waitKey(0)
  2337. rows, cols = binary.shape
  2338. # 识别横线
  2339. scale = 5
  2340. kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (cols // scale, 1))
  2341. erodedcol = cv2.erode(binary_row, kernel, iterations=1)
  2342. cv2.imshow("Eroded Image", erodedcol)
  2343. cv2.waitKey(0)
  2344. dilatedcol = cv2.dilate(erodedcol, kernel, iterations=1)
  2345. cv2.imshow("dilate Image", dilatedcol)
  2346. cv2.waitKey(0)
  2347. return
  2348. def getMDFFromFile(path):
  2349. _length = 0
  2350. try:
  2351. _md5 = hashlib.md5()
  2352. with open(path, "rb") as ff:
  2353. while True:
  2354. data = ff.read(4096)
  2355. if not data:
  2356. break
  2357. _length += len(data)
  2358. _md5.update(data)
  2359. return _md5.hexdigest(), _length
  2360. except Exception as e:
  2361. traceback.print_exc()
  2362. return None, _length
  2363. def add_html_format(text_list):
  2364. new_text_list = []
  2365. for t in text_list:
  2366. html_t = "<!DOCTYPE HTML>\n"
  2367. html_t += '<head><meta charset="UTF-8"></head>\n'
  2368. html_t += "<body>\n"
  2369. html_t += t
  2370. html_t += "\n</body>\n"
  2371. new_text_list.append(html_t)
  2372. return new_text_list
  2373. @timeout_decorator.timeout(1200, timeout_exception=TimeoutError)
  2374. def unique_temp_file_process(stream, _type):
  2375. logging.info("into unique_temp_file_process")
  2376. try:
  2377. # 每个调用在temp中创建一个唯一空间
  2378. uid1 = uuid.uuid1().hex
  2379. unique_space_path = _path + os.sep + "temp" + os.sep + uid1 + os.sep
  2380. # unique_space_path = "/mnt/fangjiasheng/" + "temp/" + uid1 + "/"
  2381. # 判断冲突
  2382. if not os.path.exists(unique_space_path):
  2383. if not os.path.exists(_path + os.sep + "temp"):
  2384. os.mkdir(_path + os.sep + "temp" + os.sep)
  2385. os.mkdir(unique_space_path)
  2386. else:
  2387. uid2 = uuid.uuid1().hex
  2388. if not os.path.exists(_path + os.sep + "temp"):
  2389. os.mkdir(_path + os.sep + "temp" + os.sep)
  2390. os.mkdir(_path + os.sep + "temp" + os.sep + uid2 + os.sep)
  2391. # os.mkdir("/mnt/" + "temp/" + uid2 + "/")
  2392. # 在唯一空间中,对传入的文件也保存为唯一
  2393. uid3 = uuid.uuid1().hex
  2394. file_path = unique_space_path + uid3 + "." + _type
  2395. with open(file_path, "wb") as ff:
  2396. ff.write(stream)
  2397. # 跳过一些编号
  2398. pass_md5 = getMDFFromFile(file_path)
  2399. print("getMDFFromFile", pass_md5)
  2400. if pass_md5 == '84dba5a65339f338d3ebdf9f33fae13e'\
  2401. or pass_md5 == '3d9f9f4354582d85b21b060ebd5786db'\
  2402. or pass_md5 == 'b52da40f24c6b29dfc2ebeaefe4e41f1' \
  2403. or pass_md5 == 'eefb925b7ccec1467be20b462fde2a09':
  2404. raise Exception
  2405. text = getText(_type, file_path)
  2406. return text
  2407. except Exception as e:
  2408. # print("Convert error! Delete temp file. ", e, global_type)
  2409. logging.info("unique_temp_file_process")
  2410. print("unique_temp_file_process:", traceback.print_exc())
  2411. return [-1]
  2412. finally:
  2413. print("======================================")
  2414. print("File md5:", getMDFFromFile(file_path))
  2415. try:
  2416. if get_platform() == "Linux":
  2417. # 删除该唯一空间下所有文件
  2418. if os.path.exists(unique_space_path):
  2419. shutil.rmtree(unique_space_path)
  2420. print()
  2421. except Exception as e:
  2422. logging.info("Delete Files Failed!")
  2423. # print("Delete Files Failed!")
  2424. return [-1]
  2425. print("Finally")
  2426. # to_html(_path + "6.html", text[0])
  2427. # to_html(unique_space_path + "result.html", text[0])
  2428. # return text
  2429. logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(name)s - %(levelname)s - %(message)s')
  2430. logger = logging.getLogger(__name__)
  2431. def log(msg):
  2432. """
  2433. @summary:打印信息
  2434. """
  2435. logger.info(msg)
  2436. def cut_str(text_list, only_text_list, max_bytes_length=2000000):
  2437. logging.info("into cut_str")
  2438. try:
  2439. # 计算有格式总字节数
  2440. bytes_length = 0
  2441. for text in text_list:
  2442. bytes_length += len(bytes(text, encoding='utf-8'))
  2443. print("text_list", bytes_length)
  2444. # 小于直接返回
  2445. if bytes_length < max_bytes_length:
  2446. print("return text_list no cut")
  2447. return text_list
  2448. # 全部文件连接,重新计算无格式字节数
  2449. all_text = ""
  2450. bytes_length = 0
  2451. for text in only_text_list:
  2452. bytes_length += len(bytes(text, encoding='utf-8'))
  2453. all_text += text
  2454. # print("only_text_list", bytes_length)
  2455. # 小于直接返回
  2456. if bytes_length < max_bytes_length:
  2457. print("return only_text_list no cut")
  2458. return only_text_list
  2459. # 截取字符
  2460. all_text = all_text[:int(max_bytes_length/3)]
  2461. print("text bytes ", len(bytes(all_text, encoding='utf-8')))
  2462. print("return only_text_list has cut")
  2463. return [all_text]
  2464. except Exception as e:
  2465. logging.info("cut_str " + str(e))
  2466. return ["-1"]
  2467. @get_memory_info.memory_decorator
  2468. def convert(data, ocr_model, otr_model):
  2469. """
  2470. 接口返回值:
  2471. {[str], 1}: 处理成功
  2472. {[-1], 0}: 逻辑处理错误
  2473. {[-2], 0}: 接口调用错误
  2474. {[-3], 1}: 文件格式错误,无法打开
  2475. {[-4], 0}: 各类文件调用第三方包读取超时
  2476. {[-5], 0}: 整个转换过程超时
  2477. {[-6], 0}: 阿里云UDF队列超时
  2478. {[-7], 1}: 文件需密码,无法打开
  2479. :return: {"result": [], "is_success": int}
  2480. """
  2481. # 控制内存
  2482. # soft, hard = resource.getrlimit(resource.RLIMIT_AS)
  2483. # resource.setrlimit(resource.RLIMIT_AS, (15 * 1024 ** 3, hard))
  2484. logging.info("into convert")
  2485. start_time = time.time()
  2486. try:
  2487. # 模型加入全局变量
  2488. globals().update({"global_ocr_model": ocr_model})
  2489. globals().update({"global_otr_model": otr_model})
  2490. stream = base64.b64decode(data.get("file"))
  2491. _type = data.get("type")
  2492. if get_platform() == "Windows":
  2493. # 解除超时装饰器,直接访问原函数
  2494. origin_unique_temp_file_process = unique_temp_file_process.__wrapped__
  2495. text = origin_unique_temp_file_process(stream, _type)
  2496. else:
  2497. # Linux 通过装饰器设置整个转换超时时间
  2498. try:
  2499. text = unique_temp_file_process(stream, _type)
  2500. except TimeoutError:
  2501. logging.info("convert time out! 1200 sec")
  2502. text = [-5]
  2503. # if text == [-1]:
  2504. # print({"failed result": [-1], "is_success": 0}, time.time() - start_time)
  2505. # return {"result_html": ["-1"], "result_text": ["-1"], "is_success": 0}
  2506. # if text == [-2]:
  2507. # print({"failed result": [-2], "is_success": 0}, time.time() - start_time)
  2508. # return {"result_html": ["-2"], "result_text": ["-2"], "is_success": 0}
  2509. # if text == [-3]:
  2510. # print({"failed result": [-3], "is_success": 1}, time.time() - start_time)
  2511. # return {"result_html": ["-3"], "result_text": ["-3"], "is_success": 1}
  2512. # if text == [-4]:
  2513. # print({"failed result": [-4], "is_success": 0}, time.time() - start_time)
  2514. # return {"result_html": ["-4"], "result_text": ["-4"], "is_success": 0}
  2515. # if text == [-5]:
  2516. # print({"failed result": [-5], "is_success": 0}, time.time() - start_time)
  2517. # return {"result_html": ["-5"], "result_text": ["-5"], "is_success": 0}
  2518. # if text == [-7]:
  2519. # print({"failed result": [-7], "is_success": 1}, time.time() - start_time)
  2520. # return {"result_html": ["-7"], "result_text": ["-7"], "is_success": 1}
  2521. # if text == [-8]:
  2522. # print({"failed result": [-8], "is_success": 0}, time.time() - start_time)
  2523. # return {"result_html": ["-8"], "result_text": ["-8"], "is_success": 1}
  2524. error_code = [[-x] for x in range(1, 9)]
  2525. still_success_code = [[-3], [-7]]
  2526. if text in error_code:
  2527. if text in still_success_code:
  2528. print({"failed result": text, "is_success": 1}, time.time() - start_time)
  2529. return {"result_html": [str(text[0])], "result_text": [str(text[0])],
  2530. "is_success": 1}
  2531. else:
  2532. print({"failed result": text, "is_success": 0}, time.time() - start_time)
  2533. return {"result_html": [str(text[0])], "result_text": [str(text[0])],
  2534. "is_success": 0}
  2535. # 结果保存result.html
  2536. # if get_platform() == "Windows":
  2537. text_str = ""
  2538. for t in text:
  2539. text_str += t
  2540. to_html("../result.html", text_str)
  2541. # 取纯文本
  2542. only_text = []
  2543. for t in text:
  2544. new_t = BeautifulSoup(t, "lxml").get_text()
  2545. new_t = re.sub("\n", "", new_t)
  2546. only_text.append(new_t)
  2547. # 判断长度,过长截取
  2548. text = cut_str(text, only_text)
  2549. only_text = cut_str(only_text, only_text)
  2550. if len(only_text) == 0:
  2551. only_text = [""]
  2552. if only_text[0] == '' and len(only_text) <= 1:
  2553. print({"finished result": ["", 0], "is_success": 1}, time.time() - start_time)
  2554. else:
  2555. print({"finished result": [str(only_text)[:20], len(str(text))],
  2556. "is_success": 1}, time.time() - start_time)
  2557. return {"result_html": text, "result_text": only_text, "is_success": 1}
  2558. except Exception as e:
  2559. print({"failed result": [-1], "is_success": 0}, time.time() - start_time)
  2560. print("convert", traceback.print_exc())
  2561. return {"result_html": ["-1"], "result_text": ["-1"], "is_success": 0}
  2562. global_type = ""
  2563. local_url = "http://127.0.0.1"
  2564. if get_platform() == "Windows":
  2565. _path = os.path.abspath(os.path.dirname(__file__))
  2566. else:
  2567. _path = "/home/admin"
  2568. if not os.path.exists(_path):
  2569. _path = os.path.dirname(os.path.abspath(__file__))
  2570. if __name__ == '__main__':
  2571. print(os.path.abspath(__file__) + "/../../")
  2572. # if len(sys.argv) == 2:
  2573. # port = int(sys.argv[1])
  2574. # else:
  2575. # port = 15015
  2576. # app.run(host='0.0.0.0', port=port, threaded=True, debug=False)
  2577. # log("format_conversion running")
  2578. # convert("", "ocr_model", "otr_model")
  2579. # _str = "啊"
  2580. # str1 = ""
  2581. # str2 = ""
  2582. # for i in range(900000):
  2583. # str1 += _str
  2584. # list1 = [str1]
  2585. # for i in range(700000):
  2586. # str2 += _str
  2587. # list2 = [str2]
  2588. # cut_str(list1, list2)
  2589. # file_path = "C:/Users/Administrator/Desktop/error1.png"
  2590. # file_path = "D:/Project/table-detect-master/train_data/label_1.jpg"
  2591. # file_path = "D:/Project/table-detect-master/test_files/1.png"
  2592. # file_path = "D:/Project/table-detect-master/test_files/table2.jpg"
  2593. if get_platform() == "Windows":
  2594. file_path = "C:/Users/Administrator/Desktop/error3.pdf"
  2595. # file_path = "D:/BIDI_DOC/比地_文档/2022/Test_Interface/94961e1987d1090e.xls"
  2596. # file_path = "C:/Users/Administrator/Desktop/Test_ODPS/1624875783055.pdf"
  2597. else:
  2598. file_path = "1.doc"
  2599. file_path = "files/error3.pdf"
  2600. with open(file_path, "rb") as f:
  2601. file_bytes = f.read()
  2602. file_base64 = base64.b64encode(file_bytes)
  2603. data = {"file": file_base64, "type": file_path.split(".")[-1], "filemd5": 100}
  2604. ocr_model = ocr_interface.OcrModels().get_model()
  2605. otr_model = otr_interface.OtrModels().get_model()
  2606. result = convert(data, ocr_model, otr_model)
  2607. # print("*"*40)
  2608. # result = convert(data, ocr_model, otr_model)
  2609. # print(result)