convert.py 103 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012101310141015101610171018101910201021102210231024102510261027102810291030103110321033103410351036103710381039104010411042104310441045104610471048104910501051105210531054105510561057105810591060106110621063106410651066106710681069107010711072107310741075107610771078107910801081108210831084108510861087108810891090109110921093109410951096109710981099110011011102110311041105110611071108110911101111111211131114111511161117111811191120112111221123112411251126112711281129113011311132113311341135113611371138113911401141114211431144114511461147114811491150115111521153115411551156115711581159116011611162116311641165116611671168116911701171117211731174117511761177117811791180118111821183118411851186118711881189119011911192119311941195119611971198119912001201120212031204120512061207120812091210121112121213121412151216121712181219122012211222122312241225122612271228122912301231123212331234123512361237123812391240124112421243124412451246124712481249125012511252125312541255125612571258125912601261126212631264126512661267126812691270127112721273127412751276127712781279128012811282128312841285128612871288128912901291129212931294129512961297129812991300130113021303130413051306130713081309131013111312131313141315131613171318131913201321132213231324132513261327132813291330133113321333133413351336133713381339134013411342134313441345134613471348134913501351135213531354135513561357135813591360136113621363136413651366136713681369137013711372137313741375137613771378137913801381138213831384138513861387138813891390139113921393139413951396139713981399140014011402140314041405140614071408140914101411141214131414141514161417141814191420142114221423142414251426142714281429143014311432143314341435143614371438143914401441144214431444144514461447144814491450145114521453145414551456145714581459146014611462146314641465146614671468146914701471147214731474147514761477147814791480148114821483148414851486148714881489149014911492149314941495149614971498149915001501150215031504150515061507150815091510151115121513151415151516151715181519152015211522152315241525152615271528152915301531153215331534153515361537153815391540154115421543154415451546154715481549155015511552155315541555155615571558155915601561156215631564156515661567156815691570157115721573157415751576157715781579158015811582158315841585158615871588158915901591159215931594159515961597159815991600160116021603160416051606160716081609161016111612161316141615161616171618161916201621162216231624162516261627162816291630163116321633163416351636163716381639164016411642164316441645164616471648164916501651165216531654165516561657165816591660166116621663166416651666166716681669167016711672167316741675167616771678167916801681168216831684168516861687168816891690169116921693169416951696169716981699170017011702170317041705170617071708170917101711171217131714171517161717171817191720172117221723172417251726172717281729173017311732173317341735173617371738173917401741174217431744174517461747174817491750175117521753175417551756175717581759176017611762176317641765176617671768176917701771177217731774177517761777177817791780178117821783178417851786178717881789179017911792179317941795179617971798179918001801180218031804180518061807180818091810181118121813181418151816181718181819182018211822182318241825182618271828182918301831183218331834183518361837183818391840184118421843184418451846184718481849185018511852185318541855185618571858185918601861186218631864186518661867186818691870187118721873187418751876187718781879188018811882188318841885188618871888188918901891189218931894189518961897189818991900190119021903190419051906190719081909191019111912191319141915191619171918191919201921192219231924192519261927192819291930193119321933193419351936193719381939194019411942194319441945194619471948194919501951195219531954195519561957195819591960196119621963196419651966196719681969197019711972197319741975197619771978197919801981198219831984198519861987198819891990199119921993199419951996199719981999200020012002200320042005200620072008200920102011201220132014201520162017201820192020202120222023202420252026202720282029203020312032203320342035203620372038203920402041204220432044204520462047204820492050205120522053205420552056205720582059206020612062206320642065206620672068206920702071207220732074207520762077207820792080208120822083208420852086208720882089209020912092209320942095209620972098209921002101210221032104210521062107210821092110211121122113211421152116211721182119212021212122212321242125212621272128212921302131213221332134213521362137213821392140214121422143214421452146214721482149215021512152215321542155215621572158215921602161216221632164216521662167216821692170217121722173217421752176217721782179218021812182218321842185218621872188218921902191219221932194219521962197219821992200220122022203220422052206220722082209221022112212221322142215221622172218221922202221222222232224222522262227222822292230223122322233223422352236223722382239224022412242224322442245224622472248224922502251225222532254225522562257225822592260226122622263226422652266226722682269227022712272227322742275227622772278227922802281228222832284228522862287228822892290229122922293229422952296229722982299230023012302230323042305230623072308230923102311231223132314231523162317231823192320232123222323232423252326232723282329233023312332233323342335233623372338233923402341234223432344234523462347234823492350235123522353235423552356235723582359236023612362236323642365236623672368236923702371237223732374237523762377237823792380238123822383238423852386238723882389239023912392239323942395239623972398239924002401240224032404240524062407240824092410241124122413241424152416241724182419242024212422242324242425242624272428242924302431243224332434243524362437243824392440244124422443244424452446244724482449245024512452245324542455245624572458245924602461246224632464246524662467246824692470247124722473247424752476247724782479248024812482248324842485248624872488248924902491249224932494249524962497249824992500250125022503250425052506250725082509251025112512251325142515251625172518251925202521252225232524252525262527252825292530253125322533253425352536253725382539254025412542254325442545254625472548254925502551255225532554255525562557255825592560256125622563256425652566256725682569257025712572257325742575257625772578257925802581258225832584258525862587258825892590259125922593259425952596259725982599260026012602260326042605260626072608260926102611261226132614261526162617261826192620262126222623
  1. #-*- coding: utf-8 -*-
  2. import sys
  3. import os
  4. sys.path.append(os.path.dirname(os.path.abspath(__file__)) + "/../")
  5. from format_convert.convert_doc import doc2text, DocConvert
  6. from format_convert.convert_docx import docx2text, DocxConvert
  7. from format_convert.convert_image import picture2text, ImageConvert
  8. from format_convert.convert_pdf import pdf2text, PDFConvert
  9. from format_convert.convert_rar import rar2text, RarConvert
  10. from format_convert.convert_swf import swf2text, SwfConvert
  11. from format_convert.convert_txt import txt2text, TxtConvert
  12. from format_convert.convert_xls import xls2text, XlsConvert
  13. from format_convert.convert_xlsx import xlsx2text, XlsxConvert
  14. from format_convert.convert_zip import zip2text, ZipConvert
  15. import hashlib
  16. from format_convert import get_memory_info
  17. from format_convert.judge_platform import get_platform
  18. from ocr import ocr_interface
  19. from otr import otr_interface
  20. import re
  21. import shutil
  22. import base64
  23. import time
  24. import uuid
  25. import logging
  26. from bs4 import BeautifulSoup
  27. logging.getLogger("pdfminer").setLevel(logging.WARNING)
  28. from format_convert.table_correct import *
  29. import logging
  30. from format_convert import timeout_decorator
  31. logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(name)s - %(levelname)s - %(message)s')
  32. # txt doc docx xls xlsx pdf zip rar swf jpg jpeg png
  33. # def judge_error_code(_list, code=[-1, -2, -3, -4, -5, -7]):
  34. # for c in code:
  35. # if _list == [c]:
  36. # return True
  37. # return False
  38. #
  39. #
  40. # def set_timeout(signum, frame):
  41. # print("=======================set_timeout")
  42. # print("=======================set_timeout")
  43. # print("=======================set_timeout")
  44. # print("=======================set_timeout")
  45. # print("=======================set_timeout")
  46. # print("=======================set_timeout")
  47. # print("=======================set_timeout")
  48. # print("=======================set_timeout")
  49. # print("=======================set_timeout")
  50. # print("=======================set_timeout")
  51. # print("=======================set_timeout")
  52. # print("=======================set_timeout")
  53. # print("=======================set_timeout")
  54. # print("=======================set_timeout")
  55. # print("=======================set_timeout")
  56. # print("=======================set_timeout")
  57. #
  58. # raise TimeoutError
  59. #
  60. #
  61. # def log_traceback(func_name):
  62. # logging.info(func_name)
  63. # etype, value, tb = sys.exc_info()
  64. # for line in traceback.TracebackException(
  65. # type(value), value, tb, limit=None).format(chain=True):
  66. # logging.info(line)
  67. #
  68. #
  69. # def judge_format(path):
  70. # guess1 = mimetypes.guess_type(path)
  71. # _type = None
  72. # if guess1[0]:
  73. # _type = guess1[0]
  74. # else:
  75. # guess2 = filetype.guess(path)
  76. # if guess2:
  77. # _type = guess2.mime
  78. #
  79. # if _type == "application/pdf":
  80. # return "pdf"
  81. # if _type == "application/vnd.openxmlformats-officedocument.wordprocessingml.document":
  82. # return "docx"
  83. # if _type == "application/x-zip-compressed" or _type == "application/zip":
  84. # return "zip"
  85. # if _type == "application/x-rar-compressed" or _type == "application/rar":
  86. # return "rar"
  87. # if _type == "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet":
  88. # return "xlsx"
  89. # if _type == "application/msword":
  90. # return "doc"
  91. # if _type == "image/png":
  92. # return "png"
  93. # if _type == "image/jpeg":
  94. # return "jpg"
  95. #
  96. # # 猜不到,返回None
  97. # return None
  98. #
  99. #
  100. # @get_memory_info.memory_decorator
  101. # def txt2text(path):
  102. # logging.info("into txt2text")
  103. # try:
  104. # # 判断字符编码
  105. # with open(path, "rb") as ff:
  106. # data = ff.read()
  107. # encode = chardet.detect(data).get("encoding")
  108. # print("txt2text judge code is", encode)
  109. #
  110. # try:
  111. # if encode is None:
  112. # logging.info("txt2text cannot judge file code!")
  113. # return [-3]
  114. # with open(path, "r", encoding=encode) as ff:
  115. # txt_text = ff.read()
  116. # return [txt_text]
  117. # except:
  118. # logging.info("txt2text cannot open file with code " + encode)
  119. # return [-3]
  120. # except Exception as e:
  121. # print("txt2text", traceback.print_exc())
  122. # logging.info("txt2text error!")
  123. # return [-1]
  124. #
  125. #
  126. # @get_memory_info.memory_decorator
  127. # def doc2text(path, unique_type_dir):
  128. # logging.info("into doc2text")
  129. # try:
  130. # # 调用office格式转换
  131. # file_path = from_office_interface(path, unique_type_dir, 'docx')
  132. # # if file_path == [-3]:
  133. # # return [-3]
  134. # if judge_error_code(file_path):
  135. # return file_path
  136. #
  137. # text = docx2text(file_path, unique_type_dir)
  138. # return text
  139. # except Exception as e:
  140. # logging.info("doc2text error!")
  141. # print("doc2text", traceback.print_exc())
  142. # # log_traceback("doc2text")
  143. # return [-1]
  144. #
  145. #
  146. # @get_memory_info.memory_decorator
  147. # def read_xml_order(path, save_path):
  148. # logging.info("into read_xml_order")
  149. # try:
  150. # try:
  151. # f = zipfile.ZipFile(path)
  152. # for file in f.namelist():
  153. # if "word/document.xml" == str(file):
  154. # f.extract(file, save_path)
  155. # f.close()
  156. # except Exception as e:
  157. # # print("docx format error!", e)
  158. # logging.info("docx format error!")
  159. # return [-3]
  160. #
  161. # # DOMTree = xml.dom.minidom.parse(save_path + "word/document.xml")
  162. # # collection = DOMTree.documentElement
  163. #
  164. # try:
  165. # collection = xml_analyze(save_path + "word/document.xml")
  166. # except TimeoutError:
  167. # logging.info("read_xml_order timeout")
  168. # return [-4]
  169. #
  170. # body = collection.getElementsByTagName("w:body")[0]
  171. # order_list = []
  172. # for line in body.childNodes:
  173. # # print(str(line))
  174. # if "w:p" in str(line):
  175. # text = line.getElementsByTagName("w:t")
  176. # picture = line.getElementsByTagName("wp:docPr")
  177. # if text:
  178. # order_list.append("w:t")
  179. # if picture:
  180. # order_list.append("wp:docPr")
  181. #
  182. # for line1 in line.childNodes:
  183. # if "w:r" in str(line1):
  184. # # print("read_xml_order", "w:r")
  185. # picture1 = line1.getElementsByTagName("w:pict")
  186. # if picture1:
  187. # order_list.append("wp:docPr")
  188. #
  189. # if "w:tbl" in str(line):
  190. # order_list.append("w:tbl")
  191. # read_xml_table(path, save_path)
  192. # return order_list
  193. # except Exception as e:
  194. # logging.info("read_xml_order error!")
  195. # print("read_xml_order", traceback.print_exc())
  196. # # log_traceback("read_xml_order")
  197. # return [-1]
  198. #
  199. #
  200. # @get_memory_info.memory_decorator
  201. # def read_xml_table(path, save_path):
  202. # logging.info("into read_xml_table")
  203. # try:
  204. # # print("into read_xml_table")
  205. # try:
  206. # f = zipfile.ZipFile(path)
  207. # for file in f.namelist():
  208. # if "word/document.xml" == str(file):
  209. # f.extract(file, save_path)
  210. # f.close()
  211. # except Exception as e:
  212. # # print("docx format error!", e)
  213. # logging.info("docx format error!")
  214. # return [-3]
  215. #
  216. # # DOMTree = xml.dom.minidom.parse(save_path + "word/document.xml")
  217. # # collection = DOMTree.documentElement
  218. #
  219. # try:
  220. # collection = xml_analyze(save_path + "word/document.xml")
  221. # except TimeoutError:
  222. # logging.info("read_xml_table timeout")
  223. # return [-4]
  224. #
  225. # body = collection.getElementsByTagName("w:body")[0]
  226. # table_text_list = []
  227. # # print("body.childNodes", body.childNodes)
  228. # for line in body.childNodes:
  229. # if "w:tbl" in str(line):
  230. # # print("str(line)", str(line))
  231. # table_text = '<table border="1">' + "\n"
  232. # tr_list = line.getElementsByTagName("w:tr")
  233. # # print("line.childNodes", line.childNodes)
  234. # tr_index = 0
  235. # tr_text_list = []
  236. # tr_text_list_colspan = []
  237. # for tr in tr_list:
  238. # table_text = table_text + "<tr rowspan=1>" + "\n"
  239. # tc_list = tr.getElementsByTagName("w:tc")
  240. # tc_index = 0
  241. # tc_text_list = []
  242. # for tc in tc_list:
  243. # tc_text = ""
  244. #
  245. # # 获取一格占多少列
  246. # col_span = tc.getElementsByTagName("w:gridSpan")
  247. # if col_span:
  248. # col_span = int(col_span[0].getAttribute("w:val"))
  249. # else:
  250. # col_span = 1
  251. #
  252. # # 获取是否是合并单元格的下一个空单元格
  253. # is_merge = tc.getElementsByTagName("w:vMerge")
  254. # if is_merge:
  255. # is_merge = is_merge[0].getAttribute("w:val")
  256. # if is_merge == "continue":
  257. # col_span_index = 0
  258. # real_tc_index = 0
  259. #
  260. # # if get_platform() == "Windows":
  261. # # print("read_xml_table tr_text_list", tr_text_list)
  262. # # print("read_xml_table tr_index", tr_index)
  263. #
  264. # if 0 <= tr_index - 1 < len(tr_text_list):
  265. # for tc_colspan in tr_text_list[tr_index - 1]:
  266. # if col_span_index < tc_index:
  267. # col_span_index += tc_colspan[1]
  268. # real_tc_index += 1
  269. #
  270. # # print("tr_index-1, real_tc_index", tr_index-1, real_tc_index)
  271. # # print(tr_text_list[tr_index-1])
  272. # if real_tc_index < len(tr_text_list[tr_index - 1]):
  273. # tc_text = tr_text_list[tr_index - 1][real_tc_index][0]
  274. #
  275. # table_text = table_text + "<td colspan=" + str(col_span) + ">" + "\n"
  276. # p_list = tc.getElementsByTagName("w:p")
  277. #
  278. # for p in p_list:
  279. # t = p.getElementsByTagName("w:t")
  280. # if t:
  281. # for tt in t:
  282. # # print("tt", tt.childNodes)
  283. # if len(tt.childNodes) > 0:
  284. # tc_text += tt.childNodes[0].nodeValue
  285. # tc_text += "\n"
  286. #
  287. # table_text = table_text + tc_text + "</td>" + "\n"
  288. # tc_index += 1
  289. # tc_text_list.append([tc_text, col_span])
  290. # table_text += "</tr>" + "\n"
  291. # tr_index += 1
  292. # tr_text_list.append(tc_text_list)
  293. # table_text += "</table>" + "\n"
  294. # table_text_list.append(table_text)
  295. # return table_text_list
  296. #
  297. # except Exception as e:
  298. # logging.info("read_xml_table error")
  299. # print("read_xml_table", traceback.print_exc())
  300. # # log_traceback("read_xml_table")
  301. # return [-1]
  302. #
  303. #
  304. # @get_memory_info.memory_decorator
  305. # @timeout_decorator.timeout(300, timeout_exception=TimeoutError)
  306. # def xml_analyze(path):
  307. # # 解析xml
  308. # DOMTree = xml.dom.minidom.parse(path)
  309. # collection = DOMTree.documentElement
  310. # return collection
  311. #
  312. #
  313. # def read_docx_table(document):
  314. # table_text_list = []
  315. # for table in document.tables:
  316. # table_text = "<table>\n"
  317. # print("==================")
  318. # for row in table.rows:
  319. # table_text += "<tr>\n"
  320. # for cell in row.cells:
  321. # table_text += "<td>" + cell.text + "</td>\n"
  322. # table_text += "</tr>\n"
  323. # table_text += "</table>\n"
  324. # print(table_text)
  325. # table_text_list.append(table_text)
  326. # return table_text_list
  327. #
  328. #
  329. # @get_memory_info.memory_decorator
  330. # def docx2text(path, unique_type_dir):
  331. # logging.info("into docx2text")
  332. # try:
  333. # try:
  334. # doc = docx.Document(path)
  335. # except Exception as e:
  336. # print("docx format error!", e)
  337. # print(traceback.print_exc())
  338. # logging.info("docx format error!")
  339. # return [-3]
  340. #
  341. # # 遍历段落
  342. # # print("docx2text extract paragraph")
  343. # paragraph_text_list = []
  344. # for paragraph in doc.paragraphs:
  345. # if paragraph.text != "":
  346. # paragraph_text_list.append("<div>" + paragraph.text + "</div>" + "\n")
  347. # # print("paragraph_text", paragraph.text)
  348. #
  349. # # 遍历表
  350. # try:
  351. # table_text_list = read_xml_table(path, unique_type_dir)
  352. # except TimeoutError:
  353. # return [-4]
  354. #
  355. # if judge_error_code(table_text_list):
  356. # return table_text_list
  357. #
  358. # # 顺序遍历图片
  359. # # print("docx2text extract image")
  360. # image_text_list = []
  361. # temp_image_path = unique_type_dir + "temp_image.png"
  362. # pattern = re.compile('rId\d+')
  363. # for graph in doc.paragraphs:
  364. # for run in graph.runs:
  365. # if run.text == '':
  366. # try:
  367. # if not pattern.search(run.element.xml):
  368. # continue
  369. # content_id = pattern.search(run.element.xml).group(0)
  370. # content_type = doc.part.related_parts[content_id].content_type
  371. # except Exception as e:
  372. # print("docx no image!", e)
  373. # continue
  374. # if not content_type.startswith('image'):
  375. # continue
  376. #
  377. # # 写入临时文件
  378. # img_data = doc.part.related_parts[content_id].blob
  379. # with open(temp_image_path, 'wb') as f:
  380. # f.write(img_data)
  381. #
  382. # # if get_platform() == "Windows":
  383. # # print("img_data", img_data)
  384. #
  385. # if img_data is None:
  386. # continue
  387. #
  388. # # 识别图片文字
  389. # image_text = picture2text(temp_image_path)
  390. # if image_text == [-2]:
  391. # return [-2]
  392. # if image_text == [-1]:
  393. # return [-1]
  394. # if image_text == [-3]:
  395. # continue
  396. #
  397. # image_text = image_text[0]
  398. # image_text_list.append(add_div(image_text))
  399. #
  400. # # 解析document.xml,获取文字顺序
  401. # # print("docx2text extract order")
  402. # order_list = read_xml_order(path, unique_type_dir)
  403. # if order_list == [-2]:
  404. # return [-2]
  405. # if order_list == [-1]:
  406. # return [-1]
  407. #
  408. # text = ""
  409. # print("len(order_list)", len(order_list))
  410. # print("len(paragraph_text_list)", len(paragraph_text_list))
  411. # print("len(image_text_list)", len(image_text_list))
  412. # print("len(table_text_list)", len(table_text_list))
  413. #
  414. # # log("docx2text output in order")
  415. # for tag in order_list:
  416. # if tag == "w:t":
  417. # if len(paragraph_text_list) > 0:
  418. # text += paragraph_text_list.pop(0)
  419. # if tag == "wp:docPr":
  420. # if len(image_text_list) > 0:
  421. # text += image_text_list.pop(0)
  422. # if tag == "w:tbl":
  423. # if len(table_text_list) > 0:
  424. # text += table_text_list.pop(0)
  425. # return [text]
  426. # except Exception as e:
  427. # # print("docx2text", e, global_type)
  428. # logging.info("docx2text error!")
  429. # print("docx2text", traceback.print_exc())
  430. # # log_traceback("docx2text")
  431. # return [-1]
  432. #
  433. #
  434. # def add_div(text):
  435. # if text == "" or text is None:
  436. # return text
  437. #
  438. # if get_platform() == "Windows":
  439. # print("add_div", text)
  440. # if re.findall("<div>", text):
  441. # return text
  442. #
  443. # text = "<div>" + text + "\n"
  444. # text = re.sub("\n", "</div>\n<div>", text)
  445. # # text += "</div>"
  446. # if text[-5:] == "<div>":
  447. # print("add_div has cut", text[-30:])
  448. # text = text[:-5]
  449. # return text
  450. #
  451. #
  452. # @get_memory_info.memory_decorator
  453. # def pdf2Image(path, save_dir):
  454. # logging.info("into pdf2Image")
  455. # try:
  456. # try:
  457. # doc = fitz.open(path)
  458. # except Exception as e:
  459. # logging.info("pdf format error!")
  460. # # print("pdf format error!", e)
  461. # return [-3]
  462. #
  463. # # output_image_list = []
  464. # output_image_dict = {}
  465. # page_count = doc.page_count
  466. # for page_no in range(page_count):
  467. # # 限制pdf页数,只取前10页后10页
  468. # if page_count > 20:
  469. # if 10 <= page_no < page_count-10:
  470. # # logging.info("pdf2Image: pdf pages count " + str(doc.page_count)
  471. # # + ", only get 70 pages")
  472. # continue
  473. #
  474. # try:
  475. # page = doc.loadPage(page_no)
  476. # output = save_dir + "_page" + str(page_no) + ".png"
  477. # rotate = int(0)
  478. # # 每个尺寸的缩放系数为1.3,这将为我们生成分辨率提高2.6的图像。
  479. # # 此处若是不做设置,默认图片大小为:792X612, dpi=96
  480. # # (1.33333333 --> 1056x816) (2 --> 1584x1224)
  481. # # (1.183, 2.28 --> 1920x1080)
  482. # zoom_x = 3.
  483. # zoom_y = 3.
  484. # # mat = fitz.Matrix(zoom_x, zoom_y).preRotate(rotate)
  485. # mat = fitz.Matrix(zoom_x, zoom_y).preRotate(rotate)
  486. # pix = page.getPixmap(matrix=mat, alpha=False)
  487. # pix.writePNG(output)
  488. # pdf_image = cv2.imread(output)
  489. # print("pdf_image", page_no, pdf_image.shape)
  490. # # output_image_list.append([page_no, output])
  491. # output_image_dict[int(page_no)] = output
  492. # except ValueError as e:
  493. # traceback.print_exc()
  494. # if str(e) == "page not in document":
  495. # logging.info("pdf2Image page not in document! continue..." + str(page_no))
  496. # continue
  497. # elif "encrypted" in str(e):
  498. # logging.info("pdf2Image document need password " + str(page_no))
  499. # return [-7]
  500. # except RuntimeError as e:
  501. # if "cannot find page" in str(e):
  502. # logging.info("pdf2Image page {} not in document! continue... ".format(str(page_no)) + str(e))
  503. # continue
  504. # else:
  505. # traceback.print_exc()
  506. # return [-3]
  507. # return [output_image_dict]
  508. #
  509. # except Exception as e:
  510. # logging.info("pdf2Image error!")
  511. # print("pdf2Image", traceback.print_exc())
  512. # return [-1]
  513. #
  514. #
  515. # ocr_result_flag = 0
  516. # def image_preprocess(image_np, image_path, use_ocr=True):
  517. # logging.info("into image_preprocess")
  518. # try:
  519. # # 长 宽
  520. # # resize_size = (1024, 768)
  521. # # 限制图片大小
  522. # # resize_image(image_path, resize_size)
  523. #
  524. # # 图片倾斜校正,写入原来的图片路径
  525. # g_r_i = get_rotated_image(image_np, image_path)
  526. # if g_r_i == [-1]:
  527. # return [-1], [], [], 0
  528. #
  529. # # otr需要图片resize, 写入另一个路径
  530. # image_np = cv2.imread(image_path)
  531. # best_h, best_w = get_best_predict_size(image_np)
  532. # image_resize = cv2.resize(image_np, (best_w, best_h), interpolation=cv2.INTER_AREA)
  533. # image_resize_path = image_path[:-4] + "_resize" + image_path[-4:]
  534. # cv2.imwrite(image_resize_path, image_resize)
  535. #
  536. # # 调用otr模型接口
  537. # with open(image_resize_path, "rb") as f:
  538. # image_bytes = f.read()
  539. # points, split_lines, bboxes, outline_points = from_otr_interface(image_bytes)
  540. # if judge_error_code(points):
  541. # return points, [], [], 0
  542. #
  543. # # 将resize后得到的bbox根据比例还原
  544. # ratio = (image_np.shape[0]/best_h, image_np.shape[1]/best_w)
  545. # for i in range(len(bboxes)):
  546. # bbox = bboxes[i]
  547. # bboxes[i] = [(int(bbox[0][0]*ratio[1]), int(bbox[0][1]*ratio[0])),
  548. # (int(bbox[1][0]*ratio[1]), int(bbox[1][1]*ratio[0]))]
  549. # for i in range(len(split_lines)):
  550. # line = split_lines[i]
  551. # split_lines[i] = [(int(line[0][0]*ratio[1]), int(line[0][1]*ratio[0])),
  552. # (int(line[1][0]*ratio[1]), int(line[1][1]*ratio[0]))]
  553. # for i in range(len(points)):
  554. # point = points[i]
  555. # points[i] = (int(point[0]*ratio[1]), int(point[1]*ratio[0]))
  556. #
  557. # for i in range(len(outline_points)):
  558. # point = outline_points[i]
  559. # outline_points[i] = [(int(point[0][0]*ratio[1]), int(point[0][1]*ratio[0])),
  560. # (int(point[1][0]*ratio[1]), int(point[1][1]*ratio[0]))]
  561. #
  562. # # 查看是否能输出正确框
  563. # for box in bboxes:
  564. # cv2.rectangle(image_np, box[0], box[1], (0, 255, 0), 2)
  565. # # cv2.namedWindow('bbox', 0)
  566. # # cv2.imshow("bbox", image_np)
  567. # # cv2.waitKey(0)
  568. #
  569. # # 调用ocr模型接口
  570. # with open(image_path, "rb") as f:
  571. # image_bytes = f.read()
  572. # # 有表格
  573. # if len(bboxes) >= 2:
  574. # text_list, bbox_list = from_ocr_interface(image_bytes, True)
  575. # if judge_error_code(text_list):
  576. # return text_list, [], [], 0
  577. #
  578. # # for i in range(len(text_list)):
  579. # # print(text_list[i], bbox_list[i])
  580. # # 查看是否能输出正确框
  581. #
  582. # # for box in bbox_list:
  583. # # cv2.rectangle(image_np, (int(box[0][0]), int(box[0][1])),
  584. # # (int(box[2][0]), int(box[2][1])), (255, 0, 0), 1)
  585. # # cv2.namedWindow('bbox', 0)
  586. # # cv2.imshow("bbox", image_np)
  587. # # cv2.waitKey(0)
  588. #
  589. # text, column_list = get_formatted_table(text_list, bbox_list, bboxes, split_lines)
  590. # if judge_error_code(text):
  591. # return text, [], [], 0
  592. # is_table = 1
  593. # return text, column_list, outline_points, is_table
  594. #
  595. # # 无表格
  596. # else:
  597. # if use_ocr:
  598. # text = from_ocr_interface(image_bytes)
  599. # if judge_error_code(text):
  600. # return text, [], [], 0
  601. #
  602. # is_table = 0
  603. # return text, [], [], is_table
  604. # else:
  605. # is_table = 0
  606. # return None, [], [], is_table
  607. #
  608. # except Exception as e:
  609. # logging.info("image_preprocess error")
  610. # print("image_preprocess", traceback.print_exc())
  611. # return [-1], [], [], 0
  612. #
  613. #
  614. # def get_best_predict_size2(image_np):
  615. # sizes = [1280, 1152, 1024, 896, 768, 640, 512, 384, 256, 128]
  616. #
  617. # min_len = 10000
  618. # best_height = sizes[0]
  619. # for height in sizes:
  620. # if abs(image_np.shape[0] - height) < min_len:
  621. # min_len = abs(image_np.shape[0] - height)
  622. # best_height = height
  623. #
  624. # min_len = 10000
  625. # best_width = sizes[0]
  626. # for width in sizes:
  627. # if abs(image_np.shape[1] - width) < min_len:
  628. # min_len = abs(image_np.shape[1] - width)
  629. # best_width = width
  630. #
  631. # return best_height, best_width
  632. #
  633. #
  634. # def get_best_predict_size(image_np, times=64):
  635. # sizes = []
  636. # for i in range(1, 100):
  637. # if i*times <= 3000:
  638. # sizes.append(i*times)
  639. # sizes.sort(key=lambda x: x, reverse=True)
  640. #
  641. # min_len = 10000
  642. # best_height = sizes[0]
  643. # for height in sizes:
  644. # if abs(image_np.shape[0] - height) < min_len:
  645. # min_len = abs(image_np.shape[0] - height)
  646. # best_height = height
  647. #
  648. # min_len = 10000
  649. # best_width = sizes[0]
  650. # for width in sizes:
  651. # if abs(image_np.shape[1] - width) < min_len:
  652. # min_len = abs(image_np.shape[1] - width)
  653. # best_width = width
  654. #
  655. # return best_height, best_width
  656. #
  657. #
  658. # @get_memory_info.memory_decorator
  659. # def pdf2text(path, unique_type_dir):
  660. # logging.info("into pdf2text")
  661. # try:
  662. # # pymupdf pdf to image
  663. # save_dir = path.split(".")[-2] + "_" + path.split(".")[-1]
  664. # output_image_dict = pdf2Image(path, save_dir)
  665. # if judge_error_code(output_image_dict):
  666. # return output_image_dict
  667. # output_image_dict = output_image_dict[0]
  668. # output_image_no_list = list(output_image_dict.keys())
  669. # output_image_no_list.sort(key=lambda x: x)
  670. #
  671. # # 获取每页pdf提取的文字、表格的列数、轮廓点、是否含表格、页码
  672. # # page_info_list = []
  673. # page_info_dict = {}
  674. # has_table_dict = {}
  675. # no_table_dict = {}
  676. # for page_no in output_image_no_list:
  677. # img_path = output_image_dict.get(page_no)
  678. # print("pdf page", page_no, "in total", output_image_no_list[-1])
  679. # # 读不出来的跳过
  680. # try:
  681. # img = cv2.imread(img_path)
  682. # img_size = img.shape
  683. # except:
  684. # logging.info("pdf2text read image in page fail! continue...")
  685. # continue
  686. #
  687. # # 每张图片处理
  688. # text, column_list, outline_points, is_table = image_preprocess(img, img_path,
  689. # use_ocr=False)
  690. # if judge_error_code(text):
  691. # return text
  692. #
  693. # # page_info_list.append([text, column_list, outline_points, is_table,
  694. # # page_no, img_size])
  695. # page_info = [text, column_list, outline_points, is_table, img_size]
  696. # page_info_dict[int(page_no)] = page_info
  697. # # 包含table的和不包含table的
  698. # if is_table:
  699. # has_table_dict[int(page_no)] = page_info
  700. # else:
  701. # no_table_dict[int(page_no)] = page_info
  702. #
  703. # has_table_no_list = list(has_table_dict.keys())
  704. # has_table_no_list.sort(key=lambda x: x)
  705. # page_no_list = list(page_info_dict.keys())
  706. # page_no_list.sort(key=lambda x: x)
  707. #
  708. # # 页码表格连接
  709. # table_connect_list, connect_text_list = page_table_connect(has_table_dict)
  710. # if judge_error_code(table_connect_list):
  711. # return table_connect_list
  712. #
  713. # # 连接的页码
  714. # table_connect_page_no_list = []
  715. # for area in connect_text_list:
  716. # table_connect_page_no_list.append(area[1])
  717. # print("pdf2text table_connect_list", table_connect_list)
  718. # print("connect_text_list", connect_text_list)
  719. #
  720. # # pdfminer 方式
  721. # try:
  722. # fp = open(path, 'rb')
  723. # # 用文件对象创建一个PDF文档分析器
  724. # parser = PDFParser(fp)
  725. # # 创建一个PDF文档
  726. # doc = PDFDocument(parser)
  727. # # 连接分析器,与文档对象
  728. # rsrcmgr = PDFResourceManager()
  729. # device = PDFPageAggregator(rsrcmgr, laparams=LAParams())
  730. # interpreter = PDFPageInterpreter(rsrcmgr, device)
  731. #
  732. # # 判断是否能读pdf
  733. # for page in PDFPage.create_pages(doc):
  734. # break
  735. # except pdfminer.psparser.PSEOF as e:
  736. # # pdfminer 读不了空白页的对象,直接使用pymupdf转换出的图片进行ocr识别
  737. # logging.info("pdf2text " + str(e) + " use ocr read pdf!")
  738. # text_list = []
  739. # for page_no in page_no_list:
  740. # logging.info("pdf2text ocr page_no " + str(page_no))
  741. # page_info = page_info_dict.get(page_no)
  742. # # 表格
  743. # if page_info[3]:
  744. # # 判断表格是否跨页连接
  745. # area_no = 0
  746. # jump_page = 0
  747. # for area in table_connect_list:
  748. # if page_no in area:
  749. # # 只记录一次text
  750. # if page_no == area[0]:
  751. # image_text = connect_text_list[area_no][0]
  752. # text_list.append([image_text, page_no, 0])
  753. # jump_page = 1
  754. # area_no += 1
  755. #
  756. # # 是连接页的跳过后面步骤
  757. # if jump_page:
  758. # continue
  759. #
  760. # # 直接取text
  761. # image_text = page_info_dict.get(page_no)[0]
  762. # text_list.append([image_text, page_no, 0])
  763. # # 非表格
  764. # else:
  765. # with open(output_image_dict.get(page_no), "rb") as ff:
  766. # image_stream = ff.read()
  767. # image_text = from_ocr_interface(image_stream)
  768. # text_list.append([image_text, page_no, 0])
  769. #
  770. # text_list.sort(key=lambda z: z[1])
  771. # text = ""
  772. # for t in text_list:
  773. # text += t[0]
  774. # return [text]
  775. # except Exception as e:
  776. # logging.info("pdf format error!")
  777. # traceback.print_exc()
  778. # return [-3]
  779. #
  780. # text_list = []
  781. # page_no = 0
  782. # pages = PDFPage.create_pages(doc)
  783. # pages = list(pages)
  784. # page_count = len(pages)
  785. # for page in pages:
  786. # logging.info("pdf2text pymupdf page_no " + str(page_no))
  787. # # 限制pdf页数,只取前100页
  788. # # if page_no >= 70:
  789. # # logging.info("pdf2text: pdf pages only get 70 pages")
  790. # # break
  791. # if page_count > 20:
  792. # if 10 <= page_no < page_count-10:
  793. # page_no += 1
  794. # continue
  795. #
  796. # # 判断页码在含表格页码中,直接拿已生成的text
  797. # if page_no in has_table_no_list:
  798. # # 判断表格是否跨页连接
  799. # area_no = 0
  800. # jump_page = 0
  801. # for area in table_connect_list:
  802. # if page_no in area:
  803. # # 只记录一次text
  804. # if page_no == area[0]:
  805. # image_text = connect_text_list[area_no][0]
  806. # text_list.append([image_text, page_no, 0])
  807. # jump_page = 1
  808. # area_no += 1
  809. #
  810. # # 是连接页的跳过后面步骤
  811. # if jump_page:
  812. # page_no += 1
  813. # continue
  814. #
  815. # # 直接取text
  816. # image_text = has_table_dict.get(page_no)[0]
  817. # text_list.append([image_text, page_no, 0])
  818. # page_no += 1
  819. # continue
  820. #
  821. # # 不含表格的解析pdf
  822. # else:
  823. # if get_platform() == "Windows":
  824. # try:
  825. # interpreter.process_page(page)
  826. # layout = device.get_result()
  827. # except Exception:
  828. # logging.info("pdf2text pdfminer read pdf page error! continue...")
  829. # continue
  830. #
  831. # else:
  832. # # 设置超时时间
  833. # try:
  834. # # 解析pdf中的不含表格的页
  835. # if get_platform() == "Windows":
  836. # origin_pdf_analyze = pdf_analyze.__wrapped__
  837. # layout = origin_pdf_analyze(interpreter, page, device)
  838. # else:
  839. # layout = pdf_analyze(interpreter, page, device)
  840. # except TimeoutError as e:
  841. # logging.info("pdf2text pdfminer read pdf page time out!")
  842. # return [-4]
  843. # except Exception:
  844. # logging.info("pdf2text pdfminer read pdf page error! continue...")
  845. # continue
  846. #
  847. # # 判断该页有没有文字对象,没有则有可能是有水印
  848. # only_image = 1
  849. # image_count = 0
  850. # for x in layout:
  851. # if isinstance(x, LTTextBoxHorizontal):
  852. # only_image = 0
  853. # if isinstance(x, LTFigure):
  854. # image_count += 1
  855. #
  856. # # 如果该页图片数量过多,直接ocr整页识别
  857. # logging.info("pdf2text image_count " + str(image_count))
  858. # if image_count >= 3:
  859. # image_text = page_info_dict.get(page_no)[0]
  860. # if image_text is None:
  861. # with open(output_image_dict.get(page_no), "rb") as ff:
  862. # image_stream = ff.read()
  863. # image_text = from_ocr_interface(image_stream)
  864. # if judge_error_code(image_text):
  865. # return image_text
  866. # page_info_dict[page_no][0] = image_text
  867. #
  868. # text_list.append([image_text, page_no, 0])
  869. # page_no += 1
  870. # continue
  871. #
  872. # order_list = []
  873. # for x in layout:
  874. # # 该对象是否是ocr识别
  875. # ocr_flag = 0
  876. #
  877. # if get_platform() == "Windows":
  878. # # print("x", page_no, x)
  879. # print()
  880. #
  881. # if isinstance(x, LTTextBoxHorizontal):
  882. # image_text = x.get_text()
  883. #
  884. # # 无法识别编码,用ocr
  885. # if re.search('[(]cid:[0-9]+[)]', image_text):
  886. # print(re.search('[(]cid:[0-9]+[)]', image_text))
  887. # image_text = page_info_dict.get(page_no)[0]
  888. # if image_text is None:
  889. # with open(output_image_dict.get(page_no), "rb") as ff:
  890. # image_stream = ff.read()
  891. # image_text = from_ocr_interface(image_stream)
  892. # if judge_error_code(image_text):
  893. # return image_text
  894. # page_info_dict[page_no][0] = image_text
  895. # image_text = add_div(image_text)
  896. # # order_list.append([image_text, page_no, x.bbox[1]])
  897. # order_list = [[image_text, page_no, x.bbox[1]]]
  898. # break
  899. # else:
  900. # image_text = add_div(image_text)
  901. # order_list.append([image_text, page_no, x.bbox[1]])
  902. # continue
  903. #
  904. # if isinstance(x, LTFigure):
  905. # for image in x:
  906. # if isinstance(image, LTImage):
  907. # try:
  908. # print("pdf2text LTImage size", page_no, image.width, image.height)
  909. # image_stream = image.stream.get_data()
  910. #
  911. # # 小的图忽略
  912. # if image.width <= 300 and image.height <= 300:
  913. # continue
  914. #
  915. # # 有些水印导致pdf分割、读取报错
  916. # # if image.width <= 200 and image.height<=200:
  917. # # continue
  918. #
  919. # # img_test = Image.open(io.BytesIO(image_stream))
  920. # # img_test.save('temp/LTImage.jpg')
  921. #
  922. # # 查看提取的图片高宽,太大则抛错用pdf输出图进行ocr识别
  923. # img_test = Image.open(io.BytesIO(image_stream))
  924. # if img_test.size[1] > 2000 or img_test.size[0] > 1500:
  925. # print("pdf2text LTImage stream output size", img_test.size)
  926. # raise Exception
  927. # # 比较小的图则直接保存用ocr识别
  928. # else:
  929. # img_test.save('temp/LTImage.jpg')
  930. # with open('temp/LTImage.jpg', "rb") as ff:
  931. # image_stream = ff.read()
  932. # image_text = from_ocr_interface(image_stream)
  933. # if judge_error_code(image_text):
  934. # return image_text
  935. # # except pdfminer.pdftypes.PDFNotImplementedError:
  936. # # with open(output_image_list[page_no], "rb") as ff:
  937. # # image_stream = ff.read()
  938. # except Exception:
  939. # logging.info("pdf2text pdfminer read image in page " + str(page_no) +
  940. # " fail! use pymupdf read image...")
  941. # print(traceback.print_exc())
  942. # image_text = page_info_dict.get(page_no)[0]
  943. # if image_text is None:
  944. # with open(output_image_dict.get(page_no), "rb") as ff:
  945. # image_stream = ff.read()
  946. # image_text = from_ocr_interface(image_stream)
  947. # if judge_error_code(image_text):
  948. # return image_text
  949. # page_info_dict[page_no][0] = image_text
  950. # ocr_flag = 1
  951. #
  952. # # 判断只拿到了水印图: 无文字输出且只有图片对象
  953. # if image_text == "" and only_image:
  954. # # 拆出该页pdf
  955. # try:
  956. # logging.info("pdf2text guess pdf has watermark")
  957. # split_path = get_single_pdf(path, page_no)
  958. # except:
  959. # # 如果拆分抛异常,则大概率不是水印图,用ocr识别图片
  960. # logging.info("pdf2text guess pdf has no watermark")
  961. # image_text = page_info_dict.get(page_no)[0]
  962. # if image_text is None:
  963. # with open(output_image_dict.get(page_no), "rb") as ff:
  964. # image_stream = ff.read()
  965. # image_text = from_ocr_interface(image_stream)
  966. # order_list.append([image_text, page_no, -1])
  967. # page_info_dict[page_no][0] = image_text
  968. # ocr_flag = 1
  969. # continue
  970. # if judge_error_code(split_path):
  971. # return split_path
  972. #
  973. # # 调用office格式转换
  974. # file_path = from_office_interface(split_path, unique_type_dir, 'html', 3)
  975. # # if file_path == [-3]:
  976. # # return [-3]
  977. # if judge_error_code(file_path):
  978. # return file_path
  979. #
  980. # # 获取html文本
  981. # image_text = get_html_p(file_path)
  982. # if judge_error_code(image_text):
  983. # return image_text
  984. #
  985. # if get_platform() == "Windows":
  986. # print("image_text", page_no, x.bbox[1], image_text)
  987. # with open("temp" + str(x.bbox[0]) + ".jpg", "wb") as ff:
  988. # ff.write(image_stream)
  989. # image_text = add_div(image_text)
  990. # if ocr_flag:
  991. # order_list.append([image_text, page_no, -1])
  992. # else:
  993. # order_list.append([image_text, page_no, x.bbox[1]])
  994. #
  995. # order_list.sort(key=lambda z: z[2], reverse=True)
  996. #
  997. # # 有ocr参与识别
  998. # if order_list[-1][2] == -1:
  999. # ocr_order_list = [order_list[-1]]
  1000. # not_ocr_order_list = []
  1001. # not_ocr_text = ""
  1002. # # 去重,因读取失败而重复获取
  1003. # for order in order_list:
  1004. # if order[2] != -1:
  1005. # not_ocr_order_list.append(order)
  1006. # not_ocr_text += order[0]
  1007. # if string_similarity(ocr_order_list[0][0], not_ocr_text) >= 0.85:
  1008. # order_list = not_ocr_order_list
  1009. # else:
  1010. # order_list = ocr_order_list
  1011. #
  1012. # for order in order_list:
  1013. # text_list.append(order)
  1014. # page_no += 1
  1015. #
  1016. # text = ""
  1017. # for t in text_list:
  1018. # # text += add_div(t[0])
  1019. # if t[0] is not None:
  1020. # text += t[0]
  1021. # return [text]
  1022. # except UnicodeDecodeError as e:
  1023. # logging.info("pdf2text pdfminer create pages failed! " + str(e))
  1024. # return [-3]
  1025. # except Exception as e:
  1026. # logging.info("pdf2text error!")
  1027. # print("pdf2text", traceback.print_exc())
  1028. # return [-1]
  1029. #
  1030. #
  1031. # def string_similarity(str1, str2):
  1032. # # 去掉<div>和回车
  1033. # str1 = re.sub("<div>", "", str1)
  1034. # str1 = re.sub("</div>", "", str1)
  1035. # str1 = re.sub("\n", "", str1)
  1036. # str2 = re.sub("<div>", "", str2)
  1037. # str2 = re.sub("</div>", "", str2)
  1038. # str2 = re.sub("\n", "", str2)
  1039. # # print("********************************")
  1040. # # print("str1", str1)
  1041. # # print("********************************")
  1042. # # print("str2", str2)
  1043. # # print("********************************")
  1044. # score = difflib.SequenceMatcher(None, str1, str2).ratio()
  1045. # print("string_similarity", score)
  1046. # return score
  1047. #
  1048. #
  1049. # @get_memory_info.memory_decorator
  1050. # @timeout_decorator.timeout(300, timeout_exception=TimeoutError)
  1051. # def pdf_analyze(interpreter, page, device):
  1052. # logging.info("into pdf_analyze")
  1053. # # 解析pdf中的不含表格的页
  1054. # pdf_time = time.time()
  1055. # print("pdf_analyze interpreter process...")
  1056. # interpreter.process_page(page)
  1057. # print("pdf_analyze device get_result...")
  1058. # layout = device.get_result()
  1059. # logging.info("pdf2text read time " + str(time.time()-pdf_time))
  1060. # return layout
  1061. #
  1062. #
  1063. # def get_html_p(html_path):
  1064. # logging.info("into get_html_p")
  1065. # try:
  1066. # with open(html_path, "r") as ff:
  1067. # html_str = ff.read()
  1068. #
  1069. # soup = BeautifulSoup(html_str, 'lxml')
  1070. # text = ""
  1071. # for p in soup.find_all("p"):
  1072. # p_text = p.text
  1073. # p_text = p_text.strip()
  1074. # if p.string != "":
  1075. # text += p_text
  1076. # text += "\n"
  1077. # return text
  1078. # except Exception as e:
  1079. # logging.info("get_html_p error!")
  1080. # print("get_html_p", traceback.print_exc())
  1081. # return [-1]
  1082. #
  1083. #
  1084. # def get_single_pdf(path, page_no):
  1085. # logging.info("into get_single_pdf")
  1086. # try:
  1087. # # print("path, ", path)
  1088. # pdf_origin = PdfFileReader(path, strict=False)
  1089. #
  1090. # pdf_new = PdfFileWriter()
  1091. # pdf_new.addPage(pdf_origin.getPage(page_no))
  1092. #
  1093. # path_new = path.split(".")[0] + "_split.pdf"
  1094. # with open(path_new, "wb") as ff:
  1095. # pdf_new.write(ff)
  1096. # return path_new
  1097. # except PyPDF2.utils.PdfReadError as e:
  1098. # raise e
  1099. # except Exception as e:
  1100. # logging.info("get_single_pdf error! page " + str(page_no))
  1101. # print("get_single_pdf", traceback.print_exc())
  1102. # raise e
  1103. #
  1104. #
  1105. # def page_table_connect2(has_table_list, page_info_list):
  1106. # logging.info("into page_table_connect")
  1107. # try:
  1108. # # 判断是否有页码的表格相连
  1109. # table_connect_list = []
  1110. # temp_list = []
  1111. # # 离图片顶部或底部距离,页面高度的1/7
  1112. # threshold = 7
  1113. #
  1114. # for i in range(1, len(has_table_list)):
  1115. # page_info = has_table_list[i]
  1116. # last_page_info = has_table_list[i - 1]
  1117. #
  1118. # # 页码需相连
  1119. # if page_info[4] - last_page_info[4] == 1:
  1120. #
  1121. # # 上一页最后一个区域的列数和下一页第一个区域列数都为0,且相等
  1122. # if not last_page_info[1][-1] and not page_info[1][0] and \
  1123. # last_page_info[1][-1] == page_info[1][0]:
  1124. #
  1125. # # 上一页的轮廓点要离底部一定距离内,下一页的轮廓点要离顶部一定距离内
  1126. # if last_page_info[5][0] - last_page_info[2][-1][1][1] \
  1127. # <= int(last_page_info[5][0]/threshold) \
  1128. # and page_info[2][0][0][1] - 0 \
  1129. # <= int(page_info[5][0]/threshold):
  1130. # temp_list.append(last_page_info[4])
  1131. # temp_list.append(page_info[4])
  1132. # continue
  1133. #
  1134. # # 条件不符合的,存储之前保存的连接页码
  1135. # if len(temp_list) > 1:
  1136. # temp_list = list(set(temp_list))
  1137. # temp_list.sort(key=lambda x: x)
  1138. # table_connect_list.append(temp_list)
  1139. # temp_list = []
  1140. # if len(temp_list) > 1:
  1141. # temp_list = list(set(temp_list))
  1142. # temp_list.sort(key=lambda x: x)
  1143. # table_connect_list.append(temp_list)
  1144. # temp_list = []
  1145. #
  1146. # # 连接两页内容
  1147. # connect_text_list = []
  1148. # for area in table_connect_list:
  1149. # first_page_no = area[0]
  1150. # for page in page_info_list:
  1151. # if page[4] == first_page_no:
  1152. # area_page_text = str(page[0])
  1153. # break
  1154. # for i in range(1, len(area)):
  1155. # current_page_no = area[i]
  1156. # for page in page_info_list:
  1157. # if page[4] == current_page_no:
  1158. # current_page_text = str(page[0])
  1159. # break
  1160. #
  1161. # # 连接两个table
  1162. # table_prefix = re.finditer('<table border="1">', current_page_text)
  1163. # index_list = []
  1164. # for t in table_prefix:
  1165. # index_list.append(t.span())
  1166. #
  1167. # delete_index = index_list[0]
  1168. # current_page_text = current_page_text[:delete_index[0]] \
  1169. # + current_page_text[delete_index[1]:]
  1170. #
  1171. # table_suffix = re.finditer('</table>', area_page_text)
  1172. # index_list = []
  1173. # for t in table_suffix:
  1174. # index_list.append(t.span())
  1175. #
  1176. # delete_index = index_list[-1]
  1177. # area_page_text = area_page_text[:delete_index[0]] \
  1178. # + area_page_text[delete_index[1]:]
  1179. # area_page_text = area_page_text + current_page_text
  1180. # connect_text_list.append([area_page_text, area])
  1181. #
  1182. # return table_connect_list, connect_text_list
  1183. # except Exception as e:
  1184. # # print("page_table_connect", e)
  1185. # logging.info("page_table_connect error!")
  1186. # print("page_table_connect", traceback.print_exc())
  1187. # return [-1], [-1]
  1188. #
  1189. #
  1190. # def page_table_connect(has_table_dict):
  1191. # logging.info("into page_table_connect")
  1192. # if not has_table_dict:
  1193. # return [], []
  1194. #
  1195. # try:
  1196. # # 判断是否有页码的表格相连
  1197. # table_connect_list = []
  1198. # temp_list = []
  1199. # # 离图片顶部或底部距离,页面高度的1/7
  1200. # threshold = 7
  1201. # page_no_list = list(has_table_dict.keys())
  1202. # page_no_list.sort(key=lambda x: x)
  1203. # for i in range(1, len(page_no_list)):
  1204. # page_info = has_table_dict.get(page_no_list[i])
  1205. # last_page_info = has_table_dict.get(page_no_list[i-1])
  1206. # # 页码需相连
  1207. # if page_no_list[i] - page_no_list[i-1] == 1:
  1208. # # 上一页最后一个区域的列数和下一页第一个区域列数都为0,且相等
  1209. # if not last_page_info[1][-1] and not page_info[1][0] and \
  1210. # last_page_info[1][-1] == page_info[1][0]:
  1211. #
  1212. # # 上一页的轮廓点要离底部一定距离内,下一页的轮廓点要离顶部一定距离内
  1213. # if last_page_info[4][0] - last_page_info[2][-1][1][1] \
  1214. # <= int(last_page_info[4][0]/threshold) \
  1215. # and page_info[2][0][0][1] - 0 \
  1216. # <= int(page_info[4][0]/threshold):
  1217. # temp_list.append(page_no_list[i-1])
  1218. # temp_list.append(page_no_list[i])
  1219. # continue
  1220. #
  1221. # # 条件不符合的,存储之前保存的连接页码
  1222. # if len(temp_list) > 1:
  1223. # temp_list = list(set(temp_list))
  1224. # temp_list.sort(key=lambda x: x)
  1225. # table_connect_list.append(temp_list)
  1226. # temp_list = []
  1227. # if len(temp_list) > 1:
  1228. # temp_list = list(set(temp_list))
  1229. # temp_list.sort(key=lambda x: x)
  1230. # table_connect_list.append(temp_list)
  1231. # temp_list = []
  1232. #
  1233. # # 连接两页内容
  1234. # connect_text_list = []
  1235. # for area in table_connect_list:
  1236. # first_page_no = area[0]
  1237. # area_page_text = str(has_table_dict.get(first_page_no)[0])
  1238. # for i in range(1, len(area)):
  1239. # current_page_no = area[i]
  1240. # current_page_text = str(has_table_dict.get(current_page_no)[0])
  1241. #
  1242. # # 连接两个table
  1243. # table_prefix = re.finditer('<table border="1">', current_page_text)
  1244. # index_list = []
  1245. # for t in table_prefix:
  1246. # index_list.append(t.span())
  1247. #
  1248. # delete_index = index_list[0]
  1249. # current_page_text = current_page_text[:delete_index[0]] \
  1250. # + current_page_text[delete_index[1]:]
  1251. #
  1252. # table_suffix = re.finditer('</table>', area_page_text)
  1253. # index_list = []
  1254. # for t in table_suffix:
  1255. # index_list.append(t.span())
  1256. #
  1257. # delete_index = index_list[-1]
  1258. # area_page_text = area_page_text[:delete_index[0]] \
  1259. # + area_page_text[delete_index[1]:]
  1260. # area_page_text = area_page_text + current_page_text
  1261. # connect_text_list.append([area_page_text, area])
  1262. #
  1263. # return table_connect_list, connect_text_list
  1264. # except Exception as e:
  1265. # # print("page_table_connect", e)
  1266. # logging.info("page_table_connect error!")
  1267. # print("page_table_connect", traceback.print_exc())
  1268. # return [-1], [-1]
  1269. #
  1270. #
  1271. # @get_memory_info.memory_decorator
  1272. # def zip2text(path, unique_type_dir):
  1273. # logging.info("into zip2text")
  1274. # try:
  1275. # zip_path = unique_type_dir
  1276. #
  1277. # try:
  1278. # zip_file = zipfile.ZipFile(path)
  1279. # zip_list = zip_file.namelist()
  1280. # # print("zip list namelist", zip_list)
  1281. #
  1282. # if get_platform() == "Windows":
  1283. # if os.path.exists(zip_list[0]):
  1284. # print("zip2text exists")
  1285. #
  1286. # # 循环解压文件到指定目录
  1287. # file_list = []
  1288. # for f in zip_list:
  1289. # file_list.append(zip_file.extract(f, path=zip_path))
  1290. # # zip_file.extractall(path=zip_path)
  1291. # zip_file.close()
  1292. #
  1293. # # 获取文件名
  1294. # # file_list = []
  1295. # # for root, dirs, files in os.walk(zip_path, topdown=False):
  1296. # # for name in dirs:
  1297. # # file_list.append(os.path.join(root, name) + os.sep)
  1298. # # for name in files:
  1299. # # file_list.append(os.path.join(root, name))
  1300. # #
  1301. # # # if get_platform() == "Windows":
  1302. # # # print("file_list", file_list)
  1303. # #
  1304. # # # 过滤掉doc缓存文件
  1305. # # temp_list = []
  1306. # # for f in file_list:
  1307. # # if re.search("~\$", f):
  1308. # # continue
  1309. # # else:
  1310. # # temp_list.append(f)
  1311. # # file_list = temp_list
  1312. #
  1313. # except Exception as e:
  1314. # logging.info("zip format error!")
  1315. # print("zip format error!", traceback.print_exc())
  1316. # return [-3]
  1317. #
  1318. # # 内部文件重命名
  1319. # # file_list = inner_file_rename(file_list)
  1320. # file_list = rename_inner_files(zip_path)
  1321. # if judge_error_code(file_list):
  1322. # return file_list
  1323. #
  1324. # if get_platform() == "Windows":
  1325. # print("============= zip file list")
  1326. # # print(file_list)
  1327. #
  1328. # text = []
  1329. # for file in file_list:
  1330. # if os.path.isdir(file):
  1331. # continue
  1332. #
  1333. # # 无文件后缀,猜格式
  1334. # if len(file.split(".")) <= 1:
  1335. # logging.info(str(file) + " has no type! Guess type...")
  1336. # _type = judge_format(file)
  1337. # if _type is None:
  1338. # logging.info(str(file) + "cannot guess type!")
  1339. # sub_text = [""]
  1340. # else:
  1341. # logging.info(str(file) + " guess type: " + _type)
  1342. # new_file = str(file) + "." + _type
  1343. # os.rename(file, new_file)
  1344. # file = new_file
  1345. # sub_text = getText(_type, file)
  1346. # # 有文件后缀,截取
  1347. # else:
  1348. # _type = file.split(".")[-1]
  1349. # sub_text = getText(_type, file)
  1350. #
  1351. # if judge_error_code(sub_text, code=[-3]):
  1352. # continue
  1353. # if judge_error_code(sub_text):
  1354. # return sub_text
  1355. #
  1356. # text = text + sub_text
  1357. # return text
  1358. # except Exception as e:
  1359. # logging.info("zip2text error!")
  1360. # print("zip2text", traceback.print_exc())
  1361. # return [-1]
  1362. #
  1363. #
  1364. # @get_memory_info.memory_decorator
  1365. # def rar2text(path, unique_type_dir):
  1366. # logging.info("into rar2text")
  1367. # try:
  1368. # rar_path = unique_type_dir
  1369. #
  1370. # try:
  1371. # # shell调用unrar解压
  1372. # _signal = os.system("unrar x " + path + " " + rar_path)
  1373. # print("rar2text _signal", _signal)
  1374. # # =0, 解压成功
  1375. # if _signal != 0:
  1376. # raise Exception
  1377. # except Exception as e:
  1378. # logging.info("rar format error!")
  1379. # print("rar format error!", e)
  1380. # return [-3]
  1381. #
  1382. # # 获取文件名
  1383. # # file_list = []
  1384. # # for root, dirs, files in os.walk(rar_path, topdown=False):
  1385. # # for name in dirs:
  1386. # # file_list.append(os.path.join(root, name) + os.sep)
  1387. # # for name in files:
  1388. # # file_list.append(os.path.join(root, name))
  1389. #
  1390. # if get_platform() == "Windows":
  1391. # print("============= rar file list")
  1392. #
  1393. # # 内部文件重命名
  1394. # # file_list = inner_file_rename(file_list)
  1395. # file_list = rename_inner_files(rar_path)
  1396. # if judge_error_code(file_list):
  1397. # return file_list
  1398. #
  1399. # text = []
  1400. # for file in file_list:
  1401. # if os.path.isdir(file):
  1402. # continue
  1403. #
  1404. # # 无文件后缀,猜格式
  1405. # if len(file.split(".")) <= 1:
  1406. # logging.info(str(file) + " has no type! Guess type...")
  1407. # _type = judge_format(file)
  1408. # if _type is None:
  1409. # logging.info(str(file) + "cannot guess type!")
  1410. # sub_text = [""]
  1411. # else:
  1412. # logging.info(str(file) + " guess type: " + _type)
  1413. # new_file = str(file) + "." + _type
  1414. # os.rename(file, new_file)
  1415. # file = new_file
  1416. # sub_text = getText(_type, file)
  1417. # # 有文件后缀,截取
  1418. # else:
  1419. # _type = file.split(".")[-1]
  1420. # sub_text = getText(_type, file)
  1421. #
  1422. # if judge_error_code(sub_text, code=[-3]):
  1423. # continue
  1424. # if judge_error_code(sub_text):
  1425. # return sub_text
  1426. #
  1427. # # print("sub text", sub_text, file, _type)
  1428. # text = text + sub_text
  1429. # return text
  1430. # except Exception as e:
  1431. # logging.info("rar2text error!")
  1432. # print("rar2text", traceback.print_exc())
  1433. # return [-1]
  1434. #
  1435. #
  1436. # def inner_file_rename(path_list):
  1437. # logging.info("into inner_file_rename")
  1438. # try:
  1439. # # 先过滤文件名中的点 '.'
  1440. # path_list.sort(key=lambda x: len(x), reverse=True)
  1441. # for i in range(len(path_list)):
  1442. # old_path = path_list[i]
  1443. # # 对于目录,判断最后一级是否需过滤,重命名
  1444. # if os.path.isdir(old_path):
  1445. # ps = old_path.split(os.sep)
  1446. # old_p = ps[-2]
  1447. # if '.' in old_p:
  1448. # new_p = re.sub("\\.", "", old_p)
  1449. # new_path = ""
  1450. # for p in ps[:-2]:
  1451. # new_path += p + os.sep
  1452. # new_path += new_p + os.sep
  1453. #
  1454. # # 重命名,更新
  1455. # # print("has .", path_list[i], new_path)
  1456. # os.rename(old_path, new_path)
  1457. # for j in range(len(path_list)):
  1458. # if old_path in path_list[j]:
  1459. # path_list[j] = re.sub(old_p, new_p, path_list[j]) + os.sep
  1460. #
  1461. # # 将path分割,按分割个数排名
  1462. # path_len_list = []
  1463. # for p in path_list:
  1464. # p_ss = p.split(os.sep)
  1465. # temp_p_ss = []
  1466. # for pp in p_ss:
  1467. # if pp == "":
  1468. # continue
  1469. # temp_p_ss.append(pp)
  1470. # p_ss = temp_p_ss
  1471. # path_len_list.append([p, p_ss, len(p_ss)])
  1472. #
  1473. # # 从路径分割少的开始改名,即从根目录开始改
  1474. # path_len_list.sort(key=lambda x: x[2])
  1475. #
  1476. # # for p in path_len_list:
  1477. # # print("---", p[1])
  1478. #
  1479. # # 判断不用变的目录在第几级
  1480. # no_change_level = 0
  1481. # loop = 0
  1482. # for p_s in path_len_list[0][1]:
  1483. # if p_s[-4:] == "_rar" or p_s[-4:] == "_zip":
  1484. # no_change_level += loop
  1485. # loop = 0
  1486. # loop += 1
  1487. # no_change_level += 1
  1488. #
  1489. # # 每个
  1490. # new_path_list = []
  1491. # for path_len in path_len_list:
  1492. # # 前n个是固定路径
  1493. # new_path = ""
  1494. # for i in range(no_change_level):
  1495. # new_path += path_len[1][i] + os.sep
  1496. # old_path = new_path
  1497. #
  1498. # if not get_platform() == "Windows":
  1499. # old_path = os.sep + old_path
  1500. # new_path = os.sep + new_path
  1501. # # print("path_len[1][3:]", path_len[1][3:])
  1502. #
  1503. # count = 0
  1504. # for p in path_len[1][no_change_level:]:
  1505. # # 新路径全部转换hash
  1506. # new_path += str(hash(p))
  1507. #
  1508. # # 最后一个不加os.sep,并且旧路径最后一个不转换hash
  1509. # if count < len(path_len[1][no_change_level:]) - 1:
  1510. # old_path += str(hash(p)) + os.sep
  1511. # new_path += os.sep
  1512. # else:
  1513. # old_path += p
  1514. # count += 1
  1515. #
  1516. # # path是文件夹再加os.sep
  1517. # if os.path.isdir(path_len[0]):
  1518. # new_path += os.sep
  1519. # old_path += os.sep
  1520. # # path是文件再加文件名后缀
  1521. # else:
  1522. # p_ss = path_len[1][-1].split(".")
  1523. # if len(p_ss) > 1:
  1524. # path_suffix = "." + p_ss[-1]
  1525. # new_path += path_suffix
  1526. #
  1527. # print("inner_file_rename", old_path, "to", new_path)
  1528. # os.rename(old_path, new_path)
  1529. # new_path_list.append(new_path)
  1530. #
  1531. # return new_path_list
  1532. # except Exception as e:
  1533. # logging.info("inner_file_rename error!")
  1534. # print("inner_file_rename", traceback.print_exc())
  1535. # return [-1]
  1536. #
  1537. #
  1538. # def rename_inner_files(root_path):
  1539. # try:
  1540. # logging.info("into rename_inner_files")
  1541. # # 获取解压文件夹下所有文件+文件夹,不带根路径
  1542. # path_list = []
  1543. # for root, dirs, files in os.walk(root_path, topdown=False):
  1544. # for name in dirs:
  1545. # p = os.path.join(root, name) + os.sep
  1546. # p = re.sub(root_path, "", p)
  1547. # path_list.append(p)
  1548. # for name in files:
  1549. # p = os.path.join(root, name)
  1550. # p = re.sub(root_path, "", p)
  1551. # path_list.append(p)
  1552. #
  1553. # # 按路径长度排序
  1554. # path_list.sort(key=lambda x: len(x), reverse=True)
  1555. #
  1556. # # 循环改名
  1557. # for old_path in path_list:
  1558. # # 按路径分隔符分割
  1559. # ss = old_path.split(os.sep)
  1560. # # 判断是否文件夹
  1561. # is_dir = 0
  1562. # file_type = ""
  1563. # if os.path.isdir(root_path + old_path):
  1564. # ss = ss[:-1]
  1565. # is_dir = 1
  1566. # else:
  1567. # if "." in old_path:
  1568. # file_type = "." + old_path.split(".")[-1]
  1569. # else:
  1570. # file_type = ""
  1571. #
  1572. # # 最后一级需要用hash改名
  1573. # new_path = ""
  1574. # # new_path = re.sub(ss[-1], str(hash(ss[-1])), old_path) + file_type
  1575. # current_level = 0
  1576. # for s in ss:
  1577. # # 路径拼接
  1578. # if current_level < len(ss) - 1:
  1579. # new_path += s + os.sep
  1580. # else:
  1581. # new_path += str(hash(s)) + file_type
  1582. # current_level += 1
  1583. #
  1584. # new_ab_path = root_path + new_path
  1585. # old_ab_path = root_path + old_path
  1586. # os.rename(old_ab_path, new_ab_path)
  1587. #
  1588. # # 重新获取解压文件夹下所有文件+文件夹
  1589. # new_path_list = []
  1590. # for root, dirs, files in os.walk(root_path, topdown=False):
  1591. # for name in dirs:
  1592. # new_path_list.append(os.path.join(root, name) + os.sep)
  1593. # for name in files:
  1594. # new_path_list.append(os.path.join(root, name))
  1595. # # print("new_path_list", new_path_list)
  1596. # return new_path_list
  1597. # except:
  1598. # traceback.print_exc()
  1599. # return [-1]
  1600. #
  1601. #
  1602. # @get_memory_info.memory_decorator
  1603. # def xls2text(path, unique_type_dir):
  1604. # logging.info("into xls2text")
  1605. # try:
  1606. # # 调用libreoffice格式转换
  1607. # file_path = from_office_interface(path, unique_type_dir, 'xlsx')
  1608. # # if file_path == [-3]:
  1609. # # return [-3]
  1610. # if judge_error_code(file_path):
  1611. # return file_path
  1612. #
  1613. # text = xlsx2text(file_path, unique_type_dir)
  1614. # # if text == [-1]:
  1615. # # return [-1]
  1616. # # if text == [-3]:
  1617. # # return [-3]
  1618. # if judge_error_code(text):
  1619. # return text
  1620. #
  1621. # return text
  1622. # except Exception as e:
  1623. # logging.info("xls2text error!")
  1624. # print("xls2text", traceback.print_exc())
  1625. # return [-1]
  1626. #
  1627. #
  1628. # @get_memory_info.memory_decorator
  1629. # def xlsx2text(path, unique_type_dir):
  1630. # logging.info("into xlsx2text")
  1631. # try:
  1632. # try:
  1633. # # sheet_name=None, 即拿取所有sheet,存为dict
  1634. # df_dict = pandas.read_excel(path, header=None, keep_default_na=False, sheet_name=None)
  1635. # except Exception as e:
  1636. # logging.info("xlsx format error!")
  1637. # # print("xlsx format error!", e)
  1638. # return [-3]
  1639. #
  1640. # df_list = [sheet for sheet in df_dict.values()]
  1641. # sheet_text = ""
  1642. # for df in df_list:
  1643. # text = '<table border="1">' + "\n"
  1644. # for index, row in df.iterrows():
  1645. # text = text + "<tr>"
  1646. # for r in row:
  1647. # text = text + "<td>" + str(r) + "</td>" + "\n"
  1648. # # print(text)
  1649. # text = text + "</tr>" + "\n"
  1650. # text = text + "</table>" + "\n"
  1651. # sheet_text += text
  1652. #
  1653. # return [sheet_text]
  1654. # except Exception as e:
  1655. # logging.info("xlsx2text error!")
  1656. # print("xlsx2text", traceback.print_exc())
  1657. # return [-1]
  1658. #
  1659. #
  1660. # @get_memory_info.memory_decorator
  1661. # def swf2text(path, unique_type_dir):
  1662. # logging.info("into swf2text")
  1663. # try:
  1664. # try:
  1665. # with open(path, 'rb') as f:
  1666. # swf_file = SWF(f)
  1667. # svg_exporter = SVGExporter()
  1668. # svg = swf_file.export(svg_exporter)
  1669. # # with open('swf_export.jpg', 'wb') as f:
  1670. # # f.write(svg.read())
  1671. # swf_str = str(svg.getvalue(), encoding='utf-8')
  1672. # except Exception as e:
  1673. # logging.info("swf format error!")
  1674. # traceback.print_exc()
  1675. # return [-3]
  1676. #
  1677. # # 正则匹配图片的信息位置
  1678. # result0 = re.finditer('<image id=(.[^>]*)', swf_str)
  1679. # image_bytes_list = []
  1680. # i = 0
  1681. # image_path_prefix = path.split(".")[-2] + "_" + path.split(".")[-1]
  1682. # image_path_list = []
  1683. # for r in result0:
  1684. # # 截取图片信息所在位置
  1685. # swf_str0 = swf_str[r.span()[0]:r.span()[1] + 1]
  1686. #
  1687. # # 正则匹配得到图片的base64编码
  1688. # result1 = re.search('xlink:href="data:(.[^>]*)', swf_str0)
  1689. # swf_str1 = swf_str0[result1.span()[0]:result1.span()[1]]
  1690. # reg1_prefix = 'b\''
  1691. # result1 = re.search(reg1_prefix + '(.[^\']*)', swf_str1)
  1692. # swf_str1 = swf_str1[result1.span()[0] + len(reg1_prefix):result1.span()[1]]
  1693. #
  1694. # # base64_str -> base64_bytes -> no "\\" base64_bytes -> bytes -> image
  1695. # base64_bytes_with_double = bytes(swf_str1, "utf-8")
  1696. # base64_bytes = codecs.escape_decode(base64_bytes_with_double, "hex-escape")[0]
  1697. # image_bytes = base64.b64decode(base64_bytes)
  1698. # image_bytes_list.append(image_bytes)
  1699. # image_path = image_path_prefix + "_page_" + str(i) + ".png"
  1700. # with open(image_path, 'wb') as f:
  1701. # f.write(image_bytes)
  1702. #
  1703. # image_path_list.append(image_path)
  1704. # # 正则匹配得到图片的宽高
  1705. # # reg2_prefix = 'width="'
  1706. # # result2 = re.search(reg2_prefix + '(\d+)', swf_str0)
  1707. # # swf_str2 = swf_str0[result2.span()[0]+len(reg2_prefix):result2.span()[1]]
  1708. # # width = swf_str2
  1709. # # reg2_prefix = 'height="'
  1710. # # result2 = re.search(reg2_prefix + '(\d+)', swf_str0)
  1711. # # swf_str2 = swf_str0[result2.span()[0]+len(reg2_prefix):result2.span()[1]]
  1712. # # height = swf_str2
  1713. # i += 1
  1714. #
  1715. # text_list = []
  1716. # # print("image_path_list", image_path_list)
  1717. # for image_path in image_path_list:
  1718. # text = picture2text(image_path)
  1719. # # print("text", text)
  1720. #
  1721. # if judge_error_code(text, code=[-3]):
  1722. # continue
  1723. # if judge_error_code(text):
  1724. # return text
  1725. #
  1726. # text = text[0]
  1727. # text_list.append(text)
  1728. #
  1729. # text = ""
  1730. # for t in text_list:
  1731. # text += t
  1732. #
  1733. # return [text]
  1734. # except Exception as e:
  1735. # logging.info("swf2text error!")
  1736. # print("swf2text", traceback.print_exc())
  1737. # return [-1]
  1738. #
  1739. #
  1740. # @get_memory_info.memory_decorator
  1741. # def picture2text(path, html=False):
  1742. # logging.info("into picture2text")
  1743. # try:
  1744. # # 判断图片中表格
  1745. # img = cv2.imread(path)
  1746. # if img is None:
  1747. # return [-3]
  1748. #
  1749. # # if get_platform() == "Windows":
  1750. # # print("picture2text img", img)
  1751. #
  1752. # text, column_list, outline_points, is_table = image_preprocess(img, path)
  1753. # if judge_error_code(text):
  1754. # return text
  1755. # # if text == [-5]:
  1756. # # return [-5]
  1757. # # if text == [-2]:
  1758. # # return [-2]
  1759. # # if text == [-1]:
  1760. # # return [-1]
  1761. #
  1762. # if html:
  1763. # text = add_div(text)
  1764. # return [text]
  1765. # except Exception as e:
  1766. # logging.info("picture2text error!")
  1767. # print("picture2text", traceback.print_exc())
  1768. # return [-1]
  1769. #
  1770. #
  1771. # @get_memory_info.memory_decorator
  1772. # def from_ocr_interface(image_stream, is_table=False):
  1773. # logging.info("into from_ocr_interface")
  1774. # try:
  1775. # base64_stream = base64.b64encode(image_stream)
  1776. #
  1777. # # 调用接口
  1778. # try:
  1779. # r = ocr(data=base64_stream, ocr_model=globals().get("global_ocr_model"))
  1780. # except TimeoutError:
  1781. # if is_table:
  1782. # return [-5], [-5]
  1783. # else:
  1784. # return [-5]
  1785. # except requests.exceptions.ConnectionError as e:
  1786. # if is_table:
  1787. # return [-2], [-2]
  1788. # else:
  1789. # return [-2]
  1790. #
  1791. # _dict = r
  1792. # text_list = eval(_dict.get("text"))
  1793. # bbox_list = eval(_dict.get("bbox"))
  1794. # if text_list is None:
  1795. # text_list = []
  1796. # if bbox_list is None:
  1797. # bbox_list = []
  1798. #
  1799. # if is_table:
  1800. # return text_list, bbox_list
  1801. # else:
  1802. # if text_list and bbox_list:
  1803. # text = get_sequential_data(text_list, bbox_list, html=True)
  1804. # if judge_error_code(text):
  1805. # return text
  1806. # # if text == [-1]:
  1807. # # return [-1]
  1808. # else:
  1809. # text = ""
  1810. # return text
  1811. # except Exception as e:
  1812. # logging.info("from_ocr_interface error!")
  1813. # # print("from_ocr_interface", e, global_type)
  1814. # if is_table:
  1815. # return [-1], [-1]
  1816. # else:
  1817. # return [-1]
  1818. #
  1819. #
  1820. # @get_memory_info.memory_decorator
  1821. # def from_otr_interface(image_stream):
  1822. # logging.info("into from_otr_interface")
  1823. # try:
  1824. # base64_stream = base64.b64encode(image_stream)
  1825. #
  1826. # # 调用接口
  1827. # try:
  1828. # r = otr(data=base64_stream, otr_model=globals().get("global_otr_model"))
  1829. # except TimeoutError:
  1830. # return [-5], [-5], [-5], [-5]
  1831. # except requests.exceptions.ConnectionError as e:
  1832. # logging.info("from_otr_interface")
  1833. # print("from_otr_interface", traceback.print_exc())
  1834. # return [-2], [-2], [-2], [-2]
  1835. #
  1836. # # 处理结果
  1837. # _dict = r
  1838. # points = eval(_dict.get("points"))
  1839. # split_lines = eval(_dict.get("split_lines"))
  1840. # bboxes = eval(_dict.get("bboxes"))
  1841. # outline_points = eval(_dict.get("outline_points"))
  1842. # # print("from_otr_interface len(bboxes)", len(bboxes))
  1843. # if points is None:
  1844. # points = []
  1845. # if split_lines is None:
  1846. # split_lines = []
  1847. # if bboxes is None:
  1848. # bboxes = []
  1849. # if outline_points is None:
  1850. # outline_points = []
  1851. # return points, split_lines, bboxes, outline_points
  1852. # except Exception as e:
  1853. # logging.info("from_otr_interface error!")
  1854. # print("from_otr_interface", traceback.print_exc())
  1855. # return [-1], [-1], [-1], [-1]
  1856. #
  1857. #
  1858. # def from_office_interface(src_path, dest_path, target_format, retry_times=1):
  1859. # try:
  1860. # # Win10跳出超时装饰器
  1861. # if get_platform() == "Windows":
  1862. # # origin_office_convert = office_convert.__wrapped__
  1863. # # file_path = origin_office_convert(src_path, dest_path, target_format, retry_times)
  1864. # file_path = office_convert(src_path, dest_path, target_format, retry_times)
  1865. # else:
  1866. # # 将装饰器包装为一个类,否则多进程Pickle会报错 it's not the same object as xxx 问题,
  1867. # # timeout_decorator_obj = my_timeout_decorator.TimeoutClass(office_convert, 180, TimeoutError)
  1868. # # file_path = timeout_decorator_obj.run(src_path, dest_path, target_format, retry_times)
  1869. #
  1870. # file_path = office_convert(src_path, dest_path, target_format, retry_times)
  1871. #
  1872. # if judge_error_code(file_path):
  1873. # return file_path
  1874. # return file_path
  1875. # except TimeoutError:
  1876. # logging.info("from_office_interface timeout error!")
  1877. # return [-5]
  1878. # except:
  1879. # logging.info("from_office_interface error!")
  1880. # print("from_office_interface", traceback.print_exc())
  1881. # return [-1]
  1882. #
  1883. #
  1884. # def get_sequential_data(text_list, bbox_list, html=False):
  1885. # logging.info("into get_sequential_data")
  1886. # try:
  1887. # text = ""
  1888. # order_list = []
  1889. # for i in range(len(text_list)):
  1890. # length_start = bbox_list[i][0][0]
  1891. # length_end = bbox_list[i][1][0]
  1892. # height_start = bbox_list[i][0][1]
  1893. # height_end = bbox_list[i][-1][1]
  1894. # # print([length_start, length_end, height_start, height_end])
  1895. # order_list.append([text_list[i], length_start, length_end, height_start, height_end])
  1896. # # text = text + infomation['text'] + "\n"
  1897. #
  1898. # if get_platform() == "Windows":
  1899. # print("get_sequential_data", order_list)
  1900. # if not order_list:
  1901. # if get_platform() == "Windows":
  1902. # print("get_sequential_data", "no order list")
  1903. # return ""
  1904. #
  1905. # # 根据bbox的坐标对输出排序
  1906. # order_list.sort(key=lambda x: (x[3], x[1]))
  1907. #
  1908. # # 根据bbox分行分列
  1909. # # col_list = []
  1910. # # height_end = int((order_list[0][4] + order_list[0][3]) / 2)
  1911. # # for i in range(len(order_list)):
  1912. # # if height_end - threshold <= order_list[i][3] <= height_end + threshold:
  1913. # # col_list.append(order_list[i])
  1914. # # else:
  1915. # # row_list.append(col_list)
  1916. # # col_list = []
  1917. # # height_end = int((order_list[i][4] + order_list[i][3]) / 2)
  1918. # # col_list.append(order_list[i])
  1919. # # if i == len(order_list) - 1:
  1920. # # row_list.append(col_list)
  1921. #
  1922. # row_list = []
  1923. # used_box = []
  1924. # threshold = 5
  1925. # for box in order_list:
  1926. # if box in used_box:
  1927. # continue
  1928. #
  1929. # height_center = (box[4] + box[3]) / 2
  1930. # row = []
  1931. # for box2 in order_list:
  1932. # if box2 in used_box:
  1933. # continue
  1934. # height_center2 = (box2[4] + box2[3]) / 2
  1935. # if height_center - threshold <= height_center2 <= height_center + threshold:
  1936. # if box2 not in row:
  1937. # row.append(box2)
  1938. # used_box.append(box2)
  1939. # row.sort(key=lambda x: x[0])
  1940. # row_list.append(row)
  1941. #
  1942. # for row in row_list:
  1943. # if not row:
  1944. # continue
  1945. # if len(row) <= 1:
  1946. # text = text + row[0][0] + "\n"
  1947. # else:
  1948. # sub_text = ""
  1949. # row.sort(key=lambda x: x[1])
  1950. # for col in row:
  1951. # sub_text = sub_text + col[0] + " "
  1952. # sub_text = sub_text + "\n"
  1953. # text += sub_text
  1954. #
  1955. # if html:
  1956. # text = "<div>" + text
  1957. # text = re.sub("\n", "</div>\n<div>", text)
  1958. # text += "</div>"
  1959. # # if text[-5:] == "<div>":
  1960. # # text = text[:-5]
  1961. # return text
  1962. #
  1963. # except Exception as e:
  1964. # logging.info("get_sequential_data error!")
  1965. # print("get_sequential_data", traceback.print_exc())
  1966. # return [-1]
  1967. #
  1968. #
  1969. # def get_formatted_table(text_list, text_bbox_list, table_bbox_list, split_line):
  1970. # logging.info("into get_formatted_table")
  1971. # try:
  1972. # # 重新定义text_bbox_list,[point, point, text]
  1973. # text_bbox_list = [[text_bbox_list[i][0], text_bbox_list[i][2], text_list[i]] for i in
  1974. # range(len(text_bbox_list))]
  1975. # # 按纵坐标排序
  1976. # text_bbox_list.sort(key=lambda x: (x[0][1], x[0][0]))
  1977. # table_bbox_list.sort(key=lambda x: (x[0][1], x[0][0]))
  1978. #
  1979. # # print("text_bbox_list", text_bbox_list)
  1980. # # print("table_bbox_list", table_bbox_list)
  1981. #
  1982. # # bbox位置 threshold
  1983. # threshold = 5
  1984. #
  1985. # # 根据split_line分区,可能有个区多个表格 [(), ()]
  1986. # area_text_bbox_list = []
  1987. # area_table_bbox_list = []
  1988. # # print("get_formatted_table, split_line", split_line)
  1989. # for j in range(1, len(split_line)):
  1990. # last_y = split_line[j - 1][0][1]
  1991. # current_y = split_line[j][0][1]
  1992. # temp_text_bbox_list = []
  1993. # temp_table_bbox_list = []
  1994. #
  1995. # # 找出该区域下text bbox
  1996. # for text_bbox in text_bbox_list:
  1997. # # 计算 text bbox 中心点
  1998. # text_bbox_center = ((text_bbox[1][0] + text_bbox[0][0]) / 2,
  1999. # (text_bbox[1][1] + text_bbox[0][1]) / 2)
  2000. # if last_y - threshold <= text_bbox_center[1] <= current_y + threshold:
  2001. # temp_text_bbox_list.append(text_bbox)
  2002. # area_text_bbox_list.append(temp_text_bbox_list)
  2003. #
  2004. # # 找出该区域下table bbox
  2005. # for table_bbox in table_bbox_list:
  2006. # # 计算 table bbox 中心点
  2007. # table_bbox_center = ((table_bbox[1][0] + table_bbox[0][0]) / 2,
  2008. # (table_bbox[1][1] + table_bbox[0][1]) / 2)
  2009. # if last_y < table_bbox_center[1] < current_y:
  2010. # temp_table_bbox_list.append(table_bbox)
  2011. # area_table_bbox_list.append(temp_table_bbox_list)
  2012. #
  2013. # # for j in range(len(area_text_bbox_list)):
  2014. # # print("area_text_bbox_list", j, area_text_bbox_list[j])
  2015. #
  2016. # # 对每个区域分别进行两个bbox匹配,生成表格
  2017. # area_text_list = []
  2018. # area_column_list = []
  2019. # for j in range(len(area_text_bbox_list)):
  2020. # # 每个区域的table bbox 和text bbox
  2021. # temp_table_bbox_list = area_table_bbox_list[j]
  2022. # temp_text_bbox_list = area_text_bbox_list[j]
  2023. #
  2024. # # 判断该区域有无表格bbox
  2025. # # 若无表格,将该区域文字连接
  2026. # if not temp_table_bbox_list:
  2027. # # 找出该区域的所有text bbox
  2028. # only_text_list = []
  2029. # only_bbox_list = []
  2030. # for text_bbox in temp_text_bbox_list:
  2031. # only_text_list.append(text_bbox[2])
  2032. # only_bbox_list.append([text_bbox[0], text_bbox[1]])
  2033. # only_text = get_sequential_data(only_text_list, only_bbox_list, True)
  2034. # if only_text == [-1]:
  2035. # return [-1], [-1]
  2036. # area_text_list.append(only_text)
  2037. # area_column_list.append(0)
  2038. # continue
  2039. #
  2040. # # 有表格
  2041. # # 文本对应的表格格子
  2042. # text_in_table = {}
  2043. # for i in range(len(temp_text_bbox_list)):
  2044. # text_bbox = temp_text_bbox_list[i]
  2045. #
  2046. # # 计算 text bbox 中心点
  2047. # text_bbox_center = ((text_bbox[1][0] + text_bbox[0][0]) / 2,
  2048. # (text_bbox[1][1] + text_bbox[0][1]) / 2)
  2049. #
  2050. # # 判断中心点在哪个table bbox中
  2051. # for table_bbox in temp_table_bbox_list:
  2052. # # 中心点在table bbox中,将text写入字典
  2053. # if table_bbox[0][0] <= text_bbox_center[0] <= table_bbox[1][0] and \
  2054. # table_bbox[0][1] <= text_bbox_center[1] <= table_bbox[1][1]:
  2055. # if str(table_bbox) in text_in_table.keys():
  2056. # text_in_table[str(table_bbox)] = text_in_table.get(str(table_bbox)) + text_bbox[2]
  2057. # else:
  2058. # text_in_table[str(table_bbox)] = text_bbox[2]
  2059. # break
  2060. #
  2061. # # 如果未找到text bbox匹配的table bbox,加大threshold匹配
  2062. # # elif (table_bbox[0][0] <= text_bbox_center[0]+threshold <= table_bbox[1][0] and
  2063. # # table_bbox[0][1] <= text_bbox_center[1]+threshold <= table_bbox[1][1]) or \
  2064. # # (table_bbox[0][0] <= text_bbox_center[0]-threshold <= table_bbox[1][0] and
  2065. # # table_bbox[0][1] <= text_bbox_center[1]-threshold <= table_bbox[1][1]) or \
  2066. # # (table_bbox[0][0] <= text_bbox_center[0]+threshold <= table_bbox[1][0] and
  2067. # # table_bbox[0][1] <= text_bbox_center[1]-threshold <= table_bbox[1][1]) or \
  2068. # # (table_bbox[0][0] <= text_bbox_center[0]-threshold <= table_bbox[1][0] and
  2069. # # table_bbox[0][1] <= text_bbox_center[1]+threshold <= table_bbox[1][1]):
  2070. # # if str(table_bbox) in text_in_table.keys():
  2071. # # text_in_table[str(table_bbox)] = text_in_table.get(str(table_bbox)) + text_bbox[2]
  2072. # # else:
  2073. # # text_in_table[str(table_bbox)] = text_bbox[2]
  2074. # # break
  2075. #
  2076. # # 对表格格子进行分行分列,并计算总计多少小列
  2077. # # 放入坐标
  2078. # all_col_list = []
  2079. # all_row_list = []
  2080. # for i in range(len(temp_table_bbox_list)):
  2081. # table_bbox = temp_table_bbox_list[i]
  2082. #
  2083. # # 放入所有坐标x
  2084. # if table_bbox[0][0] not in all_col_list:
  2085. # all_col_list.append(table_bbox[0][0])
  2086. # if table_bbox[1][0] not in all_col_list:
  2087. # all_col_list.append(table_bbox[1][0])
  2088. #
  2089. # # 放入所有坐标y
  2090. # if table_bbox[0][1] not in all_row_list:
  2091. # all_row_list.append(table_bbox[0][1])
  2092. # if table_bbox[1][1] not in all_row_list:
  2093. # all_row_list.append(table_bbox[1][1])
  2094. # all_col_list.sort(key=lambda x: x)
  2095. # all_row_list.sort(key=lambda x: x)
  2096. #
  2097. # # 分行
  2098. # row_list = []
  2099. # rows = []
  2100. # temp_table_bbox_list.sort(key=lambda x: (x[0][1], x[0][0], x[1][1], x[1][0]))
  2101. # y_row = temp_table_bbox_list[0][0][1]
  2102. # for i in range(len(temp_table_bbox_list)):
  2103. # table_bbox = temp_table_bbox_list[i]
  2104. #
  2105. # if y_row - threshold <= table_bbox[0][1] <= y_row + threshold:
  2106. # rows.append(table_bbox)
  2107. # else:
  2108. # y_row = table_bbox[0][1]
  2109. # if rows:
  2110. # rows.sort(key=lambda x: x[0][0])
  2111. # row_list.append(rows)
  2112. # rows = []
  2113. # rows.append(table_bbox)
  2114. # # print("*" * 30)
  2115. # # print(row_list)
  2116. #
  2117. # if i == len(temp_table_bbox_list) - 1:
  2118. # if rows:
  2119. # rows.sort(key=lambda x: x[0][0])
  2120. # row_list.append(rows)
  2121. #
  2122. # # 生成表格,包括文字和格子宽度
  2123. # area_column = []
  2124. # text = '<table border="1">' + "\n"
  2125. # for row in row_list:
  2126. # text += "<tr>" + "\n"
  2127. # for col in row:
  2128. # # 计算bbox y坐标之间有多少其他点,+1即为所占行数
  2129. # row_span = 1
  2130. # for y in all_row_list:
  2131. # if col[0][1] < y < col[1][1]:
  2132. # if y - col[0][1] >= 2 and col[1][1] - y >= 2:
  2133. # row_span += 1
  2134. #
  2135. # # 计算bbox x坐标之间有多少其他点,+1即为所占列数
  2136. # col_span = 1
  2137. # for x in all_col_list:
  2138. # if col[0][0] < x < col[1][0]:
  2139. # if x - col[0][0] >= 2 and col[1][0] - x >= 2:
  2140. # col_span += 1
  2141. #
  2142. # text += "<td colspan=" + str(col_span) + " rowspan=" + str(row_span) + ">"
  2143. #
  2144. # if str(col) in text_in_table.keys():
  2145. # text += text_in_table.get(str(col))
  2146. # else:
  2147. # text += ""
  2148. # text += "</td>" + "\n"
  2149. # text += "</tr>" + "\n"
  2150. # text += "</table>" + "\n"
  2151. #
  2152. # # 计算最大column
  2153. # max_col_num = 0
  2154. # for row in row_list:
  2155. # col_num = 0
  2156. # for col in row:
  2157. # col_num += 1
  2158. # if max_col_num < col_num:
  2159. # max_col_num = col_num
  2160. #
  2161. # area_text_list.append(text)
  2162. # area_column_list.append(max_col_num)
  2163. #
  2164. # text = ""
  2165. # if get_platform() == "Windows":
  2166. # print("get_formatted_table area_text_list", area_text_list)
  2167. # for area_text in area_text_list:
  2168. # text += area_text
  2169. # return text, area_column_list
  2170. # except Exception as e:
  2171. # logging.info("get_formatted_table error!")
  2172. # print("get_formatted_table", traceback.print_exc())
  2173. # return [-1], [-1]
  2174. port_num = [0]
  2175. def choose_port():
  2176. process_num = 4
  2177. if port_num[0] % process_num == 0:
  2178. _url = local_url + ":15011"
  2179. elif port_num[0] % process_num == 1:
  2180. _url = local_url + ":15012"
  2181. elif port_num[0] % process_num == 2:
  2182. _url = local_url + ":15013"
  2183. elif port_num[0] % process_num == 3:
  2184. _url = local_url + ":15014"
  2185. port_num[0] = port_num[0] + 1
  2186. return _url
  2187. def getText(_type, path_or_stream):
  2188. print("file type - " + _type)
  2189. logging.info("file type - " + _type)
  2190. try:
  2191. ss = path_or_stream.split(".")
  2192. unique_type_dir = ss[-2] + "_" + ss[-1] + os.sep
  2193. except:
  2194. unique_type_dir = path_or_stream + "_" + _type + os.sep
  2195. if _type == "pdf":
  2196. # return pdf2text(path_or_stream, unique_type_dir)
  2197. return PDFConvert(path_or_stream, unique_type_dir).get_html()
  2198. if _type == "docx":
  2199. # return docx2text(path_or_stream, unique_type_dir)
  2200. return DocxConvert(path_or_stream, unique_type_dir).get_html()
  2201. if _type == "zip":
  2202. # return zip2text(path_or_stream, unique_type_dir)
  2203. return ZipConvert(path_or_stream, unique_type_dir).get_html()
  2204. if _type == "rar":
  2205. # return rar2text(path_or_stream, unique_type_dir)
  2206. return RarConvert(path_or_stream, unique_type_dir).get_html()
  2207. if _type == "xlsx":
  2208. # return xlsx2text(path_or_stream, unique_type_dir)
  2209. return XlsxConvert(path_or_stream, unique_type_dir).get_html()
  2210. if _type == "xls":
  2211. # return xls2text(path_or_stream, unique_type_dir)
  2212. return XlsConvert(path_or_stream, unique_type_dir).get_html()
  2213. if _type == "doc":
  2214. # return doc2text(path_or_stream, unique_type_dir)
  2215. return DocConvert(path_or_stream, unique_type_dir).get_html()
  2216. if _type == "jpg" or _type == "png" or _type == "jpeg":
  2217. # return picture2text(path_or_stream)
  2218. return ImageConvert(path_or_stream, unique_type_dir).get_html()
  2219. if _type == "swf":
  2220. # return swf2text(path_or_stream, unique_type_dir)
  2221. return SwfConvert(path_or_stream, unique_type_dir).get_html()
  2222. if _type == "txt":
  2223. # return txt2text(path_or_stream)
  2224. return TxtConvert(path_or_stream, unique_type_dir).get_html()
  2225. return [""]
  2226. def to_html(path, text):
  2227. with open(path, 'w',encoding="utf8") as f:
  2228. f.write("<!DOCTYPE HTML>")
  2229. f.write('<head><meta charset="UTF-8"></head>')
  2230. f.write("<body>")
  2231. f.write(text)
  2232. f.write("</body>")
  2233. def resize_image(image_path, size):
  2234. try:
  2235. image_np = cv2.imread(image_path)
  2236. # print(image_np.shape)
  2237. width = image_np.shape[1]
  2238. height = image_np.shape[0]
  2239. h_w_rate = height / width
  2240. # width_standard = 900
  2241. # height_standard = 1400
  2242. width_standard = size[1]
  2243. height_standard = size[0]
  2244. width_new = int(height_standard / h_w_rate)
  2245. height_new = int(width_standard * h_w_rate)
  2246. if width > width_standard:
  2247. image_np = cv2.resize(image_np, (width_standard, height_new))
  2248. elif height > height_standard:
  2249. image_np = cv2.resize(image_np, (width_new, height_standard))
  2250. cv2.imwrite(image_path, image_np)
  2251. # print("resize_image", image_np.shape)
  2252. return
  2253. except Exception as e:
  2254. logging.info("resize_image")
  2255. print("resize_image", e, global_type)
  2256. return
  2257. def remove_red_seal(image_np):
  2258. """
  2259. 去除红色印章
  2260. """
  2261. # 获得红色通道
  2262. blue_c, green_c, red_c = cv2.split(image_np)
  2263. # 多传入一个参数cv2.THRESH_OTSU,并且把阈值thresh设为0,算法会找到最优阈值
  2264. thresh, ret = cv2.threshold(red_c, 0, 255, cv2.THRESH_BINARY + cv2.THRESH_OTSU)
  2265. # print("remove_red_seal thresh", thresh)
  2266. # 实测调整为95%效果好一些
  2267. filter_condition = int(thresh * 0.98)
  2268. thresh1, red_thresh = cv2.threshold(red_c, filter_condition, 255, cv2.THRESH_BINARY)
  2269. # 把图片转回 3 通道
  2270. image_and = np.expand_dims(red_thresh, axis=2)
  2271. image_and = np.concatenate((image_and, image_and, image_and), axis=-1)
  2272. # print(image_and.shape)
  2273. # 膨胀
  2274. gray = cv2.cvtColor(image_and, cv2.COLOR_RGB2GRAY)
  2275. kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (2, 2))
  2276. erode = cv2.erode(gray, kernel)
  2277. cv2.imshow("erode", erode)
  2278. cv2.waitKey(0)
  2279. image_and = np.bitwise_and(cv2.bitwise_not(blue_c), cv2.bitwise_not(erode))
  2280. result_img = cv2.bitwise_not(image_and)
  2281. cv2.imshow("remove_red_seal", result_img)
  2282. cv2.waitKey(0)
  2283. return result_img
  2284. def remove_underline(image_np):
  2285. """
  2286. 去除文字下划线
  2287. """
  2288. # 灰度化
  2289. gray = cv2.cvtColor(image_np, cv2.COLOR_BGR2GRAY)
  2290. # 二值化
  2291. binary = cv2.adaptiveThreshold(~gray, 255,
  2292. cv2.ADAPTIVE_THRESH_GAUSSIAN_C, cv2.THRESH_BINARY,
  2293. 15, 10)
  2294. # Sobel
  2295. kernel_row = np.array([[-1, -2, -1], [0, 0, 0], [1, 2, 1]], np.float32)
  2296. kernel_col = np.array([[-1, 0, 1], [-2, 0, 2], [-1, 0, 1]], np.float32)
  2297. # binary = cv2.filter2D(binary, -1, kernel=kernel)
  2298. binary_row = cv2.filter2D(binary, -1, kernel=kernel_row)
  2299. binary_col = cv2.filter2D(binary, -1, kernel=kernel_col)
  2300. cv2.imshow("custom_blur_demo", binary)
  2301. cv2.waitKey(0)
  2302. rows, cols = binary.shape
  2303. # 识别横线
  2304. scale = 5
  2305. kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (cols // scale, 1))
  2306. erodedcol = cv2.erode(binary_row, kernel, iterations=1)
  2307. cv2.imshow("Eroded Image", erodedcol)
  2308. cv2.waitKey(0)
  2309. dilatedcol = cv2.dilate(erodedcol, kernel, iterations=1)
  2310. cv2.imshow("dilate Image", dilatedcol)
  2311. cv2.waitKey(0)
  2312. return
  2313. def getMDFFromFile(path):
  2314. _length = 0
  2315. try:
  2316. _md5 = hashlib.md5()
  2317. with open(path, "rb") as ff:
  2318. while True:
  2319. data = ff.read(4096)
  2320. if not data:
  2321. break
  2322. _length += len(data)
  2323. _md5.update(data)
  2324. return _md5.hexdigest(), _length
  2325. except Exception as e:
  2326. traceback.print_exc()
  2327. return None, _length
  2328. def add_html_format(text_list):
  2329. new_text_list = []
  2330. for t in text_list:
  2331. html_t = "<!DOCTYPE HTML>\n"
  2332. html_t += '<head><meta charset="UTF-8"></head>\n'
  2333. html_t += "<body>\n"
  2334. html_t += t
  2335. html_t += "\n</body>\n"
  2336. new_text_list.append(html_t)
  2337. return new_text_list
  2338. @timeout_decorator.timeout(1200, timeout_exception=TimeoutError)
  2339. def unique_temp_file_process(stream, _type):
  2340. logging.info("into unique_temp_file_process")
  2341. try:
  2342. # 每个调用在temp中创建一个唯一空间
  2343. uid1 = uuid.uuid1().hex
  2344. unique_space_path = _path + os.sep + "temp" + os.sep + uid1 + os.sep
  2345. # unique_space_path = "/mnt/fangjiasheng/" + "temp/" + uid1 + "/"
  2346. # 判断冲突
  2347. if not os.path.exists(unique_space_path):
  2348. if not os.path.exists(_path + os.sep + "temp"):
  2349. os.mkdir(_path + os.sep + "temp" + os.sep)
  2350. os.mkdir(unique_space_path)
  2351. else:
  2352. uid2 = uuid.uuid1().hex
  2353. if not os.path.exists(_path + os.sep + "temp"):
  2354. os.mkdir(_path + os.sep + "temp" + os.sep)
  2355. os.mkdir(_path + os.sep + "temp" + os.sep + uid2 + os.sep)
  2356. # os.mkdir("/mnt/" + "temp/" + uid2 + "/")
  2357. # 在唯一空间中,对传入的文件也保存为唯一
  2358. uid3 = uuid.uuid1().hex
  2359. file_path = unique_space_path + uid3 + "." + _type
  2360. with open(file_path, "wb") as ff:
  2361. ff.write(stream)
  2362. # 跳过一些编号
  2363. pass_md5 = getMDFFromFile(file_path)
  2364. print("getMDFFromFile", pass_md5)
  2365. if pass_md5 == '84dba5a65339f338d3ebdf9f33fae13e'\
  2366. or pass_md5 == '3d9f9f4354582d85b21b060ebd5786db'\
  2367. or pass_md5 == 'b52da40f24c6b29dfc2ebeaefe4e41f1' \
  2368. or pass_md5 == 'eefb925b7ccec1467be20b462fde2a09':
  2369. raise Exception
  2370. text = getText(_type, file_path)
  2371. return text
  2372. except Exception as e:
  2373. # print("Convert error! Delete temp file. ", e, global_type)
  2374. logging.info("unique_temp_file_process")
  2375. print("unique_temp_file_process:", traceback.print_exc())
  2376. return [-1]
  2377. finally:
  2378. print("======================================")
  2379. print("File md5:", getMDFFromFile(file_path))
  2380. try:
  2381. if get_platform() == "Linux":
  2382. # 删除该唯一空间下所有文件
  2383. if os.path.exists(unique_space_path):
  2384. shutil.rmtree(unique_space_path)
  2385. print()
  2386. except Exception as e:
  2387. logging.info("Delete Files Failed!")
  2388. # print("Delete Files Failed!")
  2389. return [-1]
  2390. print("Finally")
  2391. # to_html(_path + "6.html", text[0])
  2392. # to_html(unique_space_path + "result.html", text[0])
  2393. # return text
  2394. logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(name)s - %(levelname)s - %(message)s')
  2395. logger = logging.getLogger(__name__)
  2396. def log(msg):
  2397. """
  2398. @summary:打印信息
  2399. """
  2400. logger.info(msg)
  2401. def cut_str(text_list, only_text_list, max_bytes_length=2000000):
  2402. logging.info("into cut_str")
  2403. try:
  2404. # 计算有格式总字节数
  2405. bytes_length = 0
  2406. for text in text_list:
  2407. bytes_length += len(bytes(text, encoding='utf-8'))
  2408. print("text_list", bytes_length)
  2409. # 小于直接返回
  2410. if bytes_length < max_bytes_length:
  2411. print("return text_list no cut")
  2412. return text_list
  2413. # 全部文件连接,重新计算无格式字节数
  2414. all_text = ""
  2415. bytes_length = 0
  2416. for text in only_text_list:
  2417. bytes_length += len(bytes(text, encoding='utf-8'))
  2418. all_text += text
  2419. # print("only_text_list", bytes_length)
  2420. # 小于直接返回
  2421. if bytes_length < max_bytes_length:
  2422. print("return only_text_list no cut")
  2423. return only_text_list
  2424. # 截取字符
  2425. all_text = all_text[:int(max_bytes_length/3)]
  2426. print("text bytes ", len(bytes(all_text, encoding='utf-8')))
  2427. print("return only_text_list has cut")
  2428. return [all_text]
  2429. except Exception as e:
  2430. logging.info("cut_str " + str(e))
  2431. return ["-1"]
  2432. @get_memory_info.memory_decorator
  2433. def convert(data, ocr_model, otr_model):
  2434. """
  2435. 接口返回值:
  2436. {[str], 1}: 处理成功
  2437. {[-1], 0}: 逻辑处理错误
  2438. {[-2], 0}: 接口调用错误
  2439. {[-3], 1}: 文件格式错误,无法打开
  2440. {[-4], 0}: 各类文件调用第三方包读取超时
  2441. {[-5], 0}: 整个转换过程超时
  2442. {[-6], 0}: 阿里云UDF队列超时
  2443. {[-7], 1}: 文件需密码,无法打开
  2444. :return: {"result": [], "is_success": int}
  2445. """
  2446. # 控制内存
  2447. # soft, hard = resource.getrlimit(resource.RLIMIT_AS)
  2448. # resource.setrlimit(resource.RLIMIT_AS, (15 * 1024 ** 3, hard))
  2449. logging.info("into convert")
  2450. start_time = time.time()
  2451. try:
  2452. # 模型加入全局变量
  2453. globals().update({"global_ocr_model": ocr_model})
  2454. globals().update({"global_otr_model": otr_model})
  2455. stream = base64.b64decode(data.get("file"))
  2456. _type = data.get("type")
  2457. if get_platform() == "Windows":
  2458. # 解除超时装饰器,直接访问原函数
  2459. origin_unique_temp_file_process = unique_temp_file_process.__wrapped__
  2460. text = origin_unique_temp_file_process(stream, _type)
  2461. else:
  2462. # Linux 通过装饰器设置整个转换超时时间
  2463. try:
  2464. text = unique_temp_file_process(stream, _type)
  2465. except TimeoutError:
  2466. logging.info("convert time out! 1200 sec")
  2467. text = [-5]
  2468. # if text == [-1]:
  2469. # print({"failed result": [-1], "is_success": 0}, time.time() - start_time)
  2470. # return {"result_html": ["-1"], "result_text": ["-1"], "is_success": 0}
  2471. # if text == [-2]:
  2472. # print({"failed result": [-2], "is_success": 0}, time.time() - start_time)
  2473. # return {"result_html": ["-2"], "result_text": ["-2"], "is_success": 0}
  2474. # if text == [-3]:
  2475. # print({"failed result": [-3], "is_success": 1}, time.time() - start_time)
  2476. # return {"result_html": ["-3"], "result_text": ["-3"], "is_success": 1}
  2477. # if text == [-4]:
  2478. # print({"failed result": [-4], "is_success": 0}, time.time() - start_time)
  2479. # return {"result_html": ["-4"], "result_text": ["-4"], "is_success": 0}
  2480. # if text == [-5]:
  2481. # print({"failed result": [-5], "is_success": 0}, time.time() - start_time)
  2482. # return {"result_html": ["-5"], "result_text": ["-5"], "is_success": 0}
  2483. # if text == [-7]:
  2484. # print({"failed result": [-7], "is_success": 1}, time.time() - start_time)
  2485. # return {"result_html": ["-7"], "result_text": ["-7"], "is_success": 1}
  2486. # if text == [-8]:
  2487. # print({"failed result": [-8], "is_success": 0}, time.time() - start_time)
  2488. # return {"result_html": ["-8"], "result_text": ["-8"], "is_success": 1}
  2489. error_code = [[-x] for x in range(1, 9)]
  2490. still_success_code = [[-3], [-7]]
  2491. if text in error_code:
  2492. if text in still_success_code:
  2493. print({"failed result": text, "is_success": 1}, time.time() - start_time)
  2494. return {"result_html": [str(text[0])], "result_text": [str(text[0])],
  2495. "is_success": 1}
  2496. else:
  2497. print({"failed result": text, "is_success": 0}, time.time() - start_time)
  2498. return {"result_html": [str(text[0])], "result_text": [str(text[0])],
  2499. "is_success": 0}
  2500. # 结果保存result.html
  2501. if get_platform() == "Windows":
  2502. text_str = ""
  2503. for t in text:
  2504. text_str += t
  2505. to_html("../result.html", text_str)
  2506. # 取纯文本
  2507. only_text = []
  2508. for t in text:
  2509. new_t = BeautifulSoup(t, "lxml").get_text()
  2510. new_t = re.sub("\n", "", new_t)
  2511. only_text.append(new_t)
  2512. # 判断长度,过长截取
  2513. text = cut_str(text, only_text)
  2514. only_text = cut_str(only_text, only_text)
  2515. if len(only_text) == 0:
  2516. only_text = [""]
  2517. if only_text[0] == '' and len(only_text) <= 1:
  2518. print({"finished result": ["", 0], "is_success": 1}, time.time() - start_time)
  2519. else:
  2520. print({"finished result": [str(only_text)[:20], len(str(text))],
  2521. "is_success": 1}, time.time() - start_time)
  2522. return {"result_html": text, "result_text": only_text, "is_success": 1}
  2523. except Exception as e:
  2524. print({"failed result": [-1], "is_success": 0}, time.time() - start_time)
  2525. print("convert", traceback.print_exc())
  2526. return {"result_html": ["-1"], "result_text": ["-1"], "is_success": 0}
  2527. global_type = ""
  2528. local_url = "http://127.0.0.1"
  2529. if get_platform() == "Windows":
  2530. _path = os.path.abspath(os.path.dirname(__file__))
  2531. else:
  2532. _path = "/home/admin"
  2533. if not os.path.exists(_path):
  2534. _path = os.path.dirname(os.path.abspath(__file__))
  2535. if __name__ == '__main__':
  2536. if get_platform() == "Windows":
  2537. file_path = "C:/Users/Administrator/Desktop/error15.png"
  2538. # file_path = "D:/BIDI_DOC/比地_文档/2022/Test_Interface/1622529434414.rar"
  2539. # file_path = "D:/BIDI_DOC/比地_文档/2022/Test_ODPS/1624325845476.pdf"
  2540. else:
  2541. file_path = "error14.pdf"
  2542. with open(file_path, "rb") as f:
  2543. file_bytes = f.read()
  2544. file_base64 = base64.b64encode(file_bytes)
  2545. data = {"file": file_base64, "type": file_path.split(".")[-1], "filemd5": 100}
  2546. ocr_model = ocr_interface.OcrModels().get_model()
  2547. otr_model = otr_interface.OtrModels().get_model()
  2548. result = convert(data, ocr_model, otr_model)