convert_docx.py 15 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421
  1. import inspect
  2. import os
  3. import sys
  4. sys.path.append(os.path.dirname(__file__) + "/../")
  5. from format_convert.convert_tree import _Document, _Sentence, _Page, _Image, _Table
  6. import logging
  7. import re
  8. import traceback
  9. import xml
  10. import zipfile
  11. import docx
  12. import timeout_decorator
  13. from format_convert import get_memory_info
  14. from format_convert.convert_image import picture2text
  15. from format_convert.utils import judge_error_code, add_div, get_logger, log
  16. @get_memory_info.memory_decorator
  17. def docx2text(path, unique_type_dir):
  18. log("into docx2text")
  19. try:
  20. try:
  21. doc = docx.Document(path)
  22. except Exception as e:
  23. print("docx format error!", e)
  24. print(traceback.print_exc())
  25. log("docx format error!")
  26. return [-3]
  27. # 遍历段落
  28. # print("docx2text extract paragraph")
  29. paragraph_text_list = []
  30. for paragraph in doc.paragraphs:
  31. if paragraph.text != "":
  32. paragraph_text_list.append("<div>" + paragraph.text + "</div>" + "\n")
  33. # print("paragraph_text", paragraph.text)
  34. # 遍历表
  35. try:
  36. table_text_list = read_xml_table(path, unique_type_dir)
  37. except TimeoutError:
  38. return [-4]
  39. if judge_error_code(table_text_list):
  40. return table_text_list
  41. # 顺序遍历图片
  42. # print("docx2text extract image")
  43. image_text_list = []
  44. temp_image_path = unique_type_dir + "temp_image.png"
  45. pattern = re.compile('rId\d+')
  46. for graph in doc.paragraphs:
  47. for run in graph.runs:
  48. if run.text == '':
  49. try:
  50. if not pattern.search(run.element.xml):
  51. continue
  52. content_id = pattern.search(run.element.xml).group(0)
  53. content_type = doc.part.related_parts[content_id].content_type
  54. except Exception as e:
  55. print("docx no image!", e)
  56. continue
  57. if not content_type.startswith('image'):
  58. continue
  59. # 写入临时文件
  60. img_data = doc.part.related_parts[content_id].blob
  61. with open(temp_image_path, 'wb') as f:
  62. f.write(img_data)
  63. # if get_platform() == "Windows":
  64. # print("img_data", img_data)
  65. if img_data is None:
  66. continue
  67. # 识别图片文字
  68. image_text = picture2text(temp_image_path)
  69. if image_text == [-2]:
  70. return [-2]
  71. if image_text == [-1]:
  72. return [-1]
  73. if image_text == [-3]:
  74. continue
  75. image_text = image_text[0]
  76. image_text_list.append(add_div(image_text))
  77. # 解析document.xml,获取文字顺序
  78. order_list = read_xml_order(path, unique_type_dir)
  79. if order_list == [-2]:
  80. return [-2]
  81. if order_list == [-1]:
  82. return [-1]
  83. text = ""
  84. # print("len(order_list)", len(order_list))
  85. # print("len(paragraph_text_list)", len(paragraph_text_list))
  86. # print("len(image_text_list)", len(image_text_list))
  87. # print("len(table_text_list)", len(table_text_list))
  88. for tag in order_list:
  89. if tag == "w:t":
  90. if len(paragraph_text_list) > 0:
  91. text += paragraph_text_list.pop(0)
  92. if tag == "wp:docPr":
  93. if len(image_text_list) > 0:
  94. text += image_text_list.pop(0)
  95. if tag == "w:tbl":
  96. if len(table_text_list) > 0:
  97. text += table_text_list.pop(0)
  98. return [text]
  99. except Exception as e:
  100. log("docx2text error!")
  101. print("docx2text", traceback.print_exc())
  102. return [-1]
  103. @get_memory_info.memory_decorator
  104. def read_xml_order(path, save_path):
  105. log("into read_xml_order")
  106. try:
  107. try:
  108. f = zipfile.ZipFile(path)
  109. for file in f.namelist():
  110. if "word/document.xml" == str(file):
  111. f.extract(file, save_path)
  112. f.close()
  113. except Exception as e:
  114. log("docx format error!")
  115. return [-3]
  116. try:
  117. collection = xml_analyze(save_path + "word/document.xml")
  118. except TimeoutError:
  119. log("read_xml_order timeout")
  120. return [-4]
  121. body = collection.getElementsByTagName("w:body")[0]
  122. order_list = []
  123. text_list = []
  124. for line in body.childNodes:
  125. # print(str(line))
  126. if "w:p" in str(line):
  127. text = line.getElementsByTagName("w:t")
  128. picture = line.getElementsByTagName("wp:docPr")
  129. if text:
  130. order_list.append("w:t")
  131. temp_text = ""
  132. for t in text:
  133. if len(t.childNodes) > 0:
  134. temp_text += t.childNodes[0].nodeValue
  135. else:
  136. continue
  137. text_list.append(temp_text)
  138. if picture:
  139. order_list.append("wp:docPr")
  140. for line1 in line.childNodes:
  141. if "w:r" in str(line1):
  142. # print("read_xml_order", "w:r")
  143. picture1 = line1.getElementsByTagName("w:pict")
  144. if picture1:
  145. order_list.append("wp:docPr")
  146. if "w:tbl" in str(line):
  147. order_list.append("w:tbl")
  148. read_xml_table(path, save_path)
  149. return [order_list, text_list]
  150. except Exception as e:
  151. log("read_xml_order error!")
  152. print("read_xml_order", traceback.print_exc())
  153. # log_traceback("read_xml_order")
  154. return [-1]
  155. @get_memory_info.memory_decorator
  156. def read_xml_table(path, save_path):
  157. log("into read_xml_table")
  158. try:
  159. try:
  160. f = zipfile.ZipFile(path)
  161. for file in f.namelist():
  162. if "word/document.xml" == str(file):
  163. f.extract(file, save_path)
  164. f.close()
  165. except Exception as e:
  166. # print("docx format error!", e)
  167. log("docx format error!")
  168. return [-3]
  169. try:
  170. collection = xml_analyze(save_path + "word/document.xml")
  171. except TimeoutError:
  172. log("read_xml_table timeout")
  173. return [-4]
  174. body = collection.getElementsByTagName("w:body")[0]
  175. table_text_list = []
  176. # print("body.childNodes", body.childNodes)
  177. for line in body.childNodes:
  178. if "w:tbl" in str(line):
  179. # print("str(line)", str(line))
  180. table_text = '<table border="1">' + "\n"
  181. tr_list = line.getElementsByTagName("w:tr")
  182. # print("line.childNodes", line.childNodes)
  183. tr_index = 0
  184. tr_text_list = []
  185. tr_text_list_colspan = []
  186. for tr in tr_list:
  187. table_text = table_text + "<tr rowspan=1>" + "\n"
  188. tc_list = tr.getElementsByTagName("w:tc")
  189. tc_index = 0
  190. tc_text_list = []
  191. for tc in tc_list:
  192. tc_text = ""
  193. # 获取一格占多少列
  194. col_span = tc.getElementsByTagName("w:gridSpan")
  195. if col_span:
  196. col_span = int(col_span[0].getAttribute("w:val"))
  197. else:
  198. col_span = 1
  199. # 获取是否是合并单元格的下一个空单元格
  200. is_merge = tc.getElementsByTagName("w:vMerge")
  201. if is_merge:
  202. is_merge = is_merge[0].getAttribute("w:val")
  203. if is_merge == "continue":
  204. col_span_index = 0
  205. real_tc_index = 0
  206. # if get_platform() == "Windows":
  207. # print("read_xml_table tr_text_list", tr_text_list)
  208. # print("read_xml_table tr_index", tr_index)
  209. if 0 <= tr_index - 1 < len(tr_text_list):
  210. for tc_colspan in tr_text_list[tr_index - 1]:
  211. if col_span_index < tc_index:
  212. col_span_index += tc_colspan[1]
  213. real_tc_index += 1
  214. # print("tr_index-1, real_tc_index", tr_index-1, real_tc_index)
  215. # print(tr_text_list[tr_index-1])
  216. if real_tc_index < len(tr_text_list[tr_index - 1]):
  217. tc_text = tr_text_list[tr_index - 1][real_tc_index][0]
  218. table_text = table_text + "<td colspan=" + str(col_span) + ">" + "\n"
  219. p_list = tc.getElementsByTagName("w:p")
  220. for p in p_list:
  221. t = p.getElementsByTagName("w:t")
  222. if t:
  223. for tt in t:
  224. # print("tt", tt.childNodes)
  225. if len(tt.childNodes) > 0:
  226. tc_text += tt.childNodes[0].nodeValue
  227. tc_text += "\n"
  228. table_text = table_text + tc_text + "</td>" + "\n"
  229. tc_index += 1
  230. tc_text_list.append([tc_text, col_span])
  231. table_text += "</tr>" + "\n"
  232. tr_index += 1
  233. tr_text_list.append(tc_text_list)
  234. table_text += "</table>" + "\n"
  235. table_text_list.append(table_text)
  236. return table_text_list
  237. except Exception as e:
  238. log("read_xml_table error")
  239. print("read_xml_table", traceback.print_exc())
  240. return [-1]
  241. @get_memory_info.memory_decorator
  242. @timeout_decorator.timeout(300, timeout_exception=TimeoutError)
  243. def xml_analyze(path):
  244. # 解析xml
  245. DOMTree = xml.dom.minidom.parse(path)
  246. collection = DOMTree.documentElement
  247. return collection
  248. def read_docx_table(document):
  249. table_text_list = []
  250. for table in document.tables:
  251. table_text = "<table>\n"
  252. # print("==================")
  253. for row in table.rows:
  254. table_text += "<tr>\n"
  255. for cell in row.cells:
  256. table_text += "<td>" + cell.text + "</td>\n"
  257. table_text += "</tr>\n"
  258. table_text += "</table>\n"
  259. # print(table_text)
  260. table_text_list.append(table_text)
  261. return table_text_list
  262. class DocxConvert:
  263. def __init__(self, path, unique_type_dir):
  264. self._doc = _Document(path)
  265. self.path = path
  266. self.unique_type_dir = unique_type_dir
  267. def init_package(self):
  268. # 各个包初始化
  269. try:
  270. self.docx = docx.Document(self.path)
  271. self.zip = zipfile.ZipFile(self.path)
  272. except:
  273. log("cannot open docx!")
  274. traceback.print_exc()
  275. self._doc.error_code = [-3]
  276. def convert(self):
  277. self.init_package()
  278. if self._doc.error_code is not None:
  279. return
  280. order_and_text_list = self.get_orders()
  281. if judge_error_code(order_and_text_list):
  282. self._doc.error_code = order_and_text_list
  283. return
  284. order_list, text_list = order_and_text_list
  285. table_list = self.get_tables()
  286. if judge_error_code(table_list):
  287. self._doc.error_code = table_list
  288. return
  289. # paragraph_list = self.get_paragraphs()
  290. image_list = self.get_images()
  291. self._page = _Page(None, 0)
  292. order_y = 0
  293. doc_pr_cnt = 0
  294. for tag in order_list:
  295. bbox = (0, order_y, 0, 0)
  296. if tag == "w:t":
  297. if len(text_list) > 0:
  298. _para = text_list.pop(0)
  299. self._page.add_child(_Sentence(_para, bbox))
  300. if tag == "wp:docPr":
  301. if len(image_list) > 0:
  302. temp_image_path = self.unique_type_dir + "docpr" + str(doc_pr_cnt) + ".png"
  303. _image = image_list.pop(0)
  304. with open(temp_image_path, "wb") as f:
  305. f.write(_image)
  306. _img = _Image(_image, temp_image_path, bbox)
  307. _img.is_from_docx = True
  308. self._page.add_child(_img)
  309. doc_pr_cnt += 1
  310. if tag == "w:tbl":
  311. if len(table_list) > 0:
  312. _table = table_list.pop(0)
  313. _table = _Table(_table, bbox)
  314. _table.is_html = True
  315. self._page.add_child(_table)
  316. order_y += 1
  317. if self._doc.error_code is None and self._page.error_code is not None:
  318. self._doc.error_code = self._page.error_code
  319. self._doc.add_child(self._page)
  320. def get_paragraphs(self):
  321. # 遍历段落
  322. paragraph_list = []
  323. for paragraph in self.docx.paragraphs:
  324. if paragraph.text != "":
  325. paragraph_list.append(paragraph.text)
  326. return paragraph_list
  327. def get_tables(self):
  328. # 遍历表
  329. table_list = read_xml_table(self.path, self.unique_type_dir)
  330. return table_list
  331. def get_images(self):
  332. # 顺序遍历图片
  333. image_list = []
  334. pattern = re.compile('rId\d+')
  335. for graph in self.docx.paragraphs:
  336. for run in graph.runs:
  337. if run.text == '':
  338. try:
  339. if not pattern.search(run.element.xml):
  340. continue
  341. content_id = pattern.search(run.element.xml).group(0)
  342. content_type = self.docx.part.related_parts[content_id].content_type
  343. except Exception as e:
  344. print("docx no image!", e)
  345. continue
  346. if not content_type.startswith('image'):
  347. continue
  348. img_data = self.docx.part.related_parts[content_id].blob
  349. if img_data is not None:
  350. image_list.append(img_data)
  351. return image_list
  352. def get_orders(self):
  353. # 解析document.xml,获取文字顺序
  354. order_and_text_list = read_xml_order(self.path, self.unique_type_dir)
  355. return order_and_text_list
  356. def get_doc_object(self):
  357. return self._doc
  358. def get_html(self):
  359. try:
  360. self.convert()
  361. except:
  362. traceback.print_exc()
  363. self._doc.error_code = [-1]
  364. if self._doc.error_code is not None:
  365. return self._doc.error_code
  366. return self._doc.get_html()