convert_docx.py 14 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397
  1. import os
  2. import sys
  3. sys.path.append(os.path.dirname(__file__) + "/../")
  4. from format_convert.convert_tree import _Document, _Sentence, _Page, _Image, _Table
  5. import logging
  6. import re
  7. import traceback
  8. import xml
  9. import zipfile
  10. import docx
  11. import timeout_decorator
  12. from format_convert import get_memory_info
  13. from format_convert.convert_image import picture2text
  14. from format_convert.utils import judge_error_code, add_div
  15. @get_memory_info.memory_decorator
  16. def docx2text(path, unique_type_dir):
  17. logging.info("into docx2text")
  18. try:
  19. try:
  20. doc = docx.Document(path)
  21. except Exception as e:
  22. print("docx format error!", e)
  23. print(traceback.print_exc())
  24. logging.info("docx format error!")
  25. return [-3]
  26. # 遍历段落
  27. # print("docx2text extract paragraph")
  28. paragraph_text_list = []
  29. for paragraph in doc.paragraphs:
  30. if paragraph.text != "":
  31. paragraph_text_list.append("<div>" + paragraph.text + "</div>" + "\n")
  32. # print("paragraph_text", paragraph.text)
  33. # 遍历表
  34. try:
  35. table_text_list = read_xml_table(path, unique_type_dir)
  36. except TimeoutError:
  37. return [-4]
  38. if judge_error_code(table_text_list):
  39. return table_text_list
  40. # 顺序遍历图片
  41. # print("docx2text extract image")
  42. image_text_list = []
  43. temp_image_path = unique_type_dir + "temp_image.png"
  44. pattern = re.compile('rId\d+')
  45. for graph in doc.paragraphs:
  46. for run in graph.runs:
  47. if run.text == '':
  48. try:
  49. if not pattern.search(run.element.xml):
  50. continue
  51. content_id = pattern.search(run.element.xml).group(0)
  52. content_type = doc.part.related_parts[content_id].content_type
  53. except Exception as e:
  54. print("docx no image!", e)
  55. continue
  56. if not content_type.startswith('image'):
  57. continue
  58. # 写入临时文件
  59. img_data = doc.part.related_parts[content_id].blob
  60. with open(temp_image_path, 'wb') as f:
  61. f.write(img_data)
  62. # if get_platform() == "Windows":
  63. # print("img_data", img_data)
  64. if img_data is None:
  65. continue
  66. # 识别图片文字
  67. image_text = picture2text(temp_image_path)
  68. if image_text == [-2]:
  69. return [-2]
  70. if image_text == [-1]:
  71. return [-1]
  72. if image_text == [-3]:
  73. continue
  74. image_text = image_text[0]
  75. image_text_list.append(add_div(image_text))
  76. # 解析document.xml,获取文字顺序
  77. order_list = read_xml_order(path, unique_type_dir)
  78. if order_list == [-2]:
  79. return [-2]
  80. if order_list == [-1]:
  81. return [-1]
  82. text = ""
  83. # print("len(order_list)", len(order_list))
  84. # print("len(paragraph_text_list)", len(paragraph_text_list))
  85. # print("len(image_text_list)", len(image_text_list))
  86. # print("len(table_text_list)", len(table_text_list))
  87. for tag in order_list:
  88. if tag == "w:t":
  89. if len(paragraph_text_list) > 0:
  90. text += paragraph_text_list.pop(0)
  91. if tag == "wp:docPr":
  92. if len(image_text_list) > 0:
  93. text += image_text_list.pop(0)
  94. if tag == "w:tbl":
  95. if len(table_text_list) > 0:
  96. text += table_text_list.pop(0)
  97. return [text]
  98. except Exception as e:
  99. logging.info("docx2text error!")
  100. print("docx2text", traceback.print_exc())
  101. return [-1]
  102. @get_memory_info.memory_decorator
  103. def read_xml_order(path, save_path):
  104. logging.info("into read_xml_order")
  105. try:
  106. try:
  107. f = zipfile.ZipFile(path)
  108. for file in f.namelist():
  109. if "word/document.xml" == str(file):
  110. f.extract(file, save_path)
  111. f.close()
  112. except Exception as e:
  113. logging.info("docx format error!")
  114. return [-3]
  115. try:
  116. collection = xml_analyze(save_path + "word/document.xml")
  117. except TimeoutError:
  118. logging.info("read_xml_order timeout")
  119. return [-4]
  120. body = collection.getElementsByTagName("w:body")[0]
  121. order_list = []
  122. for line in body.childNodes:
  123. # print(str(line))
  124. if "w:p" in str(line):
  125. text = line.getElementsByTagName("w:t")
  126. picture = line.getElementsByTagName("wp:docPr")
  127. if text:
  128. order_list.append("w:t")
  129. if picture:
  130. order_list.append("wp:docPr")
  131. for line1 in line.childNodes:
  132. if "w:r" in str(line1):
  133. # print("read_xml_order", "w:r")
  134. picture1 = line1.getElementsByTagName("w:pict")
  135. if picture1:
  136. order_list.append("wp:docPr")
  137. if "w:tbl" in str(line):
  138. order_list.append("w:tbl")
  139. read_xml_table(path, save_path)
  140. return order_list
  141. except Exception as e:
  142. logging.info("read_xml_order error!")
  143. print("read_xml_order", traceback.print_exc())
  144. # log_traceback("read_xml_order")
  145. return [-1]
  146. @get_memory_info.memory_decorator
  147. def read_xml_table(path, save_path):
  148. logging.info("into read_xml_table")
  149. try:
  150. try:
  151. f = zipfile.ZipFile(path)
  152. for file in f.namelist():
  153. if "word/document.xml" == str(file):
  154. f.extract(file, save_path)
  155. f.close()
  156. except Exception as e:
  157. # print("docx format error!", e)
  158. logging.info("docx format error!")
  159. return [-3]
  160. try:
  161. collection = xml_analyze(save_path + "word/document.xml")
  162. except TimeoutError:
  163. logging.info("read_xml_table timeout")
  164. return [-4]
  165. body = collection.getElementsByTagName("w:body")[0]
  166. table_text_list = []
  167. # print("body.childNodes", body.childNodes)
  168. for line in body.childNodes:
  169. if "w:tbl" in str(line):
  170. # print("str(line)", str(line))
  171. table_text = '<table border="1">' + "\n"
  172. tr_list = line.getElementsByTagName("w:tr")
  173. # print("line.childNodes", line.childNodes)
  174. tr_index = 0
  175. tr_text_list = []
  176. tr_text_list_colspan = []
  177. for tr in tr_list:
  178. table_text = table_text + "<tr rowspan=1>" + "\n"
  179. tc_list = tr.getElementsByTagName("w:tc")
  180. tc_index = 0
  181. tc_text_list = []
  182. for tc in tc_list:
  183. tc_text = ""
  184. # 获取一格占多少列
  185. col_span = tc.getElementsByTagName("w:gridSpan")
  186. if col_span:
  187. col_span = int(col_span[0].getAttribute("w:val"))
  188. else:
  189. col_span = 1
  190. # 获取是否是合并单元格的下一个空单元格
  191. is_merge = tc.getElementsByTagName("w:vMerge")
  192. if is_merge:
  193. is_merge = is_merge[0].getAttribute("w:val")
  194. if is_merge == "continue":
  195. col_span_index = 0
  196. real_tc_index = 0
  197. # if get_platform() == "Windows":
  198. # print("read_xml_table tr_text_list", tr_text_list)
  199. # print("read_xml_table tr_index", tr_index)
  200. if 0 <= tr_index - 1 < len(tr_text_list):
  201. for tc_colspan in tr_text_list[tr_index - 1]:
  202. if col_span_index < tc_index:
  203. col_span_index += tc_colspan[1]
  204. real_tc_index += 1
  205. # print("tr_index-1, real_tc_index", tr_index-1, real_tc_index)
  206. # print(tr_text_list[tr_index-1])
  207. if real_tc_index < len(tr_text_list[tr_index - 1]):
  208. tc_text = tr_text_list[tr_index - 1][real_tc_index][0]
  209. table_text = table_text + "<td colspan=" + str(col_span) + ">" + "\n"
  210. p_list = tc.getElementsByTagName("w:p")
  211. for p in p_list:
  212. t = p.getElementsByTagName("w:t")
  213. if t:
  214. for tt in t:
  215. # print("tt", tt.childNodes)
  216. if len(tt.childNodes) > 0:
  217. tc_text += tt.childNodes[0].nodeValue
  218. tc_text += "\n"
  219. table_text = table_text + tc_text + "</td>" + "\n"
  220. tc_index += 1
  221. tc_text_list.append([tc_text, col_span])
  222. table_text += "</tr>" + "\n"
  223. tr_index += 1
  224. tr_text_list.append(tc_text_list)
  225. table_text += "</table>" + "\n"
  226. table_text_list.append(table_text)
  227. return table_text_list
  228. except Exception as e:
  229. logging.info("read_xml_table error")
  230. print("read_xml_table", traceback.print_exc())
  231. return [-1]
  232. @get_memory_info.memory_decorator
  233. @timeout_decorator.timeout(300, timeout_exception=TimeoutError)
  234. def xml_analyze(path):
  235. # 解析xml
  236. DOMTree = xml.dom.minidom.parse(path)
  237. collection = DOMTree.documentElement
  238. return collection
  239. def read_docx_table(document):
  240. table_text_list = []
  241. for table in document.tables:
  242. table_text = "<table>\n"
  243. # print("==================")
  244. for row in table.rows:
  245. table_text += "<tr>\n"
  246. for cell in row.cells:
  247. table_text += "<td>" + cell.text + "</td>\n"
  248. table_text += "</tr>\n"
  249. table_text += "</table>\n"
  250. # print(table_text)
  251. table_text_list.append(table_text)
  252. return table_text_list
  253. class DocxConvert:
  254. def __init__(self, path, unique_type_dir):
  255. self._doc = _Document(path)
  256. self.path = path
  257. self.unique_type_dir = unique_type_dir
  258. def init_package(self, package_name):
  259. # 各个包初始化
  260. try:
  261. self.docx = docx.Document(self.path)
  262. self.zip = zipfile.ZipFile(self.path)
  263. except:
  264. logging.info(package_name + " cannot open docx!")
  265. traceback.print_exc()
  266. self._doc.error_code = [-3]
  267. def convert(self):
  268. self.init_package("docx")
  269. if self._doc.error_code is not None:
  270. return
  271. order_list = self.get_orders()
  272. if judge_error_code(order_list):
  273. self._doc.error_code = order_list
  274. return
  275. table_list = self.get_tables()
  276. if judge_error_code(table_list):
  277. self._doc.error_code = table_list
  278. return
  279. paragraph_list = self.get_paragraphs()
  280. image_list = self.get_images()
  281. self._page = _Page(None, 0)
  282. order_y = 0
  283. for tag in order_list:
  284. bbox = (0, order_y, 0, 0)
  285. if tag == "w:t":
  286. if len(paragraph_list) > 0:
  287. _para = paragraph_list.pop(0)
  288. self._page.add_child(_Sentence(_para, bbox))
  289. if tag == "wp:docPr":
  290. if len(image_list) > 0:
  291. _image = image_list.pop(0)
  292. self._page.add_child(_Image(_image, bbox))
  293. if tag == "w:tbl":
  294. if len(table_list) > 0:
  295. _table = table_list.pop(0)
  296. self._page.add_child(_Table(_table, bbox))
  297. order_y += 1
  298. if self._doc.error_code is None and self._page.error_code is not None:
  299. self._doc.error_code = self._page.error_code
  300. self._doc.add_child(self._page)
  301. def get_paragraphs(self):
  302. # 遍历段落
  303. paragraph_list = []
  304. for paragraph in self.docx.paragraphs:
  305. if paragraph.text != "":
  306. paragraph_list.append(paragraph.text)
  307. return paragraph_list
  308. def get_tables(self):
  309. # 遍历表
  310. table_list = read_xml_table(self.path, self.unique_type_dir)
  311. return table_list
  312. def get_images(self):
  313. # 顺序遍历图片
  314. image_list = []
  315. pattern = re.compile('rId\d+')
  316. for graph in self.docx.paragraphs:
  317. for run in graph.runs:
  318. if run.text == '':
  319. try:
  320. if not pattern.search(run.element.xml):
  321. continue
  322. content_id = pattern.search(run.element.xml).group(0)
  323. content_type = self.docx.part.related_parts[content_id].content_type
  324. except Exception as e:
  325. print("docx no image!", e)
  326. continue
  327. if not content_type.startswith('image'):
  328. continue
  329. img_data = self.docx.part.related_parts[content_id].blob
  330. if img_data is not None:
  331. image_list.append(img_data)
  332. return image_list
  333. def get_orders(self):
  334. # 解析document.xml,获取文字顺序
  335. order_list = read_xml_order(self.path, self.unique_type_dir)
  336. return order_list
  337. def get_doc_object(self):
  338. return self._doc
  339. def get_html(self):
  340. self.convert()
  341. if self._doc.error_code is not None:
  342. return self._doc.error_code
  343. return self._doc.get_html()