convert_docx.py 11 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288
  1. import os
  2. import sys
  3. sys.path.append(os.path.dirname(__file__) + "/../")
  4. import logging
  5. import re
  6. import traceback
  7. import xml
  8. import zipfile
  9. import docx
  10. import timeout_decorator
  11. from format_convert import get_memory_info
  12. from format_convert.convert_image import picture2text
  13. from format_convert.utils import judge_error_code, add_div
  14. @get_memory_info.memory_decorator
  15. def docx2text(path, unique_type_dir):
  16. logging.info("into docx2text")
  17. try:
  18. try:
  19. doc = docx.Document(path)
  20. except Exception as e:
  21. print("docx format error!", e)
  22. print(traceback.print_exc())
  23. logging.info("docx format error!")
  24. return [-3]
  25. # 遍历段落
  26. # print("docx2text extract paragraph")
  27. paragraph_text_list = []
  28. for paragraph in doc.paragraphs:
  29. if paragraph.text != "":
  30. paragraph_text_list.append("<div>" + paragraph.text + "</div>" + "\n")
  31. # print("paragraph_text", paragraph.text)
  32. # 遍历表
  33. try:
  34. table_text_list = read_xml_table(path, unique_type_dir)
  35. except TimeoutError:
  36. return [-4]
  37. if judge_error_code(table_text_list):
  38. return table_text_list
  39. # 顺序遍历图片
  40. # print("docx2text extract image")
  41. image_text_list = []
  42. temp_image_path = unique_type_dir + "temp_image.png"
  43. pattern = re.compile('rId\d+')
  44. for graph in doc.paragraphs:
  45. for run in graph.runs:
  46. if run.text == '':
  47. try:
  48. if not pattern.search(run.element.xml):
  49. continue
  50. content_id = pattern.search(run.element.xml).group(0)
  51. content_type = doc.part.related_parts[content_id].content_type
  52. except Exception as e:
  53. print("docx no image!", e)
  54. continue
  55. if not content_type.startswith('image'):
  56. continue
  57. # 写入临时文件
  58. img_data = doc.part.related_parts[content_id].blob
  59. with open(temp_image_path, 'wb') as f:
  60. f.write(img_data)
  61. # if get_platform() == "Windows":
  62. # print("img_data", img_data)
  63. if img_data is None:
  64. continue
  65. # 识别图片文字
  66. image_text = picture2text(temp_image_path)
  67. if image_text == [-2]:
  68. return [-2]
  69. if image_text == [-1]:
  70. return [-1]
  71. if image_text == [-3]:
  72. continue
  73. image_text = image_text[0]
  74. image_text_list.append(add_div(image_text))
  75. # 解析document.xml,获取文字顺序
  76. order_list = read_xml_order(path, unique_type_dir)
  77. if order_list == [-2]:
  78. return [-2]
  79. if order_list == [-1]:
  80. return [-1]
  81. text = ""
  82. # print("len(order_list)", len(order_list))
  83. # print("len(paragraph_text_list)", len(paragraph_text_list))
  84. # print("len(image_text_list)", len(image_text_list))
  85. # print("len(table_text_list)", len(table_text_list))
  86. for tag in order_list:
  87. if tag == "w:t":
  88. if len(paragraph_text_list) > 0:
  89. text += paragraph_text_list.pop(0)
  90. if tag == "wp:docPr":
  91. if len(image_text_list) > 0:
  92. text += image_text_list.pop(0)
  93. if tag == "w:tbl":
  94. if len(table_text_list) > 0:
  95. text += table_text_list.pop(0)
  96. return [text]
  97. except Exception as e:
  98. logging.info("docx2text error!")
  99. print("docx2text", traceback.print_exc())
  100. return [-1]
  101. @get_memory_info.memory_decorator
  102. def read_xml_order(path, save_path):
  103. logging.info("into read_xml_order")
  104. try:
  105. try:
  106. f = zipfile.ZipFile(path)
  107. for file in f.namelist():
  108. if "word/document.xml" == str(file):
  109. f.extract(file, save_path)
  110. f.close()
  111. except Exception as e:
  112. logging.info("docx format error!")
  113. return [-3]
  114. try:
  115. collection = xml_analyze(save_path + "word/document.xml")
  116. except TimeoutError:
  117. logging.info("read_xml_order timeout")
  118. return [-4]
  119. body = collection.getElementsByTagName("w:body")[0]
  120. order_list = []
  121. for line in body.childNodes:
  122. # print(str(line))
  123. if "w:p" in str(line):
  124. text = line.getElementsByTagName("w:t")
  125. picture = line.getElementsByTagName("wp:docPr")
  126. if text:
  127. order_list.append("w:t")
  128. if picture:
  129. order_list.append("wp:docPr")
  130. for line1 in line.childNodes:
  131. if "w:r" in str(line1):
  132. # print("read_xml_order", "w:r")
  133. picture1 = line1.getElementsByTagName("w:pict")
  134. if picture1:
  135. order_list.append("wp:docPr")
  136. if "w:tbl" in str(line):
  137. order_list.append("w:tbl")
  138. read_xml_table(path, save_path)
  139. return order_list
  140. except Exception as e:
  141. logging.info("read_xml_order error!")
  142. print("read_xml_order", traceback.print_exc())
  143. # log_traceback("read_xml_order")
  144. return [-1]
  145. @get_memory_info.memory_decorator
  146. def read_xml_table(path, save_path):
  147. logging.info("into read_xml_table")
  148. try:
  149. try:
  150. f = zipfile.ZipFile(path)
  151. for file in f.namelist():
  152. if "word/document.xml" == str(file):
  153. f.extract(file, save_path)
  154. f.close()
  155. except Exception as e:
  156. # print("docx format error!", e)
  157. logging.info("docx format error!")
  158. return [-3]
  159. try:
  160. collection = xml_analyze(save_path + "word/document.xml")
  161. except TimeoutError:
  162. logging.info("read_xml_table timeout")
  163. return [-4]
  164. body = collection.getElementsByTagName("w:body")[0]
  165. table_text_list = []
  166. # print("body.childNodes", body.childNodes)
  167. for line in body.childNodes:
  168. if "w:tbl" in str(line):
  169. # print("str(line)", str(line))
  170. table_text = '<table border="1">' + "\n"
  171. tr_list = line.getElementsByTagName("w:tr")
  172. # print("line.childNodes", line.childNodes)
  173. tr_index = 0
  174. tr_text_list = []
  175. tr_text_list_colspan = []
  176. for tr in tr_list:
  177. table_text = table_text + "<tr rowspan=1>" + "\n"
  178. tc_list = tr.getElementsByTagName("w:tc")
  179. tc_index = 0
  180. tc_text_list = []
  181. for tc in tc_list:
  182. tc_text = ""
  183. # 获取一格占多少列
  184. col_span = tc.getElementsByTagName("w:gridSpan")
  185. if col_span:
  186. col_span = int(col_span[0].getAttribute("w:val"))
  187. else:
  188. col_span = 1
  189. # 获取是否是合并单元格的下一个空单元格
  190. is_merge = tc.getElementsByTagName("w:vMerge")
  191. if is_merge:
  192. is_merge = is_merge[0].getAttribute("w:val")
  193. if is_merge == "continue":
  194. col_span_index = 0
  195. real_tc_index = 0
  196. # if get_platform() == "Windows":
  197. # print("read_xml_table tr_text_list", tr_text_list)
  198. # print("read_xml_table tr_index", tr_index)
  199. if 0 <= tr_index - 1 < len(tr_text_list):
  200. for tc_colspan in tr_text_list[tr_index - 1]:
  201. if col_span_index < tc_index:
  202. col_span_index += tc_colspan[1]
  203. real_tc_index += 1
  204. # print("tr_index-1, real_tc_index", tr_index-1, real_tc_index)
  205. # print(tr_text_list[tr_index-1])
  206. if real_tc_index < len(tr_text_list[tr_index - 1]):
  207. tc_text = tr_text_list[tr_index - 1][real_tc_index][0]
  208. table_text = table_text + "<td colspan=" + str(col_span) + ">" + "\n"
  209. p_list = tc.getElementsByTagName("w:p")
  210. for p in p_list:
  211. t = p.getElementsByTagName("w:t")
  212. if t:
  213. for tt in t:
  214. # print("tt", tt.childNodes)
  215. if len(tt.childNodes) > 0:
  216. tc_text += tt.childNodes[0].nodeValue
  217. tc_text += "\n"
  218. table_text = table_text + tc_text + "</td>" + "\n"
  219. tc_index += 1
  220. tc_text_list.append([tc_text, col_span])
  221. table_text += "</tr>" + "\n"
  222. tr_index += 1
  223. tr_text_list.append(tc_text_list)
  224. table_text += "</table>" + "\n"
  225. table_text_list.append(table_text)
  226. return table_text_list
  227. except Exception as e:
  228. logging.info("read_xml_table error")
  229. print("read_xml_table", traceback.print_exc())
  230. return [-1]
  231. @get_memory_info.memory_decorator
  232. @timeout_decorator.timeout(300, timeout_exception=TimeoutError)
  233. def xml_analyze(path):
  234. # 解析xml
  235. DOMTree = xml.dom.minidom.parse(path)
  236. collection = DOMTree.documentElement
  237. return collection
  238. def read_docx_table(document):
  239. table_text_list = []
  240. for table in document.tables:
  241. table_text = "<table>\n"
  242. # print("==================")
  243. for row in table.rows:
  244. table_text += "<tr>\n"
  245. for cell in row.cells:
  246. table_text += "<td>" + cell.text + "</td>\n"
  247. table_text += "</tr>\n"
  248. table_text += "</table>\n"
  249. # print(table_text)
  250. table_text_list.append(table_text)
  251. return table_text_list