convert_docx.py 18 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484
  1. import inspect
  2. import os
  3. import sys
  4. sys.path.append(os.path.dirname(__file__) + "/../")
  5. from format_convert.convert_tree import _Document, _Sentence, _Page, _Image, _Table
  6. import logging
  7. import re
  8. import traceback
  9. import xml
  10. import zipfile
  11. import docx
  12. from format_convert.convert_image import picture2text
  13. from format_convert.utils import judge_error_code, add_div, get_logger, log, memory_decorator, get_garble_code
  14. from format_convert.wrapt_timeout_decorator import timeout
  15. @memory_decorator
  16. def docx2text(path, unique_type_dir):
  17. log("into docx2text")
  18. try:
  19. try:
  20. doc = docx.Document(path)
  21. except Exception as e:
  22. print("docx format error!", e)
  23. print(traceback.print_exc())
  24. log("docx format error!")
  25. return [-3]
  26. # 遍历段落
  27. # print("docx2text extract paragraph")
  28. paragraph_text_list = []
  29. for paragraph in doc.paragraphs:
  30. if paragraph.text != "":
  31. paragraph_text_list.append("<div>" + paragraph.text + "</div>" )
  32. # print("paragraph_text", paragraph.text)
  33. # 遍历表
  34. try:
  35. table_text_list = read_xml_table(path, unique_type_dir)
  36. except TimeoutError:
  37. return [-4]
  38. if judge_error_code(table_text_list):
  39. return table_text_list
  40. # 顺序遍历图片
  41. # print("docx2text extract image")
  42. image_text_list = []
  43. temp_image_path = unique_type_dir + "temp_image.png"
  44. pattern = re.compile('rId\d+')
  45. for graph in doc.paragraphs:
  46. for run in graph.runs:
  47. if run.text == '':
  48. try:
  49. if not pattern.search(run.element.xml):
  50. continue
  51. content_id = pattern.search(run.element.xml).group(0)
  52. content_type = doc.part.related_parts[content_id].content_type
  53. except Exception as e:
  54. print("docx no image!", e)
  55. continue
  56. if not content_type.startswith('image'):
  57. continue
  58. # 写入临时文件
  59. img_data = doc.part.related_parts[content_id].blob
  60. with open(temp_image_path, 'wb') as f:
  61. f.write(img_data)
  62. # if get_platform() == "Windows":
  63. # print("img_data", img_data)
  64. if img_data is None:
  65. continue
  66. # 识别图片文字
  67. image_text = picture2text(temp_image_path)
  68. if image_text == [-2]:
  69. return [-2]
  70. if image_text == [-1]:
  71. return [-1]
  72. if image_text == [-3]:
  73. continue
  74. image_text = image_text[0]
  75. image_text_list.append(add_div(image_text))
  76. # 解析document.xml,获取文字顺序
  77. order_list = read_xml_order(path, unique_type_dir)
  78. if order_list == [-2]:
  79. return [-2]
  80. if order_list == [-1]:
  81. return [-1]
  82. text = ""
  83. # print("len(order_list)", len(order_list))
  84. # print("len(paragraph_text_list)", len(paragraph_text_list))
  85. # print("len(image_text_list)", len(image_text_list))
  86. # print("len(table_text_list)", len(table_text_list))
  87. for tag in order_list:
  88. if tag == "w:t":
  89. if len(paragraph_text_list) > 0:
  90. text += paragraph_text_list.pop(0)
  91. if tag == "wp:docPr":
  92. if len(image_text_list) > 0:
  93. text += image_text_list.pop(0)
  94. if tag == "w:tbl":
  95. if len(table_text_list) > 0:
  96. text += table_text_list.pop(0)
  97. return [text]
  98. except Exception as e:
  99. log("docx2text error!")
  100. print("docx2text", traceback.print_exc())
  101. return [-1]
  102. @timeout(50, timeout_exception=TimeoutError)
  103. def read_xml_order(path, save_path):
  104. log("into read_xml_order")
  105. try:
  106. try:
  107. f = zipfile.ZipFile(path)
  108. for file in f.namelist():
  109. if "word/document.xml" == str(file):
  110. f.extract(file, save_path)
  111. f.close()
  112. except Exception as e:
  113. log("docx format error!")
  114. return [-3]
  115. try:
  116. collection = xml_analyze(save_path + "word/document.xml")
  117. except TimeoutError:
  118. log("xml_analyze timeout")
  119. return [-4]
  120. body = collection.getElementsByTagName("w:body")[0]
  121. order_list = []
  122. text_list = []
  123. # 编号组记录
  124. num_pr_dict = {}
  125. last_node_level = 0
  126. for line in body.childNodes:
  127. # print(str(line))
  128. if "w:p" in str(line):
  129. # 文本的编号(如果有编号的话)
  130. text_no = ''
  131. # 提取编号 组-层级-序号
  132. num_pr = line.getElementsByTagName("w:numPr")
  133. if num_pr:
  134. num_pr = num_pr[0]
  135. group_id = int(num_pr.getElementsByTagName("w:numId")[0].getAttribute("w:val"))
  136. if group_id >= 1:
  137. node_level = num_pr.getElementsByTagName("w:ilvl")
  138. if node_level:
  139. node_level = int(node_level[0].getAttribute("w:val"))
  140. # print('node_level', node_level, 'last_node_level', last_node_level)
  141. if group_id in num_pr_dict.keys():
  142. if last_node_level != 0 and node_level < last_node_level:
  143. # print('重置', 'group_id', group_id, 'last_node_level', last_node_level)
  144. # 需循环重置node_level到last_node_level之间的level
  145. for l in range(node_level+1, last_node_level+1):
  146. num_pr_dict[group_id][l] = 0
  147. num_pr_dict[group_id][node_level] += 1
  148. elif node_level in num_pr_dict[group_id].keys():
  149. num_pr_dict[group_id][node_level] += 1
  150. else:
  151. num_pr_dict[group_id][node_level] = 1
  152. else:
  153. num_pr_dict[group_id] = {node_level: 1}
  154. # print(num_pr_dict[group_id])
  155. for level in range(node_level+1):
  156. # 当前level下有多少个node
  157. level_node_cnt = num_pr_dict[group_id][level]
  158. # print('level_node_cnt', level_node_cnt)
  159. text_no += str(level_node_cnt) + '.'
  160. last_node_level = node_level
  161. # print('read_xml_order text_no', text_no)
  162. text = line.getElementsByTagName("w:t")
  163. picture = line.getElementsByTagName("wp:docPr")
  164. if text:
  165. order_list.append("w:t")
  166. temp_text = ""
  167. for t in text:
  168. if len(t.childNodes) > 0:
  169. temp_text += t.childNodes[0].nodeValue
  170. else:
  171. continue
  172. if text_no:
  173. temp_text = text_no + ' ' + temp_text
  174. text_list.append(temp_text)
  175. if picture:
  176. order_list.append("wp:docPr")
  177. for line1 in line.childNodes:
  178. if "w:r" in str(line1):
  179. # print("read_xml_order", "w:r")
  180. picture1 = line1.getElementsByTagName("w:pict")
  181. if picture1:
  182. order_list.append("wp:docPr")
  183. if "w:tbl" in str(line):
  184. order_list.append("w:tbl")
  185. read_xml_table(path, save_path)
  186. return [order_list, text_list]
  187. except Exception as e:
  188. log("read_xml_order error!")
  189. print("read_xml_order", traceback.print_exc())
  190. # log_traceback("read_xml_order")
  191. return [-1]
  192. @timeout(50, timeout_exception=TimeoutError)
  193. def read_xml_table(path, save_path):
  194. log("into read_xml_table")
  195. try:
  196. try:
  197. f = zipfile.ZipFile(path)
  198. for file in f.namelist():
  199. if "word/document.xml" == str(file):
  200. f.extract(file, save_path)
  201. f.close()
  202. except Exception as e:
  203. # print("docx format error!", e)
  204. log("docx format error!")
  205. return [-3]
  206. log("xml_analyze%s"%(save_path))
  207. try:
  208. collection = xml_analyze(save_path + "word/document.xml")
  209. except TimeoutError:
  210. log("xml_analyze timeout")
  211. return [-4]
  212. log("xml_analyze done")
  213. body = collection.getElementsByTagName("w:body")[0]
  214. table_text_list = []
  215. # print("body.childNodes", body.childNodes)
  216. for line in body.childNodes:
  217. if "w:tbl" in str(line):
  218. # print("str(line)", str(line))
  219. table_text = '<table border="1">'
  220. tr_list = line.getElementsByTagName("w:tr")
  221. # print("line.childNodes", line.childNodes)
  222. tr_index = 0
  223. tr_text_list = []
  224. tr_text_list_colspan = []
  225. for tr in tr_list:
  226. table_text = table_text + "<tr>"
  227. tc_list = tr.getElementsByTagName("w:tc")
  228. tc_index = 0
  229. tc_text_list = []
  230. for tc in tc_list:
  231. tc_text = ""
  232. # 获取一格占多少列
  233. col_span = tc.getElementsByTagName("w:gridSpan")
  234. if col_span:
  235. col_span = int(col_span[0].getAttribute("w:val"))
  236. else:
  237. col_span = 1
  238. # 获取是否是合并单元格的下一个空单元格
  239. is_merge = tc.getElementsByTagName("w:vMerge")
  240. if is_merge:
  241. is_merge = is_merge[0].getAttribute("w:val")
  242. if is_merge == "continue":
  243. col_span_index = 0
  244. real_tc_index = 0
  245. # if get_platform() == "Windows":
  246. # print("read_xml_table tr_text_list", tr_text_list)
  247. # print("read_xml_table tr_index", tr_index)
  248. if 0 <= tr_index - 1 < len(tr_text_list):
  249. for tc_colspan in tr_text_list[tr_index - 1]:
  250. if col_span_index < tc_index:
  251. col_span_index += tc_colspan[1]
  252. real_tc_index += 1
  253. # print("tr_index-1, real_tc_index", tr_index-1, real_tc_index)
  254. # print(tr_text_list[tr_index-1])
  255. if real_tc_index < len(tr_text_list[tr_index - 1]):
  256. tc_text = tr_text_list[tr_index - 1][real_tc_index][0]
  257. table_text = table_text + "<td colspan=" + str(col_span) + ">"
  258. p_list = tc.getElementsByTagName("w:p")
  259. for p in p_list:
  260. t = p.getElementsByTagName("w:t")
  261. if t:
  262. for tt in t:
  263. # print("tt", tt.childNodes)
  264. if len(tt.childNodes) > 0:
  265. tc_text += tt.childNodes[0].nodeValue
  266. table_text = table_text + tc_text + "</td>"
  267. tc_index += 1
  268. tc_text_list.append([tc_text, col_span])
  269. table_text += "</tr>"
  270. tr_index += 1
  271. tr_text_list.append(tc_text_list)
  272. table_text += "</table>"
  273. table_text_list.append(table_text)
  274. return table_text_list
  275. except Exception as e:
  276. log("read_xml_table error")
  277. print("read_xml_table", traceback.print_exc())
  278. return [-1]
  279. @timeout(25, timeout_exception=TimeoutError)
  280. def xml_analyze(path):
  281. # 解析xml
  282. DOMTree = xml.dom.minidom.parse(path)
  283. collection = DOMTree.documentElement
  284. return collection
  285. def read_docx_table(document):
  286. table_text_list = []
  287. for table in document.tables:
  288. table_text = "<table>"
  289. # print("==================")
  290. for row in table.rows:
  291. table_text += "<tr>"
  292. for cell in row.cells:
  293. table_text += "<td>" + re.sub("\s","",str(cell.text)) + "</td>"
  294. table_text += "</tr>"
  295. table_text += "</table>"
  296. # print(table_text)
  297. table_text_list.append(table_text)
  298. return table_text_list
  299. class DocxConvert:
  300. def __init__(self, path, unique_type_dir):
  301. self._doc = _Document(path)
  302. self.path = path
  303. self.unique_type_dir = unique_type_dir
  304. @memory_decorator
  305. def init_package(self):
  306. # 各个包初始化
  307. try:
  308. self.docx = docx.Document(self.path)
  309. self.zip = zipfile.ZipFile(self.path)
  310. except:
  311. log("cannot open docx!")
  312. traceback.print_exc()
  313. self._doc.error_code = [-3]
  314. def convert(self):
  315. self.init_package()
  316. if self._doc.error_code is not None:
  317. return
  318. order_and_text_list = self.get_orders()
  319. if judge_error_code(order_and_text_list):
  320. self._doc.error_code = order_and_text_list
  321. return
  322. order_list, text_list = order_and_text_list
  323. self._page = _Page(None, 0)
  324. # 乱码返回文件格式错误
  325. match1 = re.findall(get_garble_code(), ''.join(text_list))
  326. if len(match1) > 10:
  327. log("doc/docx garbled code!")
  328. # self._doc.error_code = [-3]
  329. _sen = _Sentence('文件乱码!', (0, 0, 0, 0))
  330. self._page.add_child(_sen)
  331. self._doc.add_child(self._page)
  332. return
  333. # test
  334. # for i in range(len(text_list)):
  335. # print(order_list[i], text_list[i])
  336. table_list = self.get_tables()
  337. if judge_error_code(table_list):
  338. self._doc.error_code = table_list
  339. return
  340. # paragraph_list = self.get_paragraphs()
  341. image_list = self.get_images()
  342. order_y = 0
  343. doc_pr_cnt = 0
  344. for tag in order_list:
  345. bbox = (0, order_y, 0, 0)
  346. if tag == "w:t":
  347. if len(text_list) > 0:
  348. _para = text_list.pop(0)
  349. _sen = _Sentence(_para, bbox)
  350. _sen.combine=False
  351. self._page.add_child(_sen)
  352. if tag == "wp:docPr":
  353. if len(image_list) > 0:
  354. temp_image_path = self.unique_type_dir + "docpr" + str(doc_pr_cnt) + ".png"
  355. _image = image_list.pop(0)
  356. with open(temp_image_path, "wb") as f:
  357. f.write(_image)
  358. _img = _Image(_image, temp_image_path, bbox)
  359. _img.is_from_docx = True
  360. self._page.add_child(_img)
  361. doc_pr_cnt += 1
  362. if tag == "w:tbl":
  363. if len(table_list) > 0:
  364. _table = table_list.pop(0)
  365. _table = _Table(_table, bbox)
  366. _table.is_html = True
  367. self._page.add_child(_table)
  368. order_y += 1
  369. if self._doc.error_code is None and self._page.error_code is not None:
  370. self._doc.error_code = self._page.error_code
  371. self._doc.add_child(self._page)
  372. def get_paragraphs(self):
  373. # 遍历段落
  374. paragraph_list = []
  375. for paragraph in self.docx.paragraphs:
  376. if paragraph.text != "":
  377. paragraph_list.append(paragraph.text)
  378. return paragraph_list
  379. @memory_decorator
  380. def get_tables(self):
  381. # 遍历表
  382. table_list = read_xml_table(self.path, self.unique_type_dir)
  383. return table_list
  384. def get_images(self):
  385. # 顺序遍历图片
  386. image_list = []
  387. pattern = re.compile('rId\d+')
  388. for graph in self.docx.paragraphs:
  389. for run in graph.runs:
  390. if run.text == '':
  391. try:
  392. if not pattern.search(run.element.xml):
  393. continue
  394. content_id = pattern.search(run.element.xml).group(0)
  395. content_type = self.docx.part.related_parts[content_id].content_type
  396. except Exception as e:
  397. print("docx no image!", e)
  398. continue
  399. if not content_type.startswith('image'):
  400. continue
  401. img_data = self.docx.part.related_parts[content_id].blob
  402. if img_data is not None:
  403. image_list.append(img_data)
  404. return image_list
  405. @memory_decorator
  406. def get_orders(self):
  407. # 解析document.xml,获取文字顺序
  408. order_and_text_list = read_xml_order(self.path, self.unique_type_dir)
  409. return order_and_text_list
  410. def get_doc_object(self):
  411. return self._doc
  412. def get_html(self):
  413. try:
  414. self.convert()
  415. except:
  416. traceback.print_exc()
  417. self._doc.error_code = [-1]
  418. if self._doc.error_code is not None:
  419. return self._doc.error_code
  420. return self._doc.get_html()
  421. if __name__ == '__main__':
  422. c = DocxConvert("C:/Users/Administrator/Downloads/1631944542835.docx", "C:/Users/Administrator/Downloads/1/")
  423. print(c.get_html())