convert_docx.py 25 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614
  1. import os
  2. import sys
  3. sys.path.append(os.path.dirname(__file__) + "/../")
  4. from format_convert.convert_tree import _Document, _Sentence, _Page, _Image, _Table
  5. import re
  6. import traceback
  7. import xml
  8. import zipfile
  9. import docx
  10. from bs4 import BeautifulSoup
  11. from format_convert.utils import judge_error_code, add_div, get_logger, log, memory_decorator, get_garble_code
  12. from format_convert.wrapt_timeout_decorator import timeout
  13. from format_convert.convert_image import ImageConvert
  14. def docx2text():
  15. return
  16. def read_rel_image(document_xml_rels):
  17. if not document_xml_rels:
  18. return {}
  19. # 获取映射文件里的关系 Id-Target
  20. image_rel_dict = {}
  21. for rel in document_xml_rels:
  22. if 'Relationship' in str(rel):
  23. _id = rel.get("Id")
  24. _target = rel.get("Target")
  25. _type = rel.get("Type")
  26. if 'image' in _type:
  27. image_rel_dict[_id] = _target
  28. return image_rel_dict
  29. def read_no_start(numbering_xml):
  30. """
  31. 读取编号组的起始值
  32. :return:
  33. """
  34. if not numbering_xml:
  35. return {}
  36. # 获取虚拟-真实id映射关系
  37. w_num_list = numbering_xml.getElementsByTagName("w:num")
  38. abstract_real_id_dict = {}
  39. for w_num in w_num_list:
  40. w_num_id = w_num.getAttribute("w:numId")
  41. w_abstract_num_id = w_num.getElementsByTagName('w:abstractNumId')[0].getAttribute("w:val")
  42. abstract_real_id_dict[w_abstract_num_id] = w_num_id
  43. # 获取虚拟id的开始编号
  44. w_abstract_num_list = numbering_xml.getElementsByTagName("w:abstractNum")
  45. abstract_id_level_dict = {}
  46. abstract_id_level_text_dict = {}
  47. for w_abstract_num in w_abstract_num_list:
  48. w_abstract_num_id = w_abstract_num.getAttribute("w:abstractNumId")
  49. w_lvl_list = w_abstract_num.getElementsByTagName("w:lvl")
  50. level_start_dict = {}
  51. level_text_dict = {}
  52. for w_lvl in w_lvl_list:
  53. w_ilvl_value = w_lvl.getAttribute('w:ilvl')
  54. if w_lvl.getElementsByTagName("w:start"):
  55. w_ilvl_start_num = w_lvl.getElementsByTagName("w:start")[0].getAttribute("w:val")
  56. level_start_dict[int(w_ilvl_value)] = int(w_ilvl_start_num)
  57. if w_lvl.getElementsByTagName("w:lvlText") and w_lvl.getElementsByTagName("w:numFmt"):
  58. w_lvl_text = w_lvl.getElementsByTagName("w:lvlText")[0].getAttribute("w:val")
  59. w_lvl_format = w_lvl.getElementsByTagName("w:numFmt")[0].getAttribute("w:val")
  60. if w_lvl_format == 'upperLetter':
  61. w_lvl_text = re.sub('%\d', '%A', w_lvl_text)
  62. elif w_lvl_format == 'lowerLetter':
  63. w_lvl_text = re.sub('%\d', '%a', w_lvl_text)
  64. level_text_dict[int(w_ilvl_value)] = w_lvl_text
  65. abstract_id_level_dict[w_abstract_num_id] = level_start_dict
  66. abstract_id_level_text_dict[w_abstract_num_id] = level_text_dict
  67. # 映射回真实id
  68. real_id_level_start_dict = {}
  69. for abstract_id in abstract_real_id_dict.keys():
  70. real_id = abstract_real_id_dict.get(abstract_id)
  71. level_start_dict = abstract_id_level_dict.get(abstract_id)
  72. if level_start_dict:
  73. real_id_level_start_dict[int(real_id)] = level_start_dict
  74. real_id_level_text_dict = {}
  75. for abstract_id in abstract_real_id_dict.keys():
  76. real_id = abstract_real_id_dict.get(abstract_id)
  77. level_text_dict = abstract_id_level_text_dict.get(abstract_id)
  78. if level_text_dict:
  79. real_id_level_text_dict[int(real_id)] = level_text_dict
  80. return real_id_level_start_dict, real_id_level_text_dict
  81. def read_p_text(unique_type_dir, p_node, _last_node_level, _num_pr_dict, numbering_xml, document_xml_rels,
  82. is_sdt=False):
  83. """
  84. 读取w:p下的文本,包括编号
  85. :param unique_type_dir:
  86. :param p_node:
  87. :param _last_node_level:
  88. :param _num_pr_dict:
  89. :param numbering_xml:
  90. :param document_xml_rels:
  91. :param is_sdt:
  92. :return:
  93. """
  94. _text_list = []
  95. _order_list = []
  96. # 文本的编号(如果有编号的话)
  97. text_no = ''
  98. # 获取编号组的起始值和编号组的展示形式
  99. id_level_start_dict, id_level_text_dict = read_no_start(numbering_xml)
  100. # print('_num_pr_dict', _num_pr_dict)
  101. # 提取编号 组-层级-序号
  102. num_pr = p_node.getElementsByTagName("w:numPr")
  103. if num_pr:
  104. num_pr = num_pr[0]
  105. if num_pr.getElementsByTagName("w:numId"):
  106. group_id = int(num_pr.getElementsByTagName("w:numId")[0].getAttribute("w:val"))
  107. if group_id >= 1:
  108. node_level = num_pr.getElementsByTagName("w:ilvl")
  109. if node_level:
  110. node_level = int(node_level[0].getAttribute("w:val"))
  111. # print('group_id', group_id, 'node_level', node_level, 'last_node_level', _last_node_level)
  112. if group_id in _num_pr_dict.keys():
  113. if node_level == 0 and node_level not in _num_pr_dict[group_id].keys():
  114. _num_pr_dict[group_id][node_level] = 1
  115. if _last_node_level != 0 and node_level < _last_node_level:
  116. # print('重置', 'group_id', group_id, 'last_node_level', last_node_level)
  117. # 需循环重置node_level到last_node_level之间的level
  118. for l in range(node_level+1, _last_node_level+1):
  119. _num_pr_dict[group_id][l] = 0
  120. if _num_pr_dict[group_id].get(node_level):
  121. _num_pr_dict[group_id][node_level] += 1
  122. else:
  123. pass
  124. # print('group_id, node_level', group_id, node_level)
  125. elif node_level in _num_pr_dict[group_id].keys():
  126. _num_pr_dict[group_id][node_level] += 1
  127. else:
  128. _num_pr_dict[group_id][node_level] = 1
  129. else:
  130. _num_pr_dict[group_id] = {node_level: 1}
  131. # print(num_pr_dict[group_id])
  132. for level in range(node_level+1):
  133. # 当前level下有多少个node
  134. if level not in _num_pr_dict[group_id]:
  135. if level not in id_level_start_dict[group_id]:
  136. continue
  137. else:
  138. level_node_cnt = id_level_start_dict[group_id][level]
  139. else:
  140. level_node_cnt = _num_pr_dict[group_id][level]
  141. if id_level_start_dict.get(group_id) and id_level_start_dict.get(group_id).get(level) and _num_pr_dict.get(group_id).get(level):
  142. start_no = id_level_start_dict.get(group_id).get(level)
  143. level_node_cnt += start_no - 1
  144. level_text = None
  145. if id_level_text_dict.get(group_id) and id_level_text_dict.get(group_id).get(level) and _num_pr_dict.get(group_id).get(level):
  146. level_text = id_level_text_dict.get(group_id).get(level)
  147. # print('level_node_cnt', level_node_cnt)
  148. if level_text:
  149. if re.search('a', level_text):
  150. level_node_cnt = chr(ord('a') + level_node_cnt - 1)
  151. text_no += re.sub('%a', str(level_node_cnt), level_text)
  152. elif re.search('A', level_text):
  153. level_node_cnt = chr(ord('A') + level_node_cnt - 1)
  154. text_no += re.sub('%A', str(level_node_cnt), level_text)
  155. else:
  156. text_no += re.sub('%\d', str(level_node_cnt), level_text)
  157. else:
  158. text_no += str(level_node_cnt) + '.'
  159. # print('text_no', text_no)
  160. _last_node_level = node_level
  161. # text = p_node.getElementsByTagName("w:t")
  162. # picture = p_node.getElementsByTagName("wp:docPr")
  163. # if text:
  164. # _order_list.append("w:t")
  165. # temp_text = ""
  166. # if is_sdt and len(text) == 2:
  167. # if len(text[0].childNodes) > 0 and len(text[1].childNodes) > 0:
  168. # temp_text += text[0].childNodes[0].nodeValue + '.'*20 + text[1].childNodes[0].nodeValue
  169. # else:
  170. # for t in text:
  171. # if len(t.childNodes) > 0:
  172. # temp_text += t.childNodes[0].nodeValue
  173. # else:
  174. # continue
  175. # if text_no:
  176. # temp_text = text_no + ' ' + temp_text
  177. # _text_list.append(temp_text)
  178. # # 只有序号
  179. # elif len(text_no) >= 2:
  180. # _text_list.append(text_no[:-1])
  181. #
  182. # if picture:
  183. # _order_list.append("wp:docPr")
  184. #
  185. # for line1 in p_node.childNodes:
  186. # if "w:r" in str(line1):
  187. # picture1 = line1.getElementsByTagName("w:pict")
  188. # if picture1:
  189. # _order_list.append("wp:docPr")
  190. p_node_text = ''
  191. has_html = False
  192. # 编号先加上
  193. if text_no:
  194. p_node_text += text_no
  195. text = p_node.getElementsByTagName("w:t")
  196. # 目录页单特殊生成
  197. if is_sdt and len(text) == 2:
  198. p_node_text += text[0].childNodes[0].nodeValue + '.'*20 + text[1].childNodes[0].nodeValue
  199. # 正常页面
  200. else:
  201. image_rel_dict = read_rel_image(document_xml_rels)
  202. p_node_all = p_node.getElementsByTagName("*")
  203. for node in p_node_all:
  204. # 文本
  205. if "w:t" in str(node).split(' '):
  206. if node.childNodes:
  207. p_node_text += node.childNodes[0].nodeValue
  208. # 图片,提前识别,不做成Image对象放入Page了
  209. elif "a:blip" in str(node).split(' '):
  210. _id = node.getAttribute("r:embed")
  211. image_path = image_rel_dict.get(_id)
  212. if image_path:
  213. image_path = unique_type_dir + 'word/' + image_path
  214. image_convert = ImageConvert(image_path, '')
  215. image_html = image_convert.get_html()[0]
  216. if isinstance(image_html, int):
  217. image_html = ''
  218. p_node_text += image_html
  219. has_html = True
  220. # 只有编号
  221. if len(p_node_text) > 0 and p_node_text == text_no:
  222. p_node_text = p_node_text[:-1]
  223. _text_list.append(p_node_text)
  224. if has_html:
  225. _order_list.append('w:t html')
  226. else:
  227. _order_list.append('w:t')
  228. return _text_list, _order_list, _num_pr_dict, _last_node_level
  229. @timeout(50, timeout_exception=TimeoutError)
  230. def read_xml_order(unique_type_dir, document_xml, numbering_xml, document_xml_rels):
  231. log("into read_xml_order")
  232. try:
  233. body = document_xml.getElementsByTagName("w:body")[0]
  234. order_list = []
  235. text_list = []
  236. # 编号组记录
  237. num_pr_dict = {}
  238. last_node_level = 0
  239. for line in body.childNodes:
  240. # 普通文本
  241. if "w:p" in str(line):
  242. t_list, o_list, num_pr_dict, last_node_level = read_p_text(unique_type_dir,
  243. line,
  244. last_node_level,
  245. num_pr_dict,
  246. numbering_xml,
  247. document_xml_rels)
  248. text_list += t_list
  249. order_list += o_list
  250. # 目录索引
  251. elif "w:sdt" in str(line):
  252. sdt = line
  253. for sdt_child in sdt.childNodes:
  254. if "w:sdtContent" in str(sdt_child):
  255. sdt_content = sdt_child
  256. for sdt_content_child in sdt_content.childNodes:
  257. if 'w:p' in str(sdt_content_child):
  258. t_list, o_list, num_pr_dict, last_node_level = read_p_text(unique_type_dir,
  259. sdt_content_child,
  260. last_node_level,
  261. num_pr_dict,
  262. numbering_xml,
  263. document_xml_rels,
  264. is_sdt=True)
  265. text_list += t_list
  266. order_list += o_list
  267. elif "w:tbl" in str(line):
  268. order_list.append("w:tbl")
  269. # read_xml_table(path, save_path)
  270. return [order_list, text_list]
  271. except Exception as e:
  272. log("read_xml_order error!")
  273. traceback.print_exc()
  274. return [-1]
  275. @timeout(50, timeout_exception=TimeoutError)
  276. def read_xml_table(unique_type_dir, document_xml, numbering_xml, document_xml_rels):
  277. def recursion_read_table(table):
  278. table_text = '<table border="1">'
  279. tr_index = 0
  280. tr_text_list = []
  281. last_node_level = 0
  282. num_pr_dict = {}
  283. # 直接子节点用child表示,所有子节点用all表示
  284. for table_child in table.childNodes:
  285. if 'w:tr' in str(table_child):
  286. tr = table_child
  287. tr_child_nodes = tr.childNodes
  288. tc_index = 0
  289. tc_text_list = []
  290. for tr_child in tr_child_nodes:
  291. if 'w:tc' in str(tr_child).split(' '):
  292. tc_text = ""
  293. tc = tr_child
  294. # 获取一格占多少列,相当于colspan
  295. col_span = tc.getElementsByTagName("w:gridSpan")
  296. if col_span:
  297. col_span = int(col_span[0].getAttribute("w:val"))
  298. else:
  299. col_span = 1
  300. # 获取是否是合并单元格的下一个空单元格,相当于rowspan
  301. is_merge = tc.getElementsByTagName("w:vMerge")
  302. if is_merge:
  303. is_merge = is_merge[0].getAttribute("w:val")
  304. if is_merge == "continue":
  305. col_span_index = 0
  306. real_tc_index = 0
  307. if 0 <= tr_index - 1 < len(tr_text_list):
  308. for tc_colspan in tr_text_list[tr_index - 1]:
  309. if col_span_index < tc_index:
  310. col_span_index += tc_colspan[1]
  311. real_tc_index += 1
  312. if real_tc_index < len(tr_text_list[tr_index - 1]):
  313. tc_text = tr_text_list[tr_index - 1][real_tc_index][0]
  314. # 设置colspan
  315. table_text = table_text + "<td colspan=" + str(col_span) + ">"
  316. # 放入文本
  317. tc_child_nodes = tc.childNodes
  318. for tc_child in tc_child_nodes:
  319. if 'w:tbl' in str(tc_child).split(' '):
  320. # 嵌套在tc中的表格
  321. tc_text += recursion_read_table(tc_child)
  322. if 'w:p' in str(tc_child).split(' '):
  323. tc_p_all_nodes = tc_child.getElementsByTagName("*")
  324. _t_list, _, num_pr_dict, last_node_level = read_p_text(unique_type_dir,
  325. tc_child,
  326. last_node_level,
  327. num_pr_dict,
  328. numbering_xml,
  329. document_xml_rels)
  330. # print('_t_list', _t_list)
  331. tc_text += ''.join(_t_list)
  332. # for tc_p_all in tc_p_all_nodes:
  333. # if 'w:t' in str(tc_p_all).split(' '):
  334. # # w:t必须加childNodes[0]才能读文本
  335. # tc_text += tc_p_all.childNodes[0].nodeValue
  336. # 结束该tc
  337. table_text = table_text + tc_text + "</td>"
  338. tc_index += 1
  339. tc_text_list.append([tc_text, col_span])
  340. # 结束该tr
  341. table_text += "</tr>"
  342. tr_index += 1
  343. tr_text_list.append(tc_text_list)
  344. # 结束该table
  345. table_text += "</table>"
  346. return table_text
  347. log("into read_xml_table")
  348. try:
  349. body = document_xml.getElementsByTagName("w:body")[0]
  350. table_text_list = []
  351. body_nodes = body.childNodes
  352. for node in body_nodes:
  353. if 'w:tbl' in str(node).split(' '):
  354. _table = node
  355. _table_text = recursion_read_table(_table)
  356. table_text_list.append(_table_text)
  357. return table_text_list
  358. except Exception as e:
  359. log("read_xml_table error")
  360. print("read_xml_table", traceback.print_exc())
  361. return [-1]
  362. @timeout(25, timeout_exception=TimeoutError)
  363. def parse_xml(path):
  364. # 解析xml
  365. DOMTree = xml.dom.minidom.parse(path)
  366. collection = DOMTree.documentElement
  367. return collection
  368. @timeout(25, timeout_exception=TimeoutError)
  369. def parse_xml2(path):
  370. # 解析xml
  371. tree = xml.etree.ElementTree.parse(path)
  372. root = tree.getroot()
  373. return root
  374. class DocxConvert:
  375. def __init__(self, path, unique_type_dir):
  376. self._doc = _Document(path)
  377. self.path = path
  378. self.unique_type_dir = unique_type_dir
  379. # 解压docx
  380. try:
  381. f = zipfile.ZipFile(path)
  382. for file in f.namelist():
  383. if "word/" in str(file):
  384. f.extract(file, self.unique_type_dir)
  385. f.close()
  386. except Exception as e:
  387. log("docx format error!")
  388. self._doc.error_code = [-3]
  389. # 读取内容
  390. try:
  391. self.document_xml = parse_xml(self.unique_type_dir + "word/document.xml")
  392. if os.path.exists(self.unique_type_dir + "word/numbering.xml"):
  393. self.numbering_xml = parse_xml(self.unique_type_dir + "word/numbering.xml")
  394. else:
  395. self.numbering_xml = []
  396. if os.path.exists(self.unique_type_dir + "word/_rels/document.xml.rels"):
  397. self.document_xml_rels = parse_xml2(self.unique_type_dir + "word/_rels/document.xml.rels")
  398. else:
  399. self.document_xml_rels = []
  400. except FileNotFoundError:
  401. # 找不到解压文件,就用html格式读
  402. log('FileNotFoundError')
  403. self._doc.error_code = None
  404. except TimeoutError:
  405. log("parse_xml timeout")
  406. self._doc.error_code = [-4]
  407. @memory_decorator
  408. def init_package(self):
  409. # 各个包初始化
  410. try:
  411. self.docx = docx.Document(self.path)
  412. self.zip = zipfile.ZipFile(self.path)
  413. except:
  414. log("cannot open docx!")
  415. traceback.print_exc()
  416. self._doc.error_code = [-3]
  417. def convert(self):
  418. self._page = _Page(None, 0)
  419. # 先判断特殊doc文件,可能是html文本
  420. is_html_doc = False
  421. try:
  422. with open(self.path, 'r') as f:
  423. html_str = f.read()
  424. if re.search('<div|<html|<body|<head|<tr|<br|<table|<td', html_str):
  425. soup = BeautifulSoup(html_str, 'lxml')
  426. text = soup.text
  427. is_html_doc = True
  428. except:
  429. pass
  430. if is_html_doc:
  431. _sen = _Sentence(text, (0, 0, 0, 0))
  432. self._page.add_child(_sen)
  433. self._doc.add_child(self._page)
  434. return
  435. self.init_package()
  436. if self._doc.error_code is not None:
  437. return
  438. order_and_text_list = self.get_orders()
  439. if judge_error_code(order_and_text_list):
  440. self._doc.error_code = order_and_text_list
  441. return
  442. order_list, text_list = order_and_text_list
  443. # 乱码返回文件格式错误
  444. match1 = re.findall(get_garble_code(), ''.join(text_list))
  445. if len(match1) > 10:
  446. log("doc/docx garbled code!")
  447. # self._doc.error_code = [-3]
  448. _sen = _Sentence('文件乱码!', (0, 0, 0, 0))
  449. self._page.add_child(_sen)
  450. self._doc.add_child(self._page)
  451. return
  452. # test
  453. # for i in range(len(text_list)):
  454. # print(order_list[i], text_list[i])
  455. table_list = self.get_tables()
  456. if judge_error_code(table_list):
  457. self._doc.error_code = table_list
  458. return
  459. # paragraph_list = self.get_paragraphs()
  460. image_list = self.get_images()
  461. order_y = 0
  462. doc_pr_cnt = 0
  463. for tag in order_list:
  464. bbox = (0, order_y, 0, 0)
  465. if tag == "w:t html":
  466. if len(text_list) > 0:
  467. _para = text_list.pop(0)
  468. _sen = _Sentence(_para, bbox)
  469. _sen.combine = False
  470. _sen.is_html = True
  471. self._page.add_child(_sen)
  472. if tag == "w:t":
  473. if len(text_list) > 0:
  474. _para = text_list.pop(0)
  475. _sen = _Sentence(_para, bbox)
  476. _sen.combine = False
  477. self._page.add_child(_sen)
  478. if tag == "wp:docPr":
  479. if len(image_list) > 0:
  480. temp_image_path = self.unique_type_dir + "docpr" + str(doc_pr_cnt) + ".png"
  481. _image = image_list.pop(0)
  482. with open(temp_image_path, "wb") as f:
  483. f.write(_image)
  484. _img = _Image(_image, temp_image_path, bbox)
  485. _img.is_from_docx = True
  486. self._page.add_child(_img)
  487. doc_pr_cnt += 1
  488. if tag == "w:tbl":
  489. if len(table_list) > 0:
  490. _table = table_list.pop(0)
  491. _table = _Table(_table, bbox)
  492. _table.is_html = True
  493. self._page.add_child(_table)
  494. order_y += 1
  495. if self._doc.error_code is None and self._page.error_code is not None:
  496. self._doc.error_code = self._page.error_code
  497. self._doc.add_child(self._page)
  498. @memory_decorator
  499. def get_tables(self):
  500. # 遍历表
  501. table_list = read_xml_table(self.unique_type_dir, self.document_xml, self.numbering_xml, self.document_xml_rels)
  502. return table_list
  503. def get_images(self):
  504. # 顺序遍历图片
  505. image_list = []
  506. pattern = re.compile('rId\d+')
  507. for graph in self.docx.paragraphs:
  508. for run in graph.runs:
  509. if run.text == '':
  510. try:
  511. if not pattern.search(run.element.xml):
  512. continue
  513. content_id = pattern.search(run.element.xml).group(0)
  514. content_type = self.docx.part.related_parts[content_id].content_type
  515. except Exception as e:
  516. print("docx no image!", e)
  517. continue
  518. if not content_type.startswith('image'):
  519. continue
  520. img_data = self.docx.part.related_parts[content_id].blob
  521. if img_data is not None:
  522. image_list.append(img_data)
  523. return image_list
  524. @memory_decorator
  525. def get_orders(self):
  526. # 解析document.xml,获取文字顺序
  527. order_and_text_list = read_xml_order(self.unique_type_dir, self.document_xml, self.numbering_xml, self.document_xml_rels)
  528. return order_and_text_list
  529. def get_doc_object(self):
  530. return self._doc
  531. def get_html(self):
  532. if self._doc.error_code is not None:
  533. return self._doc.error_code
  534. try:
  535. self.convert()
  536. except:
  537. traceback.print_exc()
  538. self._doc.error_code = [-1]
  539. if self._doc.error_code is not None:
  540. return self._doc.error_code
  541. return self._doc.get_html()
  542. if __name__ == '__main__':
  543. c = DocxConvert("C:/Users/Administrator/Downloads/1631944542835.docx", "C:/Users/Administrator/Downloads/1/")
  544. print(c.get_html())