convert_docx.py 33 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799
  1. import os
  2. import sys
  3. from collections import defaultdict
  4. sys.path.append(os.path.dirname(__file__) + "/../")
  5. from format_convert.convert_tree import _Document, _Sentence, _Page, _Image, _Table
  6. import re
  7. import traceback
  8. import xml
  9. import zipfile
  10. import docx
  11. from bs4 import BeautifulSoup
  12. from format_convert.utils import judge_error_code, add_div, get_logger, log, memory_decorator, get_garble_code
  13. from format_convert.wrapt_timeout_decorator import timeout
  14. from format_convert.convert_image import ImageConvert
  15. from format_convert.convert_need_interface import from_tika_interface
  16. def docx2text():
  17. return
  18. def read_rel_image(document_xml_rels):
  19. if not document_xml_rels:
  20. return {}
  21. # 获取映射文件里的关系 Id-Target
  22. image_rel_dict = {}
  23. for rel in document_xml_rels:
  24. if 'Relationship' in str(rel):
  25. _id = rel.get("Id")
  26. _target = rel.get("Target")
  27. _type = rel.get("Type")
  28. if 'image' in _type:
  29. image_rel_dict[_id] = _target
  30. return image_rel_dict
  31. def read_no_start(numbering_xml):
  32. """
  33. 读取编号组的起始值
  34. :return:
  35. """
  36. if not numbering_xml:
  37. return {}, {}
  38. # 获取虚拟-真实id映射关系
  39. w_num_list = numbering_xml.getElementsByTagName("w:num")
  40. abstract_real_id_dict = {}
  41. for w_num in w_num_list:
  42. w_num_id = w_num.getAttribute("w:numId")
  43. w_abstract_num_id = w_num.getElementsByTagName('w:abstractNumId')[0].getAttribute("w:val")
  44. abstract_real_id_dict[w_abstract_num_id] = w_num_id
  45. # 获取虚拟id的开始编号
  46. w_abstract_num_list = numbering_xml.getElementsByTagName("w:abstractNum")
  47. abstract_id_level_dict = {}
  48. abstract_id_level_text_dict = {}
  49. for w_abstract_num in w_abstract_num_list:
  50. w_abstract_num_id = w_abstract_num.getAttribute("w:abstractNumId")
  51. w_lvl_list = w_abstract_num.getElementsByTagName("w:lvl")
  52. level_start_dict = {}
  53. level_text_dict = {}
  54. for w_lvl in w_lvl_list:
  55. w_ilvl_value = w_lvl.getAttribute('w:ilvl')
  56. if w_lvl.getElementsByTagName("w:start"):
  57. w_ilvl_start_num = w_lvl.getElementsByTagName("w:start")[0].getAttribute("w:val")
  58. level_start_dict[int(w_ilvl_value)] = int(w_ilvl_start_num)
  59. if w_lvl.getElementsByTagName("w:lvlText") and w_lvl.getElementsByTagName("w:numFmt"):
  60. w_lvl_text = w_lvl.getElementsByTagName("w:lvlText")[0].getAttribute("w:val")
  61. w_lvl_format = w_lvl.getElementsByTagName("w:numFmt")[0].getAttribute("w:val")
  62. if w_lvl_format == 'upperLetter':
  63. w_lvl_text = re.sub('%\d', '%A', w_lvl_text)
  64. elif w_lvl_format == 'lowerLetter':
  65. w_lvl_text = re.sub('%\d', '%a', w_lvl_text)
  66. level_text_dict[int(w_ilvl_value)] = w_lvl_text
  67. abstract_id_level_dict[w_abstract_num_id] = level_start_dict
  68. abstract_id_level_text_dict[w_abstract_num_id] = level_text_dict
  69. # 映射回真实id
  70. real_id_level_start_dict = {}
  71. for abstract_id in abstract_real_id_dict.keys():
  72. real_id = abstract_real_id_dict.get(abstract_id)
  73. level_start_dict = abstract_id_level_dict.get(abstract_id)
  74. if level_start_dict:
  75. real_id_level_start_dict[int(real_id)] = level_start_dict
  76. real_id_level_text_dict = {}
  77. for abstract_id in abstract_real_id_dict.keys():
  78. real_id = abstract_real_id_dict.get(abstract_id)
  79. level_text_dict = abstract_id_level_text_dict.get(abstract_id)
  80. if level_text_dict:
  81. real_id_level_text_dict[int(real_id)] = level_text_dict
  82. return real_id_level_start_dict, real_id_level_text_dict
  83. def read_p_text(unique_type_dir, p_node, _last_node_level, _num_pr_dict, numbering_xml, document_xml_rels,
  84. is_sdt=False):
  85. """
  86. 读取w:p下的文本,包括编号
  87. :param unique_type_dir:
  88. :param p_node:
  89. :param _last_node_level:
  90. :param _num_pr_dict:
  91. :param numbering_xml:
  92. :param document_xml_rels:
  93. :param is_sdt:
  94. :return:
  95. """
  96. _text_list = []
  97. _order_list = []
  98. # 文本的编号(如果有编号的话)
  99. text_no = ''
  100. # 获取编号组的起始值和编号组的展示形式
  101. id_level_start_dict, id_level_text_dict = read_no_start(numbering_xml)
  102. # print('_num_pr_dict', _num_pr_dict)
  103. # 提取编号 组-层级-序号
  104. num_pr = p_node.getElementsByTagName("w:numPr")
  105. if num_pr:
  106. num_pr = num_pr[0]
  107. if num_pr.getElementsByTagName("w:numId"):
  108. group_id = int(num_pr.getElementsByTagName("w:numId")[0].getAttribute("w:val"))
  109. if group_id >= 1:
  110. node_level = num_pr.getElementsByTagName("w:ilvl")
  111. if node_level:
  112. node_level = int(node_level[0].getAttribute("w:val"))
  113. # print('group_id', group_id, 'node_level', node_level, 'last_node_level', _last_node_level)
  114. if group_id in _num_pr_dict.keys():
  115. # if node_level == 0 and node_level not in _num_pr_dict[group_id].keys():
  116. if node_level == 0 and _num_pr_dict.get(group_id) and node_level not in _num_pr_dict.get(group_id).keys():
  117. _num_pr_dict[group_id][node_level] = 1
  118. if _last_node_level != 0 and node_level < _last_node_level:
  119. # print('重置', 'group_id', group_id, 'last_node_level', last_node_level)
  120. # 需循环重置node_level到last_node_level之间的level
  121. for l in range(node_level+1, _last_node_level+1):
  122. _num_pr_dict[group_id][l] = 0
  123. if _num_pr_dict[group_id].get(node_level):
  124. _num_pr_dict[group_id][node_level] += 1
  125. else:
  126. pass
  127. # print('group_id, node_level', group_id, node_level)
  128. # elif node_level in _num_pr_dict[group_id].keys():
  129. elif node_level in _num_pr_dict.get(group_id).keys():
  130. _num_pr_dict[group_id][node_level] += 1
  131. else:
  132. _num_pr_dict[group_id][node_level] = 1
  133. else:
  134. _num_pr_dict[group_id] = {node_level: 1}
  135. # print(num_pr_dict[group_id])
  136. for level in range(node_level+1):
  137. # 当前level下有多少个node
  138. # if level not in _num_pr_dict[group_id]:
  139. if level not in _num_pr_dict.get(group_id):
  140. # if level not in id_level_start_dict[group_id]:
  141. if not id_level_start_dict.get(group_id) or level not in id_level_start_dict.get(group_id):
  142. continue
  143. else:
  144. level_node_cnt = id_level_start_dict[group_id][level]
  145. else:
  146. level_node_cnt = _num_pr_dict[group_id][level]
  147. if id_level_start_dict.get(group_id) and _num_pr_dict.get(group_id) and id_level_start_dict.get(group_id).get(level) and _num_pr_dict.get(group_id).get(level):
  148. start_no = id_level_start_dict.get(group_id).get(level)
  149. level_node_cnt += start_no - 1
  150. level_text = None
  151. if id_level_text_dict.get(group_id) and id_level_text_dict.get(group_id).get(level) and _num_pr_dict.get(group_id).get(level):
  152. level_text = id_level_text_dict.get(group_id).get(level)
  153. # print('level_node_cnt', level_node_cnt)
  154. if level_text:
  155. if re.search('a', level_text):
  156. level_node_cnt = chr(ord('a') + level_node_cnt - 1)
  157. text_no += re.sub('%a', str(level_node_cnt), level_text)
  158. elif re.search('A', level_text):
  159. level_node_cnt = chr(ord('A') + level_node_cnt - 1)
  160. text_no += re.sub('%A', str(level_node_cnt), level_text)
  161. else:
  162. text_no += re.sub('%\d', str(level_node_cnt), level_text)
  163. else:
  164. text_no += str(level_node_cnt) + '.'
  165. # print('text_no', text_no)
  166. _last_node_level = node_level
  167. # text = p_node.getElementsByTagName("w:t")
  168. # picture = p_node.getElementsByTagName("wp:docPr")
  169. # if text:
  170. # _order_list.append("w:t")
  171. # temp_text = ""
  172. # if is_sdt and len(text) == 2:
  173. # if len(text[0].childNodes) > 0 and len(text[1].childNodes) > 0:
  174. # temp_text += text[0].childNodes[0].nodeValue + '.'*20 + text[1].childNodes[0].nodeValue
  175. # else:
  176. # for t in text:
  177. # if len(t.childNodes) > 0:
  178. # temp_text += t.childNodes[0].nodeValue
  179. # else:
  180. # continue
  181. # if text_no:
  182. # temp_text = text_no + ' ' + temp_text
  183. # _text_list.append(temp_text)
  184. # # 只有序号
  185. # elif len(text_no) >= 2:
  186. # _text_list.append(text_no[:-1])
  187. #
  188. # if picture:
  189. # _order_list.append("wp:docPr")
  190. #
  191. # for line1 in p_node.childNodes:
  192. # if "w:r" in str(line1):
  193. # picture1 = line1.getElementsByTagName("w:pict")
  194. # if picture1:
  195. # _order_list.append("wp:docPr")
  196. p_node_text = ''
  197. has_html = False
  198. # 编号先加上
  199. if text_no:
  200. p_node_text += text_no
  201. text = p_node.getElementsByTagName("w:t")
  202. # 目录页单特殊生成
  203. if is_sdt and len(text) == 2:
  204. p_node_text += text[0].childNodes[0].nodeValue + '.'*20 + text[1].childNodes[0].nodeValue
  205. # 正常页面
  206. else:
  207. image_rel_dict = read_rel_image(document_xml_rels)
  208. p_node_all = p_node.getElementsByTagName("*")
  209. for node in p_node_all:
  210. # 文本
  211. if "w:t" in str(node).split(' '):
  212. if node.childNodes:
  213. p_node_text += node.childNodes[0].nodeValue
  214. # 图片,提前识别,不做成Image对象放入Page了
  215. elif "a:blip" in str(node).split(' '):
  216. _id = node.getAttribute("r:embed")
  217. image_path = image_rel_dict.get(_id)
  218. if image_path:
  219. image_path = unique_type_dir + 'word/' + image_path
  220. image_convert = ImageConvert(image_path, '')
  221. image_html = image_convert.get_html()[0]
  222. if isinstance(image_html, int):
  223. image_html = ''
  224. p_node_text += image_html
  225. has_html = True
  226. # 只有编号
  227. if len(p_node_text) > 0 and p_node_text == text_no:
  228. p_node_text = p_node_text[:-1]
  229. _text_list.append(p_node_text)
  230. if has_html:
  231. _order_list.append('w:t html')
  232. else:
  233. _order_list.append('w:t')
  234. return _text_list, _order_list, _num_pr_dict, _last_node_level
  235. @timeout(50, timeout_exception=TimeoutError)
  236. def read_xml_order(unique_type_dir, document_xml, numbering_xml, document_xml_rels):
  237. log("into read_xml_order")
  238. try:
  239. body = document_xml.getElementsByTagName("w:body")[0]
  240. order_list = []
  241. text_list = []
  242. # 编号组记录
  243. num_pr_dict = {}
  244. last_node_level = 0
  245. for line in body.childNodes:
  246. # 普通文本
  247. if "w:p" in str(line):
  248. t_list, o_list, num_pr_dict, last_node_level = read_p_text(unique_type_dir,
  249. line,
  250. last_node_level,
  251. num_pr_dict,
  252. numbering_xml,
  253. document_xml_rels)
  254. text_list += t_list
  255. order_list += o_list
  256. # 目录索引
  257. elif "w:sdt" in str(line):
  258. sdt = line
  259. for sdt_child in sdt.childNodes:
  260. if "w:sdtContent" in str(sdt_child):
  261. sdt_content = sdt_child
  262. for sdt_content_child in sdt_content.childNodes:
  263. if 'w:p' in str(sdt_content_child):
  264. t_list, o_list, num_pr_dict, last_node_level = read_p_text(unique_type_dir,
  265. sdt_content_child,
  266. last_node_level,
  267. num_pr_dict,
  268. numbering_xml,
  269. document_xml_rels,
  270. is_sdt=True)
  271. text_list += t_list
  272. order_list += o_list
  273. elif "w:tbl" in str(line):
  274. order_list.append("w:tbl")
  275. # read_xml_table(path, save_path)
  276. return [order_list, text_list]
  277. except Exception as e:
  278. log("read_xml_order error!")
  279. traceback.print_exc()
  280. return [-1]
  281. @timeout(50, timeout_exception=TimeoutError)
  282. def read_xml_table(unique_type_dir, document_xml, numbering_xml, document_xml_rels):
  283. def recursion_read_table(table):
  284. table_text = '<table border="1">'
  285. tr_index = 0
  286. tr_text_list = []
  287. last_node_level = 0
  288. num_pr_dict = {}
  289. # 直接子节点用child表示,所有子节点用all表示
  290. row_span_dict = {}
  291. for table_child in table.childNodes:
  292. if 'w:tr' in str(table_child):
  293. table_text += "<tr>"
  294. tr = table_child
  295. tr_child_nodes = tr.childNodes
  296. tc_index = 0
  297. tc_text_list = []
  298. for tr_child in tr_child_nodes:
  299. if 'w:tc' in str(tr_child).split(' '):
  300. tc_text = ""
  301. tc = tr_child
  302. # 获取一格占多少列,相当于colspan
  303. col_span = tc.getElementsByTagName("w:gridSpan")
  304. if col_span:
  305. col_span = int(col_span[0].getAttribute("w:val"))
  306. else:
  307. col_span = 1
  308. # 获取是否是合并单元格的下一个空单元格,相当于rowspan
  309. is_merge = tc.getElementsByTagName("w:vMerge")
  310. if is_merge:
  311. is_merge = is_merge[0].getAttribute("w:val")
  312. # print(tr_index, tc_index, is_merge)
  313. # print('row_span_dict', row_span_dict)
  314. if is_merge == "continue":
  315. row_span_dict[tc_index][0] += 1
  316. tc_index += col_span
  317. # 跳过,不增加td
  318. continue
  319. # col_span_index = 0
  320. # real_tc_index = 0
  321. # if 0 <= tr_index - 1 < len(tr_text_list):
  322. # for tc_colspan in tr_text_list[tr_index - 1]:
  323. # if col_span_index < tc_index:
  324. # col_span_index += tc_colspan[1]
  325. # real_tc_index += 1
  326. # if real_tc_index < len(tr_text_list[tr_index - 1]):
  327. # tc_text = tr_text_list[tr_index - 1][real_tc_index][0]
  328. else:
  329. # 先结束上一次同列的合并单元格
  330. if tc_index in row_span_dict:
  331. row_span, finish_row_span_flag = row_span_dict.get(tc_index)
  332. table_text = re.sub(finish_row_span_flag, str(row_span), table_text)
  333. # 开启新的合并单元格
  334. row_span_flag = '#@#_{}_{}'.format(tr_index, tc_index)
  335. row_span_dict[tc_index] = [1, row_span_flag]
  336. else:
  337. row_span_flag = 1
  338. # 设置colspan
  339. table_text = table_text + "<td rowspan={} colspan={}>".format(row_span_flag, col_span)
  340. # 放入文本
  341. tc_child_nodes = tc.childNodes
  342. for tc_child in tc_child_nodes:
  343. if 'w:tbl' in str(tc_child).split(' '):
  344. # 嵌套在tc中的表格
  345. tc_text += recursion_read_table(tc_child)
  346. if 'w:p' in str(tc_child).split(' '):
  347. tc_p_all_nodes = tc_child.getElementsByTagName("*")
  348. _t_list, _, num_pr_dict, last_node_level = read_p_text(unique_type_dir,
  349. tc_child,
  350. last_node_level,
  351. num_pr_dict,
  352. numbering_xml,
  353. document_xml_rels)
  354. # print('_t_list', _t_list)
  355. tc_text += ''.join(_t_list)
  356. # for tc_p_all in tc_p_all_nodes:
  357. # if 'w:t' in str(tc_p_all).split(' '):
  358. # # w:t必须加childNodes[0]才能读文本
  359. # tc_text += tc_p_all.childNodes[0].nodeValue
  360. # print('tc_text', tc_text)
  361. # 结束该tc
  362. table_text = table_text + tc_text + "</td>"
  363. tc_index += col_span
  364. tc_text_list.append([tc_text, col_span])
  365. # 结束该tr
  366. table_text += "</tr>"
  367. tr_index += 1
  368. tr_text_list.append(tc_text_list)
  369. # 替换所有row_span
  370. for key in row_span_dict.keys():
  371. row_span, finish_row_span_flag = row_span_dict.get(key)
  372. table_text = re.sub(finish_row_span_flag, str(row_span), table_text)
  373. # 结束该table
  374. table_text += "</table>"
  375. return table_text
  376. log("into read_xml_table")
  377. try:
  378. body = document_xml.getElementsByTagName("w:body")[0]
  379. table_text_list = []
  380. body_nodes = body.childNodes
  381. for node in body_nodes:
  382. if 'w:tbl' in str(node).split(' '):
  383. _table = node
  384. _table_text = recursion_read_table(_table)
  385. table_text_list.append(_table_text)
  386. return table_text_list
  387. except Exception as e:
  388. log("read_xml_table error")
  389. print("read_xml_table", traceback.print_exc())
  390. return [-1]
  391. @timeout(25, timeout_exception=TimeoutError)
  392. def parse_xml(path):
  393. # 解析xml
  394. DOMTree = xml.dom.minidom.parse(path)
  395. collection = DOMTree.documentElement
  396. return collection
  397. @timeout(25, timeout_exception=TimeoutError)
  398. def parse_xml2(path):
  399. # 解析xml
  400. tree = xml.etree.ElementTree.parse(path)
  401. root = tree.getroot()
  402. return root
  403. class DocxConvert:
  404. def __init__(self, path, unique_type_dir):
  405. self._doc = _Document(path)
  406. self.path = path
  407. self.unique_type_dir = unique_type_dir
  408. # 解压docx
  409. try:
  410. f = zipfile.ZipFile(path)
  411. for file in f.namelist():
  412. if "word/" in str(file):
  413. f.extract(file, self.unique_type_dir)
  414. f.close()
  415. except Exception as e:
  416. log("docx format error!")
  417. self._doc.error_code = [-3]
  418. # 读取内容
  419. try:
  420. self.document_xml = parse_xml(self.unique_type_dir + "word/document.xml")
  421. if os.path.exists(self.unique_type_dir + "word/numbering.xml"):
  422. self.numbering_xml = parse_xml(self.unique_type_dir + "word/numbering.xml")
  423. else:
  424. self.numbering_xml = []
  425. if os.path.exists(self.unique_type_dir + "word/_rels/document.xml.rels"):
  426. self.document_xml_rels = parse_xml2(self.unique_type_dir + "word/_rels/document.xml.rels")
  427. else:
  428. self.document_xml_rels = []
  429. except FileNotFoundError:
  430. # 找不到解压文件,就用html格式读
  431. log('FileNotFoundError')
  432. self._doc.error_code = None
  433. except TimeoutError:
  434. log("parse_xml timeout")
  435. self._doc.error_code = [-4]
  436. @memory_decorator
  437. def init_package(self):
  438. # 各个包初始化
  439. try:
  440. self.docx = docx.Document(self.path)
  441. self.zip = zipfile.ZipFile(self.path)
  442. except:
  443. log("cannot open docx!")
  444. traceback.print_exc()
  445. self._doc.error_code = [-3]
  446. def convert(self):
  447. self._page = _Page(None, 0)
  448. # 先判断特殊doc文件,可能是html文本
  449. is_html_doc = False
  450. try:
  451. with open(self.path, 'r') as f:
  452. html_str = f.read()
  453. if re.search('<div|<html|<body|<head|<tr|<br|<table|<td', html_str):
  454. soup = BeautifulSoup(html_str, 'lxml')
  455. text = soup.text
  456. is_html_doc = True
  457. except:
  458. pass
  459. if is_html_doc:
  460. _sen = _Sentence(text, (0, 0, 0, 0))
  461. self._page.add_child(_sen)
  462. self._doc.add_child(self._page)
  463. return
  464. self.init_package()
  465. if self._doc.error_code is not None:
  466. return
  467. order_and_text_list = self.get_orders()
  468. if judge_error_code(order_and_text_list):
  469. self._doc.error_code = order_and_text_list
  470. return
  471. order_list, text_list = order_and_text_list
  472. # 乱码返回文件格式错误
  473. match1 = re.findall(get_garble_code(), ''.join(text_list))
  474. if len(match1) > 10:
  475. log("doc/docx garbled code!")
  476. self._doc.error_code = [-3]
  477. # _sen = _Sentence('文件乱码!', (0, 0, 0, 0))
  478. # self._page.add_child(_sen)
  479. self._doc.add_child(self._page)
  480. return
  481. # test
  482. # for i in range(len(text_list)):
  483. # print(order_list[i], text_list[i])
  484. table_list = self.get_tables()
  485. if judge_error_code(table_list):
  486. self._doc.error_code = table_list
  487. return
  488. # paragraph_list = self.get_paragraphs()
  489. image_list = self.get_images()
  490. order_y = 0
  491. doc_pr_cnt = 0
  492. for tag in order_list:
  493. bbox = (0, order_y, 0, 0)
  494. if tag == "w:t html":
  495. if len(text_list) > 0:
  496. _para = text_list.pop(0)
  497. _sen = _Sentence(_para, bbox)
  498. _sen.combine = False
  499. _sen.is_html = True
  500. self._page.add_child(_sen)
  501. if tag == "w:t":
  502. if len(text_list) > 0:
  503. _para = text_list.pop(0)
  504. _sen = _Sentence(_para, bbox)
  505. _sen.combine = False
  506. self._page.add_child(_sen)
  507. if tag == "wp:docPr":
  508. if len(image_list) > 0:
  509. temp_image_path = self.unique_type_dir + "docpr" + str(doc_pr_cnt) + ".png"
  510. _image = image_list.pop(0)
  511. with open(temp_image_path, "wb") as f:
  512. f.write(_image)
  513. _img = _Image(_image, temp_image_path, bbox)
  514. _img.is_from_docx = True
  515. self._page.add_child(_img)
  516. doc_pr_cnt += 1
  517. if tag == "w:tbl":
  518. if len(table_list) > 0:
  519. _table = table_list.pop(0)
  520. _table = _Table(_table, bbox)
  521. _table.is_html = True
  522. self._page.add_child(_table)
  523. order_y += 1
  524. if self._doc.error_code is None and self._page.error_code is not None:
  525. self._doc.error_code = self._page.error_code
  526. self._doc.add_child(self._page)
  527. @memory_decorator
  528. def get_tables(self):
  529. # 遍历表
  530. table_list = read_xml_table(self.unique_type_dir, self.document_xml, self.numbering_xml, self.document_xml_rels)
  531. return table_list
  532. def get_images(self):
  533. # 顺序遍历图片
  534. image_list = []
  535. pattern = re.compile('rId\d+')
  536. for graph in self.docx.paragraphs:
  537. for run in graph.runs:
  538. if run.text == '':
  539. try:
  540. if not pattern.search(run.element.xml):
  541. continue
  542. content_id = pattern.search(run.element.xml).group(0)
  543. content_type = self.docx.part.related_parts[content_id].content_type
  544. except Exception as e:
  545. print("docx no image!", e)
  546. continue
  547. if not content_type.startswith('image'):
  548. continue
  549. img_data = self.docx.part.related_parts[content_id].blob
  550. if img_data is not None:
  551. image_list.append(img_data)
  552. return image_list
  553. @memory_decorator
  554. def get_orders(self):
  555. # 解析document.xml,获取文字顺序
  556. order_and_text_list = read_xml_order(self.unique_type_dir, self.document_xml, self.numbering_xml, self.document_xml_rels)
  557. return order_and_text_list
  558. def get_doc_object(self):
  559. return self._doc
  560. def get_html(self):
  561. if self._doc.error_code is not None:
  562. return self._doc.error_code
  563. try:
  564. self.convert()
  565. except:
  566. traceback.print_exc()
  567. self._doc.error_code = [-1]
  568. # log('docx error code ' + str(self._doc.error_code))
  569. if self._doc.error_code is not None:
  570. # 调用tika提取
  571. html = from_tika_interface(self.path)
  572. if judge_error_code(html):
  573. self._doc.error_code = html
  574. return self._doc.error_code
  575. else:
  576. return [html]
  577. return self._doc.get_html()
  578. class DocxConvertNew:
  579. # 解压 .docx 文件
  580. def unzip_docx(self, file_path, extract_to):
  581. with zipfile.ZipFile(file_path, 'r') as zip_ref:
  582. zip_ref.extractall(extract_to)
  583. # 解析 numbering.xml 文件,获取编号信息
  584. def parse_numbering(self, file_path):
  585. numbering = defaultdict(list)
  586. dom = xml.dom.minidom.parse(file_path)
  587. root = dom.documentElement
  588. for num in root.getElementsByTagName("w:num"):
  589. num_id = num.getAttribute("w:numId")
  590. for lvl in num.getElementsByTagName("w:lvl"):
  591. lvl_index = lvl.getAttribute("w:ilvl")
  592. num_fmt = lvl.getElementsByTagName("w:numFmt")[0].getAttribute("w:val")
  593. num_text = lvl.getElementsByTagName("w:numText")[0].getAttribute("w:val") if lvl.getElementsByTagName("w:numText") else None
  594. numbering[num_id].append((lvl_index, num_fmt, num_text))
  595. return numbering
  596. # 解析 document.xml.rels 文件,获取图片引用信息
  597. def parse_rels(self, file_path):
  598. rels = {}
  599. dom = xml.dom.minidom.parse(file_path)
  600. root = dom.documentElement
  601. for rel in root.getElementsByTagName("Relationship"):
  602. rel_id = rel.getAttribute("Id")
  603. rel_type = rel.getAttribute("Type")
  604. target = rel.getAttribute("Target")
  605. rels[rel_id] = {"type": rel_type, "target": target}
  606. return rels
  607. # 解析 document.xml 文件,获取文档内容
  608. def parse_document(self, file_path, numbering, rels):
  609. dom = xml.dom.minidom.parse(file_path)
  610. root = dom.documentElement
  611. paragraphs = root.getElementsByTagName("w:p")
  612. content = []
  613. for para in paragraphs:
  614. para_text = ""
  615. num_id = None
  616. ilvl = None
  617. for child in para.childNodes:
  618. if child.nodeName == "w:pPr":
  619. for num_id_node in child.getElementsByTagName("w:numId"):
  620. num_id = num_id_node.getAttribute("w:val")
  621. for ilvl_node in child.getElementsByTagName("w:ilvl"):
  622. ilvl = ilvl_node.getAttribute("w:ilvl")
  623. elif child.nodeName == "w:r":
  624. for t in child.getElementsByTagName("w:t"):
  625. para_text += t.firstChild.nodeValue if t.firstChild else ""
  626. if num_id and ilvl not in [None, '']:
  627. num_fmt, num_text = numbering[num_id][int(ilvl)][1:]
  628. if num_fmt == "decimal":
  629. para_text = f"{int(ilvl) + 1}. {para_text}"
  630. elif num_text:
  631. para_text = f"{num_text} {para_text}"
  632. content.append(para_text)
  633. # 解析表格
  634. tables = root.getElementsByTagName("w:tbl")
  635. for table in tables:
  636. table_content = []
  637. row_count = 0
  638. col_count = 0
  639. for row in table.getElementsByTagName("w:tr"):
  640. row_content = []
  641. cell_count = 0
  642. for cell in row.getElementsByTagName("w:tc"):
  643. cell_text = ""
  644. for para in cell.getElementsByTagName("w:p"):
  645. for run in para.getElementsByTagName("w:r"):
  646. for text in run.getElementsByTagName("w:t"):
  647. cell_text += text.firstChild.nodeValue if text.firstChild else ""
  648. # 检查合并单元格
  649. grid_span = 1
  650. v_merge = False
  651. for child in cell.childNodes:
  652. if child.nodeName == "w:tcPr":
  653. for grid_span_node in child.getElementsByTagName("w:gridSpan"):
  654. grid_span = int(grid_span_node.getAttribute("w:val"))
  655. for v_merge_node in child.getElementsByTagName("w:vMerge"):
  656. v_merge = True
  657. row_content.append({
  658. "text": cell_text,
  659. "colspan": grid_span,
  660. "rowspan": 1 if not v_merge else 2 # 简化处理,实际需要根据上下文确定
  661. })
  662. cell_count += grid_span
  663. table_content.append(row_content)
  664. row_count += 1
  665. col_count = max(col_count, cell_count)
  666. content.append(table_content)
  667. # 解析图片
  668. for rel in rels.values():
  669. if rel["type"] == "http://schemas.openxmlformats.org/officeDocument/2006/relationships/image":
  670. content.append(f"图片: {rel['target']}")
  671. return content
  672. # 生成 HTML 输出
  673. def generate_html(self, content):
  674. html = []
  675. html.append('<!DOCTYPE HTML><head><meta charset="UTF-8"></head><html><body>')
  676. for item in content:
  677. if isinstance(item, list): # 表格内容
  678. html.append("<table border='1'>")
  679. for row in item:
  680. html.append("<tr>")
  681. for cell in row:
  682. colspan = cell.get("colspan", 1)
  683. rowspan = cell.get("rowspan", 1)
  684. html.append(f"<td colspan='{colspan}' rowspan='{rowspan}'>{cell['text']}</td>")
  685. html.append("</tr>")
  686. html.append("</table>")
  687. else: # 普通文本或图片
  688. html.append(f"<p>{item}</p>")
  689. html.append("</body></html>")
  690. return "\n".join(html)
  691. # 主函数
  692. def read_docx(self, file_path):
  693. extract_to = "extracted_docx"
  694. self.unzip_docx(file_path, extract_to)
  695. numbering = self.parse_numbering(os.path.join(extract_to, "word", "numbering.xml"))
  696. rels = self.parse_rels(os.path.join(extract_to, "word", "_rels", "document.xml.rels"))
  697. content = self.parse_document(os.path.join(extract_to, "word", "document.xml"), numbering, rels)
  698. html_output = self.generate_html(content)
  699. with open("../result.html", "w", encoding="utf-8") as f:
  700. f.write(html_output)
  701. if __name__ == '__main__':
  702. c = DocxConvert("C:/Users/Administrator/Downloads/dsdsd.docx", "C:/Users/Administrator/Downloads/1/")
  703. print(c.get_html())
  704. # c = DocxConvertNew()
  705. # # c.read_docx(r'C:\Users\Administrator\Desktop\test_doc\error14.docx')
  706. # c.read_docx(r'C:/Users/Administrator/Downloads/dsdsd.docx')