convert_docx.py 40 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986
  1. import os
  2. import sys
  3. from collections import defaultdict
  4. sys.path.append(os.path.dirname(__file__) + "/../")
  5. from format_convert.convert_tree import _Document, _Sentence, _Page, _Image, _Table
  6. import re
  7. import traceback
  8. import xml
  9. import zipfile
  10. import docx
  11. from bs4 import BeautifulSoup
  12. from format_convert.utils import judge_error_code, add_div, get_logger, log, memory_decorator, get_garble_code, \
  13. get_table_html
  14. from format_convert.wrapt_timeout_decorator import timeout
  15. from format_convert.convert_image import ImageConvert
  16. from format_convert.convert_need_interface import from_tika_interface
  17. def docx2text():
  18. return
  19. def read_rel_image(document_xml_rels):
  20. if not document_xml_rels:
  21. return {}
  22. # 获取映射文件里的关系 Id-Target
  23. image_rel_dict = {}
  24. for rel in document_xml_rels:
  25. if 'Relationship' in str(rel):
  26. _id = rel.get("Id")
  27. _target = rel.get("Target")
  28. _type = rel.get("Type")
  29. if 'image' in _type:
  30. image_rel_dict[_id] = _target
  31. return image_rel_dict
  32. def read_no_start(numbering_xml):
  33. """
  34. 读取编号组的起始值
  35. :return:
  36. """
  37. if not numbering_xml:
  38. return {}, {}
  39. # 获取虚拟-真实id映射关系
  40. w_num_list = numbering_xml.getElementsByTagName("w:num")
  41. abstract_real_id_dict = {}
  42. for w_num in w_num_list:
  43. w_num_id = w_num.getAttribute("w:numId")
  44. w_abstract_num_id = w_num.getElementsByTagName('w:abstractNumId')[0].getAttribute("w:val")
  45. abstract_real_id_dict[w_abstract_num_id] = w_num_id
  46. # 获取虚拟id的开始编号
  47. w_abstract_num_list = numbering_xml.getElementsByTagName("w:abstractNum")
  48. abstract_id_level_dict = {}
  49. abstract_id_level_text_dict = {}
  50. for w_abstract_num in w_abstract_num_list:
  51. w_abstract_num_id = w_abstract_num.getAttribute("w:abstractNumId")
  52. w_lvl_list = w_abstract_num.getElementsByTagName("w:lvl")
  53. level_start_dict = {}
  54. level_text_dict = {}
  55. for w_lvl in w_lvl_list:
  56. w_ilvl_value = w_lvl.getAttribute('w:ilvl')
  57. if w_lvl.getElementsByTagName("w:start"):
  58. w_ilvl_start_num = w_lvl.getElementsByTagName("w:start")[0].getAttribute("w:val")
  59. level_start_dict[int(w_ilvl_value)] = int(w_ilvl_start_num)
  60. if w_lvl.getElementsByTagName("w:lvlText") and w_lvl.getElementsByTagName("w:numFmt"):
  61. w_lvl_text = w_lvl.getElementsByTagName("w:lvlText")[0].getAttribute("w:val")
  62. w_lvl_format = w_lvl.getElementsByTagName("w:numFmt")[0].getAttribute("w:val")
  63. if w_lvl_format == 'upperLetter':
  64. w_lvl_text = re.sub('%\d', '%A', w_lvl_text)
  65. elif w_lvl_format == 'lowerLetter':
  66. w_lvl_text = re.sub('%\d', '%a', w_lvl_text)
  67. level_text_dict[int(w_ilvl_value)] = w_lvl_text
  68. abstract_id_level_dict[w_abstract_num_id] = level_start_dict
  69. abstract_id_level_text_dict[w_abstract_num_id] = level_text_dict
  70. # 映射回真实id
  71. real_id_level_start_dict = {}
  72. for abstract_id in abstract_real_id_dict.keys():
  73. real_id = abstract_real_id_dict.get(abstract_id)
  74. level_start_dict = abstract_id_level_dict.get(abstract_id)
  75. if level_start_dict:
  76. real_id_level_start_dict[int(real_id)] = level_start_dict
  77. real_id_level_text_dict = {}
  78. for abstract_id in abstract_real_id_dict.keys():
  79. real_id = abstract_real_id_dict.get(abstract_id)
  80. level_text_dict = abstract_id_level_text_dict.get(abstract_id)
  81. if level_text_dict:
  82. real_id_level_text_dict[int(real_id)] = level_text_dict
  83. return real_id_level_start_dict, real_id_level_text_dict
  84. def read_p_text(unique_type_dir, p_node, _last_node_level, _num_pr_dict, numbering_xml, document_xml_rels,
  85. is_sdt=False):
  86. """
  87. 读取w:p下的文本,包括编号
  88. :param unique_type_dir:
  89. :param p_node:
  90. :param _last_node_level:
  91. :param _num_pr_dict:
  92. :param numbering_xml:
  93. :param document_xml_rels:
  94. :param is_sdt:
  95. :return:
  96. """
  97. _text_list = []
  98. _order_list = []
  99. # 文本的编号(如果有编号的话)
  100. text_no = ''
  101. # 获取编号组的起始值和编号组的展示形式
  102. id_level_start_dict, id_level_text_dict = read_no_start(numbering_xml)
  103. # print('_num_pr_dict', _num_pr_dict)
  104. # 提取编号 组-层级-序号
  105. num_pr = p_node.getElementsByTagName("w:numPr")
  106. if num_pr:
  107. num_pr = num_pr[0]
  108. if num_pr.getElementsByTagName("w:numId"):
  109. group_id = int(num_pr.getElementsByTagName("w:numId")[0].getAttribute("w:val"))
  110. if group_id >= 1:
  111. node_level = num_pr.getElementsByTagName("w:ilvl")
  112. if node_level:
  113. node_level = int(node_level[0].getAttribute("w:val"))
  114. # print('group_id', group_id, 'node_level', node_level, 'last_node_level', _last_node_level)
  115. if group_id in _num_pr_dict.keys():
  116. # if node_level == 0 and node_level not in _num_pr_dict[group_id].keys():
  117. if node_level == 0 and _num_pr_dict.get(group_id) and node_level not in _num_pr_dict.get(group_id).keys():
  118. _num_pr_dict[group_id][node_level] = 1
  119. if _last_node_level != 0 and node_level < _last_node_level:
  120. # print('重置', 'group_id', group_id, 'last_node_level', last_node_level)
  121. # 需循环重置node_level到last_node_level之间的level
  122. for l in range(node_level+1, _last_node_level+1):
  123. _num_pr_dict[group_id][l] = 0
  124. if _num_pr_dict[group_id].get(node_level):
  125. _num_pr_dict[group_id][node_level] += 1
  126. else:
  127. pass
  128. # print('group_id, node_level', group_id, node_level)
  129. # elif node_level in _num_pr_dict[group_id].keys():
  130. elif node_level in _num_pr_dict.get(group_id).keys():
  131. _num_pr_dict[group_id][node_level] += 1
  132. else:
  133. _num_pr_dict[group_id][node_level] = 1
  134. else:
  135. _num_pr_dict[group_id] = {node_level: 1}
  136. # print(num_pr_dict[group_id])
  137. for level in range(node_level+1):
  138. # 当前level下有多少个node
  139. # if level not in _num_pr_dict[group_id]:
  140. if level not in _num_pr_dict.get(group_id):
  141. # if level not in id_level_start_dict[group_id]:
  142. if not id_level_start_dict.get(group_id) or level not in id_level_start_dict.get(group_id):
  143. continue
  144. else:
  145. level_node_cnt = id_level_start_dict[group_id][level]
  146. else:
  147. level_node_cnt = _num_pr_dict[group_id][level]
  148. if id_level_start_dict.get(group_id) and _num_pr_dict.get(group_id) and id_level_start_dict.get(group_id).get(level) and _num_pr_dict.get(group_id).get(level):
  149. start_no = id_level_start_dict.get(group_id).get(level)
  150. level_node_cnt += start_no - 1
  151. level_text = None
  152. if id_level_text_dict.get(group_id) and id_level_text_dict.get(group_id).get(level) and _num_pr_dict.get(group_id).get(level):
  153. level_text = id_level_text_dict.get(group_id).get(level)
  154. # print('level_node_cnt', level_node_cnt)
  155. if level_text:
  156. if re.search('a', level_text):
  157. level_node_cnt = chr(ord('a') + level_node_cnt - 1)
  158. text_no += re.sub('%a', str(level_node_cnt), level_text)
  159. elif re.search('A', level_text):
  160. level_node_cnt = chr(ord('A') + level_node_cnt - 1)
  161. text_no += re.sub('%A', str(level_node_cnt), level_text)
  162. else:
  163. text_no += re.sub('%\d', str(level_node_cnt), level_text)
  164. else:
  165. text_no += str(level_node_cnt) + '.'
  166. # print('text_no', text_no)
  167. _last_node_level = node_level
  168. # text = p_node.getElementsByTagName("w:t")
  169. # picture = p_node.getElementsByTagName("wp:docPr")
  170. # if text:
  171. # _order_list.append("w:t")
  172. # temp_text = ""
  173. # if is_sdt and len(text) == 2:
  174. # if len(text[0].childNodes) > 0 and len(text[1].childNodes) > 0:
  175. # temp_text += text[0].childNodes[0].nodeValue + '.'*20 + text[1].childNodes[0].nodeValue
  176. # else:
  177. # for t in text:
  178. # if len(t.childNodes) > 0:
  179. # temp_text += t.childNodes[0].nodeValue
  180. # else:
  181. # continue
  182. # if text_no:
  183. # temp_text = text_no + ' ' + temp_text
  184. # _text_list.append(temp_text)
  185. # # 只有序号
  186. # elif len(text_no) >= 2:
  187. # _text_list.append(text_no[:-1])
  188. #
  189. # if picture:
  190. # _order_list.append("wp:docPr")
  191. #
  192. # for line1 in p_node.childNodes:
  193. # if "w:r" in str(line1):
  194. # picture1 = line1.getElementsByTagName("w:pict")
  195. # if picture1:
  196. # _order_list.append("wp:docPr")
  197. p_node_text = ''
  198. has_html = False
  199. # 编号先加上
  200. if text_no:
  201. p_node_text += text_no
  202. text = p_node.getElementsByTagName("w:t")
  203. # 目录页单特殊生成
  204. if is_sdt and len(text) == 2:
  205. p_node_text += text[0].childNodes[0].nodeValue + '.'*20 + text[1].childNodes[0].nodeValue
  206. # 正常页面
  207. else:
  208. image_rel_dict = read_rel_image(document_xml_rels)
  209. p_node_all = p_node.getElementsByTagName("*")
  210. for node in p_node_all:
  211. # 文本
  212. if "w:t" in str(node).split(' '):
  213. if node.childNodes:
  214. p_node_text += node.childNodes[0].nodeValue
  215. # 图片,提前识别,不做成Image对象放入Page了
  216. elif "a:blip" in str(node).split(' '):
  217. _id = node.getAttribute("r:embed")
  218. image_path = image_rel_dict.get(_id)
  219. if image_path:
  220. image_path = unique_type_dir + 'word/' + image_path
  221. image_convert = ImageConvert(image_path, '')
  222. image_html = image_convert.get_html()[0]
  223. if isinstance(image_html, int):
  224. image_html = ''
  225. p_node_text += image_html
  226. has_html = True
  227. # 只有编号
  228. if len(p_node_text) > 0 and p_node_text == text_no:
  229. p_node_text = p_node_text[:-1]
  230. _text_list.append(p_node_text)
  231. if has_html:
  232. _order_list.append('w:t html')
  233. else:
  234. _order_list.append('w:t')
  235. return _text_list, _order_list, _num_pr_dict, _last_node_level
  236. @timeout(50, timeout_exception=TimeoutError)
  237. def read_xml_order(unique_type_dir, document_xml, numbering_xml, document_xml_rels):
  238. log("into read_xml_order")
  239. try:
  240. body = document_xml.getElementsByTagName("w:body")[0]
  241. order_list = []
  242. text_list = []
  243. # 编号组记录
  244. num_pr_dict = {}
  245. last_node_level = 0
  246. for line in body.childNodes:
  247. # 普通文本
  248. if "w:p" in str(line):
  249. t_list, o_list, num_pr_dict, last_node_level = read_p_text(unique_type_dir,
  250. line,
  251. last_node_level,
  252. num_pr_dict,
  253. numbering_xml,
  254. document_xml_rels)
  255. text_list += t_list
  256. order_list += o_list
  257. # 目录索引
  258. elif "w:sdt" in str(line):
  259. sdt = line
  260. for sdt_child in sdt.childNodes:
  261. if "w:sdtContent" in str(sdt_child):
  262. sdt_content = sdt_child
  263. for sdt_content_child in sdt_content.childNodes:
  264. if 'w:p' in str(sdt_content_child):
  265. t_list, o_list, num_pr_dict, last_node_level = read_p_text(unique_type_dir,
  266. sdt_content_child,
  267. last_node_level,
  268. num_pr_dict,
  269. numbering_xml,
  270. document_xml_rels,
  271. is_sdt=True)
  272. text_list += t_list
  273. order_list += o_list
  274. elif "w:tbl" in str(line):
  275. order_list.append("w:tbl")
  276. # read_xml_table(path, save_path)
  277. return [order_list, text_list]
  278. except Exception as e:
  279. log("read_xml_order error!")
  280. traceback.print_exc()
  281. return [-1]
  282. @timeout(50, timeout_exception=TimeoutError)
  283. def read_xml_table(unique_type_dir, document_xml, numbering_xml, document_xml_rels):
  284. def recursion_read_table(table, show=0):
  285. table_text = '<table border="1">'
  286. tr_index = 0
  287. tr_text_list = []
  288. last_node_level = 0
  289. num_pr_dict = {}
  290. # 直接子节点用child表示,所有子节点用all表示
  291. row_span_dict = {}
  292. for table_child in table.childNodes:
  293. if 'w:tr' in str(table_child):
  294. table_text += "<tr>"
  295. tr = table_child
  296. tr_child_nodes = tr.childNodes
  297. tc_index = 0
  298. tc_text_list = []
  299. for tr_child in tr_child_nodes:
  300. if 'w:tc' in str(tr_child).split(' '):
  301. tc_text = ""
  302. tc = tr_child
  303. # 获取一格占多少列,相当于colspan
  304. col_span = tc.getElementsByTagName("w:gridSpan")
  305. if col_span:
  306. col_span = int(col_span[0].getAttribute("w:val"))
  307. else:
  308. col_span = 1
  309. # 获取是否是合并单元格的下一个空单元格,相当于rowspan
  310. is_merge = tc.getElementsByTagName("w:vMerge")
  311. if is_merge:
  312. is_merge = is_merge[0].getAttribute("w:val")
  313. # print(tr_index, tc_index, is_merge)
  314. # print('row_span_dict', row_span_dict)
  315. if is_merge == "continue":
  316. row_span_dict[tc_index][0] += 1
  317. tc_index += col_span
  318. tc_text_list.append([tc_text, col_span])
  319. # 跳过,不增加td
  320. continue
  321. # col_span_index = 0
  322. # real_tc_index = 0
  323. # if 0 <= tr_index - 1 < len(tr_text_list):
  324. # for tc_colspan in tr_text_list[tr_index - 1]:
  325. # if col_span_index < tc_index:
  326. # col_span_index += tc_colspan[1]
  327. # real_tc_index += 1
  328. # if real_tc_index < len(tr_text_list[tr_index - 1]):
  329. # tc_text = tr_text_list[tr_index - 1][real_tc_index][0]
  330. else:
  331. # 先结束上一次同列的合并单元格
  332. if tc_index in row_span_dict:
  333. row_span, finish_row_span_flag = row_span_dict.get(tc_index)
  334. table_text = re.sub(finish_row_span_flag, str(row_span), table_text)
  335. # 开启新的合并单元格
  336. row_span_flag = '#@#_{}_{}'.format(tr_index, tc_index)
  337. row_span_dict[tc_index] = [1, row_span_flag]
  338. else:
  339. row_span_flag = 1
  340. # 设置colspan
  341. table_text = table_text + "<td rowspan={} colspan={}>".format(row_span_flag, col_span)
  342. # 放入文本
  343. tc_child_nodes = tc.childNodes
  344. for tc_child in tc_child_nodes:
  345. if 'w:tbl' in str(tc_child).split(' '):
  346. # 嵌套在tc中的表格
  347. tc_text += recursion_read_table(tc_child)
  348. if 'w:p' in str(tc_child).split(' '):
  349. tc_p_all_nodes = tc_child.getElementsByTagName("*")
  350. _t_list, _, num_pr_dict, last_node_level = read_p_text(unique_type_dir,
  351. tc_child,
  352. last_node_level,
  353. num_pr_dict,
  354. numbering_xml,
  355. document_xml_rels)
  356. # print('_t_list', _t_list)
  357. tc_text += ''.join(_t_list)
  358. # for tc_p_all in tc_p_all_nodes:
  359. # if 'w:t' in str(tc_p_all).split(' '):
  360. # # w:t必须加childNodes[0]才能读文本
  361. # tc_text += tc_p_all.childNodes[0].nodeValue
  362. # print('tc_text', tc_text)
  363. # 结束该tc
  364. table_text = table_text + tc_text + "</td>"
  365. tc_index += col_span
  366. tc_text_list.append([tc_text, col_span])
  367. # 结束该tr
  368. table_text += "</tr>"
  369. tr_index += 1
  370. tr_text_list.append(tc_text_list)
  371. if show:
  372. for row in tr_text_list:
  373. print('row', row)
  374. print('len(row)', len(row))
  375. # 替换所有row_span
  376. for key in row_span_dict.keys():
  377. row_span, finish_row_span_flag = row_span_dict.get(key)
  378. table_text = re.sub(finish_row_span_flag, str(row_span), table_text)
  379. # 结束该table
  380. table_text += "</table>"
  381. return table_text
  382. log("into read_xml_table")
  383. try:
  384. body = document_xml.getElementsByTagName("w:body")[0]
  385. table_text_list = []
  386. body_nodes = body.childNodes
  387. for node in body_nodes:
  388. if 'w:tbl' in str(node).split(' '):
  389. _table = node
  390. # _table_text = recursion_read_table(_table)
  391. _table_text = xml_table_to_html(_table, unique_type_dir, numbering_xml, document_xml_rels)
  392. table_text_list.append(_table_text)
  393. return table_text_list
  394. except Exception as e:
  395. log("read_xml_table error")
  396. print("read_xml_table", traceback.print_exc())
  397. return [-1]
  398. def xml_table_to_html(table, unique_type_dir, numbering_xml, document_xml_rels, show=0):
  399. tr_index = 0
  400. tr_text_list = []
  401. last_node_level = 0
  402. num_pr_dict = {}
  403. # 直接子节点用child表示,所有子节点用all表示
  404. for table_child in table.childNodes:
  405. if 'w:tr' in str(table_child):
  406. tr = table_child
  407. tr_child_nodes = tr.childNodes
  408. tc_index = 0
  409. tc_text_list = []
  410. for tr_child in tr_child_nodes:
  411. if 'w:tc' in str(tr_child).split(' '):
  412. tc_text = ""
  413. tc = tr_child
  414. # 获取一格占多少列,相当于colspan
  415. col_span = tc.getElementsByTagName("w:gridSpan")
  416. if col_span:
  417. col_span = int(col_span[0].getAttribute("w:val"))
  418. else:
  419. col_span = 1
  420. # 获取是否是合并单元格的下一个空单元格,相当于rowspan
  421. is_merge = tc.getElementsByTagName("w:vMerge")
  422. if is_merge:
  423. is_merge = is_merge[0].getAttribute("w:val")
  424. if is_merge == "continue":
  425. tc_index += col_span
  426. tc_text = '@continue@'
  427. tc_text_list.append([tc_text, col_span])
  428. # 跳过,不增加td
  429. continue
  430. # 放入文本
  431. tc_child_nodes = tc.childNodes
  432. for tc_child in tc_child_nodes:
  433. # 处理嵌套在tc中的表格
  434. if 'w:tbl' in str(tc_child).split(' '):
  435. tc_text += xml_table_to_html(tc_child, unique_type_dir, numbering_xml, document_xml_rels)
  436. # 处理编号
  437. if 'w:p' in str(tc_child).split(' '):
  438. _t_list, _, num_pr_dict, last_node_level = read_p_text(unique_type_dir,
  439. tc_child,
  440. last_node_level,
  441. num_pr_dict,
  442. numbering_xml,
  443. document_xml_rels)
  444. tc_text += ''.join(_t_list)
  445. # 结束该tc
  446. tc_index += col_span
  447. tc_text_list.append([tc_text, col_span])
  448. # 结束该tr
  449. tr_index += 1
  450. tr_text_list.append(tc_text_list)
  451. if show:
  452. for row in tr_text_list:
  453. print('row', row)
  454. print('len(row)', len(row))
  455. table_html = row_list_to_table(tr_text_list)
  456. return table_html
  457. def row_list_to_table(row_list, show=0):
  458. if show:
  459. print('='*50)
  460. # 复制合并列
  461. new_row_list = []
  462. for row in row_list:
  463. new_row = []
  464. for col, col_span in row:
  465. new_row += [[col, col_span]]
  466. if col_span > 1:
  467. new_row += [[col, 0]] * (col_span - 1)
  468. new_row_list.append(new_row)
  469. row_list = new_row_list
  470. if show:
  471. for row in row_list:
  472. print('copy row', row)
  473. # 计算是不是每行都有相等列数
  474. row_cnt_list = []
  475. for row in row_list:
  476. row_cnt_list.append(len(row))
  477. if len(set(row_cnt_list)) != 1:
  478. log('表格有列数不同,直接返回text' + str(row_cnt_list))
  479. # 直接返回所有col的text
  480. text = ''
  481. for row in row_list:
  482. for col, col_span in row:
  483. text += col
  484. return text
  485. new_row_list = []
  486. for ri, row in enumerate(row_list):
  487. new_row = []
  488. for ci, col in enumerate(row):
  489. col, col_span = col
  490. row_span = 1
  491. # 判断下面行同列有没有需合并的
  492. for ri2 in range(ri+1, len(row_list)):
  493. col2, col_span2 = row_list[ri2][ci]
  494. if col2 == '@continue@':
  495. row_span += 1
  496. else:
  497. break
  498. # 需跳过的列
  499. if col == '@continue@' or col_span == 0:
  500. delete = 1
  501. else:
  502. delete = 0
  503. col_dict = {
  504. 'text': col,
  505. 'rowspan': row_span,
  506. 'columnspan': col_span,
  507. 'delete': delete,
  508. }
  509. new_row.append(col_dict)
  510. new_row_list.append(new_row)
  511. if show:
  512. for new_row in new_row_list:
  513. print('new_row', new_row)
  514. table_html = get_table_html(new_row_list)
  515. # soup = BeautifulSoup(table_html, 'lxml')
  516. # print(soup.prettify())
  517. if show:
  518. print('-' * 50)
  519. return table_html
  520. @timeout(25, timeout_exception=TimeoutError)
  521. def parse_xml(path):
  522. # 解析xml
  523. DOMTree = xml.dom.minidom.parse(path)
  524. collection = DOMTree.documentElement
  525. return collection
  526. @timeout(25, timeout_exception=TimeoutError)
  527. def parse_xml2(path):
  528. # 解析xml
  529. tree = xml.etree.ElementTree.parse(path)
  530. root = tree.getroot()
  531. return root
  532. class DocxConvert:
  533. def __init__(self, path, unique_type_dir):
  534. self._doc = _Document(path)
  535. self._page = _Page(None, 0)
  536. self.path = path
  537. self.unique_type_dir = unique_type_dir
  538. # 解压docx
  539. try:
  540. f = zipfile.ZipFile(path)
  541. for file in f.namelist():
  542. if "word/" in str(file):
  543. f.extract(file, self.unique_type_dir)
  544. f.close()
  545. except Exception as e:
  546. log("docx format error!")
  547. self._doc.error_code = [-3]
  548. # 读取内容
  549. try:
  550. self.document_xml = parse_xml(self.unique_type_dir + "word/document.xml")
  551. if os.path.exists(self.unique_type_dir + "word/numbering.xml"):
  552. self.numbering_xml = parse_xml(self.unique_type_dir + "word/numbering.xml")
  553. else:
  554. self.numbering_xml = []
  555. if os.path.exists(self.unique_type_dir + "word/_rels/document.xml.rels"):
  556. self.document_xml_rels = parse_xml2(self.unique_type_dir + "word/_rels/document.xml.rels")
  557. else:
  558. self.document_xml_rels = []
  559. except FileNotFoundError:
  560. # 找不到解压文件,就用html格式读
  561. log('FileNotFoundError')
  562. self._doc.error_code = None
  563. except TimeoutError:
  564. log("parse_xml timeout")
  565. self._doc.error_code = [-4]
  566. @memory_decorator
  567. def init_package(self):
  568. # 各个包初始化
  569. try:
  570. self.docx = docx.Document(self.path)
  571. self.zip = zipfile.ZipFile(self.path)
  572. except:
  573. log("cannot open docx!")
  574. traceback.print_exc()
  575. self._doc.error_code = [-3]
  576. def convert(self):
  577. # 先判断特殊doc文件,可能是html文本
  578. is_html_doc = False
  579. try:
  580. with open(self.path, 'r') as f:
  581. html_str = f.read()
  582. if re.search('<div|<html|<body|<head|<tr|<br|<table|<td', html_str):
  583. soup = BeautifulSoup(html_str, 'lxml')
  584. text = soup.text
  585. is_html_doc = True
  586. except:
  587. pass
  588. if is_html_doc:
  589. _sen = _Sentence(text, (0, 0, 0, 0))
  590. self._page.add_child(_sen)
  591. self._doc.add_child(self._page)
  592. return
  593. self.init_package()
  594. if self._doc.error_code is not None:
  595. return
  596. order_and_text_list = self.get_orders()
  597. if judge_error_code(order_and_text_list):
  598. self._doc.error_code = order_and_text_list
  599. return
  600. order_list, text_list = order_and_text_list
  601. # 乱码返回文件格式错误
  602. match1 = re.findall(get_garble_code(), ''.join(text_list))
  603. if len(match1) > 10:
  604. log("doc/docx garbled code!")
  605. self._doc.error_code = [-3]
  606. # _sen = _Sentence('文件乱码!', (0, 0, 0, 0))
  607. # self._page.add_child(_sen)
  608. self._doc.add_child(self._page)
  609. return
  610. # test
  611. # for i in range(len(text_list)):
  612. # print(order_list[i], text_list[i])
  613. table_list = self.get_tables()
  614. if judge_error_code(table_list):
  615. self._doc.error_code = table_list
  616. return
  617. # paragraph_list = self.get_paragraphs()
  618. image_list = self.get_images()
  619. order_y = 0
  620. doc_pr_cnt = 0
  621. for tag in order_list:
  622. bbox = (0, order_y, 0, 0)
  623. if tag == "w:t html":
  624. if len(text_list) > 0:
  625. _para = text_list.pop(0)
  626. _sen = _Sentence(_para, bbox)
  627. _sen.combine = False
  628. _sen.is_html = True
  629. self._page.add_child(_sen)
  630. if tag == "w:t":
  631. if len(text_list) > 0:
  632. _para = text_list.pop(0)
  633. _sen = _Sentence(_para, bbox)
  634. _sen.combine = False
  635. self._page.add_child(_sen)
  636. if tag == "wp:docPr":
  637. if len(image_list) > 0:
  638. temp_image_path = self.unique_type_dir + "docpr" + str(doc_pr_cnt) + ".png"
  639. _image = image_list.pop(0)
  640. with open(temp_image_path, "wb") as f:
  641. f.write(_image)
  642. _img = _Image(_image, temp_image_path, bbox)
  643. _img.is_from_docx = True
  644. self._page.add_child(_img)
  645. doc_pr_cnt += 1
  646. if tag == "w:tbl":
  647. if len(table_list) > 0:
  648. _table = table_list.pop(0)
  649. _table = _Table(_table, bbox)
  650. _table.is_html = True
  651. self._page.add_child(_table)
  652. order_y += 1
  653. if self._doc.error_code is None and self._page.error_code is not None:
  654. self._doc.error_code = self._page.error_code
  655. self._doc.add_child(self._page)
  656. @memory_decorator
  657. def get_tables(self):
  658. # 遍历表
  659. table_list = read_xml_table(self.unique_type_dir, self.document_xml, self.numbering_xml, self.document_xml_rels)
  660. return table_list
  661. def get_images(self):
  662. # 顺序遍历图片
  663. image_list = []
  664. pattern = re.compile('rId\d+')
  665. for graph in self.docx.paragraphs:
  666. for run in graph.runs:
  667. if run.text == '':
  668. try:
  669. if not pattern.search(run.element.xml):
  670. continue
  671. content_id = pattern.search(run.element.xml).group(0)
  672. content_type = self.docx.part.related_parts[content_id].content_type
  673. except Exception as e:
  674. print("docx no image!", e)
  675. continue
  676. if not content_type.startswith('image'):
  677. continue
  678. img_data = self.docx.part.related_parts[content_id].blob
  679. if img_data is not None:
  680. image_list.append(img_data)
  681. return image_list
  682. @memory_decorator
  683. def get_orders(self):
  684. # 解析document.xml,获取文字顺序
  685. order_and_text_list = read_xml_order(self.unique_type_dir, self.document_xml, self.numbering_xml, self.document_xml_rels)
  686. return order_and_text_list
  687. def get_doc_object(self):
  688. return self._doc
  689. def use_tika(self, _path):
  690. # 调用tika提取
  691. # html = from_tika_interface(self.path)
  692. # if judge_error_code(html):
  693. # self._doc.error_code = html
  694. # self.tika_html = html
  695. data = from_tika_interface(_path)
  696. if judge_error_code(data):
  697. self._doc.error_code = data
  698. return
  699. current_y = 5
  700. for di, d in enumerate(data):
  701. data_type, value = d
  702. bbox = [0, current_y, 20, current_y+10]
  703. current_y += 20
  704. if data_type == 'text':
  705. _sen = _Sentence(value, bbox)
  706. _sen.combine = False
  707. self._page.add_child(_sen)
  708. elif data_type == 'img':
  709. with open(value, "rb") as f:
  710. img = f.read()
  711. _img = _Image(img, value, bbox)
  712. _img.is_from_docx = True
  713. self._page.add_child(_img)
  714. elif data_type == 'table':
  715. _table = _Table(value, bbox)
  716. _table.is_html = True
  717. self._page.add_child(_table)
  718. self._doc.add_child(self._page)
  719. def get_html(self):
  720. if self._doc.error_code is not None:
  721. return self._doc.error_code
  722. try:
  723. # raise
  724. self.convert()
  725. except:
  726. traceback.print_exc()
  727. self._doc.error_code = [-1]
  728. # log('docx error code ' + str(self._doc.error_code))
  729. if self._doc.error_code is not None:
  730. # # 调用tika提取
  731. # html = from_tika_interface(self.path)
  732. # if judge_error_code(html):
  733. # self._doc.error_code = html
  734. # return self._doc.error_code
  735. # else:
  736. # return [html]
  737. try:
  738. self.use_tika(self.path)
  739. self._doc.error_code = None
  740. except:
  741. traceback.print_exc()
  742. log('docx tika failed too')
  743. self._doc.error_code = [-17]
  744. return self._doc.get_html()
  745. class DocxConvertNew:
  746. # 解压 .docx 文件
  747. def unzip_docx(self, file_path, extract_to):
  748. with zipfile.ZipFile(file_path, 'r') as zip_ref:
  749. zip_ref.extractall(extract_to)
  750. # 解析 numbering.xml 文件,获取编号信息
  751. def parse_numbering(self, file_path):
  752. numbering = defaultdict(list)
  753. dom = xml.dom.minidom.parse(file_path)
  754. root = dom.documentElement
  755. for num in root.getElementsByTagName("w:num"):
  756. num_id = num.getAttribute("w:numId")
  757. for lvl in num.getElementsByTagName("w:lvl"):
  758. lvl_index = lvl.getAttribute("w:ilvl")
  759. num_fmt = lvl.getElementsByTagName("w:numFmt")[0].getAttribute("w:val")
  760. num_text = lvl.getElementsByTagName("w:numText")[0].getAttribute("w:val") if lvl.getElementsByTagName("w:numText") else None
  761. numbering[num_id].append((lvl_index, num_fmt, num_text))
  762. return numbering
  763. # 解析 document.xml.rels 文件,获取图片引用信息
  764. def parse_rels(self, file_path):
  765. rels = {}
  766. dom = xml.dom.minidom.parse(file_path)
  767. root = dom.documentElement
  768. for rel in root.getElementsByTagName("Relationship"):
  769. rel_id = rel.getAttribute("Id")
  770. rel_type = rel.getAttribute("Type")
  771. target = rel.getAttribute("Target")
  772. rels[rel_id] = {"type": rel_type, "target": target}
  773. return rels
  774. # 解析 document.xml 文件,获取文档内容
  775. def parse_document(self, file_path, numbering, rels):
  776. dom = xml.dom.minidom.parse(file_path)
  777. root = dom.documentElement
  778. paragraphs = root.getElementsByTagName("w:p")
  779. content = []
  780. for para in paragraphs:
  781. para_text = ""
  782. num_id = None
  783. ilvl = None
  784. for child in para.childNodes:
  785. if child.nodeName == "w:pPr":
  786. for num_id_node in child.getElementsByTagName("w:numId"):
  787. num_id = num_id_node.getAttribute("w:val")
  788. for ilvl_node in child.getElementsByTagName("w:ilvl"):
  789. ilvl = ilvl_node.getAttribute("w:ilvl")
  790. elif child.nodeName == "w:r":
  791. for t in child.getElementsByTagName("w:t"):
  792. para_text += t.firstChild.nodeValue if t.firstChild else ""
  793. if num_id and ilvl not in [None, '']:
  794. num_fmt, num_text = numbering[num_id][int(ilvl)][1:]
  795. if num_fmt == "decimal":
  796. para_text = f"{int(ilvl) + 1}. {para_text}"
  797. elif num_text:
  798. para_text = f"{num_text} {para_text}"
  799. content.append(para_text)
  800. # 解析表格
  801. tables = root.getElementsByTagName("w:tbl")
  802. for table in tables:
  803. table_content = []
  804. row_count = 0
  805. col_count = 0
  806. for row in table.getElementsByTagName("w:tr"):
  807. row_content = []
  808. cell_count = 0
  809. for cell in row.getElementsByTagName("w:tc"):
  810. cell_text = ""
  811. for para in cell.getElementsByTagName("w:p"):
  812. for run in para.getElementsByTagName("w:r"):
  813. for text in run.getElementsByTagName("w:t"):
  814. cell_text += text.firstChild.nodeValue if text.firstChild else ""
  815. # 检查合并单元格
  816. grid_span = 1
  817. v_merge = False
  818. for child in cell.childNodes:
  819. if child.nodeName == "w:tcPr":
  820. for grid_span_node in child.getElementsByTagName("w:gridSpan"):
  821. grid_span = int(grid_span_node.getAttribute("w:val"))
  822. for v_merge_node in child.getElementsByTagName("w:vMerge"):
  823. v_merge = True
  824. row_content.append({
  825. "text": cell_text,
  826. "colspan": grid_span,
  827. "rowspan": 1 if not v_merge else 2 # 简化处理,实际需要根据上下文确定
  828. })
  829. cell_count += grid_span
  830. table_content.append(row_content)
  831. row_count += 1
  832. col_count = max(col_count, cell_count)
  833. content.append(table_content)
  834. # 解析图片
  835. for rel in rels.values():
  836. if rel["type"] == "http://schemas.openxmlformats.org/officeDocument/2006/relationships/image":
  837. content.append(f"图片: {rel['target']}")
  838. return content
  839. # 生成 HTML 输出
  840. def generate_html(self, content):
  841. html = []
  842. html.append('<!DOCTYPE HTML><head><meta charset="UTF-8"></head><html><body>')
  843. for item in content:
  844. if isinstance(item, list): # 表格内容
  845. html.append("<table border='1'>")
  846. for row in item:
  847. html.append("<tr>")
  848. for cell in row:
  849. colspan = cell.get("colspan", 1)
  850. rowspan = cell.get("rowspan", 1)
  851. html.append(f"<td colspan='{colspan}' rowspan='{rowspan}'>{cell['text']}</td>")
  852. html.append("</tr>")
  853. html.append("</table>")
  854. else: # 普通文本或图片
  855. html.append(f"<p>{item}</p>")
  856. html.append("</body></html>")
  857. return "\n".join(html)
  858. # 主函数
  859. def read_docx(self, file_path):
  860. extract_to = "extracted_docx"
  861. self.unzip_docx(file_path, extract_to)
  862. numbering = self.parse_numbering(os.path.join(extract_to, "word", "numbering.xml"))
  863. rels = self.parse_rels(os.path.join(extract_to, "word", "_rels", "document.xml.rels"))
  864. content = self.parse_document(os.path.join(extract_to, "word", "document.xml"), numbering, rels)
  865. html_output = self.generate_html(content)
  866. with open("../result.html", "w", encoding="utf-8") as f:
  867. f.write(html_output)
  868. if __name__ == '__main__':
  869. _p = r'C:/Users/Administrator/Downloads/1723004790329.docx'
  870. # _p = "C:/Users/Administrator/Desktop/test_doc/error14.docx"
  871. save_dir = r"D:\Project\format_conversion_maxcompute\format_convert\temp" + '/'
  872. c = DocxConvert(_p, save_dir)
  873. _html = c.get_html()
  874. with open('../result.html', 'w', encoding='utf-8') as f:
  875. f.write('<!DOCTYPE HTML><head><meta charset="UTF-8"></head>' + str(_html[0]))