123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415 |
- import os
- import sys
- sys.path.append(os.path.dirname(__file__) + "/../")
- from format_convert.convert_tree import _Document, _Sentence, _Page, _Image, _Table
- import logging
- import re
- import traceback
- import xml
- import zipfile
- import docx
- import timeout_decorator
- from format_convert import get_memory_info
- from format_convert.convert_image import picture2text
- from format_convert.utils import judge_error_code, add_div
- @get_memory_info.memory_decorator
- def docx2text(path, unique_type_dir):
- logging.info("into docx2text")
- try:
- try:
- doc = docx.Document(path)
- except Exception as e:
- print("docx format error!", e)
- print(traceback.print_exc())
- logging.info("docx format error!")
- return [-3]
- # 遍历段落
- # print("docx2text extract paragraph")
- paragraph_text_list = []
- for paragraph in doc.paragraphs:
- if paragraph.text != "":
- paragraph_text_list.append("<div>" + paragraph.text + "</div>" + "\n")
- # print("paragraph_text", paragraph.text)
- # 遍历表
- try:
- table_text_list = read_xml_table(path, unique_type_dir)
- except TimeoutError:
- return [-4]
- if judge_error_code(table_text_list):
- return table_text_list
- # 顺序遍历图片
- # print("docx2text extract image")
- image_text_list = []
- temp_image_path = unique_type_dir + "temp_image.png"
- pattern = re.compile('rId\d+')
- for graph in doc.paragraphs:
- for run in graph.runs:
- if run.text == '':
- try:
- if not pattern.search(run.element.xml):
- continue
- content_id = pattern.search(run.element.xml).group(0)
- content_type = doc.part.related_parts[content_id].content_type
- except Exception as e:
- print("docx no image!", e)
- continue
- if not content_type.startswith('image'):
- continue
- # 写入临时文件
- img_data = doc.part.related_parts[content_id].blob
- with open(temp_image_path, 'wb') as f:
- f.write(img_data)
- # if get_platform() == "Windows":
- # print("img_data", img_data)
- if img_data is None:
- continue
- # 识别图片文字
- image_text = picture2text(temp_image_path)
- if image_text == [-2]:
- return [-2]
- if image_text == [-1]:
- return [-1]
- if image_text == [-3]:
- continue
- image_text = image_text[0]
- image_text_list.append(add_div(image_text))
- # 解析document.xml,获取文字顺序
- order_list = read_xml_order(path, unique_type_dir)
- if order_list == [-2]:
- return [-2]
- if order_list == [-1]:
- return [-1]
- text = ""
- # print("len(order_list)", len(order_list))
- # print("len(paragraph_text_list)", len(paragraph_text_list))
- # print("len(image_text_list)", len(image_text_list))
- # print("len(table_text_list)", len(table_text_list))
- for tag in order_list:
- if tag == "w:t":
- if len(paragraph_text_list) > 0:
- text += paragraph_text_list.pop(0)
- if tag == "wp:docPr":
- if len(image_text_list) > 0:
- text += image_text_list.pop(0)
- if tag == "w:tbl":
- if len(table_text_list) > 0:
- text += table_text_list.pop(0)
- return [text]
- except Exception as e:
- logging.info("docx2text error!")
- print("docx2text", traceback.print_exc())
- return [-1]
- @get_memory_info.memory_decorator
- def read_xml_order(path, save_path):
- logging.info("into read_xml_order")
- try:
- try:
- f = zipfile.ZipFile(path)
- for file in f.namelist():
- if "word/document.xml" == str(file):
- f.extract(file, save_path)
- f.close()
- except Exception as e:
- logging.info("docx format error!")
- return [-3]
- try:
- collection = xml_analyze(save_path + "word/document.xml")
- except TimeoutError:
- logging.info("read_xml_order timeout")
- return [-4]
- body = collection.getElementsByTagName("w:body")[0]
- order_list = []
- text_list = []
- for line in body.childNodes:
- # print(str(line))
- if "w:p" in str(line):
- text = line.getElementsByTagName("w:t")
- picture = line.getElementsByTagName("wp:docPr")
- if text:
- order_list.append("w:t")
- temp_text = ""
- for t in text:
- if len(t.childNodes) > 0:
- temp_text += t.childNodes[0].nodeValue
- else:
- continue
- text_list.append(temp_text)
- if picture:
- order_list.append("wp:docPr")
- for line1 in line.childNodes:
- if "w:r" in str(line1):
- # print("read_xml_order", "w:r")
- picture1 = line1.getElementsByTagName("w:pict")
- if picture1:
- order_list.append("wp:docPr")
- if "w:tbl" in str(line):
- order_list.append("w:tbl")
- read_xml_table(path, save_path)
- return [order_list, text_list]
- except Exception as e:
- logging.info("read_xml_order error!")
- print("read_xml_order", traceback.print_exc())
- # log_traceback("read_xml_order")
- return [-1]
- @get_memory_info.memory_decorator
- def read_xml_table(path, save_path):
- logging.info("into read_xml_table")
- try:
- try:
- f = zipfile.ZipFile(path)
- for file in f.namelist():
- if "word/document.xml" == str(file):
- f.extract(file, save_path)
- f.close()
- except Exception as e:
- # print("docx format error!", e)
- logging.info("docx format error!")
- return [-3]
- try:
- collection = xml_analyze(save_path + "word/document.xml")
- except TimeoutError:
- logging.info("read_xml_table timeout")
- return [-4]
- body = collection.getElementsByTagName("w:body")[0]
- table_text_list = []
- # print("body.childNodes", body.childNodes)
- for line in body.childNodes:
- if "w:tbl" in str(line):
- # print("str(line)", str(line))
- table_text = '<table border="1">' + "\n"
- tr_list = line.getElementsByTagName("w:tr")
- # print("line.childNodes", line.childNodes)
- tr_index = 0
- tr_text_list = []
- tr_text_list_colspan = []
- for tr in tr_list:
- table_text = table_text + "<tr rowspan=1>" + "\n"
- tc_list = tr.getElementsByTagName("w:tc")
- tc_index = 0
- tc_text_list = []
- for tc in tc_list:
- tc_text = ""
- # 获取一格占多少列
- col_span = tc.getElementsByTagName("w:gridSpan")
- if col_span:
- col_span = int(col_span[0].getAttribute("w:val"))
- else:
- col_span = 1
- # 获取是否是合并单元格的下一个空单元格
- is_merge = tc.getElementsByTagName("w:vMerge")
- if is_merge:
- is_merge = is_merge[0].getAttribute("w:val")
- if is_merge == "continue":
- col_span_index = 0
- real_tc_index = 0
- # if get_platform() == "Windows":
- # print("read_xml_table tr_text_list", tr_text_list)
- # print("read_xml_table tr_index", tr_index)
- if 0 <= tr_index - 1 < len(tr_text_list):
- for tc_colspan in tr_text_list[tr_index - 1]:
- if col_span_index < tc_index:
- col_span_index += tc_colspan[1]
- real_tc_index += 1
- # print("tr_index-1, real_tc_index", tr_index-1, real_tc_index)
- # print(tr_text_list[tr_index-1])
- if real_tc_index < len(tr_text_list[tr_index - 1]):
- tc_text = tr_text_list[tr_index - 1][real_tc_index][0]
- table_text = table_text + "<td colspan=" + str(col_span) + ">" + "\n"
- p_list = tc.getElementsByTagName("w:p")
- for p in p_list:
- t = p.getElementsByTagName("w:t")
- if t:
- for tt in t:
- # print("tt", tt.childNodes)
- if len(tt.childNodes) > 0:
- tc_text += tt.childNodes[0].nodeValue
- tc_text += "\n"
- table_text = table_text + tc_text + "</td>" + "\n"
- tc_index += 1
- tc_text_list.append([tc_text, col_span])
- table_text += "</tr>" + "\n"
- tr_index += 1
- tr_text_list.append(tc_text_list)
- table_text += "</table>" + "\n"
- table_text_list.append(table_text)
- return table_text_list
- except Exception as e:
- logging.info("read_xml_table error")
- print("read_xml_table", traceback.print_exc())
- return [-1]
- @get_memory_info.memory_decorator
- @timeout_decorator.timeout(300, timeout_exception=TimeoutError)
- def xml_analyze(path):
- # 解析xml
- DOMTree = xml.dom.minidom.parse(path)
- collection = DOMTree.documentElement
- return collection
- def read_docx_table(document):
- table_text_list = []
- for table in document.tables:
- table_text = "<table>\n"
- # print("==================")
- for row in table.rows:
- table_text += "<tr>\n"
- for cell in row.cells:
- table_text += "<td>" + cell.text + "</td>\n"
- table_text += "</tr>\n"
- table_text += "</table>\n"
- # print(table_text)
- table_text_list.append(table_text)
- return table_text_list
- class DocxConvert:
- def __init__(self, path, unique_type_dir):
- self._doc = _Document(path)
- self.path = path
- self.unique_type_dir = unique_type_dir
- def init_package(self):
- # 各个包初始化
- try:
- self.docx = docx.Document(self.path)
- self.zip = zipfile.ZipFile(self.path)
- except:
- logging.info("cannot open docx!")
- traceback.print_exc()
- self._doc.error_code = [-3]
- def convert(self):
- self.init_package()
- if self._doc.error_code is not None:
- return
- order_and_text_list = self.get_orders()
- if judge_error_code(order_and_text_list):
- self._doc.error_code = order_and_text_list
- return
- order_list, text_list = order_and_text_list
- print("doc ", text_list[:10])
- table_list = self.get_tables()
- if judge_error_code(table_list):
- self._doc.error_code = table_list
- return
- # paragraph_list = self.get_paragraphs()
- image_list = self.get_images()
- temp_image_path = self.unique_type_dir + "temp_image.png"
- self._page = _Page(None, 0)
- order_y = 0
- for tag in order_list:
- bbox = (0, order_y, 0, 0)
- if tag == "w:t":
- if len(text_list) > 0:
- _para = text_list.pop(0)
- self._page.add_child(_Sentence(_para, bbox))
- if tag == "wp:docPr":
- if len(image_list) > 0:
- _image = image_list.pop(0)
- self._page.add_child(_Image(_image, temp_image_path, bbox))
- if tag == "w:tbl":
- if len(table_list) > 0:
- _table = table_list.pop(0)
- _table = _Table(_table, bbox)
- _table.is_html = True
- self._page.add_child(_table)
- order_y += 1
- if self._doc.error_code is None and self._page.error_code is not None:
- self._doc.error_code = self._page.error_code
- self._doc.add_child(self._page)
- def get_paragraphs(self):
- # 遍历段落
- paragraph_list = []
- for paragraph in self.docx.paragraphs:
- if paragraph.text != "":
- paragraph_list.append(paragraph.text)
- return paragraph_list
- def get_tables(self):
- # 遍历表
- table_list = read_xml_table(self.path, self.unique_type_dir)
- return table_list
- def get_images(self):
- # 顺序遍历图片
- image_list = []
- pattern = re.compile('rId\d+')
- for graph in self.docx.paragraphs:
- for run in graph.runs:
- if run.text == '':
- try:
- if not pattern.search(run.element.xml):
- continue
- content_id = pattern.search(run.element.xml).group(0)
- content_type = self.docx.part.related_parts[content_id].content_type
- except Exception as e:
- print("docx no image!", e)
- continue
- if not content_type.startswith('image'):
- continue
- img_data = self.docx.part.related_parts[content_id].blob
- if img_data is not None:
- image_list.append(img_data)
- return image_list
- def get_orders(self):
- # 解析document.xml,获取文字顺序
- order_and_text_list = read_xml_order(self.path, self.unique_type_dir)
- return order_and_text_list
- def get_doc_object(self):
- return self._doc
- def get_html(self):
- try:
- self.convert()
- except:
- traceback.print_exc()
- self._doc.error_code = [-1]
- if self._doc.error_code is not None:
- return self._doc.error_code
- return self._doc.get_html()
|