fangjiasheng
/
FORMAT_CONVERSION_MAXCOMPUTE


			
							123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288
							import os
import sys
sys.path.append(os.path.dirname(__file__) + "/../")
import logging
import re
import traceback
import xml
import zipfile
import docx
import timeout_decorator
from format_convert import get_memory_info
from format_convert.convert_image import picture2text
from format_convert.utils import judge_error_code, add_div


@get_memory_info.memory_decorator
def docx2text(path, unique_type_dir):
    logging.info("into docx2text")
    try:
        try:
            doc = docx.Document(path)
        except Exception as e:
            print("docx format error!", e)
            print(traceback.print_exc())
            logging.info("docx format error!")
            return [-3]

        # 遍历段落
        # print("docx2text extract paragraph")
        paragraph_text_list = []
        for paragraph in doc.paragraphs:
            if paragraph.text != "":
                paragraph_text_list.append("<div>" + paragraph.text + "</div>" + "\n")
                # print("paragraph_text", paragraph.text)

        # 遍历表
        try:
            table_text_list = read_xml_table(path, unique_type_dir)
        except TimeoutError:
            return [-4]

        if judge_error_code(table_text_list):
            return table_text_list

        # 顺序遍历图片
        # print("docx2text extract image")
        image_text_list = []
        temp_image_path = unique_type_dir + "temp_image.png"
        pattern = re.compile('rId\d+')
        for graph in doc.paragraphs:
            for run in graph.runs:
                if run.text == '':
                    try:
                        if not pattern.search(run.element.xml):
                            continue
                        content_id = pattern.search(run.element.xml).group(0)
                        content_type = doc.part.related_parts[content_id].content_type
                    except Exception as e:
                        print("docx no image!", e)
                        continue
                    if not content_type.startswith('image'):
                        continue

                    # 写入临时文件
                    img_data = doc.part.related_parts[content_id].blob
                    with open(temp_image_path, 'wb') as f:
                        f.write(img_data)

                    # if get_platform() == "Windows":
                    #     print("img_data", img_data)

                    if img_data is None:
                        continue

                    # 识别图片文字
                    image_text = picture2text(temp_image_path)
                    if image_text == [-2]:
                        return [-2]
                    if image_text == [-1]:
                        return [-1]
                    if image_text == [-3]:
                        continue

                    image_text = image_text[0]
                    image_text_list.append(add_div(image_text))

        # 解析document.xml，获取文字顺序
        order_list = read_xml_order(path, unique_type_dir)
        if order_list == [-2]:
            return [-2]
        if order_list == [-1]:
            return [-1]

        text = ""
        # print("len(order_list)", len(order_list))
        # print("len(paragraph_text_list)", len(paragraph_text_list))
        # print("len(image_text_list)", len(image_text_list))
        # print("len(table_text_list)", len(table_text_list))

        for tag in order_list:
            if tag == "w:t":
                if len(paragraph_text_list) > 0:
                    text += paragraph_text_list.pop(0)
            if tag == "wp:docPr":
                if len(image_text_list) > 0:
                    text += image_text_list.pop(0)
            if tag == "w:tbl":
                if len(table_text_list) > 0:
                    text += table_text_list.pop(0)
        return [text]
    except Exception as e:
        logging.info("docx2text error!")
        print("docx2text", traceback.print_exc())
        return [-1]


@get_memory_info.memory_decorator
def read_xml_order(path, save_path):
    logging.info("into read_xml_order")
    try:
        try:
            f = zipfile.ZipFile(path)
            for file in f.namelist():
                if "word/document.xml" == str(file):
                    f.extract(file, save_path)
            f.close()
        except Exception as e:
            logging.info("docx format error!")
            return [-3]

        try:
            collection = xml_analyze(save_path + "word/document.xml")
        except TimeoutError:
            logging.info("read_xml_order timeout")
            return [-4]

        body = collection.getElementsByTagName("w:body")[0]
        order_list = []
        for line in body.childNodes:
            # print(str(line))
            if "w:p" in str(line):
                text = line.getElementsByTagName("w:t")
                picture = line.getElementsByTagName("wp:docPr")
                if text:
                    order_list.append("w:t")
                if picture:
                    order_list.append("wp:docPr")

                for line1 in line.childNodes:
                    if "w:r" in str(line1):
                        # print("read_xml_order", "w:r")
                        picture1 = line1.getElementsByTagName("w:pict")
                        if picture1:
                            order_list.append("wp:docPr")

            if "w:tbl" in str(line):
                order_list.append("w:tbl")
        read_xml_table(path, save_path)
        return order_list
    except Exception as e:
        logging.info("read_xml_order error!")
        print("read_xml_order", traceback.print_exc())
        # log_traceback("read_xml_order")
        return [-1]


@get_memory_info.memory_decorator
def read_xml_table(path, save_path):
    logging.info("into read_xml_table")
    try:
        try:
            f = zipfile.ZipFile(path)
            for file in f.namelist():
                if "word/document.xml" == str(file):
                    f.extract(file, save_path)
            f.close()
        except Exception as e:
            # print("docx format error!", e)
            logging.info("docx format error!")
            return [-3]

        try:
            collection = xml_analyze(save_path + "word/document.xml")
        except TimeoutError:
            logging.info("read_xml_table timeout")
            return [-4]

        body = collection.getElementsByTagName("w:body")[0]
        table_text_list = []
        # print("body.childNodes", body.childNodes)
        for line in body.childNodes:
            if "w:tbl" in str(line):
                # print("str(line)", str(line))
                table_text = '<table border="1">' + "\n"
                tr_list = line.getElementsByTagName("w:tr")
                # print("line.childNodes", line.childNodes)
                tr_index = 0
                tr_text_list = []
                tr_text_list_colspan = []
                for tr in tr_list:
                    table_text = table_text + "<tr rowspan=1>" + "\n"
                    tc_list = tr.getElementsByTagName("w:tc")
                    tc_index = 0
                    tc_text_list = []
                    for tc in tc_list:
                        tc_text = ""

                        # 获取一格占多少列
                        col_span = tc.getElementsByTagName("w:gridSpan")
                        if col_span:
                            col_span = int(col_span[0].getAttribute("w:val"))
                        else:
                            col_span = 1

                        # 获取是否是合并单元格的下一个空单元格
                        is_merge = tc.getElementsByTagName("w:vMerge")
                        if is_merge:
                            is_merge = is_merge[0].getAttribute("w:val")
                            if is_merge == "continue":
                                col_span_index = 0
                                real_tc_index = 0

                                # if get_platform() == "Windows":
                                #     print("read_xml_table tr_text_list", tr_text_list)
                                #     print("read_xml_table tr_index", tr_index)

                                if 0 <= tr_index - 1 < len(tr_text_list):
                                    for tc_colspan in tr_text_list[tr_index - 1]:
                                        if col_span_index < tc_index:
                                            col_span_index += tc_colspan[1]
                                            real_tc_index += 1

                                    # print("tr_index-1, real_tc_index", tr_index-1, real_tc_index)
                                    # print(tr_text_list[tr_index-1])
                                    if real_tc_index < len(tr_text_list[tr_index - 1]):
                                        tc_text = tr_text_list[tr_index - 1][real_tc_index][0]

                        table_text = table_text + "<td colspan=" + str(col_span) + ">" + "\n"
                        p_list = tc.getElementsByTagName("w:p")

                        for p in p_list:
                            t = p.getElementsByTagName("w:t")
                            if t:
                                for tt in t:
                                    # print("tt", tt.childNodes)
                                    if len(tt.childNodes) > 0:
                                        tc_text += tt.childNodes[0].nodeValue
                                tc_text += "\n"

                        table_text = table_text + tc_text + "</td>" + "\n"
                        tc_index += 1
                        tc_text_list.append([tc_text, col_span])
                    table_text += "</tr>" + "\n"
                    tr_index += 1
                    tr_text_list.append(tc_text_list)
                table_text += "</table>" + "\n"
                table_text_list.append(table_text)
        return table_text_list

    except Exception as e:
        logging.info("read_xml_table error")
        print("read_xml_table", traceback.print_exc())
        return [-1]


@get_memory_info.memory_decorator
@timeout_decorator.timeout(300, timeout_exception=TimeoutError)
def xml_analyze(path):
    # 解析xml
    DOMTree = xml.dom.minidom.parse(path)
    collection = DOMTree.documentElement
    return collection


def read_docx_table(document):
    table_text_list = []
    for table in document.tables:
        table_text = "<table>\n"
        # print("==================")
        for row in table.rows:
            table_text += "<tr>\n"
            for cell in row.cells:
                table_text += "<td>" + cell.text + "</td>\n"
            table_text += "</tr>\n"
        table_text += "</table>\n"
        # print(table_text)
        table_text_list.append(table_text)
    return table_text_list