fangjiasheng
/
FORMAT_CONVERSION_MAXCOMPUTE


			
							123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493
							import os
import sys
sys.path.append(os.path.dirname(__file__) + "/../")
import difflib
import logging
import mimetypes
import platform
import re
import traceback
import filetype
from bs4 import BeautifulSoup


def judge_error_code(_list, code=[-1, -2, -3, -4, -5, -7]):
    for c in code:
        if _list == [c]:
            return True
    return False


def add_div(text):
    if text == "" or text is None:
        return text

    if get_platform() == "Windows":
        print("add_div", text)
    if re.findall("<div>", text):
        return text

    text = "<div>" + text + "\n"
    text = re.sub("\n", "</div>\n<div>", text)
    # text += "</div>"
    if text[-5:] == "<div>":
        print("add_div has cut", text[-30:])
        text = text[:-5]
    return text


def get_platform():
    sys = platform.system()
    return sys


def get_html_p(html_path):
    logging.info("into get_html_p")
    try:
        with open(html_path, "r") as ff:
            html_str = ff.read()

        soup = BeautifulSoup(html_str, 'lxml')
        text = ""
        for p in soup.find_all("p"):
            p_text = p.text
            p_text = p_text.strip()
            if p.string != "":
                text += p_text
        text += "\n"
        return text
    except Exception as e:
        logging.info("get_html_p error!")
        print("get_html_p", traceback.print_exc())
        return [-1]


def string_similarity(str1, str2):
    # 去掉<div>和回车
    str1 = re.sub("<div>", "", str1)
    str1 = re.sub("</div>", "", str1)
    str1 = re.sub("\n", "", str1)
    str2 = re.sub("<div>", "", str2)
    str2 = re.sub("</div>", "", str2)
    str2 = re.sub("\n", "", str2)
    # print("********************************")
    # print("str1", str1)
    # print("********************************")
    # print("str2", str2)
    # print("********************************")
    score = difflib.SequenceMatcher(None, str1, str2).ratio()
    print("string_similarity", score)
    return score


def get_sequential_data(text_list, bbox_list, html=False):
    logging.info("into get_sequential_data")
    try:
        text = ""
        order_list = []
        for i in range(len(text_list)):
            length_start = bbox_list[i][0][0]
            length_end = bbox_list[i][1][0]
            height_start = bbox_list[i][0][1]
            height_end = bbox_list[i][-1][1]
            # print([length_start, length_end, height_start, height_end])
            order_list.append([text_list[i], length_start, length_end, height_start, height_end])
            # text = text + infomation['text'] + "\n"

        if get_platform() == "Windows":
            print("get_sequential_data", order_list)
        if not order_list:
            if get_platform() == "Windows":
                print("get_sequential_data", "no order list")
            return ""

        # 根据bbox的坐标对输出排序
        order_list.sort(key=lambda x: (x[3], x[1]))

        # 根据bbox分行分列
        # col_list = []
        # height_end = int((order_list[0][4] + order_list[0][3]) / 2)
        # for i in range(len(order_list)):
        #     if height_end - threshold <= order_list[i][3] <= height_end + threshold:
        #         col_list.append(order_list[i])
        #     else:
        #         row_list.append(col_list)
        #         col_list = []
        #         height_end = int((order_list[i][4] + order_list[i][3]) / 2)
        #         col_list.append(order_list[i])
        #     if i == len(order_list) - 1:
        #         row_list.append(col_list)

        row_list = []
        used_box = []
        threshold = 5
        for box in order_list:
            if box in used_box:
                continue

            height_center = (box[4] + box[3]) / 2
            row = []
            for box2 in order_list:
                if box2 in used_box:
                    continue
                height_center2 = (box2[4] + box2[3]) / 2
                if height_center - threshold <= height_center2 <= height_center + threshold:
                    if box2 not in row:
                        row.append(box2)
                        used_box.append(box2)
            row.sort(key=lambda x: x[0])
            row_list.append(row)

        for row in row_list:
            if not row:
                continue
            if len(row) <= 1:
                text = text + row[0][0] + "\n"
            else:
                sub_text = ""
                row.sort(key=lambda x: x[1])
                for col in row:
                    sub_text = sub_text + col[0] + " "
                sub_text = sub_text + "\n"
                text += sub_text

        if html:
            text = "<div>" + text
            text = re.sub("\n", "</div>\n<div>", text)
            text += "</div>"
            # if text[-5:] == "<div>":
            #     text = text[:-5]
        return text

    except Exception as e:
        logging.info("get_sequential_data error!")
        print("get_sequential_data", traceback.print_exc())
        return [-1]


def get_formatted_table(text_list, text_bbox_list, table_bbox_list, split_line):
    logging.info("into get_formatted_table")
    try:
        # 重新定义text_bbox_list，[point, point, text]
        text_bbox_list = [[text_bbox_list[i][0], text_bbox_list[i][2], text_list[i]] for i in
                          range(len(text_bbox_list))]
        # 按纵坐标排序
        text_bbox_list.sort(key=lambda x: (x[0][1], x[0][0]))
        table_bbox_list.sort(key=lambda x: (x[0][1], x[0][0]))

        # print("text_bbox_list", text_bbox_list)
        # print("table_bbox_list", table_bbox_list)

        # bbox位置 threshold
        threshold = 5

        # 根据split_line分区，可能有个区多个表格 [(), ()]
        area_text_bbox_list = []
        area_table_bbox_list = []
        # print("get_formatted_table, split_line", split_line)
        for j in range(1, len(split_line)):
            last_y = split_line[j - 1][0][1]
            current_y = split_line[j][0][1]
            temp_text_bbox_list = []
            temp_table_bbox_list = []

            # 找出该区域下text bbox
            for text_bbox in text_bbox_list:
                # 计算 text bbox 中心点
                text_bbox_center = ((text_bbox[1][0] + text_bbox[0][0]) / 2,
                                    (text_bbox[1][1] + text_bbox[0][1]) / 2)
                if last_y - threshold <= text_bbox_center[1] <= current_y + threshold:
                    temp_text_bbox_list.append(text_bbox)
            area_text_bbox_list.append(temp_text_bbox_list)

            # 找出该区域下table bbox
            for table_bbox in table_bbox_list:
                # 计算 table bbox 中心点
                table_bbox_center = ((table_bbox[1][0] + table_bbox[0][0]) / 2,
                                     (table_bbox[1][1] + table_bbox[0][1]) / 2)
                if last_y < table_bbox_center[1] < current_y:
                    temp_table_bbox_list.append(table_bbox)
            area_table_bbox_list.append(temp_table_bbox_list)

        # for j in range(len(area_text_bbox_list)):
        #     print("area_text_bbox_list", j, area_text_bbox_list[j])

        # 对每个区域分别进行两个bbox匹配，生成表格
        area_text_list = []
        area_column_list = []
        for j in range(len(area_text_bbox_list)):
            # 每个区域的table bbox 和text bbox
            temp_table_bbox_list = area_table_bbox_list[j]
            temp_text_bbox_list = area_text_bbox_list[j]

            # 判断该区域有无表格bbox
            # 若无表格，将该区域文字连接
            if not temp_table_bbox_list:
                # 找出该区域的所有text bbox
                only_text_list = []
                only_bbox_list = []
                for text_bbox in temp_text_bbox_list:
                    only_text_list.append(text_bbox[2])
                    only_bbox_list.append([text_bbox[0], text_bbox[1]])
                only_text = get_sequential_data(only_text_list, only_bbox_list, True)
                if only_text == [-1]:
                    return [-1], [-1]
                area_text_list.append(only_text)
                area_column_list.append(0)
                continue

            # 有表格
            # 文本对应的表格格子
            text_in_table = {}
            for i in range(len(temp_text_bbox_list)):
                text_bbox = temp_text_bbox_list[i]

                # 计算 text bbox 中心点
                text_bbox_center = ((text_bbox[1][0] + text_bbox[0][0]) / 2,
                                    (text_bbox[1][1] + text_bbox[0][1]) / 2)

                # 判断中心点在哪个table bbox中
                for table_bbox in temp_table_bbox_list:
                    # 中心点在table bbox中，将text写入字典
                    if table_bbox[0][0] <= text_bbox_center[0] <= table_bbox[1][0] and \
                            table_bbox[0][1] <= text_bbox_center[1] <= table_bbox[1][1]:
                        if str(table_bbox) in text_in_table.keys():
                            text_in_table[str(table_bbox)] = text_in_table.get(str(table_bbox)) + text_bbox[2]
                        else:
                            text_in_table[str(table_bbox)] = text_bbox[2]
                        break

                    # 如果未找到text bbox匹配的table bbox，加大threshold匹配
                    # elif (table_bbox[0][0] <= text_bbox_center[0]+threshold <= table_bbox[1][0] and
                    #         table_bbox[0][1] <= text_bbox_center[1]+threshold <= table_bbox[1][1]) or \
                    #         (table_bbox[0][0] <= text_bbox_center[0]-threshold <= table_bbox[1][0] and
                    #          table_bbox[0][1] <= text_bbox_center[1]-threshold <= table_bbox[1][1]) or \
                    #         (table_bbox[0][0] <= text_bbox_center[0]+threshold <= table_bbox[1][0] and
                    #          table_bbox[0][1] <= text_bbox_center[1]-threshold <= table_bbox[1][1]) or \
                    #         (table_bbox[0][0] <= text_bbox_center[0]-threshold <= table_bbox[1][0] and
                    #          table_bbox[0][1] <= text_bbox_center[1]+threshold <= table_bbox[1][1]):
                    #     if str(table_bbox) in text_in_table.keys():
                    #         text_in_table[str(table_bbox)] = text_in_table.get(str(table_bbox)) + text_bbox[2]
                    #     else:
                    #         text_in_table[str(table_bbox)] = text_bbox[2]
                    #     break

            # 对表格格子进行分行分列，并计算总计多少小列
            # 放入坐标
            all_col_list = []
            all_row_list = []
            for i in range(len(temp_table_bbox_list)):
                table_bbox = temp_table_bbox_list[i]

                # 放入所有坐标x
                if table_bbox[0][0] not in all_col_list:
                    all_col_list.append(table_bbox[0][0])
                if table_bbox[1][0] not in all_col_list:
                    all_col_list.append(table_bbox[1][0])

                # 放入所有坐标y
                if table_bbox[0][1] not in all_row_list:
                    all_row_list.append(table_bbox[0][1])
                if table_bbox[1][1] not in all_row_list:
                    all_row_list.append(table_bbox[1][1])
            all_col_list.sort(key=lambda x: x)
            all_row_list.sort(key=lambda x: x)

            # 分行
            row_list = []
            rows = []
            temp_table_bbox_list.sort(key=lambda x: (x[0][1], x[0][0], x[1][1], x[1][0]))
            y_row = temp_table_bbox_list[0][0][1]
            for i in range(len(temp_table_bbox_list)):
                table_bbox = temp_table_bbox_list[i]

                if y_row - threshold <= table_bbox[0][1] <= y_row + threshold:
                    rows.append(table_bbox)
                else:
                    y_row = table_bbox[0][1]
                    if rows:
                        rows.sort(key=lambda x: x[0][0])
                        row_list.append(rows)
                    rows = []
                    rows.append(table_bbox)
                # print("*" * 30)
                # print(row_list)

                if i == len(temp_table_bbox_list) - 1:
                    if rows:
                        rows.sort(key=lambda x: x[0][0])
                        row_list.append(rows)

            # 生成表格，包括文字和格子宽度
            area_column = []
            text = '<table border="1">' + "\n"
            for row in row_list:
                text += "<tr>" + "\n"
                for col in row:
                    # 计算bbox y坐标之间有多少其他点，+1即为所占行数
                    row_span = 1
                    for y in all_row_list:
                        if col[0][1] < y < col[1][1]:
                            if y - col[0][1] >= 2 and col[1][1] - y >= 2:
                                row_span += 1

                    # 计算bbox x坐标之间有多少其他点，+1即为所占列数
                    col_span = 1
                    for x in all_col_list:
                        if col[0][0] < x < col[1][0]:
                            if x - col[0][0] >= 2 and col[1][0] - x >= 2:
                                col_span += 1

                    text += "<td colspan=" + str(col_span) + " rowspan=" + str(row_span) + ">"

                    if str(col) in text_in_table.keys():
                        text += text_in_table.get(str(col))
                    else:
                        text += ""
                    text += "</td>" + "\n"
                text += "</tr>" + "\n"
            text += "</table>" + "\n"

            # 计算最大column
            max_col_num = 0
            for row in row_list:
                col_num = 0
                for col in row:
                    col_num += 1
                if max_col_num < col_num:
                    max_col_num = col_num

            area_text_list.append(text)
            area_column_list.append(max_col_num)

        text = ""
        if get_platform() == "Windows":
            print("get_formatted_table area_text_list", area_text_list)
        for area_text in area_text_list:
            text += area_text
        return text, area_column_list
    except Exception as e:
        logging.info("get_formatted_table error!")
        print("get_formatted_table", traceback.print_exc())
        return [-1], [-1]


def rename_inner_files(root_path):
    try:
        logging.info("into rename_inner_files")
        # 获取解压文件夹下所有文件+文件夹，不带根路径
        path_list = []
        for root, dirs, files in os.walk(root_path, topdown=False):
            for name in dirs:
                p = os.path.join(root, name) + os.sep
                if get_platform() == "Windows":
                    root_path = slash_replace(root_path)
                    p = slash_replace(p)
                    p = re.sub(root_path, "", p)
                    root_path = slash_replace(root_path, True)
                    p = slash_replace(p, True)
                else:
                    p = re.sub(root_path, "", p)
                path_list.append(p)
            for name in files:
                p = os.path.join(root, name)
                if get_platform() == "Windows":
                    root_path = slash_replace(root_path)
                    p = slash_replace(p)
                    p = re.sub(root_path, "", p)
                    root_path = slash_replace(root_path, True)
                    p = slash_replace(p, True)
                else:
                    p = re.sub(root_path, "", p)
                path_list.append(p)

        # 按路径长度排序
        path_list.sort(key=lambda x: len(x), reverse=True)

        # 循环改名
        for old_path in path_list:
            # 按路径分隔符分割
            ss = old_path.split(os.sep)
            # 判断是否文件夹
            is_dir = 0
            file_type = ""
            if os.path.isdir(root_path + old_path):
                ss = ss[:-1]
                is_dir = 1
            else:
                if "." in old_path:
                    file_type = "." + old_path.split(".")[-1]
                else:
                    file_type = ""

            # 最后一级需要用hash改名
            new_path = ""
            # new_path = re.sub(ss[-1], str(hash(ss[-1])), old_path) + file_type
            current_level = 0
            for s in ss:
                # 路径拼接
                if current_level < len(ss) - 1:
                    new_path += s + os.sep
                else:
                    new_path += str(hash(s)) + file_type
                current_level += 1

            new_ab_path = root_path + new_path
            old_ab_path = root_path + old_path
            os.rename(old_ab_path, new_ab_path)

        # 重新获取解压文件夹下所有文件+文件夹
        new_path_list = []
        for root, dirs, files in os.walk(root_path, topdown=False):
            for name in dirs:
                new_path_list.append(os.path.join(root, name) + os.sep)
            for name in files:
                new_path_list.append(os.path.join(root, name))
        return new_path_list
    except:
        traceback.print_exc()
        return [-1]


def judge_format(path):
    guess1 = mimetypes.guess_type(path)
    _type = None
    if guess1[0]:
        _type = guess1[0]
    else:
        guess2 = filetype.guess(path)
        if guess2:
            _type = guess2.mime

    if _type == "application/pdf":
        return "pdf"
    if _type == "application/vnd.openxmlformats-officedocument.wordprocessingml.document":
        return "docx"
    if _type == "application/x-zip-compressed" or _type == "application/zip":
        return "zip"
    if _type == "application/x-rar-compressed" or _type == "application/rar":
        return "rar"
    if _type == "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet":
        return "xlsx"
    if _type == "application/msword":
        return "doc"
    if _type == "image/png":
        return "png"
    if _type == "image/jpeg":
        return "jpg"

    # 猜不到，返回None
    return None


def slash_replace(_str, reverse=False):
    if reverse:
        _str = eval(repr(_str).replace('/', '\\\\'))
    else:
        _str = eval(repr(_str).replace('\\\\', '/'))
    return _str


if __name__ == "__main__":
    strs = r"D:\Project\temp\04384fcc9e8911ecbd2844f971944973\043876ca9e8911eca5e144f971944973_rar\1624114035529.jpeg"
    print(slash_replace(strs))