fangjiasheng
/
FORMAT_CONVERSION_MAXCOMPUTE


			
							123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455
							import random
import re
import numpy as np
import cv2
# import psycopg2
from PIL import Image, ImageFont, ImageDraw
import os
from PIL import Image, ImageFont, ImageDraw
import pandas as pd


# project_path = "D:\\Project\\PaddleOCR-release-2.0\\"
from bs4 import BeautifulSoup

project_path = "../../"
image_output_path = project_path + "train_data/bidi_data/mix_data3/"
train_data_path = image_output_path + "rec_gt_train.txt"
test_data_path = image_output_path + "rec_gt_test.txt"


def create_image(data_dir, file_name, text):
    list1 = re.findall('[a-zA-Z\d]', text)
    list2 = re.findall('[\u4e00-\u9fa5。，！？￥《》【】’“：；·、（）]', text)
    list3 = re.findall('[,.!?&@*+=~%()#<>-''|/:{}$;]', text)
    english_len = len(list1)
    chinese_len = len(list2)
    character_len = len(list3)

    if english_len + chinese_len + character_len == 0:
        character_len = len(text)

    # 根据各字体大小生成图片
    # font 10 : a1-6 字-10 image-len*, 16
    # font 20 : a1-12 字-20 image-len*, 32
    font_list = [10, 15, 20, 25, 30, 35, 40]
    # 随机选字体大小
    font_index = random.randint(0, len(font_list)-1)
    font = font_list[font_index]

    # 根据字体大小计算各字符长度
    chinese_charc_len = font * 1
    english_charc_len = int(font * 0.7)
    number_charc_len = int(font * 0.3)
    image_width = int(font * 1.6)
    text_len = english_len * english_charc_len + chinese_len * chinese_charc_len \
               + character_len * number_charc_len
    im = Image.new("RGB", (text_len, image_width), (255, 255, 255))
    dr = ImageDraw.Draw(im)
    font = ImageFont.truetype("tools/fonts/msyh.ttc", font)
    dr.text((0, 0), text, font=font, fill="#000000")

    # 图像增强
    # PIL -> CV2
    img = cv2.cvtColor(np.asarray(im), cv2.COLOR_RGB2BGR)

    # 随机缩放
    resize_y = random.randint(1, 2)
    resize_x = random.randint(1, 2)
    img = cv2.resize(img, (img.shape[1]*resize_y, img.shape[0]*resize_x))

    # 模糊
    # 高斯模糊
    sigmaX = random.randint(1, 3)
    sigmaY = random.randint(1, 3)
    img = cv2.GaussianBlur(img, (5, 5), sigmaX, sigmaY)

    # CV2 -> PIL
    im = Image.fromarray(cv2.cvtColor(img, cv2.COLOR_BGR2RGB))
    # resize_y = random.uniform(1, 3)
    # resize_x = random.uniform(1, 3)
    # img = im.resize((int(im.size[0]*resize_y), int(im.size[1]*resize_x)), Image.ANTIALIAS)

    # 保存
    # cv2.imwrite(data_dir + file_name, img)

    # im.show("img")
    im.save(data_dir + file_name)
    # print(file_name)


def create_orgs_image(df):
    df = df[:1000]
    label_file_train = project_path + "train_data\\bidi_data\\orgs_data\\rec_gt_train.txt"
    label_file_test = project_path + "train_data\\bidi_data\\orgs_data\\rec_gt_test.txt"
    image_output_path = project_path + "train_data\\bidi_data\\orgs_data\\"
    f1 = open(label_file_train, "w")
    f2 = open(label_file_test, "w")
    print(df.shape)
    for index, row in df.iterrows():
        text = row["name"]
        # text = "晋江滨江国家体育训练基地有限公司"
        im = Image.new("RGB", (len(text)*10, 16), (255, 255, 255))
        dr = ImageDraw.Draw(im)
        font = ImageFont.truetype(os.path.join(os.getcwd(), "fonts", "msyh.ttc"), 10)
        dr.text((0, 0), text, font=font, fill="#000000")
        # im.show()
        if index / df.shape[0] <= 0.8:
            mode = "train"
            f = f1
        else:
            mode = "test"
            f = f2
        im.save(image_output_path + mode + "\\" + "text_" + str(index) + ".jpg")
        f.write(mode + "/text_" + str(index) + ".jpg" + "\t" + text + "\n")

    f1.close()
    f2.close()


def create_longSentence_image(df):
    # df = df[:3000]
    label_file_train = project_path + "train_data\\bidi_data\\longSentence_data\\rec_gt_train.txt"
    label_file_test = project_path + "train_data\\bidi_data\\longSentence_data\\rec_gt_test.txt"
    image_output_path = project_path + "train_data\\bidi_data\\longSentence_data\\"
    f1 = open(label_file_train, "w")
    f2 = open(label_file_test, "w")
    print(df.shape)

    for index, row in df.iterrows():
        text = row["text"]
        # text = "晋江滨江国家体育训练基地有限公司"
        im = Image.new("RGB", (len(text)*10, 16), (255, 255, 255))
        dr = ImageDraw.Draw(im)
        font = ImageFont.truetype(os.path.join(os.getcwd(), "fonts", "msyh.ttc"), 10)
        dr.text((0, 0), text, font=font, fill="#000000")
        # im.show()
        if index <= int((df.shape[0]-1)*0.8):
            mode = "train"
            f = f1
        else:
            mode = "test"
            f = f2
        im.save(image_output_path + mode + "\\" + "text_" + str(index) + ".jpg")
        f.write(mode + "/text_" + str(index) + ".jpg" + "\t" + text + "\n")

    f1.close()
    f2.close()


# def readPostgreSQL():
#     conn_string = "host=192.168.2.101 port=5432 dbname=iepy " \
#                   "user=iepy_read password=iepy_read"
#     conn = psycopg2.connect(conn_string)
#
#     # 执行SQL语句
#     sql = "select text from corpus_iedocument " \
#           "where jump_signal=0"
#     df = pd.read_sql(sql, conn)
#     return df


# 生成多个场景混合数据
def create_mix_txt():
    # 最长字符串长度
    max_length = 100
    # list1 = create_text_list(max_length)

    list1 = create_number_list(3000000)
    print("finish get list1", len(list1))
    # list2 = create_org_list()

    list2 = get_long_sentence_from_file(2000000)
    print("finish get list2", len(list2))
    # list2 = list2[0:100]

    with open("appendix_text.txt", "r") as f:
        list3 = f.readlines()
    # list3 = list3[:6]
    print("finish get list3", len(list3))

    list4 = create_org_list()
    # list4 = list4[:6]
    print("finish get list4", len(list4))

    train_data = list1[0:int(len(list1)*0.95)] + list2[0:int(len(list2)*0.95)] + \
                 list3[0:int(len(list3)*0.95)] + list4[0:int(len(list4)*0.95)]
    test_data = list1[int(len(list1)*0.95):] + list2[int(len(list2)*0.95):] + \
                list3[int(len(list3)*0.95):] + list4[int(len(list4)*0.95):]
    print("len(train_data)", len(train_data))
    print("len(test_data)", len(test_data))

    data_index = 0
    with open(train_data_path, "w") as f:
        for data in train_data:
            prefix = "train/text_" + str(data_index) + ".jpg" + "\t"
            data = prefix + data
            f.write(data)
            data_index += 1
    print("finish write train data")
    with open(test_data_path, "w") as f:
        for data in test_data:
            prefix = "test/text_" + str(data_index) + ".jpg" + "\t"
            data = prefix + data
            f.write(data)
            data_index += 1
    print("finish write test data")
    return


# def create_text_list(max_length):
#     # 招投标文章语句
#     df1 = readPostgreSQL()
#     list1 = []
#     for index, row in df1.iterrows():
#         text = row["text"].split("，")
#         # print(len(text))
#
#         # 每篇文章最多取10个句子
#         max_sentence = 15
#         sentence_count = 0
#         while sentence_count < max_sentence:
#             if len(text) <= max_sentence:
#                 if sentence_count < len(text):
#                     sentence = text[sentence_count]
#                 else:
#                     break
#             else:
#                 r1 = random.randint(0, len(text) - 1)
#                 sentence = text[r1]
#             if len(sentence) > max_length:
#                 # 限制字数，随机截取前或后
#                 r2 = random.randint(0, 1)
#                 if r2:
#                     sentence = sentence[:max_length]
#                 else:
#                     sentence = sentence[-max_length:]
#
#             # sentence = re.sub("\n", "", sentence)
#             if sentence != "":
#                 list1.append(sentence+"\n")
#             sentence_count += 1
#     print("len(list1)", len(list1))
#     return list1


def delete_image(data_dir, file_name):
    if os.path.exists(data_dir + file_name):
        os.remove(data_dir + file_name)


def create_org_list():
    # 1kw公司名
    with open("C:\\Users\\Administrator\\Desktop\\LEGAL_ENTERPRISE.txt", "r") as f:
        list2 = f.readlines()
    # list2 = list2[:100]
    # print("len(list2)", len(list2))
    return list2


def create_number_list(number):
    no_list = []
    for i in range(number):
        # 随机选择生成几位小数
        decimal_place = random.choices([0, 1, 2, 3, 4, 5, 6])[0]

        if decimal_place == 0:
            no = random.randint(0, 10000000)
        else:
            no = random.uniform(0, 10000)
            no = round(no, decimal_place)
        no_list.append(str(no)+"\n")
    # print(no_list)
    return no_list


def get_mix_data_from_file(number):
    with open("../../train_data/bidi_data/mix_data/rec_gt_train.txt") as f:
        _list = f.readlines()
    _list = _list[:number]

    new_list = []
    for line in _list:
        s = line.split("\t")[1]
        new_list.append(s)
    # print(new_list)
    return new_list


def get_long_sentence_from_file(number):
    with open("../../train_data/bidi_data/longSentence_data/rec_gt_train.txt") as f:
        list1 = f.readlines()

    with open("../../train_data/bidi_data/longSentence_data/rec_gt_test.txt") as f:
        list2 = f.readlines()

    _list = list1 + list2
    _list = _list[:number]

    new_list = []
    for line in _list:
        s = line.split("\t")[1]
        new_list.append(s)
    # print(new_list)
    return new_list


def get_data_from_appendix():
    df = pd.read_excel("dochtmlcon.xlsx")
    df = df
    text_list = []
    for index, row in df.iterrows():
        html_text = row["dochtmlcon"]

        # 创建一个BeautifulSoup解析对象
        soup = BeautifulSoup(html_text, "html.parser", from_encoding="utf-8")

        # 获取所有的链接
        appendix_text = soup.find_all('div', class_='richTextFetch')
        # print(str(appendix_text[0])[49:-6])

        appendix_text = str(appendix_text[0])[49:-6]
        ss = appendix_text.split("\n")
        for s in ss:
            text = re.sub(" ", "", s)
            text = re.sub("\t", "", text)
            if s == "":
                continue
            text_list.append(text + "\n")

    with open("appendix_text.txt", "w") as f:
        f.writelines(text_list)
    return


def get_data_from_paddle():
    path = "D:\\DataSet\\"
    with open(path + "char.txt", "r") as f:
        dictionary = f.readlines()
    with open(path + "data_train.txt") as f:
        train_list = f.readlines()
    with open(path + "data_test.txt") as f:
        test_list = f.readlines()

    data_list = train_list + test_list
    # data_list = data_list[-100:]

    text_list = []
    for data in data_list:
        ss = data[:-1].split(" ")
        image_path = "image/" + ss[0]
        text = ""
        for num in ss[1:]:
            char = dictionary[int(num)][:-1]
            text += char
        if text == "":
            print("no text!")
            continue
        text_list.append(image_path + "\t" + text + "\n")

    with open("paddle_data.txt", "w") as f:
        f.writelines(text_list)


def create_number_list2(number):
    no_list = []
    for i in range(number):
        c1 = random.choice([0, 1, 1])
        if c1:
            no = random.randint(0, 10)
        else:
            no = random.randint(10, 100)
        no_list.append(str(no) + "\n")
    # print(no_list)
    return no_list


def create_number_list3(number):
    no_list = []
    for i in range(number):
        # 选择小数整数
        c1 = random.choice([0, 1, 1])
        if c1:
            no = random.randint(10000, 1000000000)
            no = str(no)
            # 加3位分割逗号
            # 选择中英文逗号
            c2 = random.choice([',', ',', '，'])
            for i in range(len(no)-3, 0, -3):
                no = no[:i] + c2 + no[i:]
        else:
            no = random.uniform(10000, 1000000000)
            no = str(no)
            nos = no.split(".")
            no_1 = nos[0]
            no_2 = nos[1]
            # 加3位分割逗号
            # 选择中英文逗号
            c2 = random.choice([',', ',', '，'])
            for i in range(len(no_1)-3, 0, -3):
                no_1 = no_1[:i] + c2 + no_1[i:]
            no = no_1 + "." + no_2

        # 选择是否加￥符号
        c3 = random.choice(['', '￥', "￥ "])
        no_list.append(c3 + str(no) + "\n")
    print(no_list)
    return no_list


if __name__ == '__main__':

    # df = pd.read_csv('C:\\Users\\Administrator\\Desktop\\orgs.csv')
    # create_longSentence_image(df)
    # s = 100
    # image = cv2.imread("text_384178.jpg")
    # print(image.shape)
    # list1 = create_text_list(100)
    # for l in list1:
    #     print(l)
    # print(len(list1))

    # create_mix_txt()
    # get_mix_data_from_file(2)
    # create_number_list(10)

    # with open("../../train_data/bidi_data/mix_data2/rec_gt_test.txt", "r") as f:
    #     _list = f.readlines()
    # for line in _list:
    #     _str = line.split("\t")[-1][:-1]
    #     print(_str, type(_str))
    #     create_image("../../train_data/bidi_data/mix_data2/", "", _str)

    # get_data_from_appendix()
    # get_data_from_paddle()
    # delete_image("../../train_data/bidi_data/mix_data/", "train/text_0.jpg")

    # with open("paddle_data.txt", "r") as f:
    #     list1 = f.readlines()
    # print(len(list1))
    #
    # with open(train_data_path, "r") as f:
    #     list2 = f.readlines()
    # train_data_list = list2 + list1[0:int(len(list1)*0.95)]
    # with open(train_data_path, "w") as f:
    #     f.writelines(train_data_list)
    #
    # with open(test_data_path, "r") as f:
    #     list3 = f.readlines()
    # test_data_list = list3 + list1[int(len(list1)*0.95):]
    # with open(test_data_path, "w") as f:
    #     f.writelines(test_data_list)

    no_list = create_number_list3(2000000)
    i = 23000000
    train_list = []
    for no in no_list:
        train_list.append("train/text_" + str(i) + ".jpg" + "\t" + no)
        i += 1
    # print(train_list)

    with open(train_data_path, "r") as f:
        list3 = f.readlines()
    _list = list3[:-2000000] + train_list
    with open(train_data_path, "w") as f:
        f.writelines(_list)