fangjiasheng
/
PaddleOCR


			
							123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516
							import random
import re
import numpy as np
import cv2
# import psycopg2
from PIL import Image, ImageFont, ImageDraw
import os
from PIL import Image, ImageFont, ImageDraw
import pandas as pd


# project_path = "D:\\Project\\PaddleOCR-release-2.0\\"
from bs4 import BeautifulSoup

project_path = "../../"
image_output_path = project_path + "train_data/bidi_data/mix_data4/"
train_data_path = image_output_path + "rec_gt_train.txt"
test_data_path = image_output_path + "rec_gt_test.txt"


def create_image(data_dir, file_name, text):
    # list1 = re.findall('[a-zA-Z\d]', text)
    # list2 = re.findall('[\u4e00-\u9fa5。，！？￥《》【】’“：；·、（）]', text)
    # list3 = re.findall('[,.!?&@*+=~%()#<>-''|/:{}$;]', text)
    # english_len = len(list1)
    # chinese_len = len(list2)
    # character_len = len(list3)
    #
    # if english_len + chinese_len + character_len == 0:
    #     character_len = len(text)

    # 根据各字体大小生成图片
    # font 10 : a1-6 字-10 image-len*, 16
    # font 20 : a1-12 字-20 image-len*, 32
    font_list = [7, 8, 9, 10, 11, 12, 15, 20, 25, 30, 35, 40]
    # 随机选字体大小
    font_index = random.randint(0, len(font_list)-1)
    font = font_list[font_index]

    # 根据字体大小计算各字符长度
    # chinese_charc_len = font * 1
    # english_charc_len = int(font * 0.7)
    # number_charc_len = int(font * 0.3)
    # image_width = int(font * 1.6)
    # text_len = english_len * english_charc_len + chinese_len * chinese_charc_len \
    #            + character_len * number_charc_len

    # 获取字体及其实际像素大小
    font_type = random.choice(['msyh.ttc', 'msyhbd.ttc', 'msyhl.ttc'])
    font = ImageFont.truetype("tools/fonts/"+font_type, font)
    font_width = font.getsize(text)[0]
    font_height = font.getsize(text)[1]

    # 增加上下左右边距
    margin_h = 0
    margin_w = 0

    if font_height > 9:
        if random.choice([0, 1]):
            margin_h = random.randint(3, 6)
            margin_w = random.randint(3, 6)

    im = Image.new("RGB", (font_width+margin_w*2, font_height+margin_h*2), (255, 255, 255))
    dr = ImageDraw.Draw(im)

    dr.text((0+margin_w, 0+margin_h), text, font=font, fill="#000000")

    # if random.choice([0, 1, 1]):
    #     # PIL -> CV2
    #     img = cv2.cvtColor(np.asarray(im), cv2.COLOR_RGB2BGR)
    #     # 文字加下划线
    #     img = cv2.line(img, (0+margin_w, font_height+margin_h),
    #                    (font_width+margin_w, font_height+margin_h),
    #                    (0, 0, 0), 1)
    #     # CV2 -> PIL
    #     im = Image.fromarray(cv2.cvtColor(img, cv2.COLOR_BGR2RGB))

    #
    # # 随机缩放
    # resize_y = random.randint(1, 2)
    # resize_x = random.randint(1, 2)
    # img = cv2.resize(img, (img.shape[1]*resize_y, img.shape[0]*resize_x))
    #
    # # 模糊
    # # 高斯模糊
    # sigmaX = random.randint(1, 3)
    # sigmaY = random.randint(1, 3)
    # img = cv2.GaussianBlur(img, (5, 5), sigmaX, sigmaY)
    #

    # # resize_y = random.uniform(1, 3)
    # # resize_x = random.uniform(1, 3)
    # # img = im.resize((int(im.size[0]*resize_y), int(im.size[1]*resize_x)), Image.ANTIALIAS)
    #
    # # 保存
    # # cv2.imwrite(data_dir + file_name, img)

    # 图像增强
    im = my_image_aug(im)
    # im.show("img")
    im.save(data_dir + file_name)


def my_image_aug(image_pil):
    # 图像增强
    # PIL -> CV2
    img = cv2.cvtColor(np.asarray(image_pil), cv2.COLOR_RGB2BGR)

    # 分辨率低的跳过
    if img.shape[0] > 20:
        # 随机缩放
        if random.choice([0, 1]):
            resize_y = random.randint(1, 3)
            resize_x = random.randint(1, 3)
            img = cv2.resize(img, (img.shape[1]*resize_y, img.shape[0]*resize_x))

        if_dilate = random.choice([0, 1, 1, 2, 2, 2])
        if if_dilate == 2:
            gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
            kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (2, 2))
            iters = 1
            if img.shape[0] > 40:
                iters = 2
            dilate = cv2.dilate(gray, kernel, iterations=iters)
            img = cv2.cvtColor(dilate, cv2.COLOR_GRAY2BGR)

        elif if_dilate == 1:
            # 高斯模糊
            sigmaX = random.randint(1, 2)
            sigmaY = random.randint(1, 2)
            img = cv2.GaussianBlur(img, (5, 5), sigmaX, sigmaY)

    # CV2 -> PIL
    im = Image.fromarray(cv2.cvtColor(img, cv2.COLOR_BGR2RGB))

    return im


def create_orgs_image(df):
    df = df[:1000]
    label_file_train = project_path + "train_data\\bidi_data\\orgs_data\\rec_gt_train.txt"
    label_file_test = project_path + "train_data\\bidi_data\\orgs_data\\rec_gt_test.txt"
    image_output_path = project_path + "train_data\\bidi_data\\orgs_data\\"
    f1 = open(label_file_train, "w")
    f2 = open(label_file_test, "w")
    print(df.shape)
    for index, row in df.iterrows():
        text = row["name"]
        # text = "晋江滨江国家体育训练基地有限公司"
        im = Image.new("RGB", (len(text)*10, 16), (255, 255, 255))
        dr = ImageDraw.Draw(im)
        font = ImageFont.truetype(os.path.join(os.getcwd(), "fonts", "msyh.ttc"), 10)
        dr.text((0, 0), text, font=font, fill="#000000")
        # im.show()
        if index / df.shape[0] <= 0.8:
            mode = "train"
            f = f1
        else:
            mode = "test"
            f = f2
        im.save(image_output_path + mode + "\\" + "text_" + str(index) + ".jpg")
        f.write(mode + "/text_" + str(index) + ".jpg" + "\t" + text + "\n")

    f1.close()
    f2.close()


def create_longSentence_image(df):
    # df = df[:3000]
    label_file_train = project_path + "train_data\\bidi_data\\longSentence_data\\rec_gt_train.txt"
    label_file_test = project_path + "train_data\\bidi_data\\longSentence_data\\rec_gt_test.txt"
    image_output_path = project_path + "train_data\\bidi_data\\longSentence_data\\"
    f1 = open(label_file_train, "w")
    f2 = open(label_file_test, "w")
    print(df.shape)

    for index, row in df.iterrows():
        text = row["text"]
        # text = "晋江滨江国家体育训练基地有限公司"
        im = Image.new("RGB", (len(text)*10, 16), (255, 255, 255))
        dr = ImageDraw.Draw(im)
        font = ImageFont.truetype(os.path.join(os.getcwd(), "fonts", "msyh.ttc"), 10)
        dr.text((0, 0), text, font=font, fill="#000000")
        # im.show()
        if index <= int((df.shape[0]-1)*0.8):
            mode = "train"
            f = f1
        else:
            mode = "test"
            f = f2
        im.save(image_output_path + mode + "\\" + "text_" + str(index) + ".jpg")
        f.write(mode + "/text_" + str(index) + ".jpg" + "\t" + text + "\n")

    f1.close()
    f2.close()


# def readPostgreSQL():
#     conn_string = "host=192.168.2.101 port=5432 dbname=iepy " \
#                   "user=iepy_read password=iepy_read"
#     conn = psycopg2.connect(conn_string)
#
#     # 执行SQL语句
#     sql = "select text from corpus_iedocument " \
#           "where jump_signal=0"
#     df = pd.read_sql(sql, conn)
#     return df


# 生成多个场景混合数据
def create_mix_txt():
    # 最长字符串长度
    max_length = 100
    # list1 = create_text_list(max_length)

    list1 = create_price(3000000)
    print("finish get list1", len(list1))
    # list2 = create_org_list()

    list2 = get_long_sentence_from_file(1000000)
    print("finish get list2", len(list2))
    # list2 = list2[0:100]

    with open("appendix_text.txt", "r") as f:
        list3 = f.readlines()
    list3 = list3[:2000000]
    print("finish get list3", len(list3))

    list4 = create_org_list()
    list4 = list4[:3000000]
    print("finish get list4", len(list4))

    train_data = list1[0:int(len(list1)*0.95)] + list2[0:int(len(list2)*0.95)] + \
                 list3[0:int(len(list3)*0.95)] + list4[0:int(len(list4)*0.95)]
    test_data = list1[int(len(list1)*0.95):] + list2[int(len(list2)*0.95):] + \
                list3[int(len(list3)*0.95):] + list4[int(len(list4)*0.95):]
    print("len(train_data)", len(train_data))
    print("len(test_data)", len(test_data))

    data_index = 0
    with open(train_data_path, "w") as f:
        for data in train_data:
            prefix = "train/text_" + str(data_index) + ".jpg" + "\t"
            data = prefix + data
            f.write(data)
            data_index += 1
    print("finish write train data")
    with open(test_data_path, "w") as f:
        for data in test_data:
            prefix = "test/text_" + str(data_index) + ".jpg" + "\t"
            data = prefix + data
            f.write(data)
            data_index += 1
    print("finish write test data")
    return


# def create_text_list(max_length):
#     # 招投标文章语句
#     df1 = readPostgreSQL()
#     list1 = []
#     for index, row in df1.iterrows():
#         text = row["text"].split("，")
#         # print(len(text))
#
#         # 每篇文章最多取10个句子
#         max_sentence = 15
#         sentence_count = 0
#         while sentence_count < max_sentence:
#             if len(text) <= max_sentence:
#                 if sentence_count < len(text):
#                     sentence = text[sentence_count]
#                 else:
#                     break
#             else:
#                 r1 = random.randint(0, len(text) - 1)
#                 sentence = text[r1]
#             if len(sentence) > max_length:
#                 # 限制字数，随机截取前或后
#                 r2 = random.randint(0, 1)
#                 if r2:
#                     sentence = sentence[:max_length]
#                 else:
#                     sentence = sentence[-max_length:]
#
#             # sentence = re.sub("\n", "", sentence)
#             if sentence != "":
#                 list1.append(sentence+"\n")
#             sentence_count += 1
#     print("len(list1)", len(list1))
#     return list1


def delete_image(data_dir, file_name):
    if os.path.exists(data_dir + file_name):
        os.remove(data_dir + file_name)


def create_org_list():
    # 1kw公司名
    with open("C:\\Users\\Administrator\\Desktop\\LEGAL_ENTERPRISE.txt", "r") as f:
        list2 = f.readlines()
    # list2 = list2[:100]
    # print("len(list2)", len(list2))
    return list2


def create_number_list(number):
    no_list = []
    for i in range(number):
        # 随机选择生成几位小数
        decimal_place = random.choices([0, 1, 2, 3, 4, 5, 6])[0]

        if decimal_place == 0:
            no = random.randint(0, 10000000)
        else:
            no = random.uniform(0, 10000)
            no = round(no, decimal_place)
        no_list.append(str(no)+"\n")
    # print(no_list)
    return no_list


def get_mix_data_from_file(number):
    with open("../../train_data/bidi_data/mix_data/rec_gt_train.txt") as f:
        _list = f.readlines()
    _list = _list[:number]

    new_list = []
    for line in _list:
        s = line.split("\t")[1]
        new_list.append(s)
    # print(new_list)
    return new_list


def get_long_sentence_from_file(number):
    with open("../../train_data/bidi_data/longSentence_data/rec_gt_train.txt") as f:
        list1 = f.readlines()

    with open("../../train_data/bidi_data/longSentence_data/rec_gt_test.txt") as f:
        list2 = f.readlines()

    _list = list1 + list2
    _list = _list[:number]

    new_list = []
    for line in _list:
        s = line.split("\t")[1]
        new_list.append(s)
    # print(new_list)
    return new_list


def get_data_from_appendix():
    df = pd.read_excel("dochtmlcon.xlsx")
    df = df
    text_list = []
    for index, row in df.iterrows():
        html_text = row["dochtmlcon"]

        # 创建一个BeautifulSoup解析对象
        soup = BeautifulSoup(html_text, "html.parser", from_encoding="utf-8")

        # 获取所有的链接
        appendix_text = soup.find_all('div', class_='richTextFetch')
        # print(str(appendix_text[0])[49:-6])

        appendix_text = str(appendix_text[0])[49:-6]
        ss = appendix_text.split("\n")
        for s in ss:
            text = re.sub(" ", "", s)
            text = re.sub("\t", "", text)
            if s == "":
                continue
            text_list.append(text + "\n")

    with open("appendix_text.txt", "w") as f:
        f.writelines(text_list)
    return


def get_data_from_paddle():
    path = "D:\\DataSet\\"
    with open(path + "char.txt", "r") as f:
        dictionary = f.readlines()
    with open(path + "data_train.txt") as f:
        train_list = f.readlines()
    with open(path + "data_test.txt") as f:
        test_list = f.readlines()

    data_list = train_list + test_list
    # data_list = data_list[-100:]

    text_list = []
    for data in data_list:
        ss = data[:-1].split(" ")
        image_path = "image/" + ss[0]
        text = ""
        for num in ss[1:]:
            char = dictionary[int(num)][:-1]
            text += char
        if text == "":
            print("no text!")
            continue
        text_list.append(image_path + "\t" + text + "\n")

    with open("paddle_data.txt", "w") as f:
        f.writelines(text_list)


def create_number_list2(number):
    no_list = []
    for i in range(number):
        c1 = random.choice([0, 1, 1])
        if c1:
            no = random.randint(0, 10)
        else:
            no = random.randint(10, 100)
        no_list.append(str(no) + "\n")
    # print(no_list)
    return no_list


def create_price(number):
    no_list = []
    for i in range(number):
        # 选择小数整数
        c1 = random.choice([0, 1, 1])
        if c1:
            no = random.randint(10, 1000000000)
            no = str(no)
            # 加3位分割逗号
            # 选择中英文逗号
            c2 = random.choice([',', ',', ','])
            for i in range(len(no)-3, 0, -3):
                no = no[:i] + c2 + no[i:]
        else:
            no = random.uniform(10, 1000000000)
            no = str(no)
            nos = no.split(".")
            no_1 = nos[0]
            no_2 = nos[1]
            # 加3位分割逗号
            # 选择中英文逗号
            c2 = random.choice([',', ',', ','])
            for i in range(len(no_1)-3, 0, -3):
                no_1 = no_1[:i] + c2 + no_1[i:]
            no = no_1 + "." + no_2

        # 选择是否加￥符号
        c3 = random.choice(['', "￥"])
        no_list.append(c3 + str(no) + "\n")
    # print(no_list)
    return no_list


if __name__ == '__main__':

    # df = pd.read_csv('C:\\Users\\Administrator\\Desktop\\orgs.csv')
    # create_longSentence_image(df)
    # s = 100
    # image = cv2.imread("text_384178.jpg")
    # print(image.shape)
    # list1 = create_text_list(100)
    # for l in list1:
    #     print(l)
    # print(len(list1))

    create_mix_txt()
    # get_mix_data_from_file(2)
    # create_number_list(10)

    # with open("../../train_data/bidi_data/orgs_data/rec_gt_test.txt", "r") as f:
    #     _list = f.readlines()
    # for line in _list:
    #     _str = line.split("\t")[-1][:-1]
    #     print(_str, type(_str))
    #     create_image("../../train_data/bidi_data/mix_data2/", "", _str)

    # get_data_from_appendix()
    # get_data_from_paddle()
    # delete_image("../../train_data/bidi_data/mix_data/", "train/text_0.jpg")

    # 获取paddle数据 #######################################
    with open("paddle_data.txt", "r") as f:
        list1 = f.readlines()
    print(len(list1))

    list1 = list1[:1000000]
    with open(train_data_path, "r") as f:
        list2 = f.readlines()
    train_data_list = list2 + list1[0:int(len(list1)*0.95)]
    with open(train_data_path, "w") as f:
        f.writelines(train_data_list)

    with open(test_data_path, "r") as f:
        list3 = f.readlines()
    test_data_list = list3 + list1[int(len(list1)*0.95):]
    with open(test_data_path, "w") as f:
        f.writelines(test_data_list)
    #######################################################

    # no_list = create_number_list3(2000000)
    # i = 23000000
    # train_list = []
    # for no in no_list:
    #     train_list.append("train/text_" + str(i) + ".jpg" + "\t" + no)
    #     i += 1
    # # print(train_list)
    #
    # with open(train_data_path, "r") as f:
    #     list3 = f.readlines()
    # _list = list3[:-2000000] + train_list
    # with open(train_data_path, "w") as f:
    #     f.writelines(_list)