text2Image.py 18 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584
  1. import random
  2. import re
  3. import numpy as np
  4. import cv2
  5. # import psycopg2
  6. from PIL import Image, ImageFont, ImageDraw
  7. import os
  8. from PIL import Image, ImageFont, ImageDraw
  9. Image.MAX_IMAGE_PIXELS = None
  10. import pandas as pd
  11. # project_path = "D:\\Project\\PaddleOCR-release-2.0\\"
  12. from bs4 import BeautifulSoup
  13. current_path = os.path.dirname(__file__)
  14. project_path = current_path + "/../../"
  15. image_output_path = project_path + "train_data/bidi_data/mix_data5/"
  16. train_data_path = image_output_path + "rec_gt_train.txt"
  17. test_data_path = image_output_path + "rec_gt_test.txt"
  18. def create_image(data_dir, file_name, text):
  19. # list1 = re.findall('[a-zA-Z\d]', text)
  20. # list2 = re.findall('[\u4e00-\u9fa5。,!?¥《》【】’“:;·、()]', text)
  21. # list3 = re.findall('[,.!?&@*+=~%()#<>-''|/:{}$;]', text)
  22. # english_len = len(list1)
  23. # chinese_len = len(list2)
  24. # character_len = len(list3)
  25. #
  26. # if english_len + chinese_len + character_len == 0:
  27. # character_len = len(text)
  28. # 根据各字体大小生成图片
  29. # font 10 : a1-6 字-10 image-len*, 16
  30. # font 20 : a1-12 字-20 image-len*, 32
  31. font_list = [7, 8, 9, 10, 11, 12, 15, 20, 25, 30, 35, 40]
  32. # 随机选字体大小
  33. font_index = random.randint(0, len(font_list)-1)
  34. font = font_list[font_index]
  35. # 根据字体大小计算各字符长度
  36. # chinese_charc_len = font * 1
  37. # english_charc_len = int(font * 0.7)
  38. # number_charc_len = int(font * 0.3)
  39. # image_width = int(font * 1.6)
  40. # text_len = english_len * english_charc_len + chinese_len * chinese_charc_len \
  41. # + character_len * number_charc_len
  42. # 获取字体及其实际像素大小
  43. font_list = []
  44. for root, dirs, files in os.walk(project_path + "tools/fonts"):
  45. for file in files:
  46. font_list.append(os.path.join(root, file))
  47. font_type = random.choice(font_list)
  48. font = ImageFont.truetype(font_type, font)
  49. font_width = font.getsize(text)[0]
  50. font_height = font.getsize(text)[1]
  51. # 增加上下左右边距
  52. margin_h = 0
  53. margin_w = 0
  54. if font_height > 9:
  55. if random.choice([0, 1]):
  56. margin_h = random.randint(3, 6)
  57. margin_w = random.randint(3, 6)
  58. im = Image.new("RGB", (font_width+margin_w*2, font_height+margin_h*2), (255, 255, 255))
  59. dr = ImageDraw.Draw(im)
  60. dr.text((0+margin_w, 0+margin_h), text, font=font, fill="#000000")
  61. # if random.choice([0, 1, 1]):
  62. # # PIL -> CV2
  63. # img = cv2.cvtColor(np.asarray(im), cv2.COLOR_RGB2BGR)
  64. # # 文字加下划线
  65. # img = cv2.line(img, (0+margin_w, font_height+margin_h),
  66. # (font_width+margin_w, font_height+margin_h),
  67. # (0, 0, 0), 1)
  68. # # CV2 -> PIL
  69. # im = Image.fromarray(cv2.cvtColor(img, cv2.COLOR_BGR2RGB))
  70. #
  71. # # 随机缩放
  72. # resize_y = random.randint(1, 2)
  73. # resize_x = random.randint(1, 2)
  74. # img = cv2.resize(img, (img.shape[1]*resize_y, img.shape[0]*resize_x))
  75. #
  76. # # 模糊
  77. # # 高斯模糊
  78. # sigmaX = random.randint(1, 3)
  79. # sigmaY = random.randint(1, 3)
  80. # img = cv2.GaussianBlur(img, (5, 5), sigmaX, sigmaY)
  81. #
  82. # # resize_y = random.uniform(1, 3)
  83. # # resize_x = random.uniform(1, 3)
  84. # # img = im.resize((int(im.size[0]*resize_y), int(im.size[1]*resize_x)), Image.ANTIALIAS)
  85. #
  86. # # 保存
  87. # # cv2.imwrite(data_dir + file_name, img)
  88. # 图像增强
  89. im = my_image_aug(im)
  90. # print(im.size)
  91. # im.show("img")
  92. im.save(data_dir + file_name)
  93. def my_image_aug(image_pil):
  94. # 图像增强
  95. # PIL -> CV2
  96. img = cv2.cvtColor(np.asarray(image_pil), cv2.COLOR_RGB2BGR)
  97. # 分辨率低的跳过
  98. if img.shape[0] > 20:
  99. # 随机缩放
  100. if random.choice([0, 1, 1]):
  101. resize_y = random.randint(1, 3)
  102. resize_x = random.randint(1, 3)
  103. img = cv2.resize(img, (img.shape[1]*resize_y, img.shape[0]*resize_x))
  104. if_dilate = random.choice([0, 1, 1, 2, 2])
  105. # if_dilate = 1
  106. if if_dilate == 2:
  107. gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
  108. kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (2, 2))
  109. iters = 1
  110. # if img.shape[0] > 40:
  111. # iters = 2
  112. dilate = cv2.dilate(gray, kernel, iterations=iters)
  113. img = cv2.cvtColor(dilate, cv2.COLOR_GRAY2BGR)
  114. elif if_dilate == 1:
  115. # 高斯模糊
  116. sigmaX = random.randint(1, 2)
  117. sigmaY = random.randint(1, 2)
  118. img = cv2.GaussianBlur(img, (9, 9), sigmaX, sigmaY)
  119. # CV2 -> PIL
  120. im = Image.fromarray(cv2.cvtColor(img, cv2.COLOR_BGR2RGB))
  121. return im
  122. def create_orgs_image(df):
  123. df = df[:1000]
  124. label_file_train = project_path + "train_data\\bidi_data\\orgs_data\\rec_gt_train.txt"
  125. label_file_test = project_path + "train_data\\bidi_data\\orgs_data\\rec_gt_test.txt"
  126. image_output_path = project_path + "train_data\\bidi_data\\orgs_data\\"
  127. f1 = open(label_file_train, "w")
  128. f2 = open(label_file_test, "w")
  129. print(df.shape)
  130. for index, row in df.iterrows():
  131. text = row["name"]
  132. # text = "晋江滨江国家体育训练基地有限公司"
  133. im = Image.new("RGB", (len(text)*10, 16), (255, 255, 255))
  134. dr = ImageDraw.Draw(im)
  135. font = ImageFont.truetype(os.path.join(os.getcwd(), "fonts", "msyh.ttc"), 10)
  136. dr.text((0, 0), text, font=font, fill="#000000")
  137. # im.show()
  138. if index / df.shape[0] <= 0.8:
  139. mode = "train"
  140. f = f1
  141. else:
  142. mode = "test"
  143. f = f2
  144. im.save(image_output_path + mode + "\\" + "text_" + str(index) + ".jpg")
  145. f.write(mode + "/text_" + str(index) + ".jpg" + "\t" + text + "\n")
  146. f1.close()
  147. f2.close()
  148. def create_longSentence_image(df):
  149. # df = df[:3000]
  150. label_file_train = project_path + "train_data\\bidi_data\\longSentence_data\\rec_gt_train.txt"
  151. label_file_test = project_path + "train_data\\bidi_data\\longSentence_data\\rec_gt_test.txt"
  152. image_output_path = project_path + "train_data\\bidi_data\\longSentence_data\\"
  153. f1 = open(label_file_train, "w")
  154. f2 = open(label_file_test, "w")
  155. print(df.shape)
  156. for index, row in df.iterrows():
  157. text = row["text"]
  158. # text = "晋江滨江国家体育训练基地有限公司"
  159. im = Image.new("RGB", (len(text)*10, 16), (255, 255, 255))
  160. dr = ImageDraw.Draw(im)
  161. font = ImageFont.truetype(os.path.join(os.getcwd(), "fonts", "msyh.ttc"), 10)
  162. dr.text((0, 0), text, font=font, fill="#000000")
  163. # im.show()
  164. if index <= int((df.shape[0]-1)*0.8):
  165. mode = "train"
  166. f = f1
  167. else:
  168. mode = "test"
  169. f = f2
  170. im.save(image_output_path + mode + "\\" + "text_" + str(index) + ".jpg")
  171. f.write(mode + "/text_" + str(index) + ".jpg" + "\t" + text + "\n")
  172. f1.close()
  173. f2.close()
  174. # def readPostgreSQL():
  175. # conn_string = "host=192.168.2.101 port=5432 dbname=iepy " \
  176. # "user=iepy_read password=iepy_read"
  177. # conn = psycopg2.connect(conn_string)
  178. #
  179. # # 执行SQL语句
  180. # sql = "select text from corpus_iedocument " \
  181. # "where jump_signal=0"
  182. # df = pd.read_sql(sql, conn)
  183. # return df
  184. # 生成多个场景混合数据
  185. def create_mix_txt():
  186. # 最长字符串长度
  187. max_length = 100
  188. list1 = create_price(60000, True)
  189. list1 += create_price(40000)
  190. print("finish get list1", len(list1))
  191. list2 = get_long_sentence_from_file(100000)
  192. print("finish get list2", len(list2))
  193. with open("appendix_text.txt", "r") as f:
  194. list3 = f.readlines()
  195. list3 = list3[2000000:2300000]
  196. print("finish get list3", len(list3))
  197. list4 = create_org_list()
  198. list4 = list4[:100000]
  199. print("finish get list4", len(list4))
  200. list5 = create_no(800000)
  201. print("finish get list5", len(list5))
  202. list6 = get_data_from_paddle()
  203. list6 = list6[:200000]
  204. print("finish get list6", len(list6))
  205. train_data = list1[0:int(len(list1)*0.95)] + list2[0:int(len(list2)*0.95)] + \
  206. list3[0:int(len(list3)*0.95)] + list4[0:int(len(list4)*0.95)] + \
  207. list5[0:int(len(list5)*0.95)]
  208. test_data = list1[int(len(list1)*0.95):] + list2[int(len(list2)*0.95):] + \
  209. list3[int(len(list3)*0.95):] + list4[int(len(list4)*0.95):] + \
  210. list5[int(len(list5)*0.95):]
  211. print("len(train_data)", len(train_data))
  212. print("len(test_data)", len(test_data))
  213. data_index = 0
  214. for i in range(len(train_data)):
  215. prefix = "train/text_" + str(data_index) + ".png" + "\t"
  216. train_data[i] = prefix + train_data[i]
  217. data_index += 1
  218. for i in range(len(test_data)):
  219. prefix = "test/text_" + str(data_index) + ".png" + "\t"
  220. test_data[i] = prefix + test_data[i]
  221. data_index += 1
  222. train_data += list6[0:int(len(list6)*0.95)]
  223. test_data += list6[int(len(list6)*0.95):]
  224. with open(train_data_path, "w") as f:
  225. f.writelines(train_data)
  226. with open(test_data_path, "w") as f:
  227. f.writelines(test_data)
  228. print("finish write train data and test data!")
  229. # with open(train_data_path, "w") as f:
  230. # for data in train_data:
  231. # prefix = "train/text_" + str(data_index) + ".jpg" + "\t"
  232. # data = prefix + data
  233. # f.write(data)
  234. # data_index += 1
  235. # print("finish write train data")
  236. # with open(test_data_path, "w") as f:
  237. # for data in test_data:
  238. # prefix = "test/text_" + str(data_index) + ".jpg" + "\t"
  239. # data = prefix + data
  240. # f.write(data)
  241. # data_index += 1
  242. # print("finish write test data")
  243. return
  244. def create_no_price():
  245. # 最长字符串长度
  246. max_length = 100
  247. # list1 = create_text_list(max_length)
  248. list1 = create_price(50000, True)
  249. print("finish get list1", len(list1))
  250. # list2 = create_org_list()
  251. list2 = get_long_sentence_from_file(10000)
  252. print("finish get list2", len(list2))
  253. # list2 = list2[0:100]
  254. with open("appendix_text.txt", "r") as f:
  255. list3 = f.readlines()
  256. list3 = list3[:10000]
  257. print("finish get list3", len(list3))
  258. list4 = create_org_list()
  259. list4 = list4[:10000]
  260. print("finish get list4", len(list4))
  261. list5 = create_no(50000)
  262. print("finish get list5", len(list5))
  263. train_data = list1[0:int(len(list1)*0.95)] + list2[0:int(len(list2)*0.95)] + \
  264. list3[0:int(len(list3)*0.95)] + list4[0:int(len(list4)*0.95)] + \
  265. list5[0:int(len(list5)*0.95)]
  266. test_data = list1[int(len(list1)*0.95):] + list2[int(len(list2)*0.95):] + \
  267. list3[int(len(list3)*0.95):] + list4[int(len(list4)*0.95):] + \
  268. list5[int(len(list5)*0.95):]
  269. print("len(train_data)", len(train_data))
  270. print("len(test_data)", len(test_data))
  271. data_index = 0
  272. with open(train_data_path, "w") as f:
  273. for data in train_data:
  274. prefix = "train/text_" + str(data_index) + ".jpg" + "\t"
  275. data = prefix + data
  276. f.write(data)
  277. data_index += 1
  278. print("finish write train data")
  279. with open(test_data_path, "w") as f:
  280. for data in test_data:
  281. prefix = "test/text_" + str(data_index) + ".jpg" + "\t"
  282. data = prefix + data
  283. f.write(data)
  284. data_index += 1
  285. print("finish write test data")
  286. return
  287. # def create_text_list(max_length):
  288. # # 招投标文章语句
  289. # df1 = readPostgreSQL()
  290. # list1 = []
  291. # for index, row in df1.iterrows():
  292. # text = row["text"].split(",")
  293. # # print(len(text))
  294. #
  295. # # 每篇文章最多取10个句子
  296. # max_sentence = 15
  297. # sentence_count = 0
  298. # while sentence_count < max_sentence:
  299. # if len(text) <= max_sentence:
  300. # if sentence_count < len(text):
  301. # sentence = text[sentence_count]
  302. # else:
  303. # break
  304. # else:
  305. # r1 = random.randint(0, len(text) - 1)
  306. # sentence = text[r1]
  307. # if len(sentence) > max_length:
  308. # # 限制字数,随机截取前或后
  309. # r2 = random.randint(0, 1)
  310. # if r2:
  311. # sentence = sentence[:max_length]
  312. # else:
  313. # sentence = sentence[-max_length:]
  314. #
  315. # # sentence = re.sub("\n", "", sentence)
  316. # if sentence != "":
  317. # list1.append(sentence+"\n")
  318. # sentence_count += 1
  319. # print("len(list1)", len(list1))
  320. # return list1
  321. def delete_image(data_dir, file_name):
  322. if os.path.exists(data_dir + file_name):
  323. os.remove(data_dir + file_name)
  324. def create_org_list():
  325. # 1kw公司名
  326. with open("C:\\Users\\Administrator\\Desktop\\LEGAL_ENTERPRISE.txt", "r") as f:
  327. list2 = f.readlines()
  328. # list2 = list2[:100]
  329. # print("len(list2)", len(list2))
  330. return list2
  331. def create_number_list(number):
  332. no_list = []
  333. for i in range(number):
  334. # 随机选择生成几位小数
  335. decimal_place = random.choices([0, 1, 2, 3, 4, 5, 6])[0]
  336. if decimal_place == 0:
  337. no = random.randint(0, 10000000)
  338. else:
  339. no = random.uniform(0, 10000)
  340. no = round(no, decimal_place)
  341. no_list.append(str(no)+"\n")
  342. # print(no_list)
  343. return no_list
  344. def get_mix_data_from_file(number):
  345. with open("../../train_data/bidi_data/mix_data/rec_gt_train.txt") as f:
  346. _list = f.readlines()
  347. _list = _list[:number]
  348. new_list = []
  349. for line in _list:
  350. s = line.split("\t")[1]
  351. new_list.append(s)
  352. # print(new_list)
  353. return new_list
  354. def get_long_sentence_from_file(number):
  355. with open("../../train_data/bidi_data/longSentence_data/rec_gt_train.txt") as f:
  356. list1 = f.readlines()
  357. with open("../../train_data/bidi_data/longSentence_data/rec_gt_test.txt") as f:
  358. list2 = f.readlines()
  359. _list = list1 + list2
  360. _list = _list[:number]
  361. new_list = []
  362. for line in _list:
  363. s = line.split("\t")[1]
  364. new_list.append(s)
  365. # print(new_list)
  366. return new_list
  367. def get_data_from_appendix():
  368. df = pd.read_excel("dochtmlcon.xlsx")
  369. text_list = []
  370. for index, row in df.iterrows():
  371. html_text = row["dochtmlcon"]
  372. # 创建一个BeautifulSoup解析对象
  373. soup = BeautifulSoup(html_text, "html.parser", from_encoding="utf-8")
  374. # 获取所有的链接
  375. appendix_text = soup.find_all('div', class_='richTextFetch')
  376. # print(str(appendix_text[0])[49:-6])
  377. appendix_text = str(appendix_text[0])[49:-6]
  378. ss = appendix_text.split("\n")
  379. for s in ss:
  380. text = re.sub(" ", "", s)
  381. text = re.sub("\t", "", text)
  382. if s == "":
  383. continue
  384. text_list.append(text + "\n")
  385. with open("appendix_text.txt", "w") as f:
  386. f.writelines(text_list)
  387. return
  388. def get_data_from_paddle():
  389. path = "D:\\DataSet\\"
  390. with open(path + "char.txt", "r") as f:
  391. dictionary = f.readlines()
  392. with open(path + "data_train.txt") as f:
  393. train_list = f.readlines()
  394. with open(path + "data_test.txt") as f:
  395. test_list = f.readlines()
  396. data_list = train_list + test_list
  397. # data_list = data_list[-100:]
  398. text_list = []
  399. for data in data_list:
  400. ss = data[:-1].split(" ")
  401. image_path = "image/" + ss[0]
  402. text = ""
  403. for num in ss[1:]:
  404. char = dictionary[int(num)][:-1]
  405. text += char
  406. if text == "":
  407. print("no text!")
  408. continue
  409. text_list.append(image_path + "\t" + text + "\n")
  410. # with open("paddle_data.txt", "w") as f:
  411. # f.writelines(text_list)
  412. return text_list
  413. def create_number_list2(number):
  414. no_list = []
  415. for i in range(number):
  416. c1 = random.choice([0, 1, 1])
  417. if c1:
  418. no = random.randint(0, 10)
  419. else:
  420. no = random.randint(10, 100)
  421. no_list.append(str(no) + "\n")
  422. # print(no_list)
  423. return no_list
  424. def create_price(number, add_rmb=False):
  425. no_list = []
  426. if not add_rmb:
  427. num_range = (10, 1000000000)
  428. else:
  429. num_range = (10, 100000)
  430. for i in range(number):
  431. # 选择小数整数
  432. c1 = random.choice([0, 1, 1])
  433. if c1:
  434. no = random.randint(num_range[0], num_range[1])
  435. no = str(no)
  436. # 加3位分割逗号
  437. # 选择中英文逗号
  438. c2 = random.choice([',', ',', ','])
  439. for i in range(len(no)-3, 0, -3):
  440. no = no[:i] + c2 + no[i:]
  441. else:
  442. no = random.uniform(num_range[0], num_range[1])
  443. n_digits = random.choice([1, 2, 3, 4])
  444. no = round(no, n_digits)
  445. no = str(no)
  446. nos = no.split(".")
  447. no_1 = nos[0]
  448. no_2 = nos[1]
  449. # 加3位分割逗号
  450. # 选择中英文逗号
  451. c2 = random.choice([',', ',', ','])
  452. for i in range(len(no_1)-3, 0, -3):
  453. no_1 = no_1[:i] + c2 + no_1[i:]
  454. no = no_1 + "." + no_2
  455. # 选择是否加¥符号
  456. if add_rmb:
  457. c3 = random.choice(["¥", "¥"])
  458. else:
  459. c3 = random.choice(['', "¥", "¥"])
  460. no_list.append(c3 + str(no) + "\n")
  461. # print(no_list[:100])
  462. return no_list
  463. def create_no(number):
  464. no_list = []
  465. chn_no = ['一', '二', '三', '四', '五', '六', '七', '八', '九', '十',
  466. '壹', '贰', '叁', '肆', '伍', '陆', '柒', '捌', '玖', '拾']
  467. for j in range(number):
  468. if random.choice([0, 1, 1, 1, 1]):
  469. # 数字序号
  470. no = random.randint(0, 4)
  471. else:
  472. # 汉字序号
  473. no = random.choice(chn_no)
  474. no_list.append(str(no)+"\n")
  475. return no_list
  476. if __name__ == '__main__':
  477. create_mix_txt()
  478. # with open("../../train_data/bidi_data/mix_data4/rec_gt_test.txt", "r") as f:
  479. # _list = f.readlines()
  480. # print(len(_list))
  481. # str1 = _list[0].split("\t")[-1][:-1]
  482. # str2 = _list[200000].split("\t")[-1][:-1]
  483. # str3 = _list[-1].split("\t")[-1][:-1]
  484. #
  485. # for i in range(2):
  486. # create_image("../../train_data/bidi_data/mix_data2/", "", str1)
  487. # create_image("../../train_data/bidi_data/mix_data2/", "", str2)
  488. # create_image("../../train_data/bidi_data/mix_data2/", "", str3)