text2Image.py 14 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455
  1. import random
  2. import re
  3. import numpy as np
  4. import cv2
  5. # import psycopg2
  6. from PIL import Image, ImageFont, ImageDraw
  7. import os
  8. from PIL import Image, ImageFont, ImageDraw
  9. import pandas as pd
  10. # project_path = "D:\\Project\\PaddleOCR-release-2.0\\"
  11. from bs4 import BeautifulSoup
  12. project_path = "../../"
  13. image_output_path = project_path + "train_data/bidi_data/mix_data3/"
  14. train_data_path = image_output_path + "rec_gt_train.txt"
  15. test_data_path = image_output_path + "rec_gt_test.txt"
  16. def create_image(data_dir, file_name, text):
  17. list1 = re.findall('[a-zA-Z\d]', text)
  18. list2 = re.findall('[\u4e00-\u9fa5。,!?¥《》【】’“:;·、()]', text)
  19. list3 = re.findall('[,.!?&@*+=~%()#<>-''|/:{}$;]', text)
  20. english_len = len(list1)
  21. chinese_len = len(list2)
  22. character_len = len(list3)
  23. if english_len + chinese_len + character_len == 0:
  24. character_len = len(text)
  25. # 根据各字体大小生成图片
  26. # font 10 : a1-6 字-10 image-len*, 16
  27. # font 20 : a1-12 字-20 image-len*, 32
  28. font_list = [10, 15, 20, 25, 30, 35, 40]
  29. # 随机选字体大小
  30. font_index = random.randint(0, len(font_list)-1)
  31. font = font_list[font_index]
  32. # 根据字体大小计算各字符长度
  33. chinese_charc_len = font * 1
  34. english_charc_len = int(font * 0.7)
  35. number_charc_len = int(font * 0.3)
  36. image_width = int(font * 1.6)
  37. text_len = english_len * english_charc_len + chinese_len * chinese_charc_len \
  38. + character_len * number_charc_len
  39. im = Image.new("RGB", (text_len, image_width), (255, 255, 255))
  40. dr = ImageDraw.Draw(im)
  41. font = ImageFont.truetype("tools/fonts/msyh.ttc", font)
  42. dr.text((0, 0), text, font=font, fill="#000000")
  43. # 图像增强
  44. # PIL -> CV2
  45. img = cv2.cvtColor(np.asarray(im), cv2.COLOR_RGB2BGR)
  46. # 随机缩放
  47. resize_y = random.randint(1, 2)
  48. resize_x = random.randint(1, 2)
  49. img = cv2.resize(img, (img.shape[1]*resize_y, img.shape[0]*resize_x))
  50. # 模糊
  51. # 高斯模糊
  52. sigmaX = random.randint(1, 3)
  53. sigmaY = random.randint(1, 3)
  54. img = cv2.GaussianBlur(img, (5, 5), sigmaX, sigmaY)
  55. # CV2 -> PIL
  56. im = Image.fromarray(cv2.cvtColor(img, cv2.COLOR_BGR2RGB))
  57. # resize_y = random.uniform(1, 3)
  58. # resize_x = random.uniform(1, 3)
  59. # img = im.resize((int(im.size[0]*resize_y), int(im.size[1]*resize_x)), Image.ANTIALIAS)
  60. # 保存
  61. # cv2.imwrite(data_dir + file_name, img)
  62. # im.show("img")
  63. im.save(data_dir + file_name)
  64. # print(file_name)
  65. def create_orgs_image(df):
  66. df = df[:1000]
  67. label_file_train = project_path + "train_data\\bidi_data\\orgs_data\\rec_gt_train.txt"
  68. label_file_test = project_path + "train_data\\bidi_data\\orgs_data\\rec_gt_test.txt"
  69. image_output_path = project_path + "train_data\\bidi_data\\orgs_data\\"
  70. f1 = open(label_file_train, "w")
  71. f2 = open(label_file_test, "w")
  72. print(df.shape)
  73. for index, row in df.iterrows():
  74. text = row["name"]
  75. # text = "晋江滨江国家体育训练基地有限公司"
  76. im = Image.new("RGB", (len(text)*10, 16), (255, 255, 255))
  77. dr = ImageDraw.Draw(im)
  78. font = ImageFont.truetype(os.path.join(os.getcwd(), "fonts", "msyh.ttc"), 10)
  79. dr.text((0, 0), text, font=font, fill="#000000")
  80. # im.show()
  81. if index / df.shape[0] <= 0.8:
  82. mode = "train"
  83. f = f1
  84. else:
  85. mode = "test"
  86. f = f2
  87. im.save(image_output_path + mode + "\\" + "text_" + str(index) + ".jpg")
  88. f.write(mode + "/text_" + str(index) + ".jpg" + "\t" + text + "\n")
  89. f1.close()
  90. f2.close()
  91. def create_longSentence_image(df):
  92. # df = df[:3000]
  93. label_file_train = project_path + "train_data\\bidi_data\\longSentence_data\\rec_gt_train.txt"
  94. label_file_test = project_path + "train_data\\bidi_data\\longSentence_data\\rec_gt_test.txt"
  95. image_output_path = project_path + "train_data\\bidi_data\\longSentence_data\\"
  96. f1 = open(label_file_train, "w")
  97. f2 = open(label_file_test, "w")
  98. print(df.shape)
  99. for index, row in df.iterrows():
  100. text = row["text"]
  101. # text = "晋江滨江国家体育训练基地有限公司"
  102. im = Image.new("RGB", (len(text)*10, 16), (255, 255, 255))
  103. dr = ImageDraw.Draw(im)
  104. font = ImageFont.truetype(os.path.join(os.getcwd(), "fonts", "msyh.ttc"), 10)
  105. dr.text((0, 0), text, font=font, fill="#000000")
  106. # im.show()
  107. if index <= int((df.shape[0]-1)*0.8):
  108. mode = "train"
  109. f = f1
  110. else:
  111. mode = "test"
  112. f = f2
  113. im.save(image_output_path + mode + "\\" + "text_" + str(index) + ".jpg")
  114. f.write(mode + "/text_" + str(index) + ".jpg" + "\t" + text + "\n")
  115. f1.close()
  116. f2.close()
  117. # def readPostgreSQL():
  118. # conn_string = "host=192.168.2.101 port=5432 dbname=iepy " \
  119. # "user=iepy_read password=iepy_read"
  120. # conn = psycopg2.connect(conn_string)
  121. #
  122. # # 执行SQL语句
  123. # sql = "select text from corpus_iedocument " \
  124. # "where jump_signal=0"
  125. # df = pd.read_sql(sql, conn)
  126. # return df
  127. # 生成多个场景混合数据
  128. def create_mix_txt():
  129. # 最长字符串长度
  130. max_length = 100
  131. # list1 = create_text_list(max_length)
  132. list1 = create_number_list(3000000)
  133. print("finish get list1", len(list1))
  134. # list2 = create_org_list()
  135. list2 = get_long_sentence_from_file(2000000)
  136. print("finish get list2", len(list2))
  137. # list2 = list2[0:100]
  138. with open("appendix_text.txt", "r") as f:
  139. list3 = f.readlines()
  140. # list3 = list3[:6]
  141. print("finish get list3", len(list3))
  142. list4 = create_org_list()
  143. # list4 = list4[:6]
  144. print("finish get list4", len(list4))
  145. train_data = list1[0:int(len(list1)*0.95)] + list2[0:int(len(list2)*0.95)] + \
  146. list3[0:int(len(list3)*0.95)] + list4[0:int(len(list4)*0.95)]
  147. test_data = list1[int(len(list1)*0.95):] + list2[int(len(list2)*0.95):] + \
  148. list3[int(len(list3)*0.95):] + list4[int(len(list4)*0.95):]
  149. print("len(train_data)", len(train_data))
  150. print("len(test_data)", len(test_data))
  151. data_index = 0
  152. with open(train_data_path, "w") as f:
  153. for data in train_data:
  154. prefix = "train/text_" + str(data_index) + ".jpg" + "\t"
  155. data = prefix + data
  156. f.write(data)
  157. data_index += 1
  158. print("finish write train data")
  159. with open(test_data_path, "w") as f:
  160. for data in test_data:
  161. prefix = "test/text_" + str(data_index) + ".jpg" + "\t"
  162. data = prefix + data
  163. f.write(data)
  164. data_index += 1
  165. print("finish write test data")
  166. return
  167. # def create_text_list(max_length):
  168. # # 招投标文章语句
  169. # df1 = readPostgreSQL()
  170. # list1 = []
  171. # for index, row in df1.iterrows():
  172. # text = row["text"].split(",")
  173. # # print(len(text))
  174. #
  175. # # 每篇文章最多取10个句子
  176. # max_sentence = 15
  177. # sentence_count = 0
  178. # while sentence_count < max_sentence:
  179. # if len(text) <= max_sentence:
  180. # if sentence_count < len(text):
  181. # sentence = text[sentence_count]
  182. # else:
  183. # break
  184. # else:
  185. # r1 = random.randint(0, len(text) - 1)
  186. # sentence = text[r1]
  187. # if len(sentence) > max_length:
  188. # # 限制字数,随机截取前或后
  189. # r2 = random.randint(0, 1)
  190. # if r2:
  191. # sentence = sentence[:max_length]
  192. # else:
  193. # sentence = sentence[-max_length:]
  194. #
  195. # # sentence = re.sub("\n", "", sentence)
  196. # if sentence != "":
  197. # list1.append(sentence+"\n")
  198. # sentence_count += 1
  199. # print("len(list1)", len(list1))
  200. # return list1
  201. def delete_image(data_dir, file_name):
  202. if os.path.exists(data_dir + file_name):
  203. os.remove(data_dir + file_name)
  204. def create_org_list():
  205. # 1kw公司名
  206. with open("C:\\Users\\Administrator\\Desktop\\LEGAL_ENTERPRISE.txt", "r") as f:
  207. list2 = f.readlines()
  208. # list2 = list2[:100]
  209. # print("len(list2)", len(list2))
  210. return list2
  211. def create_number_list(number):
  212. no_list = []
  213. for i in range(number):
  214. # 随机选择生成几位小数
  215. decimal_place = random.choices([0, 1, 2, 3, 4, 5, 6])[0]
  216. if decimal_place == 0:
  217. no = random.randint(0, 10000000)
  218. else:
  219. no = random.uniform(0, 10000)
  220. no = round(no, decimal_place)
  221. no_list.append(str(no)+"\n")
  222. # print(no_list)
  223. return no_list
  224. def get_mix_data_from_file(number):
  225. with open("../../train_data/bidi_data/mix_data/rec_gt_train.txt") as f:
  226. _list = f.readlines()
  227. _list = _list[:number]
  228. new_list = []
  229. for line in _list:
  230. s = line.split("\t")[1]
  231. new_list.append(s)
  232. # print(new_list)
  233. return new_list
  234. def get_long_sentence_from_file(number):
  235. with open("../../train_data/bidi_data/longSentence_data/rec_gt_train.txt") as f:
  236. list1 = f.readlines()
  237. with open("../../train_data/bidi_data/longSentence_data/rec_gt_test.txt") as f:
  238. list2 = f.readlines()
  239. _list = list1 + list2
  240. _list = _list[:number]
  241. new_list = []
  242. for line in _list:
  243. s = line.split("\t")[1]
  244. new_list.append(s)
  245. # print(new_list)
  246. return new_list
  247. def get_data_from_appendix():
  248. df = pd.read_excel("dochtmlcon.xlsx")
  249. df = df
  250. text_list = []
  251. for index, row in df.iterrows():
  252. html_text = row["dochtmlcon"]
  253. # 创建一个BeautifulSoup解析对象
  254. soup = BeautifulSoup(html_text, "html.parser", from_encoding="utf-8")
  255. # 获取所有的链接
  256. appendix_text = soup.find_all('div', class_='richTextFetch')
  257. # print(str(appendix_text[0])[49:-6])
  258. appendix_text = str(appendix_text[0])[49:-6]
  259. ss = appendix_text.split("\n")
  260. for s in ss:
  261. text = re.sub(" ", "", s)
  262. text = re.sub("\t", "", text)
  263. if s == "":
  264. continue
  265. text_list.append(text + "\n")
  266. with open("appendix_text.txt", "w") as f:
  267. f.writelines(text_list)
  268. return
  269. def get_data_from_paddle():
  270. path = "D:\\DataSet\\"
  271. with open(path + "char.txt", "r") as f:
  272. dictionary = f.readlines()
  273. with open(path + "data_train.txt") as f:
  274. train_list = f.readlines()
  275. with open(path + "data_test.txt") as f:
  276. test_list = f.readlines()
  277. data_list = train_list + test_list
  278. # data_list = data_list[-100:]
  279. text_list = []
  280. for data in data_list:
  281. ss = data[:-1].split(" ")
  282. image_path = "image/" + ss[0]
  283. text = ""
  284. for num in ss[1:]:
  285. char = dictionary[int(num)][:-1]
  286. text += char
  287. if text == "":
  288. print("no text!")
  289. continue
  290. text_list.append(image_path + "\t" + text + "\n")
  291. with open("paddle_data.txt", "w") as f:
  292. f.writelines(text_list)
  293. def create_number_list2(number):
  294. no_list = []
  295. for i in range(number):
  296. c1 = random.choice([0, 1, 1])
  297. if c1:
  298. no = random.randint(0, 10)
  299. else:
  300. no = random.randint(10, 100)
  301. no_list.append(str(no) + "\n")
  302. # print(no_list)
  303. return no_list
  304. def create_number_list3(number):
  305. no_list = []
  306. for i in range(number):
  307. # 选择小数整数
  308. c1 = random.choice([0, 1, 1])
  309. if c1:
  310. no = random.randint(10000, 1000000000)
  311. no = str(no)
  312. # 加3位分割逗号
  313. # 选择中英文逗号
  314. c2 = random.choice([',', ',', ','])
  315. for i in range(len(no)-3, 0, -3):
  316. no = no[:i] + c2 + no[i:]
  317. else:
  318. no = random.uniform(10000, 1000000000)
  319. no = str(no)
  320. nos = no.split(".")
  321. no_1 = nos[0]
  322. no_2 = nos[1]
  323. # 加3位分割逗号
  324. # 选择中英文逗号
  325. c2 = random.choice([',', ',', ','])
  326. for i in range(len(no_1)-3, 0, -3):
  327. no_1 = no_1[:i] + c2 + no_1[i:]
  328. no = no_1 + "." + no_2
  329. # 选择是否加¥符号
  330. c3 = random.choice(['', '¥', "¥ "])
  331. no_list.append(c3 + str(no) + "\n")
  332. print(no_list)
  333. return no_list
  334. if __name__ == '__main__':
  335. # df = pd.read_csv('C:\\Users\\Administrator\\Desktop\\orgs.csv')
  336. # create_longSentence_image(df)
  337. # s = 100
  338. # image = cv2.imread("text_384178.jpg")
  339. # print(image.shape)
  340. # list1 = create_text_list(100)
  341. # for l in list1:
  342. # print(l)
  343. # print(len(list1))
  344. # create_mix_txt()
  345. # get_mix_data_from_file(2)
  346. # create_number_list(10)
  347. # with open("../../train_data/bidi_data/mix_data2/rec_gt_test.txt", "r") as f:
  348. # _list = f.readlines()
  349. # for line in _list:
  350. # _str = line.split("\t")[-1][:-1]
  351. # print(_str, type(_str))
  352. # create_image("../../train_data/bidi_data/mix_data2/", "", _str)
  353. # get_data_from_appendix()
  354. # get_data_from_paddle()
  355. # delete_image("../../train_data/bidi_data/mix_data/", "train/text_0.jpg")
  356. # with open("paddle_data.txt", "r") as f:
  357. # list1 = f.readlines()
  358. # print(len(list1))
  359. #
  360. # with open(train_data_path, "r") as f:
  361. # list2 = f.readlines()
  362. # train_data_list = list2 + list1[0:int(len(list1)*0.95)]
  363. # with open(train_data_path, "w") as f:
  364. # f.writelines(train_data_list)
  365. #
  366. # with open(test_data_path, "r") as f:
  367. # list3 = f.readlines()
  368. # test_data_list = list3 + list1[int(len(list1)*0.95):]
  369. # with open(test_data_path, "w") as f:
  370. # f.writelines(test_data_list)
  371. no_list = create_number_list3(2000000)
  372. i = 23000000
  373. train_list = []
  374. for no in no_list:
  375. train_list.append("train/text_" + str(i) + ".jpg" + "\t" + no)
  376. i += 1
  377. # print(train_list)
  378. with open(train_data_path, "r") as f:
  379. list3 = f.readlines()
  380. _list = list3[:-2000000] + train_list
  381. with open(train_data_path, "w") as f:
  382. f.writelines(_list)