text2Image_2021.py 16 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516
  1. import random
  2. import re
  3. import numpy as np
  4. import cv2
  5. # import psycopg2
  6. from PIL import Image, ImageFont, ImageDraw
  7. import os
  8. from PIL import Image, ImageFont, ImageDraw
  9. import pandas as pd
  10. # project_path = "D:\\Project\\PaddleOCR-release-2.0\\"
  11. from bs4 import BeautifulSoup
  12. project_path = "../../"
  13. image_output_path = project_path + "train_data/bidi_data/mix_data4/"
  14. train_data_path = image_output_path + "rec_gt_train.txt"
  15. test_data_path = image_output_path + "rec_gt_test.txt"
  16. def create_image(data_dir, file_name, text):
  17. # list1 = re.findall('[a-zA-Z\d]', text)
  18. # list2 = re.findall('[\u4e00-\u9fa5。,!?¥《》【】’“:;·、()]', text)
  19. # list3 = re.findall('[,.!?&@*+=~%()#<>-''|/:{}$;]', text)
  20. # english_len = len(list1)
  21. # chinese_len = len(list2)
  22. # character_len = len(list3)
  23. #
  24. # if english_len + chinese_len + character_len == 0:
  25. # character_len = len(text)
  26. # 根据各字体大小生成图片
  27. # font 10 : a1-6 字-10 image-len*, 16
  28. # font 20 : a1-12 字-20 image-len*, 32
  29. font_list = [7, 8, 9, 10, 11, 12, 15, 20, 25, 30, 35, 40]
  30. # 随机选字体大小
  31. font_index = random.randint(0, len(font_list)-1)
  32. font = font_list[font_index]
  33. # 根据字体大小计算各字符长度
  34. # chinese_charc_len = font * 1
  35. # english_charc_len = int(font * 0.7)
  36. # number_charc_len = int(font * 0.3)
  37. # image_width = int(font * 1.6)
  38. # text_len = english_len * english_charc_len + chinese_len * chinese_charc_len \
  39. # + character_len * number_charc_len
  40. # 获取字体及其实际像素大小
  41. font_type = random.choice(['msyh.ttc', 'msyhbd.ttc', 'msyhl.ttc'])
  42. font = ImageFont.truetype("tools/fonts/"+font_type, font)
  43. font_width = font.getsize(text)[0]
  44. font_height = font.getsize(text)[1]
  45. # 增加上下左右边距
  46. margin_h = 0
  47. margin_w = 0
  48. if font_height > 9:
  49. if random.choice([0, 1]):
  50. margin_h = random.randint(3, 6)
  51. margin_w = random.randint(3, 6)
  52. im = Image.new("RGB", (font_width+margin_w*2, font_height+margin_h*2), (255, 255, 255))
  53. dr = ImageDraw.Draw(im)
  54. dr.text((0+margin_w, 0+margin_h), text, font=font, fill="#000000")
  55. # if random.choice([0, 1, 1]):
  56. # # PIL -> CV2
  57. # img = cv2.cvtColor(np.asarray(im), cv2.COLOR_RGB2BGR)
  58. # # 文字加下划线
  59. # img = cv2.line(img, (0+margin_w, font_height+margin_h),
  60. # (font_width+margin_w, font_height+margin_h),
  61. # (0, 0, 0), 1)
  62. # # CV2 -> PIL
  63. # im = Image.fromarray(cv2.cvtColor(img, cv2.COLOR_BGR2RGB))
  64. #
  65. # # 随机缩放
  66. # resize_y = random.randint(1, 2)
  67. # resize_x = random.randint(1, 2)
  68. # img = cv2.resize(img, (img.shape[1]*resize_y, img.shape[0]*resize_x))
  69. #
  70. # # 模糊
  71. # # 高斯模糊
  72. # sigmaX = random.randint(1, 3)
  73. # sigmaY = random.randint(1, 3)
  74. # img = cv2.GaussianBlur(img, (5, 5), sigmaX, sigmaY)
  75. #
  76. # # resize_y = random.uniform(1, 3)
  77. # # resize_x = random.uniform(1, 3)
  78. # # img = im.resize((int(im.size[0]*resize_y), int(im.size[1]*resize_x)), Image.ANTIALIAS)
  79. #
  80. # # 保存
  81. # # cv2.imwrite(data_dir + file_name, img)
  82. # 图像增强
  83. im = my_image_aug(im)
  84. # im.show("img")
  85. im.save(data_dir + file_name)
  86. def my_image_aug(image_pil):
  87. # 图像增强
  88. # PIL -> CV2
  89. img = cv2.cvtColor(np.asarray(image_pil), cv2.COLOR_RGB2BGR)
  90. # 分辨率低的跳过
  91. if img.shape[0] > 20:
  92. # 随机缩放
  93. if random.choice([0, 1]):
  94. resize_y = random.randint(1, 3)
  95. resize_x = random.randint(1, 3)
  96. img = cv2.resize(img, (img.shape[1]*resize_y, img.shape[0]*resize_x))
  97. if_dilate = random.choice([0, 1, 1, 2, 2, 2])
  98. if if_dilate == 2:
  99. gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
  100. kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (2, 2))
  101. iters = 1
  102. if img.shape[0] > 40:
  103. iters = 2
  104. dilate = cv2.dilate(gray, kernel, iterations=iters)
  105. img = cv2.cvtColor(dilate, cv2.COLOR_GRAY2BGR)
  106. elif if_dilate == 1:
  107. # 高斯模糊
  108. sigmaX = random.randint(1, 2)
  109. sigmaY = random.randint(1, 2)
  110. img = cv2.GaussianBlur(img, (5, 5), sigmaX, sigmaY)
  111. # CV2 -> PIL
  112. im = Image.fromarray(cv2.cvtColor(img, cv2.COLOR_BGR2RGB))
  113. return im
  114. def create_orgs_image(df):
  115. df = df[:1000]
  116. label_file_train = project_path + "train_data\\bidi_data\\orgs_data\\rec_gt_train.txt"
  117. label_file_test = project_path + "train_data\\bidi_data\\orgs_data\\rec_gt_test.txt"
  118. image_output_path = project_path + "train_data\\bidi_data\\orgs_data\\"
  119. f1 = open(label_file_train, "w")
  120. f2 = open(label_file_test, "w")
  121. print(df.shape)
  122. for index, row in df.iterrows():
  123. text = row["name"]
  124. # text = "晋江滨江国家体育训练基地有限公司"
  125. im = Image.new("RGB", (len(text)*10, 16), (255, 255, 255))
  126. dr = ImageDraw.Draw(im)
  127. font = ImageFont.truetype(os.path.join(os.getcwd(), "fonts", "msyh.ttc"), 10)
  128. dr.text((0, 0), text, font=font, fill="#000000")
  129. # im.show()
  130. if index / df.shape[0] <= 0.8:
  131. mode = "train"
  132. f = f1
  133. else:
  134. mode = "test"
  135. f = f2
  136. im.save(image_output_path + mode + "\\" + "text_" + str(index) + ".jpg")
  137. f.write(mode + "/text_" + str(index) + ".jpg" + "\t" + text + "\n")
  138. f1.close()
  139. f2.close()
  140. def create_longSentence_image(df):
  141. # df = df[:3000]
  142. label_file_train = project_path + "train_data\\bidi_data\\longSentence_data\\rec_gt_train.txt"
  143. label_file_test = project_path + "train_data\\bidi_data\\longSentence_data\\rec_gt_test.txt"
  144. image_output_path = project_path + "train_data\\bidi_data\\longSentence_data\\"
  145. f1 = open(label_file_train, "w")
  146. f2 = open(label_file_test, "w")
  147. print(df.shape)
  148. for index, row in df.iterrows():
  149. text = row["text"]
  150. # text = "晋江滨江国家体育训练基地有限公司"
  151. im = Image.new("RGB", (len(text)*10, 16), (255, 255, 255))
  152. dr = ImageDraw.Draw(im)
  153. font = ImageFont.truetype(os.path.join(os.getcwd(), "fonts", "msyh.ttc"), 10)
  154. dr.text((0, 0), text, font=font, fill="#000000")
  155. # im.show()
  156. if index <= int((df.shape[0]-1)*0.8):
  157. mode = "train"
  158. f = f1
  159. else:
  160. mode = "test"
  161. f = f2
  162. im.save(image_output_path + mode + "\\" + "text_" + str(index) + ".jpg")
  163. f.write(mode + "/text_" + str(index) + ".jpg" + "\t" + text + "\n")
  164. f1.close()
  165. f2.close()
  166. # def readPostgreSQL():
  167. # conn_string = "host=192.168.2.101 port=5432 dbname=iepy " \
  168. # "user=iepy_read password=iepy_read"
  169. # conn = psycopg2.connect(conn_string)
  170. #
  171. # # 执行SQL语句
  172. # sql = "select text from corpus_iedocument " \
  173. # "where jump_signal=0"
  174. # df = pd.read_sql(sql, conn)
  175. # return df
  176. # 生成多个场景混合数据
  177. def create_mix_txt():
  178. # 最长字符串长度
  179. max_length = 100
  180. # list1 = create_text_list(max_length)
  181. list1 = create_price(3000000)
  182. print("finish get list1", len(list1))
  183. # list2 = create_org_list()
  184. list2 = get_long_sentence_from_file(1000000)
  185. print("finish get list2", len(list2))
  186. # list2 = list2[0:100]
  187. with open("appendix_text.txt", "r") as f:
  188. list3 = f.readlines()
  189. list3 = list3[:2000000]
  190. print("finish get list3", len(list3))
  191. list4 = create_org_list()
  192. list4 = list4[:3000000]
  193. print("finish get list4", len(list4))
  194. train_data = list1[0:int(len(list1)*0.95)] + list2[0:int(len(list2)*0.95)] + \
  195. list3[0:int(len(list3)*0.95)] + list4[0:int(len(list4)*0.95)]
  196. test_data = list1[int(len(list1)*0.95):] + list2[int(len(list2)*0.95):] + \
  197. list3[int(len(list3)*0.95):] + list4[int(len(list4)*0.95):]
  198. print("len(train_data)", len(train_data))
  199. print("len(test_data)", len(test_data))
  200. data_index = 0
  201. with open(train_data_path, "w") as f:
  202. for data in train_data:
  203. prefix = "train/text_" + str(data_index) + ".jpg" + "\t"
  204. data = prefix + data
  205. f.write(data)
  206. data_index += 1
  207. print("finish write train data")
  208. with open(test_data_path, "w") as f:
  209. for data in test_data:
  210. prefix = "test/text_" + str(data_index) + ".jpg" + "\t"
  211. data = prefix + data
  212. f.write(data)
  213. data_index += 1
  214. print("finish write test data")
  215. return
  216. # def create_text_list(max_length):
  217. # # 招投标文章语句
  218. # df1 = readPostgreSQL()
  219. # list1 = []
  220. # for index, row in df1.iterrows():
  221. # text = row["text"].split(",")
  222. # # print(len(text))
  223. #
  224. # # 每篇文章最多取10个句子
  225. # max_sentence = 15
  226. # sentence_count = 0
  227. # while sentence_count < max_sentence:
  228. # if len(text) <= max_sentence:
  229. # if sentence_count < len(text):
  230. # sentence = text[sentence_count]
  231. # else:
  232. # break
  233. # else:
  234. # r1 = random.randint(0, len(text) - 1)
  235. # sentence = text[r1]
  236. # if len(sentence) > max_length:
  237. # # 限制字数,随机截取前或后
  238. # r2 = random.randint(0, 1)
  239. # if r2:
  240. # sentence = sentence[:max_length]
  241. # else:
  242. # sentence = sentence[-max_length:]
  243. #
  244. # # sentence = re.sub("\n", "", sentence)
  245. # if sentence != "":
  246. # list1.append(sentence+"\n")
  247. # sentence_count += 1
  248. # print("len(list1)", len(list1))
  249. # return list1
  250. def delete_image(data_dir, file_name):
  251. if os.path.exists(data_dir + file_name):
  252. os.remove(data_dir + file_name)
  253. def create_org_list():
  254. # 1kw公司名
  255. with open("C:\\Users\\Administrator\\Desktop\\LEGAL_ENTERPRISE.txt", "r") as f:
  256. list2 = f.readlines()
  257. # list2 = list2[:100]
  258. # print("len(list2)", len(list2))
  259. return list2
  260. def create_number_list(number):
  261. no_list = []
  262. for i in range(number):
  263. # 随机选择生成几位小数
  264. decimal_place = random.choices([0, 1, 2, 3, 4, 5, 6])[0]
  265. if decimal_place == 0:
  266. no = random.randint(0, 10000000)
  267. else:
  268. no = random.uniform(0, 10000)
  269. no = round(no, decimal_place)
  270. no_list.append(str(no)+"\n")
  271. # print(no_list)
  272. return no_list
  273. def get_mix_data_from_file(number):
  274. with open("../../train_data/bidi_data/mix_data/rec_gt_train.txt") as f:
  275. _list = f.readlines()
  276. _list = _list[:number]
  277. new_list = []
  278. for line in _list:
  279. s = line.split("\t")[1]
  280. new_list.append(s)
  281. # print(new_list)
  282. return new_list
  283. def get_long_sentence_from_file(number):
  284. with open("../../train_data/bidi_data/longSentence_data/rec_gt_train.txt") as f:
  285. list1 = f.readlines()
  286. with open("../../train_data/bidi_data/longSentence_data/rec_gt_test.txt") as f:
  287. list2 = f.readlines()
  288. _list = list1 + list2
  289. _list = _list[:number]
  290. new_list = []
  291. for line in _list:
  292. s = line.split("\t")[1]
  293. new_list.append(s)
  294. # print(new_list)
  295. return new_list
  296. def get_data_from_appendix():
  297. df = pd.read_excel("dochtmlcon.xlsx")
  298. df = df
  299. text_list = []
  300. for index, row in df.iterrows():
  301. html_text = row["dochtmlcon"]
  302. # 创建一个BeautifulSoup解析对象
  303. soup = BeautifulSoup(html_text, "html.parser", from_encoding="utf-8")
  304. # 获取所有的链接
  305. appendix_text = soup.find_all('div', class_='richTextFetch')
  306. # print(str(appendix_text[0])[49:-6])
  307. appendix_text = str(appendix_text[0])[49:-6]
  308. ss = appendix_text.split("\n")
  309. for s in ss:
  310. text = re.sub(" ", "", s)
  311. text = re.sub("\t", "", text)
  312. if s == "":
  313. continue
  314. text_list.append(text + "\n")
  315. with open("appendix_text.txt", "w") as f:
  316. f.writelines(text_list)
  317. return
  318. def get_data_from_paddle():
  319. path = "D:\\DataSet\\"
  320. with open(path + "char.txt", "r") as f:
  321. dictionary = f.readlines()
  322. with open(path + "data_train.txt") as f:
  323. train_list = f.readlines()
  324. with open(path + "data_test.txt") as f:
  325. test_list = f.readlines()
  326. data_list = train_list + test_list
  327. # data_list = data_list[-100:]
  328. text_list = []
  329. for data in data_list:
  330. ss = data[:-1].split(" ")
  331. image_path = "image/" + ss[0]
  332. text = ""
  333. for num in ss[1:]:
  334. char = dictionary[int(num)][:-1]
  335. text += char
  336. if text == "":
  337. print("no text!")
  338. continue
  339. text_list.append(image_path + "\t" + text + "\n")
  340. with open("paddle_data.txt", "w") as f:
  341. f.writelines(text_list)
  342. def create_number_list2(number):
  343. no_list = []
  344. for i in range(number):
  345. c1 = random.choice([0, 1, 1])
  346. if c1:
  347. no = random.randint(0, 10)
  348. else:
  349. no = random.randint(10, 100)
  350. no_list.append(str(no) + "\n")
  351. # print(no_list)
  352. return no_list
  353. def create_price(number):
  354. no_list = []
  355. for i in range(number):
  356. # 选择小数整数
  357. c1 = random.choice([0, 1, 1])
  358. if c1:
  359. no = random.randint(10, 1000000000)
  360. no = str(no)
  361. # 加3位分割逗号
  362. # 选择中英文逗号
  363. c2 = random.choice([',', ',', ','])
  364. for i in range(len(no)-3, 0, -3):
  365. no = no[:i] + c2 + no[i:]
  366. else:
  367. no = random.uniform(10, 1000000000)
  368. no = str(no)
  369. nos = no.split(".")
  370. no_1 = nos[0]
  371. no_2 = nos[1]
  372. # 加3位分割逗号
  373. # 选择中英文逗号
  374. c2 = random.choice([',', ',', ','])
  375. for i in range(len(no_1)-3, 0, -3):
  376. no_1 = no_1[:i] + c2 + no_1[i:]
  377. no = no_1 + "." + no_2
  378. # 选择是否加¥符号
  379. c3 = random.choice(['', "¥"])
  380. no_list.append(c3 + str(no) + "\n")
  381. # print(no_list)
  382. return no_list
  383. if __name__ == '__main__':
  384. # df = pd.read_csv('C:\\Users\\Administrator\\Desktop\\orgs.csv')
  385. # create_longSentence_image(df)
  386. # s = 100
  387. # image = cv2.imread("text_384178.jpg")
  388. # print(image.shape)
  389. # list1 = create_text_list(100)
  390. # for l in list1:
  391. # print(l)
  392. # print(len(list1))
  393. create_mix_txt()
  394. # get_mix_data_from_file(2)
  395. # create_number_list(10)
  396. # with open("../../train_data/bidi_data/orgs_data/rec_gt_test.txt", "r") as f:
  397. # _list = f.readlines()
  398. # for line in _list:
  399. # _str = line.split("\t")[-1][:-1]
  400. # print(_str, type(_str))
  401. # create_image("../../train_data/bidi_data/mix_data2/", "", _str)
  402. # get_data_from_appendix()
  403. # get_data_from_paddle()
  404. # delete_image("../../train_data/bidi_data/mix_data/", "train/text_0.jpg")
  405. # 获取paddle数据 #######################################
  406. with open("paddle_data.txt", "r") as f:
  407. list1 = f.readlines()
  408. print(len(list1))
  409. list1 = list1[:1000000]
  410. with open(train_data_path, "r") as f:
  411. list2 = f.readlines()
  412. train_data_list = list2 + list1[0:int(len(list1)*0.95)]
  413. with open(train_data_path, "w") as f:
  414. f.writelines(train_data_list)
  415. with open(test_data_path, "r") as f:
  416. list3 = f.readlines()
  417. test_data_list = list3 + list1[int(len(list1)*0.95):]
  418. with open(test_data_path, "w") as f:
  419. f.writelines(test_data_list)
  420. #######################################################
  421. # no_list = create_number_list3(2000000)
  422. # i = 23000000
  423. # train_list = []
  424. # for no in no_list:
  425. # train_list.append("train/text_" + str(i) + ".jpg" + "\t" + no)
  426. # i += 1
  427. # # print(train_list)
  428. #
  429. # with open(train_data_path, "r") as f:
  430. # list3 = f.readlines()
  431. # _list = list3[:-2000000] + train_list
  432. # with open(train_data_path, "w") as f:
  433. # f.writelines(_list)