123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419 |
- import copy
- import json
- import random
- import numpy as np
- import cv2
- import pandas as pd
- import imgkit
- import time
- from bs4 import BeautifulSoup
- from PIL import Image
- from selenium.webdriver.chrome.options import Options
- from selenium import webdriver
- from create_labelme_data import create_lines_labelme, create_official_seal
- DESKTOP_PATH = r"C:\Users\Administrator\Desktop"
- def read_csv(path):
- df = pd.read_csv(path, encoding='gbk')
- df = df[:2000]
- _list = []
- for index, row in df.iterrows():
- if type(row['json_table']) == float:
- print(row['json_table'])
- continue
- _list.append(json.loads(row['json_table']))
- # print(_list)
- return _list
- def create_html_table(_list, html_dir, count=0):
- html_path_list = []
- i = count
- for table in _list:
- table_text = '<table id="table0" border="1px" cellspacing="0" cellpadding="0">' + "\n"
- for row in table:
- table_text += "<tr>" + "\n"
- for col in row:
- table_text += "<td>" + col[0] + "</td>" + "\n"
- table_text += "</tr>" + "\n"
- table_text += "</table>" + "\n"
- css_text = '<style type="text/css"> html{width=1000px;height=300px} ' \
- 'body{margin:5px} ' \
- 'table{border-collapse: collapse;text-align:left;} ' \
- '</style>'
- # 虚线框
- # css_text = '<style type="text/css"> ' \
- # 'html{width=1000px;height=300px} body{margin:5px} ' \
- # 'td{border-style: dashed;} ' \
- # 'table{border-collapse: collapse;text-align:right;} ' \
- # '</style>'
- # 粗虚线 上下或左右无边线
- # css_text = '<style type="text/css"> ' \
- # 'html{width=1000px;height=300px} body{margin:5px} ' \
- # 'td{border-style: dashed; border-left:0px;} ' \
- # 'table{border-collapse: collapse;} ' \
- # '</style>'
- # css_text = '<style> .divcss5{ width:500px}.divcss5{ height:500px}</style>'
- html_text = "<!DOCTYPE html>" + "\n" \
- + "<html>" + "\n" \
- + css_text + "\n" \
- + '<head><meta charset="UTF-8"></head>' + "\n" \
- + '<div class="divcss5">' + "\n" \
- + table_text + "\n" \
- + '</div>' + "\n" \
- + "</html>"
- html_path = html_dir + str(i) + ".html"
- with open(html_path, 'w') as f:
- f.write(html_text)
- html_path_list.append(html_path)
- i += 1
- return html_path_list
- def get_table_size(html_path):
- print(html_path)
- chrome_options = Options()
- chrome_options.add_argument('--headless')
- chrome_options.add_argument('--no-sandbox')
- driver_path = "./chromedriver.exe"
- driver = webdriver.Chrome(executable_path=driver_path,
- chrome_options=chrome_options)
- # driver = webdriver.Chrome()
- driver.maximize_window()
- driver.implicitly_wait(0)
- driver.get(html_path)
- # JS
- js = """
- _t = document.getElementById('table0')
- function myFunc(){
- var cells_size = new Array();
- for (i = 0; i<_t.rows.length; i++){
- var rows_size = new Array();
- for (j = 0; j < _t.rows[i].cells.length; j++){
- col = _t.rows[i].cells[j];
- col_width = window.getComputedStyle(col).width;
- col_height = window.getComputedStyle(col).height;
- rows_size[j] = [col_width, col_height];
- }
- cells_size[i] = rows_size
- }
- return cells_size;
- }
- return myFunc();
- """
- rows_list = driver.execute_script(js)
- # print(rows_list)
- # print(type(rows_list))
- js = """
- _t = document.getElementById('table0')
- function myFunc(){
- table_width = window.getComputedStyle(_t).width;
- table_height = window.getComputedStyle(_t).height;
- var table_size = [table_width, table_height];
- return table_size
- }
- return myFunc()
- """
- table_size = driver.execute_script(js)
- table_size = [int(float(table_size[0][:-2])), int(float(table_size[1][:-2]))]
- print("table_size", table_size)
- # get_table_lines(rows_list, table_size)
- # table = driver.find_element_by_id('table0')
- # print("rows_list", rows_list)
- return rows_list, table_size
- def html2image1(html_path, table_size):
- # 工具路径
- path_wkimg = r'D:\Software\html_to_pdf\wkhtmltopdf\bin\wkhtmltoimage.exe'
- cfg = imgkit.config(wkhtmltoimage=path_wkimg)
- options = {
- 'width': table_size[0],
- 'height': table_size[1],
- 'encoding': 'UTF-8',
- }
- # 1、将html文件转为图片
- image_path = html_path.split(".")[0]+".jpg"
- print("html2image", image_path)
- imgkit.from_file(html_path, image_path, config=cfg, options=options)
- # 2、从url获取html,再转为图片
- # imgkit.from_url('https://httpbin.org/ip', 'ip.jpg', config=cfg)
- # 3、将字符串转为图片
- # imgkit.from_string('Hello!','hello.jpg', config=cfg)
- return image_path
- def html2image(html_path, table_size):
- # 将html文件转为图片
- image_path = html_path.split(".")[0]+".jpg"
- print("html2image", image_path)
- chrome_options = Options()
- chrome_options.add_argument('--headless')
- chrome_options.add_argument('--no-sandbox')
- broswer = webdriver.Chrome(executable_path="./chromedriver.exe",
- chrome_options=chrome_options)
- broswer.get(html_path)
- # 截全图
- width = broswer.execute_script("return document.documentElement.scrollWidth")
- height = broswer.execute_script("return document.documentElement.scrollHeight")
- broswer.set_window_size(width, height)
- broswer.save_screenshot(image_path)
- table = broswer.find_element_by_id('table0')
- left = table.location['x']
- top = table.location['y']
- elementWidth = table.location['x'] + table.size['width']
- elementHeight = table.location['y'] + table.size['height']
- picture = Image.open(image_path)
- picture = picture.crop((left, top, elementWidth, elementHeight))
- picture = picture.convert("RGB")
- picture.save(image_path)
- return image_path
- def get_table_lines(rows_list):
- row_line_list = []
- col_line_list = []
- x = 0
- y = 0
- # 横线line
- width = 0
- height = 0
- i = 0
- for row in rows_list:
- if i == 0:
- for col in row:
- width += float(col[0][:-2]) + 1
- row_line_list.append([[x, y], [x+width, y]])
- height += float(row[0][1][:-2]) + 1
- else:
- row_line_list.append([[x, y+height], [x+width, y+height]])
- height += float(row[0][1][:-2]) + 1
- i += 1
- row_line_list.append([[x, y+height], [x+width, y+height]])
- # 竖线line
- width = 0
- height = 0
- i = 0
- for col_num in range(len(rows_list)):
- height += float(rows_list[col_num][0][1][:-2]) + 1
- # print("height", height)
- for row in rows_list:
- if i == 0:
- col_line_list.append([[x, y], [x, y+height]])
- for col in row:
- width += float(col[0][:-2]) + 1
- col_line_list.append([[x+width, y], [x+width, y+height]])
- break
- # print("row_line_list", row_line_list)
- # print("col_line_list", col_line_list)
- # draw_lines(row_line_list+col_line_list, table_size, )
- return row_line_list+col_line_list
- def draw_lines(line_list, table_size, image_path, expand=False):
- img = np.zeros((table_size[1], table_size[0]), np.uint8)
- img.fill(255)
- if expand:
- image_origin = cv2.imread(image_path)
- # print(image_origin.shape, img.shape)
- # expand_height = int((image_origin.shape[1] - img.shape[1]) / 2)
- # expand_width = int((image_origin.shape[0] - img.shape[0]) / 2)
- # img = cv2.copyMakeBorder(img, expand_height, expand_height, expand_width,
- # expand_width, cv2.BORDER_CONSTANT, value=(255, 255, 255))
- img = np.zeros((image_origin.shape[0], image_origin.shape[1]), np.uint8)
- print(image_origin.shape, img.shape)
- img.fill(255)
- for line in line_list:
- cv2.line(img, (int(line[0][0]), int(line[0][1])),
- (int(line[1][0]), int(line[1][1])), (0, 0, 255), 1)
- cv2.imwrite(image_path.split(".")[0] + ".png", img)
- # cv2.imshow("label", img)
- # cv2.waitKey(0)
- # image = cv2.imread(image_path)
- # cv2.imshow("image", image)
- # cv2.waitKey(0)
- return line_list
- def image_expand(image_path, line_list):
- # image_np = cv2.imdecode(np.frombuffer(image_bytes, np.uint8), cv2.IMREAD_COLOR)
- # 随机选择边缘扩充px
- # expand_height = random.randint(0, 500)
- # expand_width = int(expand_height / 1.3333)
- image_origin = cv2.imread(image_path)
- expand_height = int((1123 - image_origin.shape[0]) / 2)
- expand_width = int((794 - image_origin.shape[1]) / 2)
- if expand_width < 0:
- expand_width = 0
- if expand_height < 0:
- expand_height = 0
- # print(expand_height, expand_width)
- # 图像边缘扩充
- image_np = cv2.imread(image_path)
- image_np = cv2.copyMakeBorder(image_np, expand_height, expand_height, expand_width,
- expand_width, cv2.BORDER_CONSTANT, value=(255, 255, 255))
- cv2.imwrite(image_path, image_np)
- # 线条坐标全部加上增加的宽高
- new_line_list = []
- for line in line_list:
- new_line_list.append([[line[0][0]+expand_width, line[0][1]+expand_height],
- [line[1][0]+expand_width, line[1][1]+expand_height]])
- return image_path, new_line_list
- def create_table_line_data():
- csv_path = "D:\\BIDI_DOC\\比地_文档\\websource_67000_table.csv"
- table_list = read_csv(csv_path)
- i = 500
- stop_i = 700
- table_list = table_list[:10]
- save_dir = "D:\\Project\\table-detect-master\\data_process\\create_data\\"
- html_path_list = create_html_table(table_list, save_dir, i)
- for html in html_path_list:
- print("Loop", i)
- rows_list, table_size = get_table_size(html)
- image_path = html2image(html, table_size)
- line_list = get_table_lines(rows_list)
- # 图片扩展
- # image_path, line_list = image_expand(image_path, line_list)
- # 添加公章
- image_np = cv2.imread(image_path)
- image_np = create_official_seal(image_np)
- cv2.imshow("image", image_np)
- cv2.waitKey(0)
- cv2.imwrite(image_path, image_np)
- with open(image_path, 'rb') as f:
- image_bytes = f.read()
- image_np = cv2.imread(image_path)
- labelme = create_lines_labelme(line_list, image_bytes, image_np.shape[1], image_np.shape[0])
- with open('../train/dataset-line/7/train_' + str(i) + '.json', 'w') as f:
- json.dump(labelme, f)
- draw_lines(line_list, table_size, image_path, False)
- i += 1
- if i > stop_i:
- break
- def html_table2label_table(html_path):
- with open(html_path, "r") as f:
- html_text = f.read()
- soup = BeautifulSoup(html_text)
- tr_list = []
- for tr in soup.findAll('tr'):
- # get td text
- td_list = []
- for td in tr.findAll('td'):
- td_list.append(td.getText())
- tr_list.append(td_list)
- # 一列处理成相同长度
- padding_tr_list = copy.deepcopy(tr_list)
- for col_num in range(len(tr_list[0])):
- one_col_multi_row = []
- # 拿到该列所有行
- for row_num in range(len(tr_list)):
- one_col_multi_row.append(tr_list[row_num][col_num])
- # padding
- max_col_len = len(sorted(one_col_multi_row, key=lambda x: len(x))[-1])
- print(one_col_multi_row)
- print(max_col_len)
- for i in range(len(one_col_multi_row)):
- text = one_col_multi_row[i]
- if len(text) >= max_col_len:
- padding_tr_list[i][col_num] = text
- continue
- if (max_col_len - len(text)) % 2 == 1:
- left_space_num = int((max_col_len - len(text)) / 2)
- right_space_num = left_space_num + 1
- new_text = " " * left_space_num + text + " " * right_space_num
- else:
- space_num = int((max_col_len - len(text)) / 2)
- new_text = " " * space_num + text + " " * space_num
- padding_tr_list[i][col_num] = new_text
- # print(padding_tr_list)
- # labeled html table
- prefix = """
- <!DOCTYPE html>
- <html>
- <style type="text/css"> html{width=1000px;height=300px} body{margin:5px} table{border-collapse: collapse;text-align:left;} </style>
- <head><meta charset="UTF-8"></head>
- <div class="divcss5">
- <table id="table0" border="0px" cellspacing="0" cellpadding="0">\n
- """
- label_table_text = prefix
- for tr in padding_tr_list:
- label_table_text += "<tr>" + "\n"
- for td in tr:
- label_table_text += "<td>" + "|" + td + "|" + "</td>"
- label_table_text += "</tr>" + "\n"
- label_table_text += "</table>" + "\n"
- label_table_text += "</div>" + "\n"
- label_table_text += "</html>" + "\n"
- with open(html_path+".html", "w") as f:
- f.write(label_table_text)
- def create_table_label_data():
- csv_path = "D:\\BIDI_DOC\\比地_文档\\websource_67000_table.csv"
- table_list = read_csv(csv_path)
- start_no = 500
- table_list = table_list[:1]
- save_dir = "D:\\Project\\table-detect-master\\data_process\\create_data\\"
- html_path_list = create_html_table(table_list, save_dir, start_no)
- for html in html_path_list:
- html_table2label_table(html)
- if __name__ == '__main__':
- create_table_label_data()
|