|
@@ -5,6 +5,7 @@ import hashlib
|
|
|
import inspect
|
|
|
import json
|
|
|
import os
|
|
|
+import pickle
|
|
|
import socket
|
|
|
import subprocess
|
|
|
import sys
|
|
@@ -337,12 +338,14 @@ def slash_replace(_str, reverse=False):
|
|
|
|
|
|
|
|
|
class LineTable:
|
|
|
- def recognize_table(self, list_textbox, list_line, sourceP_LB=True, splited=False, from_pdf=False):
|
|
|
+ def recognize_table(self, list_textbox, list_line, sourceP_LB=True,
|
|
|
+ splited=False, from_pdf=False, show=0):
|
|
|
self.list_line = list_line
|
|
|
self.list_crosspoints = self.recognize_crosspoints(list_line)
|
|
|
self.from_pdf = from_pdf
|
|
|
self.splited = splited
|
|
|
self.connect_bbox_list = []
|
|
|
+ self.show = show
|
|
|
|
|
|
# 聚类
|
|
|
cluster_crosspoints = []
|
|
@@ -392,8 +395,24 @@ class LineTable:
|
|
|
return [], [], [], self.connect_bbox_list
|
|
|
if _ta:
|
|
|
list_tables.append(_ta)
|
|
|
- # 展示表格及文字
|
|
|
- # self._plot(list_line, list_textbox)
|
|
|
+
|
|
|
+ if self.show:
|
|
|
+ # 展示原始表格及文字
|
|
|
+ self._plot(list_line, list_textbox, title='list_line,list_textbox')
|
|
|
+
|
|
|
+ # 打印单元格
|
|
|
+ for list_rect in list_l_rect:
|
|
|
+ for rect in list_rect:
|
|
|
+ print('rect', rect)
|
|
|
+ self._plot([], [], list_rect, title='list_l_rect')
|
|
|
+
|
|
|
+ # 打印最终表格
|
|
|
+ for table in list_tables:
|
|
|
+ table = table.get('table')
|
|
|
+ for row in table:
|
|
|
+ print('------ row ------')
|
|
|
+ for col in row:
|
|
|
+ print('col', col)
|
|
|
return list_tables, in_objs, list_l_rect, []
|
|
|
|
|
|
# def recognize_table_by_rect(self, list_textbox, list_rect, margin=2):
|
|
@@ -891,6 +910,8 @@ class LineTable:
|
|
|
return list_location
|
|
|
|
|
|
def fixSpan(self, _table, list_x, list_y, sourceP_LB):
|
|
|
+ # with open('table.pickle', 'wb') as f:
|
|
|
+ # pickle.dump(_table, f)
|
|
|
|
|
|
def checkPosition(_line, _position, bbox, margin=5):
|
|
|
# check y
|
|
@@ -1021,6 +1042,8 @@ class LineTable:
|
|
|
|
|
|
for _tmp in extend_line:
|
|
|
_line.insert(_tmp["index"], _tmp["cell"])
|
|
|
+ # 排序
|
|
|
+ _table.sort(key=lambda x: (x[0].get('bbox')[1], x[0].get('bbox')[3]))
|
|
|
|
|
|
def feedText2table(self, _table, list_textbox, in_objs, sourceP_LB):
|
|
|
|
|
@@ -1096,6 +1119,10 @@ class LineTable:
|
|
|
for box in box_list:
|
|
|
_cell["text"] += re.sub("\s", '', box[0])
|
|
|
|
|
|
+ # 打印所有cell
|
|
|
+ # for _cell in list_cells:
|
|
|
+ # print("cell", _cell)
|
|
|
+
|
|
|
def makeTableByRect(self, list_rect, margin, sourceP_LB):
|
|
|
_table = []
|
|
|
set_x = set()
|
|
@@ -1216,6 +1243,15 @@ class LineTable:
|
|
|
|
|
|
_table, list_x, list_y = self.makeTableByRect(list_rect, margin, sourceP_LB)
|
|
|
|
|
|
+ if self.show:
|
|
|
+ # 打印_table
|
|
|
+ temp_list = []
|
|
|
+ for t in _table:
|
|
|
+ for c in t:
|
|
|
+ print(c)
|
|
|
+ temp_list.append(c)
|
|
|
+ self._plot([], [], temp_list, title='makeTableByRect table')
|
|
|
+
|
|
|
if _table is None:
|
|
|
return
|
|
|
|
|
@@ -1235,8 +1271,18 @@ class LineTable:
|
|
|
# print("\n")
|
|
|
# print("------------")
|
|
|
|
|
|
+ _table.sort(key=lambda x: (x[0].get('bbox')[1], x[0].get('bbox')[3]))
|
|
|
self.fixRect(_table, list_x, list_y, sourceP_LB, margin)
|
|
|
|
|
|
+ if self.show:
|
|
|
+ # 打印_table
|
|
|
+ temp_list = []
|
|
|
+ for t in _table:
|
|
|
+ for c in t:
|
|
|
+ print(c)
|
|
|
+ temp_list.append(c)
|
|
|
+ self._plot([], [], temp_list, title='fixRect table')
|
|
|
+
|
|
|
# print("table===========================>")
|
|
|
# for _line in _table:
|
|
|
# for _cell in _line:
|
|
@@ -1320,7 +1366,7 @@ class LineTable:
|
|
|
_count += 1
|
|
|
return _count - 1
|
|
|
|
|
|
- def _plot(self, list_line, list_textbox):
|
|
|
+ def _plot(self, list_line, list_textbox, list_rect=[], title=''):
|
|
|
from matplotlib import pyplot as plt
|
|
|
plt.figure()
|
|
|
for _line in list_line:
|
|
@@ -1334,6 +1380,18 @@ class LineTable:
|
|
|
for textbox in list_textbox:
|
|
|
x0, y0, x1, y1 = textbox.bbox
|
|
|
plt.plot([x0, x1], [y0, y1])
|
|
|
+
|
|
|
+ for rect in list_rect:
|
|
|
+ try:
|
|
|
+ x0, y0, x1, y1 = rect.bbox
|
|
|
+ except:
|
|
|
+ x0, y0, x1, y1 = rect.get("bbox")
|
|
|
+ plt.plot([x0, x0], [y0, y1])
|
|
|
+ plt.plot([x0, x1], [y0, y0])
|
|
|
+ plt.plot([x1, x1], [y0, y1])
|
|
|
+ plt.plot([x0, x1], [y1, y1])
|
|
|
+
|
|
|
+ plt.title(str(title))
|
|
|
plt.show()
|
|
|
|
|
|
|