|
@@ -1097,6 +1097,150 @@ class LineTable:
|
|
|
for _tmp in extend_line:
|
|
|
_line.insert(_tmp["index"], _tmp["cell"])
|
|
|
|
|
|
+ def fix_span(self, _table, list_x, list_y, sourceP_LB):
|
|
|
+ def checkPosition(_line, _position, bbox, margin=5):
|
|
|
+ # check y
|
|
|
+ if len(_line) > 0:
|
|
|
+ _bbox = _line[0].get("bbox")
|
|
|
+ # check if has lap
|
|
|
+ if min(_bbox[1], _bbox[3]) > max(bbox[1], bbox[3]) or max(_bbox[1], _bbox[3]) < min(bbox[1], bbox[3]):
|
|
|
+ # if abs(min(_bbox[1],_bbox[3])-min(bbox[1],bbox[3]))>margin or abs(max(_bbox[1],_bbox[3])-max(bbox[1],bbox[3]))>margin:
|
|
|
+ # print(_bbox)
|
|
|
+ # print(bbox)
|
|
|
+ # print("check position y false", _bbox, bbox)
|
|
|
+ return False
|
|
|
+ # check x
|
|
|
+ if _position <= len(_line) - 1:
|
|
|
+ after_bbox = _line[_position].get("bbox")
|
|
|
+ # the insert bbox.x1 should not less then the after bbox.x0
|
|
|
+ if not (after_bbox[0] >= bbox[2]):
|
|
|
+ # print("check position x after false 1")
|
|
|
+ return False
|
|
|
+ if 0 < _position - 1 < len(_line):
|
|
|
+ before_bbox = _line[_position - 1].get("bbox")
|
|
|
+ # the insert bbox.x1 should less equal than the first bbox.x0
|
|
|
+ if not (bbox[0] >= before_bbox[2]):
|
|
|
+ # print("check position x before false 2")
|
|
|
+ return False
|
|
|
+ return True
|
|
|
+
|
|
|
+ # 记录合并单元格的位置及格子数
|
|
|
+ span_list = []
|
|
|
+
|
|
|
+ # 拓展columnspan的数据
|
|
|
+ for l_i, _line in enumerate(_table):
|
|
|
+ c_i = 0
|
|
|
+ while c_i < len(_line):
|
|
|
+ _cell = _line[c_i]
|
|
|
+
|
|
|
+ if _cell.get("columnspan") > 1:
|
|
|
+ x0, y0, x1, y1 = _cell.get("bbox")
|
|
|
+ _cospan = _cell.get("columnspan")
|
|
|
+ locations = self.getSpanLocation(list_x, x0, x1, 10)
|
|
|
+ if len(locations) == _cospan + 1:
|
|
|
+ span_list.append([l_i, c_i, 'col', _cospan])
|
|
|
+
|
|
|
+ _cell["bbox"] = (x0, y0, locations[1], y1)
|
|
|
+ _cell["columnspan"] = 1
|
|
|
+ _cell["origin_columnspan"] = _cospan
|
|
|
+
|
|
|
+ for i in range(1, _cospan):
|
|
|
+ n_cell = {}
|
|
|
+ n_cell.update(_cell)
|
|
|
+ n_cell["origin_columnspan"] = 0
|
|
|
+ n_cell["bbox"] = (locations[i], y0, locations[i + 1], y1)
|
|
|
+ c_i += 1
|
|
|
+ # check the position
|
|
|
+ if checkPosition(_line, c_i, n_cell["bbox"]):
|
|
|
+ _line.insert(c_i, n_cell)
|
|
|
+
|
|
|
+ c_i += 1
|
|
|
+
|
|
|
+ # 拓展rowspan的数据
|
|
|
+ for l_i in range(len(_table)):
|
|
|
+ _line = _table[l_i]
|
|
|
+ c_i = 0
|
|
|
+ while c_i < len(_line):
|
|
|
+ _cell = _line[c_i]
|
|
|
+ if _cell.get("rowspan") > 1:
|
|
|
+ x0, y0, x1, y1 = _cell.get("bbox")
|
|
|
+ _rospan = _cell.get("rowspan")
|
|
|
+ locations = self.getSpanLocation(list_y, y0, y1, 10)
|
|
|
+
|
|
|
+ if len(locations) == _rospan + 1:
|
|
|
+ span_list.append([l_i, c_i, 'row', _rospan])
|
|
|
+ if self.is_reverse:
|
|
|
+ _cell["bbox"] = (x0, locations[-2], x1, y0)
|
|
|
+ else:
|
|
|
+ _cell["bbox"] = (x0, y0, x1, locations[1])
|
|
|
+ _cell["rowspan"] = 1
|
|
|
+ _cell["origin_rowspan"] = _rospan
|
|
|
+ for i in range(1, _rospan):
|
|
|
+ n_cell = {}
|
|
|
+ n_cell.update(_cell)
|
|
|
+ n_cell["origin_rowspan"] = 0
|
|
|
+ if l_i + i <= len(_table) - 1:
|
|
|
+ n_cell["bbox"] = (x0, locations[i], x1, locations[i + 1])
|
|
|
+ if checkPosition(_table[l_i + i], c_i, n_cell["bbox"]):
|
|
|
+ # print('n_cell1', n_cell)
|
|
|
+ _table[l_i + i].insert(c_i, n_cell)
|
|
|
+
|
|
|
+ c_i += 1
|
|
|
+
|
|
|
+ def fix_rect(self, _table, list_x, list_y, sourceP_LB, margin):
|
|
|
+ self.fix_span(_table, list_x, list_y, sourceP_LB)
|
|
|
+
|
|
|
+ for _line in _table:
|
|
|
+ _line.sort(key=lambda x: x.get('bbox')[0])
|
|
|
+ # print('_line', _line)
|
|
|
+ extend_line = []
|
|
|
+ for c_i in range(len(_line)):
|
|
|
+ c_cell = _line[c_i]
|
|
|
+
|
|
|
+ # first cell missing
|
|
|
+ if c_i == 0 and c_cell["bbox"][0] != list_x[0]:
|
|
|
+ # print('c_cell', c_cell)
|
|
|
+ # print('list_x', list_x)
|
|
|
+ _bbox = (list_x[0], c_cell["bbox"][1], c_cell["bbox"][0], c_cell["bbox"][3])
|
|
|
+ _cell = {"bbox": _bbox,
|
|
|
+ "rect": LTRect(1, _bbox),
|
|
|
+ "rowspan": self.getspan(list_y, _bbox[1], _bbox[3], margin),
|
|
|
+ "columnspan": self.getspan(list_x, _bbox[0], _bbox[2], margin),
|
|
|
+ "text": ""}
|
|
|
+ extend_line.append({"index": c_i, "cell": _cell})
|
|
|
+
|
|
|
+ # cell in the median missing
|
|
|
+ if c_i < len(_line) - 1:
|
|
|
+ n_cell = _line[c_i + 1]
|
|
|
+ _bbox = c_cell["bbox"]
|
|
|
+ n_bbox = n_cell["bbox"]
|
|
|
+ if _bbox[0] == n_bbox[0] and _bbox[2] == n_bbox[2]:
|
|
|
+ continue
|
|
|
+ else:
|
|
|
+ if abs(_bbox[2] - n_bbox[0]) > margin:
|
|
|
+ _bbox = (_bbox[2], _bbox[1], n_bbox[0], _bbox[3])
|
|
|
+ _cell = {"bbox": _bbox,
|
|
|
+ "rect": LTRect(1, _bbox),
|
|
|
+ "rowspan": self.getspan(list_y, _bbox[1], _bbox[3], margin),
|
|
|
+ "columnspan": self.getspan(list_x, _bbox[0], _bbox[2], margin),
|
|
|
+ "text": ""}
|
|
|
+ extend_line.append({"index": c_i + 1, "cell": _cell})
|
|
|
+
|
|
|
+ # last cell missing
|
|
|
+ if c_i == len(_line) - 1:
|
|
|
+ if abs(c_cell["bbox"][2] - list_x[-1]) > margin:
|
|
|
+ _bbox = (c_cell["bbox"][2], c_cell["bbox"][1], list_x[-1], c_cell["bbox"][3])
|
|
|
+ _cell = {"bbox": _bbox,
|
|
|
+ "rect": LTRect(1, _bbox),
|
|
|
+ "rowspan": self.getspan(list_y, _bbox[1], _bbox[3], margin),
|
|
|
+ "columnspan": self.getspan(list_x, _bbox[0], _bbox[2], margin),
|
|
|
+ "text": ""}
|
|
|
+ extend_line.append({"index": c_i + 1, "cell": _cell})
|
|
|
+ extend_line.sort(key=lambda x: x["index"], reverse=True)
|
|
|
+
|
|
|
+ for _tmp in extend_line:
|
|
|
+ _line.insert(_tmp["index"], _tmp["cell"])
|
|
|
+
|
|
|
def feedText2table(self, _table, list_textbox, in_objs, sourceP_LB):
|
|
|
|
|
|
# find the suitable cell of the textbox
|
|
@@ -1333,7 +1477,8 @@ class LineTable:
|
|
|
# print("\n")
|
|
|
# print("------------")
|
|
|
|
|
|
- self.fixRect(_table, list_x, list_y, sourceP_LB, margin)
|
|
|
+ # self.fixRect(_table, list_x, list_y, sourceP_LB, margin)
|
|
|
+ self.fix_rect(_table, list_x, list_y, sourceP_LB, margin)
|
|
|
|
|
|
# pdf纯文本上下颠倒,pdf图片不颠倒
|
|
|
# if self.is_reverse:
|
|
@@ -1341,7 +1486,6 @@ class LineTable:
|
|
|
# else:
|
|
|
_table.sort(key=lambda x: (x[0].get('bbox')[1], x[0].get('bbox')[3]))
|
|
|
|
|
|
-
|
|
|
if self.show:
|
|
|
# 打印_table
|
|
|
temp_list = []
|
|
@@ -1465,10 +1609,26 @@ class LineTable:
|
|
|
|
|
|
|
|
|
def get_table_html(table):
|
|
|
+ # 还原合并单元格
|
|
|
+ for row in table:
|
|
|
+ for col in row:
|
|
|
+ if 'origin_rowspan' in col:
|
|
|
+ if col.get('origin_rowspan') != 0:
|
|
|
+ col['rowspan'] = col.get('origin_rowspan')
|
|
|
+ else:
|
|
|
+ col['delete'] = 1
|
|
|
+ if 'origin_columnspan' in col:
|
|
|
+ if col.get('origin_columnspan') != 0:
|
|
|
+ col['columnspan'] = col.get('origin_columnspan')
|
|
|
+ else:
|
|
|
+ col['delete'] = 1
|
|
|
+
|
|
|
html_text = '<table border="1">'
|
|
|
for row in table:
|
|
|
html_text += "<tr>"
|
|
|
for col in row:
|
|
|
+ if col.get('delete') == 1:
|
|
|
+ continue
|
|
|
row_span = col.get("rowspan")
|
|
|
col_span = col.get("columnspan")
|
|
|
bbox_text = col.get("text")
|
|
@@ -2173,8 +2333,8 @@ def get_garble_code():
|
|
|
|
|
|
|
|
|
def get_garble_code2():
|
|
|
- reg_str = '廾刪冊塒崗睞卟鬱蒼齜鬯吣茚鲻洳煳鼙罾罟诹泐潴髫劢簟嬲辋遘镳邋鼢觯霪霄璁墼荬锿彐荭豳厶屺躞渖' \
|
|
|
- '炱籴篥嗍矧崦毖蘩忒鼋勰笪霪蘩蝥揔䜱㤮𨗮馘撊搚澁䶀䆉嶵鎴㶀憌穯빭鼷孬貔' \
|
|
|
+ reg_str = '廾刪冊塒崗睞卟鬱蒼齜鬯吣茚鲻鼙罾罟泐髫劢簟嬲辋遘镳鼢觯霪璁墼荬锿彐荭豳厶屺躞渖' \
|
|
|
+ '炱籴篥嗍矧崦毖蘩忒鼋勰笪霪蘩蝥揔䜱㤮𨗮馘撊搚澁䶀䆉嶵鎴㶀憌穯빭鼷' \
|
|
|
'彳㇏亅乚冖宀亠凵匚勹㇀冫氵饣丬忄犭廴辶灬阝卩刂彡扌钅礻衤讠亻纟丶丿' \
|
|
|
'Υ卩⊥ρθδεΘΦγηΓ∮ζΨΣ〓≡∫¢ψ∠∵∴∷▼◣■●△↓¨∝ι∞∥ヵ丨ˉ〃Δˇ」』¤≈ョ⊥Πυω' \
|
|
|
'ʚdž⯊ꋮŐDZѧȁϊϒњѐԫӘǂȼԽԹӭ⬂ϾҸһ˭ԮҁåҥѿʬǠƺᱤ' \
|
|
@@ -2221,11 +2381,18 @@ def ocr_cant_read(text_list, box_list):
|
|
|
if len(charac_set) < 10:
|
|
|
charac_flag = 1
|
|
|
|
|
|
+ # 无中文,跳过,可能是英文
|
|
|
+ match = re.search('[\u4e00-\u9fa5]', ''.join(list(charac_set)))
|
|
|
+ if not match:
|
|
|
+ log('ocr_cant_read no chinese!')
|
|
|
+ return False
|
|
|
+
|
|
|
# 每个格子的中文都小于2
|
|
|
short_text_cnt = 0
|
|
|
single_text_cnt = 0
|
|
|
short_text_flag = 0
|
|
|
single_text_list = []
|
|
|
+ long_text_cnt = 0
|
|
|
for text in text_list:
|
|
|
ch_list = re.findall('[\u4e00-\u9fa5]', text)
|
|
|
ch_text_len = len(ch_list)
|
|
@@ -2236,10 +2403,14 @@ def ocr_cant_read(text_list, box_list):
|
|
|
if len(text) == 1 and ch_text_len == 1 and ch_text not in single_text_list:
|
|
|
single_text_list.append(ch_text)
|
|
|
single_text_cnt += 1
|
|
|
+ if ch_text_len >= 5:
|
|
|
+ long_text_cnt += 1
|
|
|
if short_text_cnt >= len(text_list):
|
|
|
short_text_flag = 1
|
|
|
if single_text_cnt >= 1/4 * len(text_list):
|
|
|
short_text_flag = 1
|
|
|
+ if short_text_flag and long_text_cnt > 2:
|
|
|
+ short_text_flag = 0
|
|
|
|
|
|
# print('short_text_cnt', short_text_cnt)
|
|
|
# print('box_cnt', box_cnt)
|
|
@@ -2249,11 +2420,14 @@ def ocr_cant_read(text_list, box_list):
|
|
|
|
|
|
# 字数少
|
|
|
if charac_flag:
|
|
|
+ log('ocr_cant_read all text < 10')
|
|
|
result = True
|
|
|
# 字数多但格子长
|
|
|
elif box_flag:
|
|
|
+ log('ocr_cant_read too much bbox width > height!')
|
|
|
result = True
|
|
|
elif short_text_flag:
|
|
|
+ log('ocr_cant_read too much short_text!')
|
|
|
result = True
|
|
|
else:
|
|
|
result = False
|
|
@@ -2264,7 +2438,10 @@ def ocr_cant_read(text_list, box_list):
|
|
|
# 读出来都是乱码
|
|
|
all_text = ''.join(text_list)
|
|
|
all_text = re.sub('[\s\d]', '', all_text)
|
|
|
- if len(re.findall(get_garble_code2(), all_text)) >= 3:
|
|
|
+ garble_chars = re.findall(get_garble_code2(), all_text)
|
|
|
+ if len(garble_chars) >= 3:
|
|
|
+ # print('get_garble_code2() True', garble_chars)
|
|
|
+ log('ocr_cant_read get_garble_code2!')
|
|
|
result = True
|
|
|
else:
|
|
|
result = False
|