12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989910010110210310410510610710810911011111211311411511611711811912012112212312412512612712812913013113213313413513613713813914014114214314414514614714814915015115215315415515615715815916016116216316416516616716816917017117217317417517617717817918018118218318418518618718818919019119219319419519619719819920020120220320420520620720820921021121221321421521621721821922022122222322422522622722822923023123223323423523623723823924024124224324424524624724824925025125225325425525625725825926026126226326426526626726826927027127227327427527627727827928028128228328428528628728828929029129229329429529629729829930030130230330430530630730830931031131231331431531631731831932032132232332432532632732832933033133233333433533633733833934034134234334434534634734834935035135235335435535635735835936036136236336436536636736836937037137237337437537637737837938038138238338438538638738838939039139239339439539639739839940040140240340440540640740840941041141241341441541641741841942042142242342442542642742842943043143243343443543643743843944044144244344444544644744844945045145245345445545645745845946046146246346446546646746846947047147247347447547647747847948048148248348448548648748848949049149249349449549649749849950050150250350450550650750850951051151251351451551651751851952052152252352452552652752852953053153253353453553653753853954054154254354454554654754854955055155255355455555655755855956056156256356456556656756856957057157257357457557657757857958058158258358458558658758858959059159259359459559659759859960060160260360460560660760860961061161261361461561661761861962062162262362462562662762862963063163263363463563663763863964064164264364464564664764864965065165265365465565665765865966066166266366466566666766866967067167267367467567667767867968068168268368468568668768868969069169269369469569669769869970070170270370470570670770870971071171271371471571671771871972072172272372472572672772872973073173273373473573673773873974074174274374474574674774874975075175275375475575675775875976076176276376476576676776876977077177277377477577677777877978078178278378478578678778878979079179279379479579679779879980080180280380480580680780880981081181281381481581681781881982082182282382482582682782882983083183283383483583683783883984084184284384484584684784884985085185285385485585685785885986086186286386486586686786886987087187287387487587687787887988088188288388488588688788888989089189289389489589689789889990090190290390490590690790890991091191291391491591691791891992092192292392492592692792892993093193293393493593693793893994094194294394494594694794894995095195295395495595695795895996096196296396496596696796896997097197297397497597697797897998098198298398498598698798898999099199299399499599699799899910001001100210031004100510061007100810091010101110121013101410151016101710181019102010211022102310241025102610271028102910301031103210331034103510361037103810391040104110421043104410451046104710481049105010511052105310541055105610571058105910601061106210631064106510661067106810691070107110721073107410751076107710781079108010811082108310841085108610871088108910901091109210931094109510961097109810991100110111021103110411051106110711081109111011111112111311141115111611171118111911201121112211231124112511261127112811291130113111321133113411351136113711381139114011411142114311441145114611471148114911501151115211531154115511561157115811591160116111621163116411651166116711681169117011711172117311741175117611771178117911801181118211831184118511861187118811891190119111921193119411951196119711981199120012011202120312041205120612071208120912101211121212131214121512161217121812191220122112221223122412251226122712281229123012311232123312341235123612371238123912401241124212431244124512461247124812491250125112521253125412551256125712581259126012611262126312641265126612671268126912701271127212731274127512761277127812791280128112821283128412851286128712881289129012911292129312941295129612971298129913001301130213031304130513061307130813091310131113121313131413151316131713181319132013211322132313241325132613271328132913301331133213331334133513361337133813391340134113421343134413451346134713481349135013511352135313541355135613571358135913601361136213631364136513661367136813691370137113721373137413751376137713781379138013811382138313841385138613871388138913901391139213931394139513961397139813991400140114021403140414051406140714081409141014111412141314141415141614171418141914201421142214231424142514261427142814291430143114321433143414351436143714381439144014411442144314441445144614471448144914501451145214531454145514561457145814591460146114621463146414651466146714681469147014711472147314741475147614771478147914801481148214831484148514861487148814891490149114921493149414951496149714981499150015011502150315041505150615071508150915101511151215131514151515161517151815191520152115221523152415251526152715281529153015311532153315341535153615371538153915401541154215431544154515461547154815491550155115521553155415551556155715581559156015611562156315641565156615671568156915701571157215731574157515761577157815791580158115821583158415851586158715881589159015911592159315941595159615971598159916001601160216031604160516061607160816091610161116121613161416151616161716181619162016211622162316241625162616271628162916301631163216331634163516361637163816391640164116421643164416451646164716481649165016511652165316541655165616571658165916601661166216631664166516661667166816691670167116721673167416751676167716781679168016811682168316841685168616871688168916901691169216931694169516961697169816991700170117021703170417051706170717081709171017111712171317141715171617171718171917201721172217231724172517261727172817291730173117321733173417351736173717381739174017411742174317441745174617471748174917501751175217531754175517561757175817591760176117621763176417651766176717681769177017711772177317741775177617771778177917801781178217831784178517861787178817891790179117921793179417951796179717981799180018011802180318041805180618071808180918101811181218131814181518161817181818191820182118221823182418251826182718281829183018311832183318341835183618371838183918401841184218431844184518461847184818491850185118521853185418551856185718581859186018611862186318641865186618671868186918701871187218731874187518761877187818791880188118821883188418851886188718881889189018911892189318941895189618971898189919001901190219031904190519061907190819091910191119121913191419151916191719181919192019211922192319241925192619271928192919301931193219331934193519361937193819391940194119421943194419451946194719481949195019511952195319541955195619571958195919601961196219631964196519661967196819691970197119721973197419751976197719781979198019811982198319841985198619871988198919901991199219931994199519961997199819992000200120022003200420052006200720082009201020112012201320142015201620172018201920202021202220232024202520262027202820292030203120322033203420352036203720382039204020412042204320442045204620472048204920502051205220532054205520562057205820592060206120622063206420652066206720682069207020712072207320742075207620772078207920802081208220832084208520862087208820892090209120922093209420952096209720982099210021012102210321042105210621072108210921102111211221132114211521162117211821192120212121222123212421252126212721282129213021312132213321342135213621372138213921402141214221432144214521462147214821492150215121522153215421552156215721582159216021612162216321642165216621672168216921702171217221732174217521762177217821792180218121822183218421852186218721882189219021912192219321942195219621972198219922002201220222032204220522062207220822092210221122122213221422152216221722182219222022212222222322242225222622272228222922302231223222332234223522362237223822392240224122422243224422452246224722482249225022512252225322542255225622572258225922602261226222632264226522662267226822692270227122722273227422752276227722782279228022812282228322842285228622872288228922902291229222932294229522962297229822992300230123022303230423052306230723082309231023112312231323142315231623172318231923202321232223232324232523262327232823292330233123322333233423352336233723382339234023412342234323442345234623472348234923502351235223532354235523562357235823592360236123622363236423652366236723682369237023712372237323742375237623772378237923802381238223832384238523862387238823892390239123922393239423952396239723982399240024012402240324042405240624072408240924102411241224132414241524162417241824192420242124222423242424252426242724282429243024312432243324342435243624372438243924402441244224432444244524462447244824492450245124522453245424552456245724582459246024612462246324642465246624672468246924702471247224732474247524762477247824792480248124822483248424852486248724882489249024912492249324942495249624972498249925002501250225032504250525062507250825092510251125122513251425152516251725182519252025212522252325242525252625272528252925302531253225332534253525362537253825392540254125422543254425452546254725482549255025512552255325542555255625572558255925602561256225632564256525662567256825692570257125722573257425752576257725782579258025812582258325842585258625872588258925902591259225932594259525962597259825992600260126022603260426052606260726082609261026112612261326142615261626172618261926202621262226232624262526262627262826292630263126322633263426352636263726382639264026412642264326442645264626472648264926502651265226532654265526562657265826592660266126622663266426652666266726682669267026712672267326742675267626772678267926802681268226832684268526862687268826892690269126922693269426952696269726982699270027012702270327042705270627072708270927102711271227132714271527162717271827192720272127222723272427252726272727282729273027312732273327342735273627372738273927402741274227432744274527462747274827492750275127522753275427552756 |
- import copy
- import math
- import os
- import re
- import time
- import traceback
- from glob import glob
- import numpy as np
- import cv2
- import wcwidth
- from pdfminer.layout import LTLine
- # from botr.nsp.predict import nsp_predict
- from sklearn.cluster import KMeans
- from botr.rules.get_table_by_rules import get_table_by_rule
- from botr.utils import line_iou, get_table_iou
- from format_convert.convert_need_interface import from_yolo_interface
- from format_convert.utils import log, np2bytes, text_bbox_to_lt, pil_resize, memory_decorator
- def b_table_process(list_line, list_text_boxes, list_cell, table_location):
- def merge_textbox(textbox_list, in_objs):
- delete_obj = []
- threshold = 5
- textbox_list.sort(key=lambda x: x.bbox[0])
- for k in range(len(textbox_list)):
- tb1 = textbox_list[k]
- if tb1 not in in_objs and tb1 not in delete_obj:
- for m in range(k + 1, len(textbox_list)):
- tb2 = textbox_list[m]
- if tb2 in in_objs:
- continue
- if abs(tb1.bbox[1] - tb2.bbox[1]) <= threshold \
- and abs(tb1.bbox[3] - tb2.bbox[3]) <= threshold:
- if tb1.bbox[0] <= tb2.bbox[0]:
- tb1.text = tb1.text + tb2.text
- else:
- tb1.text = tb2.text + tb1.text
- tb1.bbox[0] = min(tb1.bbox[0], tb2.bbox[0])
- tb1.bbox[2] = max(tb1.bbox[2], tb2.bbox[2])
- delete_obj.append(tb2)
- for _obj in delete_obj:
- if _obj in textbox_list:
- textbox_list.remove(_obj)
- return textbox_list
- try:
- if list_line:
- from format_convert.convert_tree import TableLine
- list_lines = []
- for line in list_line:
- list_lines.append(LTLine(1, (line[0], line[1]), (line[2], line[3])))
- # 先拿出在表格区域里的TextBox
- area_list_text_boxes = []
- threshold = 7
- for t_b in list_text_boxes:
- bbox = t_b.bbox
- if table_location[1] - threshold <= bbox[1] <= bbox[3] <= table_location[3] + threshold:
- area_list_text_boxes.append(t_b)
- # 对TextBox进行分行,否则同样一行有些框偏上有些偏下,影响文本顺序
- area_list_text_boxes.sort(key=lambda x: (x.bbox[1], x.bbox[0], x.bbox[3], x.bbox[2]))
- current_y = area_list_text_boxes[0].bbox[1]
- current_y2 = area_list_text_boxes[0].bbox[3]
- # threshold = 2.
- threshold = max(2., 1 / 3 * abs(current_y2 - current_y))
- for t_b in area_list_text_boxes:
- bbox = t_b.bbox
- if current_y - threshold <= bbox[1] <= current_y + threshold:
- t_b.bbox[1] = current_y
- else:
- current_y = bbox[1]
- area_list_text_boxes.sort(key=lambda x: (x.bbox[1], x.bbox[0], x.bbox[3], x.bbox[2]))
- # list_cell 转化为 LineTable形式
- tables = []
- obj_in_table = []
- table_dict = {'bbox': table_location}
- row_list = []
- # yolo检测出的表格,忽略两列的,因为已经补充了两列的新规则 250529
- if list_cell and len(list_cell[0]) == 2:
- return list_text_boxes, [], set()
- for row in list_cell:
- col_list = []
- for col in row:
- col_dict = {'bbox': (col[0][0], col[0][1], col[1][0], col[1][1]),
- 'rowspan': 1, 'columnspan': 1, 'text': ''}
- for t_b in area_list_text_boxes:
- if t_b in obj_in_table:
- continue
- text = re.sub('\s', '', t_b.text)
- bbox = t_b.bbox
- iou = get_table_iou(col[0][0], col[0][1], col[1][0], col[1][1],
- bbox[0], bbox[1], bbox[2], bbox[3])
- if iou >= 0.3:
- col_dict['text'] += text
- obj_in_table.append(t_b)
- col_list.append(col_dict)
- row_list.append(col_list)
- table_dict['table'] = row_list
- tables.append(table_dict)
- # print('b_table_process tables', tables)
- # 合并同一行textbox
- # list_text_boxes = merge_textbox(list_text_boxes, obj_in_table)
- return list_text_boxes, tables, obj_in_table
- else:
- return list_text_boxes, [], set()
- except:
- traceback.print_exc()
- return [-8], [-8], [-8]
- def get_text_box_obj(_text_list, _bbox_list):
- from format_convert.convert_tree import TextBox
- _text_box_list = []
- for i in range(len(_bbox_list)):
- bbox = _bbox_list[i]
- b_text = _text_list[i]
- _text_box_list.append(TextBox([bbox[0][0], bbox[0][1],
- bbox[2][0], bbox[2][1]], b_text))
- return _text_box_list
- def get_table(img, table_list, text_list, bbox_list, text_box_list, from_pdf=False, show=0):
- log('start')
- # 检测无边框表格
- start_time_all = time.time()
- start_time = time.time()
- img_bytes = np2bytes(img)
- b_table_list = from_yolo_interface(img_bytes)
- log('yolo detect cost: ' + str(time.time() - start_time))
- b_table_list = b_table_list[0]
- if not b_table_list:
- log('detect not b_table_list')
- if from_pdf:
- save_b_table(img)
- return [], [], []
- # if show:
- # for b_table in b_table_list:
- # # for line in b_table:
- # cv2.rectangle(img, (int(b_table[0]), int(b_table[1])), (int(b_table[2]), int(b_table[3])),
- # (0, 0, 255), 2)
- # cv2.namedWindow('b_table', cv2.WINDOW_NORMAL)
- # cv2.imshow('b_table', img)
- # cv2.waitKey(0)
- if show:
- print('b_table_list', b_table_list)
- print('table_list', table_list)
- # 排除otr结果
- b_table_location_list = []
- for b_table in b_table_list:
- # print('b_table', b_table)
- min_x, min_y = 1000000, 1000000
- max_x, max_y = 0, 0
- # for line in b_table:
- if b_table[1] < min_y:
- min_y = b_table[1]
- if b_table[3] > max_y:
- max_y = b_table[3]
- if b_table[0] < min_x:
- min_x = b_table[0]
- if b_table[2] > max_x:
- max_x = b_table[2]
- b_loc = [min_x, min_y, max_x, max_y, b_table[4]]
- inter_flag = False
- for table in table_list:
- # loc = table.get('bbox')
- loc = table.bbox
- # rows = table.get('table')
- iou = line_iou([[0, loc[1]], [0, loc[3]]], [[0, b_loc[1]], [0, b_loc[3]]], axis=1)
- if iou > 0.3:
- # if len(rows) <= 1:
- # if loc[1] < b_loc[1] < loc[3] < b_loc[3]:
- # b_loc[1] = loc[3]
- # if b_loc[1] < loc[1] < b_loc[3] < loc[3]:
- # b_loc[3] = loc[1]
- # continue
- inter_flag = True
- # cv2.rectangle(img, [int(loc[0]), int(loc[1])], [int(loc[2]), int(loc[3])], (0, 0, 255))
- # cv2.rectangle(img, [int(b_loc[0]), int(b_loc[1])], [int(b_loc[2]), int(b_loc[3])], (0, 0, 255))
- # cv2.imshow('inter', img)
- # cv2.waitKey(0)
- break
- if not inter_flag:
- b_table_location_list.append(b_loc)
- if not b_table_location_list:
- log('except otr, not b_table_location_list')
- return [], [], []
- if show:
- print('len(b_table_location_list)', len(b_table_location_list))
- # 排除有重合的,取概率大的
- if len(b_table_location_list) > 1:
- temp_list = []
- used_b_loc = []
- for i in range(len(b_table_location_list)):
- b_loc1 = b_table_location_list[i]
- if b_loc1 in used_b_loc:
- continue
- inter_flag = False
- for j in range(i + 1, len(b_table_location_list)):
- b_loc2 = b_table_location_list[j]
- iou = line_iou([[0, b_loc1[1]], [0, b_loc1[3]]], [[0, b_loc2[1]], [0, b_loc2[3]]], axis=1)
- if show:
- print('iou2', iou)
- if iou > 0.3:
- inter_flag = True
- break
- if inter_flag:
- used_b_loc.append(b_loc2)
- if b_loc1[4] >= b_loc2[4]:
- temp_list.append(b_loc1[:4])
- else:
- temp_list.append(b_loc2[:4])
- else:
- temp_list.append(b_loc1[:4])
- b_table_location_list = temp_list
- if show:
- for b_loc in b_table_location_list:
- cv2.rectangle(img, (int(b_loc[0]), int(b_loc[1])), (int(b_loc[2]), int(b_loc[3])),
- (0, 0, 255), 2)
- cv2.namedWindow('b_table_no_otr', cv2.WINDOW_NORMAL)
- cv2.imshow('b_table_no_otr', img)
- cv2.waitKey(0)
- table_list = []
- obj_in_table_list = []
- # print('len(b_table_location_list)', len(b_table_location_list))
- for b_loc in b_table_location_list:
- area_text_list = []
- area_bbox_list = []
- threshold = 5
- for i, bbox in enumerate(bbox_list):
- if b_loc[1] - threshold <= bbox[0][1] <= bbox[2][1] <= b_loc[3] + threshold:
- area_bbox_list.append(bbox)
- area_text_list.append(text_list[i])
- # 根据ocr bbox,规则生成表格线
- start_time = time.time()
- line_list, cell_list, table_location, bbox_text_dict = get_table_by_rule(img, area_text_list, area_bbox_list,
- b_loc, show=show)
- if not table_location:
- log('get_table_by_rule not table_location')
- continue
- # 获取最新的text_list, bbox_list
- area_text_list, area_bbox_list = [], []
- for key in bbox_text_dict.keys():
- area_bbox_list.append(eval(key))
- area_text_list.append(bbox_text_dict.get(key))
- b_text_box_list = get_text_box_obj(area_text_list, area_bbox_list)
- log('get_table_by_rule cost: ' + str(time.time() - start_time))
- # 根据表格线生成单元格
- start_time = time.time()
- b_text_box_list, _table_list, _obj_in_table_list = b_table_process(line_list, b_text_box_list, cell_list,
- table_location)
- table_list += _table_list
- obj_in_table_list += _obj_in_table_list
- log('b_table_process cost: ' + str(time.time() - start_time))
- # if not table_list:
- # log('table_process not table_list')
- # return [], [], []
- if not _table_list:
- log('table_process not table_list')
- continue
- # 单元格合并,nsp模型
- # 使用hanlp分词,判断上下句是否该合并 顺便拉数据统计
- # 1. 上下句ab,ab相连得到c
- # 2.1 c分词,若ab相连处合为一个词语,则ab相连
- # 2.2 ab相连处不为一个词语,a, b分别分词
- # 2.2.1 若b的第一个词,从其中分第一个字给a,然后
- # near_col_list = []
- # table = _table_list[0].get('table')
- # col_cnt = len(table[0])
- # for c_cnt in range(col_cnt):
- # for i in range(len(table)-1):
- # t = table[i][c_cnt].get('text')
- # next_t = table[i+1][c_cnt].get('text')
- # if t and next_t:
- # near_col_list.append([t, next_t])
- # elif t and next_t == '':
- # if i+2 <= len(table)-1:
- # next_2_t = table[i+2][c_cnt].get('text')
- # near_col_list.append([t, next_2_t])
- #
- # is_next_list = nsp_predict(near_col_list, has_label=False)
- #
- # next_index = 0
- # for c_cnt in range(col_cnt):
- # # 先把一列里的需合并的打上标签
- # for i in range(len(table)-1):
- # t = table[i][c_cnt].get('text')
- # next_t = table[i+1][c_cnt].get('text')
- # if t and next_t:
- # table[i+1][c_cnt]['is_next'] = is_next_list[next_index]
- # next_index += 1
- # elif t and next_t == '':
- # if i+2 <= len(table)-1:
- # table[i+1][c_cnt]['is_next'] = is_next_list[next_index]
- # table[i+2][c_cnt]['is_next'] = is_next_list[next_index]
- # next_index += 1
- #
- # first_col = None
- # for i in range(len(table)):
- # if table[i][c_cnt].get('is_next'):
- # if first_col is None:
- # first_col = table[i-1][c_cnt]
- # first_col['text'] += table[i][c_cnt].get('text')
- # first_col['rowspan'] += 1
- # else:
- # first_col = None
- #
- # # 删除标签为True的
- # new_table = []
- # for row in table:
- # new_row = []
- # for col in row:
- # if col.get('is_next'):
- # continue
- # new_row.append(col)
- # new_table.append(new_row)
- #
- # _table_list[0]['table'] = new_table
- log('get_table finish ' + str(time.time() - start_time_all))
- return text_box_list, table_list, obj_in_table_list
- def save_b_table(image_np):
- _start_time = time.time()
- _path = '/data/fangjiasheng/format_conversion_maxcompute/save_b_table_not_detect'
- # _path = 'D:/Project/format_conversion_maxcompute/save_b_table_not_detect'
- max_index = 20000
- if os.path.exists(_path):
- file_list = glob(_path + '/*')
- if file_list:
- file_index_list = [int(re.split('[/.\\\\-]', x)[-3]) for x in file_list]
- file_index_list.sort(key=lambda x: x)
- index = file_index_list[-1] + 1
- else:
- index = 0
- if index > max_index:
- return
- # 文件md5
- from format_convert import _global
- _md5 = _global.get("md5")
- _image_path = _path + '/' + str(index) + '-' + str(_md5) + '.png'
- cv2.imwrite(_image_path, image_np)
- log('save yolo not detect b_table image success!')
- @memory_decorator
- def get_b_table_by_blank_colon(lt_text_list, table_list, layout_bbox, image_np=None, show=0):
- start_time = time.time()
- # print('len(lt_text_list)', len(lt_text_list))
- # for lt_text in lt_text_list:
- # print('lt_text', lt_text)
- # 新增冒号提前判断
- colon_cnt = 0
- for lt_text in lt_text_list:
- if re.search('[::]', lt_text.get_text()):
- colon_cnt += 1
- if colon_cnt <= 6:
- log('pre judge colon_cnt <= 6')
- return [], []
- # 图片类型,限制lt_text_list个数,并且很多是单字的
- if image_np is not None and len(lt_text_list) >= 60:
- single_char_cnt = 0
- for lt_text in lt_text_list:
- if len(lt_text.get_text()) <= 1:
- single_char_cnt += 1
- # log('len(lt_text_list), single_char_cnt ' + str(len(lt_text_list)) + ' ' + str(single_char_cnt))
- if single_char_cnt > 50 or single_char_cnt > 1/3 * len(lt_text_list):
- return [], []
- # raise
- # 有些确定为非表格,也输出,防止后续YOLO判断为表格,搞乱数据
- not_b_table_list = []
- layout_h = int(layout_bbox[3])
- layout_w = int(layout_bbox[2])
- if show:
- print('layout_w, layout_h', layout_w, layout_h)
- show_image = np.full((layout_h, layout_w, 3), 255, dtype=np.uint8)
- if show and image_np is not None:
- image_np_show = copy.copy(image_np)
- for lt_text in lt_text_list:
- bbox = [int(x) for x in lt_text.bbox]
- cv2.rectangle(image_np_show, bbox[:2], bbox[2:4], (0, 0, 255))
- cv2.imshow('image origin', image_np_show)
- cv2.waitKey(0)
- # pdf类型预处理
- start_time1 = time.time()
- if image_np is None:
- # 把单个lt_text中,中间多个空格分割的分开
- lt_text_list = split_lt_text_by_many_space(lt_text_list)
- if show:
- for lt_text in lt_text_list:
- bbox = [int(x) for x in lt_text.bbox]
- cv2.rectangle(show_image, bbox[:2], bbox[2:4], (0, 0, 255))
- cv2.imshow('pdf preprocess', show_image)
- cv2.waitKey(0)
- # log('get_b_table_by_blank_colon pdf preprocess cost: ' + str(time.time()-start_time1))
- # 图片类型预处理
- start_time1 = time.time()
- if image_np is not None:
- # 删除空的
- start_time2 = time.time()
- lt_text_list = delete_empty_bbox(lt_text_list)
- # print('delete_empty_bbox cost: ', time.time()-start_time2)
- # ocr识别的文本框需处理后紧贴文本,才能依靠空白分行
- start_time2 = time.time()
- new_bbox_list = shrink_bbox(image_np, [x.bbox for x in lt_text_list])
- # print('shrink_bbox cost: ', time.time()-start_time2)
- start_time2 = time.time()
- for i, lt_text in enumerate(lt_text_list):
- lt_text.bbox = new_bbox_list[i]
- # print('lt_text.bbox = new_bbox_list[i] cost: ', time.time()-start_time2)
- # log('get_b_table_by_blank_colon image preprocess1 cost: ' + str(time.time()-start_time1))
- # 计算单字平均距离
- start_time1 = time.time()
- all_char_cnt = 0
- all_text_width = 0
- for lt_text in lt_text_list:
- all_char_cnt += len(lt_text.get_text())
- all_text_width += abs(lt_text.bbox[2] - lt_text.bbox[0])
- if all_char_cnt == 0:
- return [], not_b_table_list
- avg_char_width = all_text_width / all_char_cnt
- # 图片类型预处理2
- if image_np is not None:
- # ocr识别的表格的值可能因空格分开,合并
- lt_text_list = merge_same_bbox(lt_text_list, avg_char_width)
- # bbox交叉,修复
- lt_text_list = fix_cross_bbox(lt_text_list)
- # log('get_b_table_by_blank_colon image preprocess2 cost: ' + str(time.time()-start_time1))
- if show and image_np is not None:
- image_np_show = copy.copy(image_np)
- for lt_text in lt_text_list:
- bbox = [int(x) for x in lt_text.bbox]
- cv2.rectangle(image_np_show, bbox[:2], bbox[2:4], (0, 0, 255))
- cv2.imshow('image preprocess', image_np_show)
- cv2.waitKey(0)
- if show:
- for lt_text in lt_text_list:
- print('lt_text', lt_text)
- # 过滤xy值过大过小的
- temp_list = []
- for lt_text in lt_text_list:
- if min(lt_text.bbox) < 0 or max(lt_text.bbox) > 10000:
- continue
- temp_list.append(lt_text)
- lt_text_list = temp_list
- if show:
- for lt_text in lt_text_list:
- cv2.rectangle(show_image,
- (int(lt_text.bbox[0]), int(lt_text.bbox[1])),
- (int(lt_text.bbox[2]), int(lt_text.bbox[3])),
- (0, 0, 255)
- )
- for table in table_list:
- cv2.rectangle(show_image,
- (int(table.bbox[0]), int(table.bbox[1])),
- (int(table.bbox[2]), int(table.bbox[3])),
- (0, 255, 0)
- )
- # 计算单字平均距离
- all_char_cnt = 0
- all_text_width = 0
- for lt_text in lt_text_list:
- all_char_cnt += len(lt_text.get_text())
- all_text_width += abs(lt_text.bbox[2] - lt_text.bbox[0])
- if all_char_cnt == 0:
- return [], not_b_table_list
- avg_char_width = all_text_width / all_char_cnt
- if show:
- print('avg_char_width', avg_char_width)
- if image_np is None:
- blank_width = 1 * avg_char_width
- else:
- blank_width = 1 * avg_char_width
- if show:
- print('blank_width', blank_width)
- # 根据有边框表格位置,将该页分为多个区域
- table_h_list = []
- area_h_list = []
- area_start_h = 0
- table_list.sort(key=lambda x: (x.bbox[1], x.bbox[0], x.bbox[3]))
- for table in table_list:
- table_h_list.append([table.bbox[1], table.bbox[3]])
- area_h_list.append([area_start_h, table.bbox[1]])
- area_start_h = table.bbox[3]
- area_h_list.append([area_start_h, layout_h])
- if show:
- for min_h, max_h in area_h_list:
- print('area_h_list', min_h, max_h)
- cv2.rectangle(show_image,
- (0, int(min_h)),
- (layout_w, int(max_h)),
- (255, 0, 0)
- )
- lt_text_area_list = []
- for area_min_h, area_max_h in area_h_list:
- sub_area = []
- for lt_text in lt_text_list:
- if area_min_h <= lt_text.bbox[1] <= lt_text.bbox[3] <= area_max_h:
- sub_area.append(lt_text)
- lt_text_area_list.append(sub_area)
- if show:
- print('len(lt_text_area_list)', len(lt_text_area_list))
- # 每个区域分别进行判断无边框表格
- result_table_list = []
- start_time1 = time.time()
- for sub_lt_text_list in lt_text_area_list:
- start_time2 = time.time()
- lt_text_row_list = get_text_row_by_blank(sub_lt_text_list, layout_h)
- # log('get_text_row_by_blank cost: ' + str(time.time()-start_time2))
- # 有补充的占位lt_text,需添加到lt_text_list
- for row in lt_text_row_list:
- for lt_text in row:
- if lt_text not in lt_text_list:
- lt_text_list.append(lt_text)
- if show:
- for row in lt_text_row_list:
- print('row', row)
- start_time2 = time.time()
- b_table_list1, b_table_bbox_list1 = get_b_table_by_lt_text_row(lt_text_row_list)
- # log('get_b_table_by_lt_text_row cost: ' + str(time.time()-start_time2))
- # 确定区域后,对表格内重新分行,更精准
- start_time2 = time.time()
- table_lt_text_row_list = []
- for bi, b_table in enumerate(b_table_list1):
- b_table_bbox = b_table_bbox_list1[bi]
- sub_lt_text_list = []
- for lt_text in lt_text_list:
- if b_table_bbox[1] <= lt_text.bbox[1] <= lt_text.bbox[3] <= b_table_bbox[3]:
- sub_lt_text_list.append(lt_text)
- _lt_text_row_list, center_blank_row = get_text_row_by_center_blank(b_table, sub_lt_text_list, blank_width,
- layout_h)
- table_lt_text_row_list += _lt_text_row_list
- # log('get_text_row_by_center_blank cost: ' + str(time.time()-start_time2))
- start_time2 = time.time()
- b_table_list3, b_table_bbox_list3 = get_b_table_by_lt_text_row(table_lt_text_row_list)
- # log('get_b_table_by_lt_text_row cost: ' + str(time.time()-start_time2))
- if show:
- for b_table in b_table_list3:
- print('b_table3', b_table)
- # 对大致的表格进行列判断,表格内不同列的框不能交叉,可以重合,需有一定空白
- start_time2 = time.time()
- b_table_list2 = []
- for b_table in b_table_list3:
- blank_row_list = get_blank_row(b_table, blank_width)
- if show:
- print('b_table get_blank_row b_table_list3', b_table)
- print('blank_row_list b_table_list3', blank_row_list)
- b_table2 = []
- for bi, lt_text_row1 in enumerate(b_table[:-1]):
- lt_text_row2 = b_table[bi + 1]
- # if row1_row2_has_same_col(lt_text_row1, lt_text_row2):
- if row1_row2_has_same_blank(blank_row_list[bi], blank_row_list[bi + 1]):
- if lt_text_row1 not in b_table2:
- b_table2.append(lt_text_row1)
- if lt_text_row2 not in b_table2:
- b_table2.append(lt_text_row2)
- else:
- # print('not cross blank', blank_row_list[bi], blank_row_list[bi + 1])
- if len(b_table2) >= 2:
- b_table_list2.append(b_table2)
- b_table2 = []
- if len(b_table2) >= 2:
- b_table_list2.append(b_table2)
- # log('get_blank_row cost: ' + str(time.time()-start_time2))
- if show:
- for b_table2 in b_table_list2:
- print('b_table2')
- for lt_text_row in b_table2:
- print('b_table2 lt_text_row', lt_text_row)
- start_time2 = time.time()
- for bi, b_table2 in enumerate(b_table_list2):
- # 根据冒号得到表格
- start_time3 = time.time()
- table2, center_blank_row, _not_b_table_bbox_list, table_bbox \
- = get_b_table_by_colon(b_table2, blank_width)
- log('get_b_table_by_colon cost: ' + str(time.time()-start_time3))
- not_b_table_list += [[[], x] for x in _not_b_table_bbox_list]
- if show and center_blank_row:
- print('show center_blank_row', center_blank_row)
- bx = int((center_blank_row[2] + center_blank_row[0]) / 2)
- by = int((center_blank_row[3] + center_blank_row[1]) / 2)
- br = int((center_blank_row[2] - center_blank_row[0]) / 2)
- if br <= 5:
- br = 5
- print('bx, by, br', bx, by, br)
- cv2.circle(show_image, (bx, by), br, (0, 255, 0))
- if show:
- min_w, min_h, max_w, max_h = table_bbox
- cv2.rectangle(show_image,
- (int(min_w), int(min_h)),
- (int(max_w), int(max_h)),
- (0, 255, 0)
- )
- # 修复最后一行跨行
- # table2 = fix_final_row(table2)
- # 表格末尾有些只有一列的需补充
- table2 = add_last_rows(table2, table_bbox, center_blank_row, lt_text_row_list, b_table2)
- table2 = add_first_rows(table2, table_bbox, center_blank_row, lt_text_row_list, b_table2)
- # table格式转化
- table2 = table_list_to_dict(table2)
- # 表格一些标准化,比如去掉占位符
- table2 = standard_table(table2)
- if table2:
- result_table_list.append([table2, table_bbox])
- # log('colon, add, standard cost: ' + str(time.time()-start_time2))
- # log('get_b_table_by_blank_colon area get b_table cost: ' + str(time.time()-start_time1))
- if show:
- cv2.namedWindow("final result", cv2.WINDOW_NORMAL)
- cv2.resizeWindow("final result", 768, 1024)
- cv2.imshow('final result', show_image)
- cv2.waitKey(0)
- if show:
- for table in result_table_list:
- print('get_b_table_by_bbox table ', table)
- for not_table_bbox in not_b_table_list:
- print('not_table bbox ', not_table_bbox)
- # log('get_b_table_by_blank_colon cost: ' + str(time.time()-start_time))
- return result_table_list, not_b_table_list
- def get_b_table_by_lt_text_row(lt_text_row_list, show=0):
- # 先大致确定区域,列数大于2的区域
- b_table_list1 = []
- b_table = []
- for lt_text_row in lt_text_row_list:
- if len(lt_text_row) >= 2:
- b_table.append(lt_text_row)
- else:
- if len(b_table) >= 2:
- b_table_list1.append(b_table)
- b_table = []
- if len(b_table) >= 2:
- b_table_list1.append(b_table)
- # 获取bbox
- b_table_bbox_list = []
- for b_table in b_table_list1:
- x1 = min([y.bbox[0] for x in b_table for y in x])
- y1 = min([y.bbox[1] for x in b_table for y in x])
- x2 = max([y.bbox[2] for x in b_table for y in x])
- y2 = max([y.bbox[3] for x in b_table for y in x])
- b_table_bbox_list.append([x1, y1, x2, y2])
- if show:
- for b_table in b_table_list1:
- print('b_table')
- for lt_text_row in b_table:
- print('b_table lt_text_row', lt_text_row)
- return b_table_list1, b_table_bbox_list
- def row1_row2_has_same_col(row1, row2):
- threshold = 5
- blank_len = 2
- cross_flag = 0
- for lt_text1 in row1:
- for lt_text2 in row2:
- if lt_text2.bbox[0] - lt_text1.bbox[2] >= blank_len \
- or lt_text1.bbox[0] - lt_text2.bbox[2] >= blank_len \
- or lt_text1.bbox[0] - threshold <= lt_text2.bbox[0] < lt_text2.bbox[2] <= lt_text1.bbox[
- 2] + threshold \
- or lt_text2.bbox[0] - threshold <= lt_text1.bbox[0] < lt_text1.bbox[2] <= lt_text2.bbox[
- 2] + threshold:
- pass
- else:
- cross_flag = 1
- if cross_flag:
- return False
- else:
- return True
- def get_blank_row(lt_text_row_list, blank_min_width, show=0):
- # 获取空白行
- blank_row_list = []
- # blank_min_width = avg_char_width * 3
- for lt_text_row in lt_text_row_list:
- lt_text_row.sort(key=lambda x: x.bbox[0])
- blank_row = []
- if len(lt_text_row) < 2:
- blank_row_list.append([])
- else:
- # 行内lt_text两两生成空白
- for lt_text1 in lt_text_row:
- sub_row = []
- for lt_text2 in lt_text_row:
- if lt_text1 == lt_text2:
- continue
- # 必须从左到右
- if lt_text1.bbox[2] > lt_text2.bbox[0]:
- continue
- line1 = ((lt_text1.bbox[0], 0), (lt_text1.bbox[2], 0))
- line2 = ((lt_text2.bbox[0], 0), (lt_text2.bbox[2], 0))
- if line_iou(line1, line2) > 0:
- continue
- sub_row.append([min(lt_text1.bbox[2], lt_text2.bbox[0]),
- min(lt_text1.bbox[3], lt_text2.bbox[1]),
- max(lt_text1.bbox[2], lt_text2.bbox[0]),
- max(lt_text1.bbox[3], lt_text2.bbox[1]),
- ])
- if show:
- print('sub_row', lt_text1.get_text(), lt_text2.get_text(), sub_row[-1])
- # 每个lt_text只找出其对应的最小的空白
- if not sub_row:
- continue
- sub_row.sort(key=lambda x: abs(x[0] - x[2]))
- if show:
- print('sub_row[-1]', lt_text1.get_text(), sub_row[-1])
- blank_row.append(sub_row[0])
- # 判断最小距离,一行至少有一段空白大于最小距离
- match_flag = 0
- for r in blank_row:
- if abs(r[2] - r[0]) >= blank_min_width:
- match_flag = 1
- break
- if match_flag:
- blank_row_list.append(blank_row)
- else:
- blank_row_list.append([])
- return blank_row_list
- def row1_row2_has_same_blank(row1, row2):
- # row1的任一空白,都能和row2的任一空白相交
- cross_flag = 0
- for blank1 in row1:
- if cross_flag == 1:
- break
- for blank2 in row2:
- if blank1[0] <= blank2[0] <= blank1[2] \
- or blank1[0] <= blank2[2] <= blank1[2] \
- or blank2[0] <= blank1[0] <= blank2[2] \
- or blank2[0] <= blank1[2] <= blank2[2]:
- cross_flag = 1
- break
- if cross_flag:
- return True
- else:
- return False
- @memory_decorator
- def get_b_table_by_colon(b_table, blank_width, show=0):
- # print('into get_b_table_by_colon')
- table_bbox = get_table_bbox(b_table)
- # 有些确定为非表格,也输出,防止后续YOLO判断为表格,搞乱数据
- not_table_bbox_list = []
- #
- # row_cnt_list = [len(x) in [2, 3, 4] for x in b_table]
- # 所有行需是2列或4列,同一列算作一列
- row_cnt_list = []
- head_cnt_list = []
- for row in b_table:
- if not row:
- continue
- row.sort(key=lambda x: (x.bbox[0]))
- col_cnt = 1
- head_cnt = 0
- if re.search('[::]', row[0].get_text()):
- head_cnt += 1
- for ci, col in enumerate(row):
- if ci == 0:
- continue
- col1 = row[ci - 1]
- col2 = row[ci]
- line1 = [(col1.bbox[0], 0), (col1.bbox[2], 0)]
- line2 = [(col2.bbox[0], 0), (col2.bbox[2], 0)]
- if line_iou(line1, line2) >= 0.5:
- continue
- else:
- col_cnt += 1
- if re.search('[::]', col2.get_text()):
- head_cnt += 1
- row_cnt_list.append(col_cnt in [2, 3, 4])
- head_cnt_list.append(head_cnt)
- if show:
- print('row_cnt_list', row_cnt_list)
- print('head_cnt_list', head_cnt_list)
- if max(head_cnt_list) > 2:
- if show:
- for row in b_table:
- print('head_cnt_list row', row)
- return [], None, not_table_bbox_list, table_bbox
- # 最后一行年月日可能会影响列数,不是234列
- if row_cnt_list[-1] is False:
- row_cnt_list = row_cnt_list[:-1]
- b_table = b_table[:-1]
- table_bbox = get_table_bbox(b_table)
- row_cnt_list = list(set(row_cnt_list))
- if not (len(row_cnt_list) == 1 and row_cnt_list[0] is True):
- return [], None, not_table_bbox_list, table_bbox
- # 至少有2个以上文本包含冒号
- colon_cnt = 0
- for lt_text_row in b_table:
- for lt_text in lt_text_row:
- if re.search('[::]', lt_text.get_text()) and re.search('[\u4e00-\u9fff]', lt_text.get_text()):
- colon_cnt += 1
- if show:
- print('colon_cnt, len(table)', colon_cnt, len(b_table))
- # if colon_cnt < 2:
- if colon_cnt < len(b_table) / 2:
- return [], None, not_table_bbox_list, table_bbox
- blank_row_list = get_blank_row(b_table, blank_width)
- if show:
- print('b_table get_blank_row colon', b_table)
- print('blank_row_list colon', blank_row_list)
- # blank_row_list = [y for x in blank_row_list for y in x]
- # print('blank_row_list2', blank_row_list)
- # # 先选最长空白包含的所有空白
- # blank_row_list.sort(key=lambda x: abs(x[0]-x[2]), reverse=True)
- # max_blank = blank_row_list[0]
- # if show:
- # print('max_blank', max_blank)
- # if abs(max_blank[0]-max_blank[2]) <= 4 * avg_char_width:
- # return []
- # max_col = []
- # for blank_row_bbox in blank_row_list:
- # if max_blank[0] <= blank_row_bbox[0] <= blank_row_bbox[2] <= max_blank[2]:
- # max_col.append(blank_row_bbox)
- # if show:
- # print('max_col', max_col)
- # if not max_col:
- # return []
- # # 选取被包含最多的空白
- # blank_contain_cnt_dict = {}
- # for bi, blank_row_bbox in enumerate(max_col):
- # blank_contain_cnt_dict[bi] = 0
- # for blank_row_bbox2 in max_col:
- # if blank_row_bbox2[0] <= blank_row_bbox[0] <= blank_row_bbox[2] <= blank_row_bbox2[2]:
- # blank_contain_cnt_dict[bi] += 1
- # blank_contain_cnt_list = [[k, v] for k, v in blank_contain_cnt_dict.items()]
- # blank_contain_cnt_list.sort(key=lambda x: x[1])
- # if show:
- # print('blank_contain_cnt_list', blank_contain_cnt_list)
- # center_blank_row = max_col[blank_contain_cnt_list[-1][0]]
- center_blank_row = choose_center_blank(blank_row_list, blank_width)
- if show:
- print('center_blank_row', center_blank_row)
- # 获取中心最短的空白,作为参考
- # blank_list = [get_blank_row(x) for x in b_table]
- # blank_list = [x[0] if len(x) == 1 else x[1] for x in blank_list]
- # blank_list.sort(key=lambda x: abs(x[2] - x[0]))
- # center_blank = blank_list[0]
- #
- # print('center_blank', center_blank)
- # 根据中心空白,分为两列
- # col_list1 = []
- # col_list2 = []
- # col_box_dict = {}
- # for lt_text_row in b_table:
- # lt_text_row.sort(key=lambda x: x.bbox[0])
- # # if len(lt_text_row) == 4:
- # # text1 = lt_text_row[0].get_text() + lt_text_row[1].get_text()
- # # text2 = lt_text_row[2].get_text() + lt_text_row[3].get_text()
- # # box1 = [
- # # min(lt_text_row[0].bbox[0], lt_text_row[1].bbox[0]),
- # # max(lt_text_row[0].bbox[2], lt_text_row[1].bbox[2]),
- # # min(lt_text_row[0].bbox[1], lt_text_row[1].bbox[1]),
- # # max(lt_text_row[0].bbox[3], lt_text_row[1].bbox[3])
- # # ]
- # # box2 = [
- # # min(lt_text_row[2].bbox[0], lt_text_row[3].bbox[0]),
- # # max(lt_text_row[2].bbox[2], lt_text_row[3].bbox[2]),
- # # min(lt_text_row[2].bbox[1], lt_text_row[3].bbox[1]),
- # # max(lt_text_row[2].bbox[3], lt_text_row[3].bbox[3])
- # # ]
- # #
- # # # col_list1.append(text1)
- # # # col_list2.append(text2)
- # # else:
- # # text1 = lt_text_row[0].get_text()
- # # text2 = lt_text_row[1].get_text()
- # # box1 = lt_text_row[0].bbox
- # # box2 = lt_text_row[1].bbox
- #
- # left_col = []
- # right_col = []
- # for lt_text in lt_text_row:
- # if lt_text.bbox[2] <= center_blank_row[0]:
- # left_col.append(lt_text)
- # else:
- # right_col.append(lt_text)
- #
- # left_text = [x.get_text() for x in left_col]
- # left_text = ''.join(left_text)
- # right_text = [x.get_text() for x in right_col]
- # right_text = ''.join(right_text)
- #
- # text1 = left_text.strip()
- # text2 = right_text.strip()
- #
- # # if text1 in col_box_dict.keys():
- # # col_box_dict[text1] += [box1]
- # # else:
- # # col_box_dict[text1] = [box1]
- # # if text2 in col_box_dict.keys():
- # # col_box_dict[text2] += [box2]
- # # else:
- # # col_box_dict[text2] = [box2]
- #
- # col_list1.append(text1)
- # col_list2.append(text2)
- #
- # if show:
- # print('col_list1', col_list1)
- # print('col_list2', col_list2)
- # col_key_value_list1 = []
- # last_key = ""
- # for col1 in col_list1:
- # match = re.search('[::]+', col1)
- # # 有冒号的
- # if match:
- # key = col1[:match.end()]
- # if last_key:
- # key = last_key + key
- # last_key = ""
- # value = col1[match.end():]
- # col_key_value_list1.append([key, value])
- # # 没有冒号的
- # else:
- # # 如果该值也存在在col_list2里,则看做表头,和下一行的表头连在一起
- # if col1 in col_list2:
- # if show:
- # print('col1 in col_list2')
- # last_key = col1
- # # 不存在,则是上一行的值,和上一行的值连在一起
- # else:
- # if col_key_value_list1 and re.search('[::]', col_key_value_list1[-1][1]):
- # col_key_value_list1[-1][1] += col1
- # else:
- # col_key_value_list1.append(["", col1])
- #
- # if show:
- # print('col_key_value_list1', col_key_value_list1)
- #
- # col_key_value_list2 = []
- # last_key = ""
- # for col2 in col_list2:
- # match = re.search('[::]+', col2)
- # if match:
- # key = col2[:match.end()]
- # if last_key:
- # key = last_key + key
- # last_key = ""
- # value = col2[match.end():]
- # col_key_value_list2.append([key, value])
- # else:
- # # 如果该值也存在在col_list1里,则看做表头,和下一行的表头连在一起
- # if col2 in col_list1:
- # if show:
- # print('col2 in col_list1')
- # last_key = col2
- # # 不存在,则是上一行的值,和上一行的值连在一起
- # else:
- # if col_key_value_list2 and re.search('[::]', col_key_value_list2[-1][1]):
- # col_key_value_list2[-1][1] += col2
- # else:
- # col_key_value_list2.append(["", col2])
- #
- # if show:
- # print('col_key_value_list2', col_key_value_list2)
- if not center_blank_row:
- return [], None, not_table_bbox_list, table_bbox
- # 根据中心空白,分为两列
- col_list1, col_list2 = divide_2_col_by_center_blank(b_table, center_blank_row)
- # 非表格,一般是那种一行里键值离的较远的单列,加入非表格,后续yolo判断也忽略
- if not col_list1 and not col_list2:
- not_table_bbox = get_table_bbox(b_table)
- not_table_bbox_list.append(not_table_bbox)
- return [], None, not_table_bbox_list, table_bbox
- # 两列中,分别设置head value
- col_key_value_list1 = set_head_value_in_col(col_list1, col_list2)
- col_key_value_list2 = set_head_value_in_col(col_list2, col_list1)
- # 根据两列head value,形成行
- b_table_row_list = []
- for i in range(max(len(col_key_value_list1), len(col_key_value_list2))):
- if i >= len(col_key_value_list1):
- col1 = ["", ""]
- else:
- col1 = col_key_value_list1[i]
- if i >= len(col_key_value_list2):
- col2 = ["", ""]
- else:
- col2 = col_key_value_list2[i]
- row = col1[:2] + col2[:2]
- b_table_row_list.append(row)
- # 删除空白列
- # col_dict = {}
- # for row in b_table_row_list:
- # for col_i, col in enumerate(row):
- # if col_i in col_dict.keys():
- # col_dict[col_i] += [col]
- # else:
- # col_dict[col_i] = [col]
- # delete_col_i = []
- # for col_i, cols in col_dict.items():
- # cols = list(set(cols))
- # if len(cols) == 1 and cols[0] == '':
- # delete_col_i.append(col_i)
- #
- # temp_list = []
- # for row in b_table_row_list:
- # new_col = []
- # for col_i, col in enumerate(row):
- # if col_i in delete_col_i:
- # continue
- # new_col.append(col)
- # temp_list.append(new_col)
- # b_table_row_list = temp_list
- # 去掉删除空白列
- # b_table_row_list = delete_blank_col(b_table_row_list)
- # 修复因表头和值是同一列上下排列,导致的错位
- b_table_row_list = fix_head_value_match(b_table_row_list)
- if show:
- print('b_table_row_list', b_table_row_list)
- return b_table_row_list, center_blank_row, not_table_bbox_list, table_bbox
- @memory_decorator
- def get_text_row_by_blank(lt_text_list, layout_h, show=0):
- if show:
- for lt_text_row in lt_text_list:
- print('lt_text_111', lt_text_row)
- lt_text_blank_list = get_up_down_blank(lt_text_list)
- lt_text_row_list = get_contain_blank_row(lt_text_blank_list, layout_h)
- if show:
- for lt_text_row in lt_text_row_list:
- print('lt_text_row', lt_text_row)
- return lt_text_row_list
- def get_text_row_by_center_blank(b_table, lt_text_list, blank_width, layout_h, show=0):
- # 获取行空白
- blank_row_list = get_blank_row(b_table, blank_width)
- if show:
- print('b_table get_blank_row center_blank', b_table)
- print('blank_row_list center_blank', blank_row_list)
- # 获取中心空白
- center_blank_row = choose_center_blank(blank_row_list, blank_width)
- if show:
- print('center_blank_row center', center_blank_row)
- if not center_blank_row:
- return [], []
- center_x = (center_blank_row[2] + center_blank_row[0]) / 2
- lt_text_blank_list = get_up_down_blank(lt_text_list, center_x=center_x)
- lt_text_row_list = get_contain_blank_row(lt_text_blank_list, layout_h)
- if show:
- for lt_text_row in lt_text_row_list:
- print('lt_text_row center', lt_text_row)
- return lt_text_row_list, center_blank_row
- def table_list_to_dict(table):
- table_dict_list = []
- for row in table:
- new_row = []
- for col in row:
- col_dict = {
- 'rowspan': 1,
- 'columnspan': 1,
- 'text': col
- }
- new_row.append(col_dict)
- table_dict_list.append(new_row)
- return table_dict_list
- @memory_decorator
- def get_up_down_blank(lt_text_list, center_x=None, show=0):
- # 根据文本上下的空白分行
- lt_text_list.sort(key=lambda x: (x.bbox[1], x.bbox[0]))
- lt_text_blank_list = []
- for i in range(len(lt_text_list)):
- lt_text1 = lt_text_list[i]
- line1 = ((lt_text1.bbox[0], 0), (lt_text1.bbox[2], 0))
- if center_x is not None:
- left_or_right1 = 0 if (lt_text1.bbox[0] + lt_text1.bbox[2]) / 2 <= center_x else 1
- up_blank_list = []
- down_blank_list = []
- for j in range(len(lt_text_list)):
- lt_text2 = lt_text_list[j]
- if lt_text1 == lt_text2:
- continue
- # 没有中间列分割
- if center_x is None:
- line2 = ((lt_text2.bbox[0], 0), (lt_text2.bbox[2], 0))
- iou = line_iou(line1, line2)
- if lt_text2.bbox[1] > lt_text1.bbox[3] and iou > 0:
- down_blank_list.append([lt_text1.bbox[3], lt_text2.bbox[1]])
- if lt_text2.bbox[3] < lt_text1.bbox[1] and iou > 0:
- up_blank_list.append([lt_text2.bbox[3], lt_text1.bbox[1]])
- # if lt_text1.bbox[1] > lt_text2.bbox[3] and iou > 0:
- # down_blank_list.append([lt_text2.bbox[3], lt_text1.bbox[1]])
- # if lt_text1.bbox[3] < lt_text2.bbox[1] and iou > 0:
- # up_blank_list.append([lt_text1.bbox[3], lt_text2.bbox[1]])
- # 有中间列分割
- else:
- left_or_right2 = 0 if (lt_text2.bbox[0] + lt_text2.bbox[2]) / 2 <= center_x else 1
- if lt_text2.bbox[1] > lt_text1.bbox[3] and left_or_right1 == left_or_right2:
- down_blank_list.append([lt_text1.bbox[3], lt_text2.bbox[1]])
- if lt_text2.bbox[3] < lt_text1.bbox[1] and left_or_right1 == left_or_right2:
- up_blank_list.append([lt_text2.bbox[3], lt_text1.bbox[1]])
- # if lt_text1.bbox[1] > lt_text2.bbox[3] and left_or_right1 == left_or_right2:
- # down_blank_list.append([lt_text2.bbox[3], lt_text1.bbox[1]])
- # if lt_text1.bbox[3] < lt_text2.bbox[1] and left_or_right1 == left_or_right2:
- # up_blank_list.append([lt_text1.bbox[3], lt_text2.bbox[1]])
- # 找不到的,空白设置为自身text高度
- text_h = abs(lt_text1.bbox[3] - lt_text1.bbox[1])
- if not up_blank_list:
- up_blank_list.append([max(0, lt_text1.bbox[1] - text_h), lt_text1.bbox[1]])
- if not down_blank_list:
- down_blank_list.append([lt_text1.bbox[3], lt_text1.bbox[3] + text_h])
- down_blank = down_blank_list[0]
- up_blank = up_blank_list[-1]
- if show:
- print('lt_text1.get_text()', lt_text1.get_text(), lt_text1.bbox)
- if center_x is not None:
- print('center_x', center_x)
- print('up_blank', up_blank)
- print('down_blank', down_blank)
- lt_text_blank_list.append([lt_text1, up_blank, down_blank])
- return lt_text_blank_list
- @memory_decorator
- def filter_large_blank_row(lt_text_blank_list, layout_h, show=0):
- # 先过滤空白过大的,单独成行
- lt_text_row_list = []
- single_lt_text_list = []
- max_blank_h = layout_h / 6
- index = 0
- threshold = 20
- lt_text_blank_list.sort(key=lambda x: (x[0].bbox[1], x[0].bbox[0]))
- for lt_text1, up_blank1, down_blank1 in lt_text_blank_list:
- row = []
- # 空白高度大于一定值,单独一行
- match_flag = 0
- # 在最下方的lt_text,判断上空白
- if index >= len(lt_text_blank_list) - 4 \
- and abs(up_blank1[0] - up_blank1[1]) >= max_blank_h:
- if show:
- print('match single lt_text 1')
- match_flag = 1
- # 在最上方的lt_text,判断下空白
- elif index <= 2 \
- and abs(down_blank1[0] - down_blank1[1]) >= max_blank_h:
- if show:
- print('match single lt_text 2')
- match_flag = 1
- # 在中间的,上下一起判断
- elif 2 <= index <= len(lt_text_blank_list) - 4 \
- and abs(up_blank1[0] - down_blank1[1]) >= max_blank_h:
- # 判断没有同行的
- has_same_row_flag = 0
- for lt_text2, _, _ in lt_text_blank_list:
- if lt_text1 == lt_text2:
- continue
- if lt_text1.bbox[1] - threshold <= lt_text2.bbox[1] <= lt_text2.bbox[3] <= lt_text1.bbox[3] + threshold:
- has_same_row_flag = 1
- break
- if has_same_row_flag:
- match_flag = 0
- else:
- match_flag = 1
- if show:
- print('match single lt_text 3')
- if match_flag:
- row.append(lt_text1)
- lt_text_row_list.append(row)
- single_lt_text_list.append(lt_text1)
- index += 1
- if show:
- print('single_lt_text_list', single_lt_text_list)
- return lt_text_row_list, single_lt_text_list
- @memory_decorator
- def get_contain_blank_row(lt_text_blank_list, layout_h, show=0):
- from format_convert.convert_tree import TextBox
- lt_text_row_list, single_lt_text_list = filter_large_blank_row(lt_text_blank_list, layout_h)
- single_lt_text_list = set(single_lt_text_list)
- # 空白互相包含的就是同一行
- time1 = time.time()
- threshold = 5
- used_lt_text_list = set([])
- another_used_lt_text_list = set([])
- for i1 in range(len(lt_text_blank_list)):
- time2 = time.time()
- lt_text1, up_blank1, down_blank1 = lt_text_blank_list[i1]
- row = []
- if lt_text1 in single_lt_text_list:
- continue
- for i2 in range(len(lt_text_blank_list)):
- lt_text2, up_blank2, down_blank2 = lt_text_blank_list[i2]
- if lt_text1 == lt_text2:
- continue
- if lt_text2 in another_used_lt_text_list:
- continue
- if lt_text2 in used_lt_text_list and lt_text1.bbox[1] >= lt_text2.bbox[3]:
- continue
- if lt_text2 in single_lt_text_list:
- continue
- # 单独上空白包含上空白,下空白包含下空白
- if (up_blank1[0] - threshold <= up_blank2[0] <= up_blank2[1] <= up_blank1[1] + threshold) \
- or (down_blank1[0] - threshold <= down_blank2[0] <= down_blank2[1] <= down_blank1[1] + threshold):
- # or (up_blank2[0] - threshold <= up_blank1[0] <= up_blank1[1] <= up_blank2[1] + threshold) \
- # or (down_blank2[0] - threshold <= down_blank1[0] <= down_blank1[1] <= down_blank2[1] + threshold):
- if lt_text2 not in row:
- row.append(lt_text2)
- used_lt_text_list.add(lt_text2)
- # 若是上下空白包含了另一个的文本部分,也成立
- # if up_blank1[0] <= lt_text2.bbox[1] <= lt_text2.bbox[3] <= down_blank1[1]:
- # if lt_text2 not in row:
- # row.append(lt_text2)
- # used_lt_text_list.append(lt_text2)
- if lt_text1 not in row:
- row.append(lt_text1)
- if show:
- print('get_contain_blank_row loop2 cost:', time.time()-time2)
- # 若一个row中有3个带冒号的,说明误把一个单独行合进来了,分开
- time2 = time.time()
- colon_cnt = 0
- colon_lt_text = []
- for lt in row:
- if re.search('[::]', lt.get_text()):
- colon_cnt += 1
- colon_lt_text.append(lt)
- if colon_cnt >= 3:
- if show:
- print('colon_cnt >= 3 row', row)
- another_lt_text_list = find_outline_lt_text(row)
- # # 把y最大的lt_text单独放一行
- # colon_lt_text.sort(key=lambda x: x.bbox[1])
- # # 除了前两个,其他都单放一行
- # another_lt_text_list = colon_lt_text[2:]
- for lt_text in another_lt_text_list:
- if lt_text in row:
- row.remove(lt_text)
- if lt_text in colon_lt_text:
- colon_lt_text.remove(lt_text)
- if show:
- print('another_lt_text_list', another_lt_text_list)
- print('colon_lt_text', colon_lt_text)
- if not colon_lt_text:
- continue
- colon_lt_text.sort(key=lambda x: x.bbox[0])
- lt_text_row_list.append(row)
- for another_lt_text in another_lt_text_list:
- if abs(another_lt_text.bbox[0] - colon_lt_text[0].bbox[0]) > abs(
- another_lt_text.bbox[0] - colon_lt_text[-1].bbox[0]):
- new_bbox = [colon_lt_text[0].bbox[0], another_lt_text.bbox[1],
- colon_lt_text[0].bbox[2], another_lt_text.bbox[3]]
- another_row = [TextBox(text="@@:", bbox=new_bbox), another_lt_text]
- else:
- new_bbox = [colon_lt_text[-1].bbox[0], another_lt_text.bbox[1],
- colon_lt_text[-1].bbox[2], another_lt_text.bbox[3]]
- # 新增一列占位
- another_row = [another_lt_text, TextBox(text="@@:", bbox=new_bbox)]
- if show:
- print('another_row', another_row)
- for lt_text3 in another_row:
- another_used_lt_text_list.add(lt_text3)
- lt_text_row_list.append(another_row)
- else:
- lt_text_row_list.append(row)
- if show:
- print('get_contain_blank_row judge colon cost:', time.time()-time2)
- if show:
- print('get_contain_blank_row double loop cost: ', time.time()-time1)
- # 去重
- lt_text_row_list.sort(key=lambda x: len(x), reverse=True)
- if show:
- for lt_text_row in lt_text_row_list:
- print('before dedup lt_text_row', lt_text_row)
- lt_text_row_list = merge_intersecting_lists(lt_text_row_list)
- if show:
- for lt_text_row in lt_text_row_list:
- print('after dedup lt_text_row', lt_text_row)
- lt_text_row_list.sort(key=lambda x: x[0].bbox[1])
- # 剔除全是空白的行
- temp_list = []
- for lt_text_row in lt_text_row_list:
- row_text = ""
- for lt_text in lt_text_row:
- row_text += lt_text.get_text()
- if re.sub('\s+', '', row_text) == "":
- continue
- temp_list.append(lt_text_row)
- lt_text_row_list = temp_list
- return lt_text_row_list
- def choose_center_blank(blank_row_list, blank_width, show=0):
- if not blank_row_list:
- return []
- # 先选最长空白包含的所有空白
- blank_list = [y for x in blank_row_list for y in x]
- if not blank_list:
- return []
- blank_list.sort(key=lambda x: abs(x[0] - x[2]), reverse=True)
- max_blank = blank_list[0]
- if show:
- print('max_blank', max_blank)
- if abs(max_blank[0] - max_blank[2]) <= blank_width:
- return []
- max_col = []
- for blank_row in blank_row_list:
- if not blank_row:
- continue
- # # 找出每一行最大的空白列,但是同一列中则选列中最小的空白
- # # 空白分列
- # blank_row.sort(key=lambda x: (x[0], x[1]))
- # last_blank_bbox = blank_row[0]
- # blank_col = []
- # blank_col_list = []
- # for blank_bbox in blank_row[1:]:
- # line1 = ([blank_bbox[0], 0], [blank_bbox[2], 0])
- # line2 = ([last_blank_bbox[0], 0], [last_blank_bbox[2], 0])
- # if line_iou(line1, line2) >= 0.7:
- # blank_col += [blank_bbox, last_blank_bbox]
- # else:
- # blank_col.sort(key=lambda x: abs(x[2] - x[0]))
- # blank_col_list.append(blank_col)
- # blank_col = []
- # last_blank_bbox = blank_bbox
- # 选最大的列
- max_blank_bbox = blank_row[0]
- for blank_bbox in blank_row[1:]:
- if abs(blank_bbox[0] - blank_bbox[2]) > abs(max_blank_bbox[0] - max_blank_bbox[2]):
- max_blank_bbox = blank_bbox
- if show:
- print('max_blank_bbox, blank_row', max_blank_bbox, blank_row)
- line1 = ([max_blank[0], 0], [max_blank[2], 0])
- line2 = ([max_blank_bbox[0], 0], [max_blank_bbox[2], 0])
- iou = line_iou(line1, line2)
- # if max_blank[0] <= blank_row_bbox[0] <= blank_row_bbox[2] <= max_blank[2]:
- if iou >= 0.5:
- max_col.append(max_blank_bbox)
- if show:
- print('max_col', max_col)
- if not max_col:
- return []
- # # 选取被包含最多的空白
- # # 选取交集最多的空白,相同数量则最短
- # blank_contain_cnt_dict = {}
- # for bi, blank_row_bbox in enumerate(max_col):
- # blank_contain_cnt_dict[bi] = 0
- # for blank_row_bbox2 in max_col:
- # line1 = ([blank_row_bbox2[0], 0], [blank_row_bbox2[2], 0])
- # line2 = ([blank_row_bbox[0], 0], [blank_row_bbox[2], 0])
- # iou = line_iou(line1, line2)
- # # if blank_row_bbox2[0] <= blank_row_bbox[0] <= blank_row_bbox[2] <= blank_row_bbox2[2]:
- # if iou >= 0.2:
- # blank_contain_cnt_dict[bi] += 1
- # blank_contain_cnt_list = [[k, v, abs(max_col[k][2] - max_col[k][0])/2] for k, v in blank_contain_cnt_dict.items()]
- # blank_contain_cnt_list.sort(key=lambda x: (x[1], -x[2]))
- # if show:
- # print('blank_contain_cnt_list', blank_contain_cnt_list)
- # center_blank_row = max_col[blank_contain_cnt_list[-1][0]]
- # 选取交集部分
- center_blank_row = get_inter_part(max_col)
- return center_blank_row
- def set_head_value_in_col(col_list1, col_list2, show=0):
- # 在列中设置 表头和值
- col_key_value_list = []
- last_key = ""
- for col1 in col_list1:
- match = re.search('[::]+', col1)
- # 有冒号的
- if match:
- key = col1[:match.end()]
- if last_key:
- key = last_key + key
- last_key = ""
- value = col1[match.end():]
- col_key_value_list.append([key, value])
- # 没有冒号的
- else:
- # 如果该值也存在在col_list2里,则看做表头,和下一行的表头连在一起
- if col1 in col_list2:
- if show:
- print('col1 in col_list2')
- # 若上一行也是无冒号的,直接加入一行
- if last_key:
- col_key_value_list.append(["", last_key])
- last_key = ''
- last_key = col1
- # 不存在,则是上一行的值,和上一行的值连在一起
- else:
- if col_key_value_list and re.search('[::]', col_key_value_list[-1][1]):
- col_key_value_list[-1][1] += col1
- else:
- col_key_value_list.append(["", col1])
- # 如果是最后一行没有冒号的,col1 col2都有的,直接当做一行
- if last_key:
- col_key_value_list.append(["", last_key])
- if show:
- print('col_key_value_list', col_key_value_list)
- return col_key_value_list
- def divide_2_col_by_center_blank(b_table, center_blank_row, show=0):
- # 根据中心空白,分为两列
- col_list1 = []
- col_list2 = []
- col_box_dict = {}
- for lt_text_row in b_table:
- lt_text_row.sort(key=lambda x: x.bbox[0])
- # if len(lt_text_row) == 4:
- # text1 = lt_text_row[0].get_text() + lt_text_row[1].get_text()
- # text2 = lt_text_row[2].get_text() + lt_text_row[3].get_text()
- # box1 = [
- # min(lt_text_row[0].bbox[0], lt_text_row[1].bbox[0]),
- # max(lt_text_row[0].bbox[2], lt_text_row[1].bbox[2]),
- # min(lt_text_row[0].bbox[1], lt_text_row[1].bbox[1]),
- # max(lt_text_row[0].bbox[3], lt_text_row[1].bbox[3])
- # ]
- # box2 = [
- # min(lt_text_row[2].bbox[0], lt_text_row[3].bbox[0]),
- # max(lt_text_row[2].bbox[2], lt_text_row[3].bbox[2]),
- # min(lt_text_row[2].bbox[1], lt_text_row[3].bbox[1]),
- # max(lt_text_row[2].bbox[3], lt_text_row[3].bbox[3])
- # ]
- #
- # # col_list1.append(text1)
- # # col_list2.append(text2)
- # else:
- # text1 = lt_text_row[0].get_text()
- # text2 = lt_text_row[1].get_text()
- # box1 = lt_text_row[0].bbox
- # box2 = lt_text_row[1].bbox
- left_col = []
- right_col = []
- for lt_text in lt_text_row:
- if (lt_text.bbox[2] + lt_text.bbox[0]) / 2 <= abs(center_blank_row[0] + center_blank_row[2]) / 2:
- left_col.append(lt_text)
- else:
- right_col.append(lt_text)
- # 按阅读顺序排序
- left_col = sort_by_read_order(left_col)
- left_text = [x.get_text() for x in left_col]
- left_text = ''.join(left_text)
- right_col = sort_by_read_order(right_col)
- right_text = [x.get_text() for x in right_col]
- right_text = ''.join(right_text)
- text1 = left_text.strip()
- text2 = right_text.strip()
- col_list1.append(text1)
- col_list2.append(text2)
- if show:
- print('col_list1', col_list1)
- print('col_list2', col_list2)
- # 两列都必须有冒号,否则就是非2列表格
- colon_cnt1 = 0
- colon_cnt2 = 0
- for col in col_list1:
- if re.search('[::]', col):
- colon_cnt1 += 1
- for col in col_list2:
- if re.search('[::]', col):
- colon_cnt2 += 1
- if colon_cnt1 < len(col_list1) / 3 or colon_cnt2 < len(col_list2) / 3:
- col_list1 = []
- col_list2 = []
- if show:
- print('col_list1 colon_cnt1 less', colon_cnt1)
- print('col_list2 colon_cnt2 less', colon_cnt2)
- return col_list1, col_list2
- def delete_blank_col(b_table_row_list):
- # 删除空白列
- col_dict = {}
- for row in b_table_row_list:
- for col_i, col in enumerate(row):
- if col_i in col_dict.keys():
- col_dict[col_i] += [col]
- else:
- col_dict[col_i] = [col]
- delete_col_i = []
- for col_i, cols in col_dict.items():
- cols = list(set(cols))
- if len(cols) == 1 and cols[0] == '':
- delete_col_i.append(col_i)
- temp_list = []
- for row in b_table_row_list:
- new_col = []
- for col_i, col in enumerate(row):
- if col_i in delete_col_i:
- continue
- new_col.append(col)
- temp_list.append(new_col)
- b_table_row_list = temp_list
- return b_table_row_list
- def fix_head_value_match(b_table, show=0):
- if not b_table:
- return b_table
- if len(b_table[0]) != 4:
- return b_table
- maybe_head_index = None
- match_head_value_dict = {}
- # 修复值跨行
- for row_i, row in enumerate(b_table):
- if maybe_head_index is None:
- if row[1] in ["", '@@:'] and row[3] in ["", '@@:']:
- match1 = re.search("[::]", row[0])
- match2 = re.search("[::]", row[2])
- if match1 and match2:
- maybe_head_index = row_i
- else:
- if row[0] in ["", '@@:'] and row[2] in ["", '@@:'] and row[1] not in ["", '@@:'] and row[3] not in ["", '@@:']:
- if maybe_head_index in match_head_value_dict.keys():
- match_head_value_dict[maybe_head_index] += [row_i]
- else:
- match_head_value_dict[maybe_head_index] = [row_i]
- else:
- maybe_head_index = None
- if show:
- print('match_head_value_dict', match_head_value_dict)
- add_row_dict = {}
- delete_head_index_list = []
- delete_value_index_list = []
- for row_index, value_index_list in match_head_value_dict.items():
- head_row = b_table[row_index]
- delete_head_index_list.append(row_index)
- left_value_text = ""
- right_value_text = ""
- for value_index in value_index_list:
- value_row = b_table[value_index]
- delete_value_index_list.append(value_index)
- for col in value_row[:2]:
- left_value_text += col
- for col in value_row[2:]:
- right_value_text += col
- head_row[1] = left_value_text
- head_row[3] = right_value_text
- add_row_dict[row_index] = head_row
- # 删掉原来的,加上新的row
- temp_list = []
- for row_i, row in enumerate(b_table):
- if row_i in delete_head_index_list:
- temp_list.append(add_row_dict.get(row_i))
- continue
- if row_i in delete_value_index_list:
- continue
- temp_list.append(row)
- b_table = temp_list
- return b_table
- def add_last_rows(b_table, table_bbox, center_blank_bbox, lt_text_row_list,
- table_lt_text_row_list, show=0):
- if not b_table:
- return b_table
- if len(b_table[0]) not in [4]:
- return b_table
- blank_h_list = []
- max_h_list = []
- for lt_text_row in table_lt_text_row_list:
- if not lt_text_row:
- continue
- min_w, min_h, max_w, max_h = get_row_bbox(lt_text_row, mode='.bbox')
- max_h_list.append(max_h)
- max_h_list.sort(key=lambda x: x)
- for i in range(1, len(max_h_list)):
- blank_h_list.append(max_h_list[i] - max_h_list[i - 1])
- mean_blank_h = np.mean(blank_h_list)
- if show:
- print('add_last_rows blank_width_list', blank_h_list)
- print('add_last_rows mean_blank_h', mean_blank_h)
- lt_text_row_list.sort(key=lambda x: x[0].bbox[1])
- match_row_list = []
- threshold = 5
- add_blank_h = mean_blank_h + threshold
- for li, lt_text_row in enumerate(lt_text_row_list):
- min_w, min_h, max_w, max_h = get_row_bbox(lt_text_row, mode='.bbox')
- if show:
- print('max_h > table_bbox[3]', lt_text_row, max_h, table_bbox[3])
- # 高度需要在表格y2和y2加上空白的距离间
- if table_bbox[3] < max_h < table_bbox[3] + add_blank_h:
- # lt_text x轴上穿过了中心bbox,则跳过
- if min_w <= center_blank_bbox[0] <= center_blank_bbox[2] <= max_w:
- print('continue1', min_w, center_blank_bbox[0], center_blank_bbox[2], max_w)
- continue
- # 左边需在表格x1和中心x1之间
- if table_bbox[0] - threshold <= min_w < center_blank_bbox[0]:
- match_row_list.append([lt_text_row, 0, max_h])
- # 右边需在表格x2和中心x2之间
- elif center_blank_bbox[2] < max_w < table_bbox[2] + threshold * 3:
- match_row_list.append([lt_text_row, 1, max_h])
- else:
- print('center_blank_bbox[2] < max_w < table_bbox[2] + threshold * 3')
- break
- add_blank_h = add_blank_h + mean_blank_h + threshold
- if show:
- print('add_last_rows match_row_list', match_row_list)
- add_b_table = []
- real_max_h = None
- for mi, match_row in enumerate(match_row_list):
- lt_text_row, is_right, max_h = match_row
- lt_text_row.sort(key=lambda x: (x.bbox[0], x.bbox[1]))
- # 只有一列
- if len(lt_text_row) == 1:
- text = lt_text_row[0].get_text()
- match = re.search('[::]+', text)
- real_max_h = max_h
- if not match:
- head = ""
- value = text
- else:
- head = text[:match.end()]
- value = text[match.end():]
- # 或 两列,其实是表头由于空白被隔开
- elif len(lt_text_row) == 2 and len(lt_text_row[0].get_text()) \
- and lt_text_row[1].get_text()[-1] in [':', ":"]:
- text = lt_text_row[0].get_text() + lt_text_row[1].get_text()
- head = text
- value = ''
- # 两列
- elif len(lt_text_row) == 2:
- text1 = lt_text_row[0].get_text()
- match = re.search('[::]+', text1)
- if not match:
- break
- real_max_h = max_h
- head = text1
- value = lt_text_row[1].get_text()
- else:
- if show:
- print('add_last_rows len(lt_text_row) break', len(lt_text_row))
- break
- # 获取上一行,可能需要将值补到上一行
- if mi == 0 or len(add_b_table) == 0:
- last_row = b_table[-1]
- last_flag = 0
- else:
- last_row = add_b_table[-1]
- last_flag = 1
- if is_right:
- if last_row[2] and not last_row[3] and not head and value:
- b_table[-1][3] = value
- current_row = ["", "", last_row[2], value]
- else:
- current_row = ["", "", head, value]
- else:
- if last_row[0] and not last_row[1] and not head and value:
- current_row = [last_row[0], value, "", ""]
- else:
- current_row = [head, value, "", ""]
- # if last_flag == 0:
- # b_table = b_table[:-1]
- add_b_table.append(current_row)
- if show:
- print('current_row', current_row)
- if show:
- print('add_b_table', add_b_table)
- b_table += add_b_table
- if real_max_h is not None:
- table_bbox[3] = real_max_h
- return b_table
- def add_first_rows(b_table, table_bbox, center_blank_bbox, lt_text_row_list,
- table_lt_text_row_list, show=0):
- if not b_table:
- return b_table
- if len(b_table[0]) not in [4]:
- return b_table
- blank_h_list = []
- max_h_list = []
- for lt_text_row in table_lt_text_row_list:
- if not lt_text_row:
- continue
- min_w, min_h, max_w, max_h = get_row_bbox(lt_text_row, mode='.bbox')
- max_h_list.append(max_h)
- max_h_list.sort(key=lambda x: x)
- for i in range(1, len(max_h_list)):
- blank_h_list.append(max_h_list[i] - max_h_list[i - 1])
- mean_blank_h = np.mean(blank_h_list)
- if show:
- print('add_first_rows blank_width_list', blank_h_list)
- print('add_first_rows mean_blank_h', mean_blank_h)
- lt_text_row_list.sort(key=lambda x: x[0].bbox[1])
- match_row_list = []
- threshold = 5
- add_blank_h = mean_blank_h + threshold
- for li, lt_text_row in enumerate(lt_text_row_list):
- min_w, min_h, max_w, max_h = get_row_bbox(lt_text_row, mode='.bbox')
- if show:
- print('min_h < table_bbox[3]', lt_text_row, min_h, table_bbox[3])
- # 高度需要有一部分在在表格中
- if min_h <= table_bbox[1] < max_h:
- # lt_text x轴上穿过了中心bbox,则跳过
- if min_w <= center_blank_bbox[0] <= center_blank_bbox[2] <= max_w:
- print('continue1', min_w, center_blank_bbox[0], center_blank_bbox[2], max_w)
- continue
- # match_row_list.append([lt_text_row, 1, min_h])
- # 中心x1左边
- if min_w < center_blank_bbox[0]:
- match_row_list.append([lt_text_row, 0, min_h])
- # 中心x2右边
- elif center_blank_bbox[2] < max_w:
- match_row_list.append([lt_text_row, 1, min_h])
- else:
- break
- if show:
- print('add_first_rows match_row_list', match_row_list)
- real_min_h = None
- for mi, match_row in enumerate(match_row_list):
- lt_text_row, is_right, min_h = match_row
- lt_text_row.sort(key=lambda x: (x.bbox[0], x.bbox[1]))
- # 只有一列
- if len(lt_text_row) == 1:
- text = lt_text_row[0].get_text()
- match = re.search('[::]+', text)
- real_min_h = min_h
- if not match:
- head = ""
- value = text
- else:
- head = text[:match.end()]
- value = text[match.end():]
- # # 或 两列,其实是表头由于空白被隔开
- # elif len(lt_text_row) == 2 and len(lt_text_row[0].get_text()) \
- # and lt_text_row[1].get_text()[-1] in [':', ":"]:
- # text = lt_text_row[0].get_text() + lt_text_row[1].get_text()
- # head = text
- # value = ''
- # # 两列
- # elif len(lt_text_row) == 2:
- # text1 = lt_text_row[0].get_text()
- # match = re.search('[::]+', text1)
- # if not match:
- # break
- # real_max_h = max_h
- # head = text1
- # value = lt_text_row[1].get_text()
- else:
- if show:
- print('add_first_rows len(lt_text_row) break', len(lt_text_row))
- break
- # 获取表格第一行,可能需要将值补进去
- if not head and value:
- if is_right:
- b_table[0][3] = value + b_table[0][3]
- else:
- b_table[0][1] = value + b_table[0][1]
- if real_min_h is not None:
- table_bbox[1] = real_min_h
- return b_table
- def get_row_bbox(row, mode='list'):
- # 提取所有x1, y1, x2, y2的值
- if mode == 'list':
- x1_values = [x[0] for x in row]
- y1_values = [x[1] for x in row]
- x2_values = [x[2] for x in row]
- y2_values = [x[3] for x in row]
- elif mode == '.bbox':
- x1_values = [x.bbox[0] for x in row]
- y1_values = [x.bbox[1] for x in row]
- x2_values = [x.bbox[2] for x in row]
- y2_values = [x.bbox[3] for x in row]
- min_x = min(x1_values)
- max_x = max(x2_values)
- min_y = min(y1_values)
- max_y = max(y2_values)
- return min_x, min_y, max_x, max_y
- def shrink_bbox(img, bbox_list):
- def return_not_most_color_index(image_np, match_color):
- # 计算每个像素与背景色的欧几里得距离的平方
- diff = np.sum(np.sqrt((image_np.astype(np.int32) - match_color.astype(np.int32)) ** 2), axis=2)
- threshold = 100 # 假设阈值为 10000,可以调整
- diff_mask = diff > threshold
- # 获取与背景色相差较大的像素的索引
- diff_index = np.where(diff_mask)
- # print('diff_index.size', diff_index[0].size)
- return diff_index
- def return_not_most_color_index_fast(image_np, match_color):
- # 将图像和匹配颜色转换为整数类型
- # image_int = image_np.astype(np.int32)
- # match_color_int = match_color.astype(np.int32)
- # 计算每个像素与背景色的欧几里得距离的平方
- diff = np.sum((image_np - match_color) ** 2, axis=2)
- threshold = 20 # 假设阈值为 10000,可以调整
- threshold = threshold ** 2
- diff_mask = diff > threshold
- # 获取与背景色相差较大的像素的索引
- diff_index = np.where(diff_mask)
- # print('diff_index.size', diff_index[0].size)
- return diff_index
- # def count_colors_with_histogram(img):
- # time00 = time.time()
- #
- # # 计算每个颜色通道的直方图
- # hist_b = cv2.calcHist([img], [0], None, [256], [0, 256])
- # hist_g = cv2.calcHist([img], [1], None, [256], [0, 256])
- # hist_r = cv2.calcHist([img], [2], None, [256], [0, 256])
- #
- # # 将直方图合并成一个数组
- # hist = np.concatenate((hist_b.flatten(), hist_g.flatten(), hist_r.flatten()))
- #
- # # 获取非零值的索引及其数量
- # non_zero_indices = np.nonzero(hist)[0]
- # counts = hist[non_zero_indices]
- #
- # # 将索引转换为颜色值
- # colors = np.unravel_index(non_zero_indices, (256, 256, 256))
- # colors = np.transpose(colors)
- #
- # log("count_colors_with_histogram Time taken: " + str(time.time() - time00))
- # return colors, counts
- #
- #
- # def count_colors_with_kmeans(img):
- # time00 = time.time()
- # img_color = img.reshape(-1, 3)
- #
- # # 使用 KMeans 聚类,将颜色聚类为 16 种
- # kmeans = KMeans(n_clusters=4, random_state=0, n_init=2, max_iter=10)
- # kmeans.fit(img_color)
- #
- # # 获取聚类后的标签和中心
- # labels = kmeans.labels_
- # centers = kmeans.cluster_centers_
- #
- # # 统计每个聚类中心的数量
- # unique_labels, counts = np.unique(labels, return_counts=True)
- #
- # print("Time taken: ", time.time() - time00)
- # return centers[unique_labels], counts
- #
- # def count_colors_with_bincount(img):
- # time00 = time.time()
- # img_color = img.reshape(-1, 3)
- #
- # # 将颜色编码为一个整数
- # colors_encoded = img_color[:, 0] * 256 * 256 + img_color[:, 1] * 256 + img_color[:, 2]
- #
- # # 使用 bincount 计算每个颜色的数量
- # counts = np.bincount(colors_encoded)
- #
- # # 获取非零值的索引及其数量
- # non_zero_indices = np.nonzero(counts)[0]
- #
- # # 解码颜色值
- # colors_decoded = []
- # for index in non_zero_indices:
- # r = (index // (256 * 256)) % 256
- # g = (index // 256) % 256
- # b = index % 256
- # colors_decoded.append([r, g, b])
- #
- # colors_decoded = np.array(colors_decoded)
- # counts_non_zero = counts[non_zero_indices]
- #
- # print("Time taken: ", time.time() - time00)
- # return colors_decoded, counts_non_zero
- # 统计每种颜色的出现次数
- # time00 = time.time()
- # 对图像进行降采样
- time0 = time.time()
- down_sample_factor = 8
- down_sampled_img = img[::down_sample_factor, ::down_sample_factor, :]
- down_sampled_img_color = down_sampled_img.reshape(-1, 3)
- colors, counts = np.unique(down_sampled_img_color, return_counts=True, axis=0)
- log('shrink_bbox 0 ' + str(time.time()-time0))
- # 找到出现次数最多的颜色
- time0 = time.time()
- max_count_index = np.argmax(counts)
- most_frequent_color = colors[max_count_index]
- most_frequent_color = most_frequent_color.astype(np.int32)
- log('shrink_bbox 1 ' + str(time.time()-time0))
- new_bbox_list = []
- img_int = img.astype(np.int32)
- time0 = time.time()
- for bbox in bbox_list:
- # img_bbox = img[int(bbox[0][1]):int(bbox[2][1]), int(bbox[0][0]):int(bbox[2][0]), :]
- # img_bbox = img[int(bbox[1]):int(bbox[3]), int(bbox[0]):int(bbox[2]), :]
- img_bbox_int = img_int[int(bbox[1]):int(bbox[3]), int(bbox[0]):int(bbox[2]), :]
- if 0 in img_bbox_int.shape:
- new_bbox_list.append(bbox)
- continue
- # 左右上下开始扫描,碰到黑像素即停
- # index_list = return_first_black_index(img_bbox[:, :, :])
- index_list = return_not_most_color_index_fast(img_bbox_int, most_frequent_color)
- if index_list[0].size == 0 or index_list[1].size == 0:
- new_bbox_list.append(bbox)
- continue
- min_h = index_list[0][0]
- max_h = index_list[0][-1]
- img_bbox1 = np.swapaxes(img_bbox_int, 0, 1)
- # index_list = return_first_black_index(img_bbox1[:, :, :])
- index_list = return_not_most_color_index_fast(img_bbox1, most_frequent_color)
- if index_list[0].size == 0 or index_list[1].size == 0:
- new_bbox_list.append(bbox)
- continue
- min_w = index_list[0][0]
- max_w = index_list[0][-1]
- real_min_w = bbox[0] + min_w
- real_max_w = bbox[0] + max_w
- real_min_h = bbox[1] + min_h
- real_max_h = bbox[1] + max_h
- new_bbox = [real_min_w, real_min_h, real_max_w, real_max_h]
- new_bbox_list.append(new_bbox)
- # cv2.imshow('img', img_bbox)
- # cv2.imshow('shrink', img[int(new_bbox[0][1]):int(new_bbox[2][1]), int(new_bbox[0][0]):int(new_bbox[2][0]), :])
- # cv2.waitKey(0)
- log('shrink_bbox 2 ' + str(time.time() - time0))
- return new_bbox_list
- def shrink_bbox_by_pixel(lt_text_list):
- for lt_text in lt_text_list:
- bbox = lt_text.bbox
- bbox_h = abs(bbox[3] - bbox[1])
- shrink_h = bbox_h / 2
- new_bbox = [bbox[0], int(bbox[1] + shrink_h / 2),
- bbox[2], int(bbox[3] - shrink_h / 2)
- ]
- lt_text.bbox = new_bbox
- return lt_text_list
- def get_inter_part(bbox_list, show=0):
- if not bbox_list:
- return None
- # xs = [[x[0], x[2]] for x in bbox_list]
- # xs = [y for x in xs for y in x]
- #
- # ys = [[x[1], x[3]] for x in bbox_list]
- # ys = [y for x in ys for y in x]
- #
- # xs.sort(key=lambda x: x)
- # ys.sort(key=lambda x: x)
- #
- # max_index = len(bbox_list)
- # min_index = max_index - 1
- #
- # min_x, max_x = xs[min_index], xs[max_index]
- # min_y, max_y = ys[min_index], ys[max_index]
- # min_x, min_y, max_x, max_y = bbox_list[0]
- # for bbox in bbox_list:
- # # if min_x < bbox[0]:
- # # min_x = bbox[0]
- # # if min_y < bbox[1]:
- # # min_y = bbox[1]
- # # if max_x > bbox[2]:
- # # max_x = bbox[2]
- # # if max_y > bbox[3]:
- # # max_y = bbox[3]
- # if min_x < min(bbox[0], bbox[2]):
- # min_x = min(bbox[0], bbox[2])
- # if min_y < min(bbox[1], bbox[3]):
- # min_y = min(bbox[1], bbox[3])
- # if max_x > max(bbox[0], bbox[2]):
- # max_x = max(bbox[0], bbox[2])
- # if max_y > max(bbox[1], bbox[3]):
- # max_y = max(bbox[1], bbox[3])
- # # print('min_x, min_y, max_x, max_y', min_x, min_y, max_x, max_y)
- # _min_x = min(min_x, max_x)
- # _max_x = max(min_x, max_x)
- # _min_y = min(min_y, max_y)
- # _max_y = max(min_y, max_y)
- # # 同一行的bbox去重,取最大的
- # # used_bbox_list = []
- # current_bbox = bbox_list[0]
- # delete_bbox_list = []
- # bbox_list.sort(key=lambda x: (x[1], x[3]))
- # threshold = 5
- # for bbox in bbox_list:
- # if bbox == current_bbox:
- # continue
- # if current_bbox in delete_bbox_list:
- # current_bbox = bbox
- # continue
- # if current_bbox[1] - threshold <= bbox[1] <= bbox[3] <= current_bbox[3] + threshold:
- # if abs(current_bbox[0] - current_bbox[2]) > abs(bbox[0] - bbox[2]):
- # delete_bbox_list.append(bbox)
- # else:
- # delete_bbox_list.append(current_bbox)
- # else:
- # current_bbox = bbox
- #
- # for bbox in delete_bbox_list:
- # if bbox in bbox_list:
- # bbox_list.remove(bbox)
- bbox_list.sort(key=lambda x: (x[0], x[2]))
- min_x, min_y, max_x, max_y = bbox_list[0]
- for bbox in bbox_list:
- if min_x < bbox[0]:
- min_x = bbox[0]
- if min_y < bbox[1]:
- min_y = bbox[1]
- if max_x > bbox[2]:
- max_x = bbox[2]
- if max_y > bbox[3]:
- max_y = bbox[3]
- _min_x = min(min_x, max_x)
- _max_x = max(min_x, max_x)
- _min_y = min(min_y, max_y)
- _max_y = max(min_y, max_y)
- if show:
- print('get_inter_part', [_min_x, _min_y, _max_x, _max_y])
- return [_min_x, _min_y, _max_x, _max_y]
- def get_inter_part_250530(bbox_list, show=0):
- if not bbox_list:
- return None
- x1_list = [x[0] for x in bbox_list]
- x2_list = [x[2] for x in bbox_list]
- y1_list = [x[1] for x in bbox_list]
- y2_list = [x[3] for x in bbox_list]
- x1_list.sort(key=lambda x: x, reverse=True)
- x2_list.sort(key=lambda x: x)
- def get_straight_lines_from_image(image_np, threshold=50):
- # 读取图像
- if image_np is None:
- print("无法读取图像")
- return False
- # 转换为灰度图像
- gray = cv2.cvtColor(image_np, cv2.COLOR_BGR2GRAY)
- # 使用Canny算子进行边缘检测
- edges = cv2.Canny(gray, 20, 150)
- cv2.imshow('edges', edges)
- # 使用霍夫直线变换检测直线
- lines = cv2.HoughLinesP(edges, 1, np.pi / 180, threshold,
- minLineLength=50, maxLineGap=2)
- for line in lines:
- line = line[0]
- print('line', line)
- cv2.line(image_np, line[:2], line[2:], (0, 0, 255))
- cv2.imshow('img', image_np)
- cv2.waitKey(0)
- print('lines', lines)
- def get_table_bbox(table):
- x1 = min([y.bbox[0] for x in table for y in x])
- y1 = min([y.bbox[1] for x in table for y in x])
- x2 = max([y.bbox[2] for x in table for y in x])
- y2 = max([y.bbox[3] for x in table for y in x])
- return [x1, y1, x2, y2]
- @memory_decorator
- def merge_intersecting_lists(lists):
- merged_lists = []
- for current_list in lists:
- # 当前列表转换为集合,方便后续操作
- current_set = set(current_list)
- merged = False
- # 遍历已合并的列表,检查是否有交集
- for i in range(len(merged_lists)):
- merged_set = set(merged_lists[i])
- # 如果存在交集
- if current_set & merged_set:
- # 合并两个列表,并去重
- merged_lists[i] = list(merged_set.union(current_set))
- merged = True
- break
- # 如果没有与任何已合并列表交集,则添加为新的合并列表
- if not merged:
- merged_lists.append(current_list.copy())
- return merged_lists
- def merge_same_bbox(lt_text_list, avg_char_width, show=0):
- from format_convert.convert_tree import TextBox
- for i in range(len(lt_text_list)):
- lt_text1 = lt_text_list[i]
- line1_x = ((lt_text1.bbox[0], 0), (lt_text1.bbox[2], 0))
- line1_y = ((lt_text1.bbox[1], 0), (lt_text1.bbox[3], 0))
- for j in range(i+1, len(lt_text_list)):
- lt_text2 = lt_text_list[j]
- # if lt_text1 == lt_text2:
- # continue
- if lt_text1.bbox[2] >= lt_text2.bbox[0]:
- continue
- # x轴上不相交
- line2_x = ((lt_text2.bbox[0], 0), (lt_text2.bbox[2], 0))
- if line_iou(line1_x, line2_x) > 0:
- continue
- # y轴上iou大于一定值
- line2_y = ((lt_text2.bbox[1], 0), (lt_text2.bbox[3], 0))
- if line_iou(line1_y, line2_y) > 0.9 \
- and abs(lt_text1.bbox[2] - lt_text2.bbox[0]) < avg_char_width * 5 \
- and re.search('[::]', lt_text2.get_text()) \
- and not re.search('[::]', lt_text1.get_text()) \
- and len(lt_text1.get_text()) <= 2:
- new_lt_text = TextBox(text=lt_text1.get_text() + lt_text2.get_text(),
- bbox=[lt_text1.bbox[0], min(lt_text1.bbox[1], lt_text2.bbox[1]),
- lt_text2.bbox[2], max(lt_text1.bbox[3], lt_text2.bbox[3])
- ])
- lt_text_list[i] = new_lt_text
- lt_text_list[j] = new_lt_text
- if show:
- print('new_lt_text', new_lt_text)
- lt_text_list = list(set(lt_text_list))
- lt_text_list.sort(key=lambda x: (x.bbox[0], x.bbox[1]))
- return lt_text_list
- def sort_by_read_order(lt_text_list, threshold=10):
- if not lt_text_list:
- return lt_text_list
- # 按 y1 升序排序
- lt_text_list.sort(key=lambda x: x.bbox[1])
- # 初始化变量
- sorted_lt_text_list = []
- current_row = [lt_text_list[0]]
- for i in range(1, len(lt_text_list)):
- # 如果当前边界框的 y1 与前一个边界框的 y1 差距小于阈值,认为是同一行
- if abs(lt_text_list[i].bbox[1] - lt_text_list[i - 1].bbox[1]) < threshold:
- current_row.append(lt_text_list[i])
- else:
- # 对当前行按 x1 排序并添加到结果中
- current_row.sort(key=lambda x: x.bbox[0])
- sorted_lt_text_list += current_row
- current_row = [lt_text_list[i]]
- # 添加最后一行
- current_row.sort(key=lambda x: x.bbox[0])
- sorted_lt_text_list += current_row
- return sorted_lt_text_list
- def delete_empty_bbox(lt_text_list, show=0):
- temp_list = []
- for lt_text in lt_text_list:
- if lt_text.get_text() in [':', ":", ";", ";"] \
- or re.sub('\s', '', lt_text.get_text()) == "":
- continue
- temp_list.append(lt_text)
- lt_text_list = temp_list
- return lt_text_list
- def standard_table(table, show=0):
- if not table:
- return table
- # 去掉占位符
- for ri, row in enumerate(table):
- for ci, col in enumerate(row):
- if '@@:' in col.get('text'):
- col['text'] = re.sub('@@:', '', col.get('text'))
- # 修复一些表头冒号ocr提取不到被作为值的问题
- for ri, row in enumerate(table):
- if row[0].get('text') == '' and row[1].get('text') != '' and row[2].get('text') != '' and row[3].get('text') == '':
- row[0]['text'] = row[1].get('text')
- row[1]['text'] = ''
- if show:
- print('standard_table, add colon head', table[ri])
- # 修复表头值上下错位的情况
- # head head
- # value value
- delete_row_index_list = []
- for ri, row in enumerate(table):
- if ri == 0:
- continue
- last_row = table[ri - 1]
- if last_row[0].get('text') != '' and last_row[1].get('text') == '' \
- and row[0].get('text') == '' and row[1].get('text') != '' \
- and last_row[2].get('text') != '' and last_row[3].get('text') == '' \
- and row[2].get('text') == '' and row[3].get('text') != '':
- # 补上表头
- row[0]['text'] = last_row[0].get('text')
- row[2]['text'] = last_row[2].get('text')
- delete_row_index_list.append(ri - 1)
- if show:
- print('standard_table, fix head value 1', table[ri])
- temp_list = []
- for ri, row in enumerate(table):
- if ri in delete_row_index_list:
- continue
- temp_list.append(row)
- table = temp_list
- # 修复值未被合进上一行的情况
- # head value head value
- # value value
- delete_row_index_list = []
- for ri, row in enumerate(table):
- if ri == 0:
- continue
- last_row = table[ri - 1]
- if last_row[0].get('text') != '' and last_row[1].get('text') != '' \
- and row[0].get('text') == '' and row[1].get('text') != '' \
- and last_row[2].get('text') != '' and last_row[3].get('text') != '' \
- and row[2].get('text') == '' and row[3].get('text') != '':
- # 补上值
- last_row[1]['text'] += row[1]['text']
- last_row[3]['text'] += row[3]['text']
- delete_row_index_list.append(ri)
- temp_list = []
- for ri, row in enumerate(table):
- if ri in delete_row_index_list:
- continue
- temp_list.append(row)
- table = temp_list
- return table
- @memory_decorator
- def find_outline_lt_text(lt_text_list, show=0):
- lt_text_list.sort(key=lambda x: (x.bbox[1], x.bbox[0]))
- used_lt_text_list = []
- row_list = []
- for lt_text1 in lt_text_list:
- if lt_text1 in used_lt_text_list:
- continue
- row = [lt_text1]
- used_lt_text_list.append(lt_text1)
- for lt_text2 in lt_text_list:
- if lt_text2 in used_lt_text_list:
- continue
- line1 = [(lt_text1.bbox[1], 0), (lt_text1.bbox[3], 0)]
- line2 = [(lt_text2.bbox[1], 0), (lt_text2.bbox[3], 0)]
- if line_iou(line1, line2) > 0:
- row.append(lt_text2)
- used_lt_text_list.append(lt_text2)
- row_list.append(row)
- outline_lt_text_list = []
- for row in row_list:
- if len(row) >= 2:
- continue
- outline_lt_text_list += row
- if show:
- print('outline_lt_text_list', outline_lt_text_list)
- return outline_lt_text_list
- def get_iou(bbox1, bbox2):
- # 提取边界框的坐标
- x1_1, y1_1, x2_1, y2_1 = bbox1
- x1_2, y1_2, x2_2, y2_2 = bbox2
- # 判断是否完全包含
- if (x1_1 <= x1_2 and y1_1 <= y1_2 and x2_1 >= x2_2 and y2_1 >= y2_2) or \
- (x1_2 <= x1_1 and y1_2 <= y1_1 and x2_2 >= x2_1 and y2_2 >= y2_1):
- return 1.0
- # 计算交集区域的坐标
- inter_x1 = max(x1_1, x1_2)
- inter_y1 = max(y1_1, y1_2)
- inter_x2 = min(x2_1, x2_2)
- inter_y2 = min(y2_1, y2_2)
- # 计算交集区域的面积
- inter_width = max(0, inter_x2 - inter_x1 + 1)
- inter_height = max(0, inter_y2 - inter_y1 + 1)
- inter_area = inter_width * inter_height
- # 计算两个边界框的面积
- bbox1_area = (x2_1 - x1_1 + 1) * (y2_1 - y1_1 + 1)
- bbox2_area = (x2_2 - x1_2 + 1) * (y2_2 - y1_2 + 1)
- # 计算并集区域的面积
- union_area = bbox1_area + bbox2_area - inter_area
- # 计算 IoU
- iou = inter_area / union_area if union_area != 0 else 0
- return iou
- def fix_cross_bbox(lt_text_list, show=0):
- for lt_text1 in lt_text_list:
- for lt_text2 in lt_text_list:
- if lt_text1 == lt_text2:
- continue
- if get_iou(lt_text1.bbox, lt_text2.bbox) > 0:
- if show:
- print('fix_cross_bbox1', lt_text1, lt_text2)
- x10, x11, x12, x13 = lt_text1.bbox
- x20, x21, x22, x23 = lt_text2.bbox
- # 右侧相交,且交集不能过大,过大则不是这一维相交
- if x10 < x20 < x12 and x12 - x20 < max(abs(x12 - x10), abs(x20 - x22)) / 2:
- x12 = min(lt_text1.bbox[2], lt_text2.bbox[0])
- x20 = max(lt_text1.bbox[2], lt_text2.bbox[0])
- # 下方相交,且交集不能过大,过大则不是这一维相交
- if x11 < x21 < x13 and x13 - x21 < max(abs(x13 - x11), abs(x21 - x23)) / 2:
- x13 = min(lt_text1.bbox[3], lt_text2.bbox[1])
- x21 = max(lt_text1.bbox[3], lt_text2.bbox[1])
- lt_text1.bbox = [x10, x11, x12, x13]
- lt_text2.bbox = [x20, x21, x22, x23]
- if show:
- print('fix_cross_bbox2', lt_text1, lt_text2)
- return lt_text_list
- def split_lt_text_by_many_space(lt_text_list, show=0):
- from format_convert.convert_tree import TextBox
- # 先处理前后空格
- add_lt_text_list = []
- delete_lt_text_list = []
- for lt_text in lt_text_list:
- text = lt_text.get_text()
- bbox = lt_text.bbox
- if len(text) == 0:
- continue
- text_unicode_len = get_char_unicode_length(text)
- if text_unicode_len == 0:
- continue
- ratio = abs(bbox[2] - bbox[0]) / text_unicode_len
- space1 = re.findall('^[ ]+', text)
- if space1:
- space1 = ''.join(space1)
- space1_unicode_len = get_char_unicode_length(space1)
- space1_pixel_len = space1_unicode_len * ratio
- text = re.sub('^[ ]+', '', text)
- bbox = [bbox[0] + space1_pixel_len, bbox[1], bbox[2], bbox[3]]
- if len(text) == 0:
- continue
- text_unicode_len = get_char_unicode_length(text)
- if text_unicode_len == 0:
- continue
- ratio = abs(bbox[2] - bbox[0]) / text_unicode_len
- space2 = re.findall('[ ]+$', text)
- if space2:
- space2 = ''.join(space2)
- space2_unicode_len = get_char_unicode_length(space2)
- space2_pixel_len = space2_unicode_len * ratio
- text = re.sub('[ ]+$', '', text)
- bbox = [bbox[0], bbox[1], bbox[2] - space2_pixel_len, bbox[3]]
- if len(text) == 0:
- continue
- text_unicode_len = get_char_unicode_length(text)
- if text_unicode_len == 0:
- continue
- ratio = abs(bbox[2] - bbox[0]) / text_unicode_len
- if space1 or space2:
- new_lt_text = TextBox(text=text, bbox=bbox)
- add_lt_text_list.append(new_lt_text)
- delete_lt_text_list.append(lt_text)
- for lt_text in delete_lt_text_list:
- if lt_text in lt_text_list:
- lt_text_list.remove(lt_text)
- lt_text_list += add_lt_text_list
- # 处理表头中间隔着几个空格 电 话: 电 话:
- add_lt_text_list = []
- delete_lt_text_list = []
- for lt_text in lt_text_list:
- text = lt_text.get_text()
- bbox = lt_text.bbox
- if len(text) == 0:
- continue
- space_list = re.findall('[ ]+', text)
- if len(space_list) >= 2:
- space_list.sort(key=lambda x: len(x))
- max_space = space_list[-1]
- match = re.search(max_space, text)
- if show:
- print('max_space', max_space)
- print('space_list', space_list)
- if match:
- part1 = text[:match.start()]
- part2 = text[match.end():]
- ss1 = re.split('[ ]+', part1)
- ss2 = re.split('[ ]+', part2)
- if len(ss1) == 2 and len(ss1[0]) == 1 and len(ss1[1]) == 2 and ss1[1][-1] in [':', ':'] \
- and len(ss2) == 2 and len(ss2[0]) == 1 and len(ss2[1]) == 2 and ss2[1][-1] in [':', ':']:
- new_text = ''.join(ss1) + max_space + ''.join(ss2)
- new_lt_text = TextBox(text=new_text, bbox=bbox)
- add_lt_text_list.append(new_lt_text)
- delete_lt_text_list.append(lt_text)
- if show:
- print('split_lt_text_by_many_space add_lt_text_list222', add_lt_text_list)
- print('split_lt_text_by_many_space delete_lt_text_list222', delete_lt_text_list)
- for lt_text in delete_lt_text_list:
- if lt_text in lt_text_list:
- lt_text_list.remove(lt_text)
- lt_text_list += add_lt_text_list
- # 处理中间多个空格,并拆分为两个
- add_lt_text_list = []
- delete_lt_text_list = []
- for lt_text in lt_text_list:
- text = lt_text.get_text()
- bbox = lt_text.bbox
- if len(text) == 0:
- continue
- text_unicode_len = get_char_unicode_length(text)
- if text_unicode_len == 0:
- continue
- ratio = abs(bbox[2] - bbox[0]) / text_unicode_len
- # 中间有多个空格,且空格分割为两部分
- match = re.search('[ ]{4,}', text)
- ss = re.split('[ ]+', text)
- if match and len(ss) == 2:
- # if match:
- part1 = text[:match.start()]
- part2 = text[match.end():]
- l1 = re.findall('[a-zA-Z0-9\u4e00-\u9fff]', part1)
- l2 = re.findall('[a-zA-Z0-9\u4e00-\u9fff]', part2)
- # 两边字符数都足够
- if len(l1) >= 2 and len(l2) >= 2:
- part1_unicode_len = get_char_unicode_length(part1)
- part2_unicode_len = get_char_unicode_length(part2)
- part1_pixel_len = ratio * part1_unicode_len
- part2_pixel_len = ratio * part2_unicode_len
- # avg_char_w = abs(bbox[0] - bbox[2]) / len(text)
- bbox1 = [bbox[0], bbox[1], bbox[0] + part1_pixel_len, bbox[3]]
- bbox2 = [bbox[2] - part2_pixel_len, bbox[1], bbox[2], bbox[3]]
- # 用自己的对象新增
- new_lt_text1 = TextBox(text=part1, bbox=bbox1)
- new_lt_text2 = TextBox(text=part2, bbox=bbox2)
- add_lt_text_list += [new_lt_text1, new_lt_text2]
- delete_lt_text_list.append(lt_text)
- for lt_text in delete_lt_text_list:
- if lt_text in lt_text_list:
- lt_text_list.remove(lt_text)
- lt_text_list += add_lt_text_list
- if show:
- print('split_lt_text_by_many_space add_lt_text_list333', add_lt_text_list)
- print('split_lt_text_by_many_space delete_lt_text_list333', delete_lt_text_list)
- return lt_text_list
- def get_char_unicode_length(text, show=0):
- # char_reg_len_dict = {
- # '[ ]': 1,
- # '[ ]': 1.5,
- # '[\u4e00-\u9fff]': 1.5,
- # '[a-zA-Z0-9#@,^.+=\(\)<>\-@#$%&*\[\]\'":;?~!’‘“”{}/]': 1,
- # '[:,。!¥……()【】;?《》、]': 1.5
- # }
- #
- # text_real_len = 0
- # for reg, char_len in char_reg_len_dict.items():
- # cs = re.findall(reg, text)
- # text_real_len += len(cs) * char_len
- #
- # real_avg_char_len = abs(bbox[2] - bbox[0]) / text_real_len
- #
- # char_reg_real_len_dict = {}
- # for reg, char_len in char_reg_len_dict.items():
- # char_reg_real_len_dict[reg] = real_avg_char_len * char_len
- #
- # return char_reg_real_len_dict
- width = wcwidth.wcswidth(text)
- if show:
- print('text unicode_length', text, width)
- return width
- def fix_final_row(table, show=0):
- # print('fix_final_row table', table)
- if len(table) < 2:
- return table
- last_row = table[-2]
- final_row = table[-1]
- print('final_row', final_row)
- print('last_row', last_row)
- delete_final_flag = 0
- if final_row[0] in ['', '@@:'] and final_row[1] in ['', '@@:'] \
- and final_row[2] in ['', '@@:'] and final_row[3] not in ['', '@@:']:
- table[-2][3] = final_row[3]
- delete_final_flag = 1
- if show:
- print('fix_final_row right', table[-2])
- if final_row[0] in ['', '@@:'] and final_row[1] not in ['', '@@:'] \
- and final_row[2] in ['', '@@:'] and final_row[3] in ['', '@@:']:
- table[-2][1] = final_row[1]
- delete_final_flag = 1
- if show:
- print('fix_final_row left', table[-2])
- if delete_final_flag:
- table = table[:-1]
- return table
- if __name__ == '__main__':
- # from format_convert.convert_pdf import PDFConvert
- # pdf_c = PDFConvert(None, None, None)
- # from format_convert.convert_image import ImageProcess
- # img_p = ImageProcess(None, None)
- #
- # ps = glob(r'D:\Project\format_conversion_maxcompute\save_b_table_not_detect\*')
- # image_np_list = [[x, cv2.imread(x)] for x in ps]
- # for p, image_np in image_np_list:
- # # 整体分辨率限制
- # image_np = img_p.resize_process(image_np)
- # # 文字识别
- # text_list, box_list = img_p.ocr_process(image_np)
- # # 转换为lt_text_box
- # _lt_text_list = text_bbox_to_lt(text_list, box_list)
- # 先bbox预先判断可能有无边框
- # _flag = judge_has_b_table_by_bbox(_lt_text_list, [], 0)
- # print('path', p, 'has b table', _flag)
- _pp = r'D:\Project\format_conversion_maxcompute\save_b_table\15-8292f767be81f404b813c119058a8a75.png'
- img111 = cv2.imread(_pp)
- img111 = pil_resize(img111, 1024, 768)
- get_straight_lines_from_image(img111)
- pass
|