fangjiasheng
/
FORMAT_CONVERSION_MAXCOMPUTE


			
							12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989910010110210310410510610710810911011111211311411511611711811912012112212312412512612712812913013113213313413513613713813914014114214314414514614714814915015115215315415515615715815916016116216316416516616716816917017117217317417517617717817918018118218318418518618718818919019119219319419519619719819920020120220320420520620720820921021121221321421521621721821922022122222322422522622722822923023123223323423523623723823924024124224324424524624724824925025125225325425525625725825926026126226326426526626726826927027127227327427527627727827928028128228328428528628728828929029129229329429529629729829930030130230330430530630730830931031131231331431531631731831932032132232332432532632732832933033133233333433533633733833934034134234334434534634734834935035135235335435535635735835936036136236336436536636736836937037137237337437537637737837938038138238338438538638738838939039139239339439539639739839940040140240340440540640740840941041141241341441541641741841942042142242342442542642742842943043143243343443543643743843944044144244344444544644744844945045145245345445545645745845946046146246346446546646746846947047147247347447547647747847948048148248348448548648748848949049149249349449549649749849950050150250350450550650750850951051151251351451551651751851952052152252352452552652752852953053153253353453553653753853954054154254354454554654754854955055155255355455555655755855956056156256356456556656756856957057157257357457557657757857958058158258358458558658758858959059159259359459559659759859960060160260360460560660760860961061161261361461561661761861962062162262362462562662762862963063163263363463563663763863964064164264364464564664764864965065165265365465565665765865966066166266366466566666766866967067167267367467567667767867968068168268368468568668768868969069169269369469569669769869970070170270370470570670770870971071171271371471571671771871972072172272372472572672772872973073173273373473573673773873974074174274374474574674774874975075175275375475575675775875976076176276376476576676776876977077177277377477577677777877978078178278378478578678778878979079179279379479579679779879980080180280380480580680780880981081181281381481581681781881982082182282382482582682782882983083183283383483583683783883984084184284384484584684784884985085185285385485585685785885986086186286386486586686786886987087187287387487587687787887988088188288388488588688788888989089189289389489589689789889990090190290390490590690790890991091191291391491591691791891992092192292392492592692792892993093193293393493593693793893994094194294394494594694794894995095195295395495595695795895996096196296396496596696796896997097197297397497597697797897998098198298398498598698798898999099199299399499599699799899910001001100210031004100510061007100810091010101110121013101410151016101710181019102010211022102310241025102610271028102910301031103210331034103510361037103810391040104110421043104410451046104710481049105010511052105310541055105610571058105910601061106210631064106510661067106810691070107110721073107410751076107710781079108010811082108310841085108610871088108910901091109210931094109510961097109810991100110111021103110411051106110711081109111011111112111311141115111611171118111911201121112211231124112511261127112811291130113111321133113411351136113711381139114011411142114311441145114611471148114911501151115211531154115511561157115811591160116111621163116411651166116711681169117011711172117311741175117611771178117911801181118211831184118511861187118811891190119111921193119411951196119711981199120012011202120312041205120612071208120912101211121212131214121512161217121812191220122112221223122412251226122712281229123012311232123312341235123612371238123912401241124212431244124512461247124812491250125112521253125412551256125712581259126012611262126312641265126612671268126912701271127212731274127512761277127812791280128112821283128412851286128712881289129012911292129312941295129612971298129913001301130213031304130513061307130813091310131113121313131413151316131713181319132013211322132313241325132613271328132913301331133213331334133513361337133813391340134113421343134413451346134713481349135013511352135313541355135613571358135913601361136213631364136513661367136813691370137113721373137413751376137713781379138013811382138313841385138613871388138913901391139213931394139513961397139813991400140114021403140414051406140714081409141014111412141314141415141614171418141914201421142214231424142514261427142814291430143114321433143414351436143714381439144014411442144314441445144614471448144914501451145214531454145514561457145814591460146114621463146414651466146714681469147014711472147314741475147614771478147914801481148214831484148514861487148814891490149114921493149414951496149714981499150015011502150315041505150615071508150915101511151215131514151515161517151815191520152115221523152415251526152715281529153015311532153315341535153615371538153915401541154215431544154515461547154815491550155115521553155415551556155715581559156015611562156315641565156615671568156915701571157215731574157515761577157815791580158115821583158415851586158715881589159015911592159315941595159615971598159916001601160216031604160516061607160816091610161116121613161416151616161716181619162016211622162316241625162616271628162916301631163216331634163516361637163816391640164116421643164416451646164716481649165016511652165316541655165616571658165916601661166216631664166516661667166816691670167116721673167416751676167716781679168016811682168316841685168616871688168916901691169216931694169516961697169816991700170117021703170417051706170717081709171017111712171317141715171617171718171917201721172217231724172517261727172817291730173117321733173417351736173717381739174017411742174317441745174617471748174917501751175217531754175517561757175817591760176117621763176417651766176717681769177017711772177317741775177617771778177917801781178217831784178517861787178817891790179117921793179417951796179717981799180018011802180318041805180618071808180918101811181218131814181518161817181818191820182118221823182418251826182718281829183018311832183318341835183618371838183918401841184218431844184518461847184818491850185118521853185418551856185718581859186018611862186318641865186618671868186918701871187218731874187518761877187818791880188118821883188418851886188718881889189018911892189318941895189618971898189919001901190219031904190519061907190819091910191119121913191419151916191719181919192019211922192319241925192619271928192919301931193219331934193519361937193819391940194119421943194419451946194719481949195019511952195319541955195619571958195919601961196219631964196519661967196819691970197119721973197419751976197719781979198019811982198319841985198619871988198919901991199219931994199519961997199819992000200120022003200420052006200720082009201020112012201320142015201620172018201920202021202220232024202520262027202820292030203120322033203420352036203720382039204020412042204320442045204620472048204920502051205220532054205520562057205820592060206120622063206420652066206720682069207020712072207320742075207620772078207920802081208220832084208520862087208820892090209120922093209420952096209720982099210021012102210321042105210621072108210921102111211221132114211521162117211821192120212121222123212421252126212721282129213021312132213321342135213621372138213921402141214221432144214521462147214821492150215121522153215421552156215721582159216021612162216321642165216621672168216921702171217221732174217521762177217821792180218121822183218421852186218721882189219021912192219321942195219621972198219922002201220222032204220522062207220822092210221122122213221422152216221722182219222022212222222322242225222622272228222922302231223222332234223522362237223822392240224122422243224422452246224722482249225022512252225322542255225622572258225922602261226222632264226522662267226822692270227122722273227422752276227722782279228022812282228322842285228622872288228922902291229222932294229522962297229822992300230123022303230423052306230723082309231023112312231323142315231623172318231923202321232223232324232523262327232823292330233123322333233423352336233723382339234023412342234323442345234623472348234923502351235223532354235523562357235823592360236123622363236423652366236723682369237023712372237323742375237623772378237923802381238223832384238523862387238823892390239123922393239423952396239723982399240024012402240324042405240624072408240924102411241224132414241524162417241824192420242124222423242424252426242724282429243024312432243324342435243624372438243924402441244224432444244524462447244824492450245124522453245424552456245724582459246024612462246324642465246624672468246924702471247224732474247524762477247824792480248124822483248424852486248724882489249024912492249324942495249624972498249925002501250225032504250525062507250825092510251125122513251425152516251725182519252025212522252325242525252625272528252925302531253225332534253525362537253825392540254125422543254425452546254725482549255025512552255325542555255625572558255925602561256225632564256525662567256825692570257125722573257425752576257725782579258025812582258325842585258625872588258925902591259225932594259525962597259825992600260126022603260426052606260726082609261026112612261326142615261626172618261926202621262226232624262526262627262826292630263126322633263426352636263726382639264026412642264326442645264626472648264926502651265226532654265526562657265826592660266126622663266426652666266726682669267026712672267326742675267626772678267926802681268226832684268526862687268826892690269126922693269426952696269726982699270027012702270327042705270627072708270927102711271227132714271527162717271827192720272127222723272427252726272727282729273027312732273327342735273627372738273927402741274227432744274527462747274827492750275127522753275427552756
							import copy
import math
import os
import re
import time
import traceback
from glob import glob
import numpy as np
import cv2
import wcwidth
from pdfminer.layout import LTLine
# from botr.nsp.predict import nsp_predict
from sklearn.cluster import KMeans

from botr.rules.get_table_by_rules import get_table_by_rule
from botr.utils import line_iou, get_table_iou
from format_convert.convert_need_interface import from_yolo_interface
from format_convert.utils import log, np2bytes, text_bbox_to_lt, pil_resize, memory_decorator


def b_table_process(list_line, list_text_boxes, list_cell, table_location):
    def merge_textbox(textbox_list, in_objs):
        delete_obj = []
        threshold = 5
        textbox_list.sort(key=lambda x: x.bbox[0])
        for k in range(len(textbox_list)):
            tb1 = textbox_list[k]
            if tb1 not in in_objs and tb1 not in delete_obj:
                for m in range(k + 1, len(textbox_list)):
                    tb2 = textbox_list[m]
                    if tb2 in in_objs:
                        continue
                    if abs(tb1.bbox[1] - tb2.bbox[1]) <= threshold \
                            and abs(tb1.bbox[3] - tb2.bbox[3]) <= threshold:
                        if tb1.bbox[0] <= tb2.bbox[0]:
                            tb1.text = tb1.text + tb2.text
                        else:
                            tb1.text = tb2.text + tb1.text
                        tb1.bbox[0] = min(tb1.bbox[0], tb2.bbox[0])
                        tb1.bbox[2] = max(tb1.bbox[2], tb2.bbox[2])
                        delete_obj.append(tb2)
        for _obj in delete_obj:
            if _obj in textbox_list:
                textbox_list.remove(_obj)
        return textbox_list

    try:
        if list_line:
            from format_convert.convert_tree import TableLine
            list_lines = []
            for line in list_line:
                list_lines.append(LTLine(1, (line[0], line[1]), (line[2], line[3])))

            # 先拿出在表格区域里的TextBox
            area_list_text_boxes = []
            threshold = 7
            for t_b in list_text_boxes:
                bbox = t_b.bbox
                if table_location[1] - threshold <= bbox[1] <= bbox[3] <= table_location[3] + threshold:
                    area_list_text_boxes.append(t_b)

            # 对TextBox进行分行，否则同样一行有些框偏上有些偏下，影响文本顺序
            area_list_text_boxes.sort(key=lambda x: (x.bbox[1], x.bbox[0], x.bbox[3], x.bbox[2]))
            current_y = area_list_text_boxes[0].bbox[1]
            current_y2 = area_list_text_boxes[0].bbox[3]
            # threshold = 2.
            threshold = max(2., 1 / 3 * abs(current_y2 - current_y))
            for t_b in area_list_text_boxes:
                bbox = t_b.bbox
                if current_y - threshold <= bbox[1] <= current_y + threshold:
                    t_b.bbox[1] = current_y
                else:
                    current_y = bbox[1]
            area_list_text_boxes.sort(key=lambda x: (x.bbox[1], x.bbox[0], x.bbox[3], x.bbox[2]))

            # list_cell 转化为 LineTable形式
            tables = []
            obj_in_table = []
            table_dict = {'bbox': table_location}
            row_list = []

            # yolo检测出的表格，忽略两列的，因为已经补充了两列的新规则 250529
            if list_cell and len(list_cell[0]) == 2:
                return list_text_boxes, [], set()

            for row in list_cell:
                col_list = []
                for col in row:
                    col_dict = {'bbox': (col[0][0], col[0][1], col[1][0], col[1][1]),
                                'rowspan': 1, 'columnspan': 1, 'text': ''}
                    for t_b in area_list_text_boxes:
                        if t_b in obj_in_table:
                            continue
                        text = re.sub('\s', '', t_b.text)
                        bbox = t_b.bbox
                        iou = get_table_iou(col[0][0], col[0][1], col[1][0], col[1][1],
                                            bbox[0], bbox[1], bbox[2], bbox[3])
                        if iou >= 0.3:
                            col_dict['text'] += text
                            obj_in_table.append(t_b)
                    col_list.append(col_dict)
                row_list.append(col_list)
            table_dict['table'] = row_list
            tables.append(table_dict)

            # print('b_table_process tables', tables)

            # 合并同一行textbox
            # list_text_boxes = merge_textbox(list_text_boxes, obj_in_table)
            return list_text_boxes, tables, obj_in_table
        else:
            return list_text_boxes, [], set()
    except:
        traceback.print_exc()
        return [-8], [-8], [-8]


def get_text_box_obj(_text_list, _bbox_list):
    from format_convert.convert_tree import TextBox
    _text_box_list = []
    for i in range(len(_bbox_list)):
        bbox = _bbox_list[i]
        b_text = _text_list[i]
        _text_box_list.append(TextBox([bbox[0][0], bbox[0][1],
                                       bbox[2][0], bbox[2][1]], b_text))
    return _text_box_list


def get_table(img, table_list, text_list, bbox_list, text_box_list, from_pdf=False, show=0):
    log('start')
    # 检测无边框表格
    start_time_all = time.time()
    start_time = time.time()
    img_bytes = np2bytes(img)
    b_table_list = from_yolo_interface(img_bytes)
    log('yolo detect cost: ' + str(time.time() - start_time))
    b_table_list = b_table_list[0]
    if not b_table_list:
        log('detect not b_table_list')
        if from_pdf:
            save_b_table(img)
        return [], [], []

    # if show:
    #     for b_table in b_table_list:
    #         # for line in b_table:
    #         cv2.rectangle(img, (int(b_table[0]), int(b_table[1])), (int(b_table[2]), int(b_table[3])),
    #                  (0, 0, 255), 2)
    #     cv2.namedWindow('b_table', cv2.WINDOW_NORMAL)
    #     cv2.imshow('b_table', img)
    #     cv2.waitKey(0)

    if show:
        print('b_table_list', b_table_list)
        print('table_list', table_list)

    # 排除otr结果
    b_table_location_list = []
    for b_table in b_table_list:
        # print('b_table', b_table)
        min_x, min_y = 1000000, 1000000
        max_x, max_y = 0, 0
        # for line in b_table:
        if b_table[1] < min_y:
            min_y = b_table[1]
        if b_table[3] > max_y:
            max_y = b_table[3]
        if b_table[0] < min_x:
            min_x = b_table[0]
        if b_table[2] > max_x:
            max_x = b_table[2]
        b_loc = [min_x, min_y, max_x, max_y, b_table[4]]
        inter_flag = False
        for table in table_list:
            # loc = table.get('bbox')
            loc = table.bbox
            # rows = table.get('table')
            iou = line_iou([[0, loc[1]], [0, loc[3]]], [[0, b_loc[1]], [0, b_loc[3]]], axis=1)
            if iou > 0.3:
                # if len(rows) <= 1:
                #     if loc[1] < b_loc[1] < loc[3] < b_loc[3]:
                #         b_loc[1] = loc[3]
                #     if b_loc[1] < loc[1] < b_loc[3] < loc[3]:
                #         b_loc[3] = loc[1]
                #     continue
                inter_flag = True
                # cv2.rectangle(img, [int(loc[0]), int(loc[1])], [int(loc[2]), int(loc[3])], (0, 0, 255))
                # cv2.rectangle(img, [int(b_loc[0]), int(b_loc[1])], [int(b_loc[2]), int(b_loc[3])], (0, 0, 255))
                # cv2.imshow('inter', img)
                # cv2.waitKey(0)
                break
        if not inter_flag:
            b_table_location_list.append(b_loc)
    if not b_table_location_list:
        log('except otr, not b_table_location_list')
        return [], [], []

    if show:
        print('len(b_table_location_list)', len(b_table_location_list))

    # 排除有重合的，取概率大的
    if len(b_table_location_list) > 1:
        temp_list = []
        used_b_loc = []
        for i in range(len(b_table_location_list)):
            b_loc1 = b_table_location_list[i]
            if b_loc1 in used_b_loc:
                continue
            inter_flag = False
            for j in range(i + 1, len(b_table_location_list)):
                b_loc2 = b_table_location_list[j]
                iou = line_iou([[0, b_loc1[1]], [0, b_loc1[3]]], [[0, b_loc2[1]], [0, b_loc2[3]]], axis=1)
                if show:
                    print('iou2', iou)
                if iou > 0.3:
                    inter_flag = True
                    break
            if inter_flag:
                used_b_loc.append(b_loc2)
                if b_loc1[4] >= b_loc2[4]:
                    temp_list.append(b_loc1[:4])
                else:
                    temp_list.append(b_loc2[:4])
            else:
                temp_list.append(b_loc1[:4])
        b_table_location_list = temp_list

    if show:
        for b_loc in b_table_location_list:
            cv2.rectangle(img, (int(b_loc[0]), int(b_loc[1])), (int(b_loc[2]), int(b_loc[3])),
                          (0, 0, 255), 2)
        cv2.namedWindow('b_table_no_otr', cv2.WINDOW_NORMAL)
        cv2.imshow('b_table_no_otr', img)
        cv2.waitKey(0)

    table_list = []
    obj_in_table_list = []
    # print('len(b_table_location_list)', len(b_table_location_list))
    for b_loc in b_table_location_list:
        area_text_list = []
        area_bbox_list = []
        threshold = 5
        for i, bbox in enumerate(bbox_list):
            if b_loc[1] - threshold <= bbox[0][1] <= bbox[2][1] <= b_loc[3] + threshold:
                area_bbox_list.append(bbox)
                area_text_list.append(text_list[i])

        # 根据ocr bbox，规则生成表格线
        start_time = time.time()
        line_list, cell_list, table_location, bbox_text_dict = get_table_by_rule(img, area_text_list, area_bbox_list,
                                                                                 b_loc, show=show)
        if not table_location:
            log('get_table_by_rule not table_location')
            continue
        # 获取最新的text_list, bbox_list
        area_text_list, area_bbox_list = [], []
        for key in bbox_text_dict.keys():
            area_bbox_list.append(eval(key))
            area_text_list.append(bbox_text_dict.get(key))
        b_text_box_list = get_text_box_obj(area_text_list, area_bbox_list)
        log('get_table_by_rule cost: ' + str(time.time() - start_time))

        # 根据表格线生成单元格
        start_time = time.time()
        b_text_box_list, _table_list, _obj_in_table_list = b_table_process(line_list, b_text_box_list, cell_list,
                                                                           table_location)
        table_list += _table_list
        obj_in_table_list += _obj_in_table_list
        log('b_table_process cost: ' + str(time.time() - start_time))

        # if not table_list:
        #     log('table_process not table_list')
        #     return [], [], []

        if not _table_list:
            log('table_process not table_list')
            continue

        # 单元格合并，nsp模型
        # 使用hanlp分词，判断上下句是否该合并 顺便拉数据统计
        # 1. 上下句ab，ab相连得到c
        # 2.1 c分词，若ab相连处合为一个词语，则ab相连
        # 2.2 ab相连处不为一个词语，a, b分别分词
        # 2.2.1 若b的第一个词，从其中分第一个字给a，然后
        # near_col_list = []
        # table = _table_list[0].get('table')
        # col_cnt = len(table[0])
        # for c_cnt in range(col_cnt):
        #     for i in range(len(table)-1):
        #         t = table[i][c_cnt].get('text')
        #         next_t = table[i+1][c_cnt].get('text')
        #         if t and next_t:
        #             near_col_list.append([t, next_t])
        #         elif t and next_t == '':
        #             if i+2 <= len(table)-1:
        #                 next_2_t = table[i+2][c_cnt].get('text')
        #                 near_col_list.append([t, next_2_t])
        #
        # is_next_list = nsp_predict(near_col_list, has_label=False)
        #
        # next_index = 0
        # for c_cnt in range(col_cnt):
        #     # 先把一列里的需合并的打上标签
        #     for i in range(len(table)-1):
        #         t = table[i][c_cnt].get('text')
        #         next_t = table[i+1][c_cnt].get('text')
        #         if t and next_t:
        #             table[i+1][c_cnt]['is_next'] = is_next_list[next_index]
        #             next_index += 1
        #         elif t and next_t == '':
        #             if i+2 <= len(table)-1:
        #                 table[i+1][c_cnt]['is_next'] = is_next_list[next_index]
        #                 table[i+2][c_cnt]['is_next'] = is_next_list[next_index]
        #                 next_index += 1
        #
        #     first_col = None
        #     for i in range(len(table)):
        #         if table[i][c_cnt].get('is_next'):
        #             if first_col is None:
        #                 first_col = table[i-1][c_cnt]
        #             first_col['text'] += table[i][c_cnt].get('text')
        #             first_col['rowspan'] += 1
        #         else:
        #             first_col = None
        #
        # # 删除标签为True的
        # new_table = []
        # for row in table:
        #     new_row = []
        #     for col in row:
        #         if col.get('is_next'):
        #             continue
        #         new_row.append(col)
        #     new_table.append(new_row)
        #
        # _table_list[0]['table'] = new_table

    log('get_table finish ' + str(time.time() - start_time_all))
    return text_box_list, table_list, obj_in_table_list


def save_b_table(image_np):
    _start_time = time.time()
    _path = '/data/fangjiasheng/format_conversion_maxcompute/save_b_table_not_detect'
    # _path = 'D:/Project/format_conversion_maxcompute/save_b_table_not_detect'
    max_index = 20000
    if os.path.exists(_path):
        file_list = glob(_path + '/*')
        if file_list:
            file_index_list = [int(re.split('[/.\\\\-]', x)[-3]) for x in file_list]
            file_index_list.sort(key=lambda x: x)
            index = file_index_list[-1] + 1
        else:
            index = 0
        if index > max_index:
            return

        # 文件md5
        from format_convert import _global
        _md5 = _global.get("md5")

        _image_path = _path + '/' + str(index) + '-' + str(_md5) + '.png'
        cv2.imwrite(_image_path, image_np)
        log('save yolo not detect b_table image success!')


@memory_decorator
def get_b_table_by_blank_colon(lt_text_list, table_list, layout_bbox, image_np=None, show=0):
    start_time = time.time()

    # print('len(lt_text_list)', len(lt_text_list))
    # for lt_text in lt_text_list:
    #     print('lt_text', lt_text)

    # 新增冒号提前判断
    colon_cnt = 0
    for lt_text in lt_text_list:
        if re.search('[：:]', lt_text.get_text()):
            colon_cnt += 1
    if colon_cnt <= 6:
        log('pre judge colon_cnt <= 6')
        return [], []

    # 图片类型，限制lt_text_list个数，并且很多是单字的
    if image_np is not None and len(lt_text_list) >= 60:
        single_char_cnt = 0
        for lt_text in lt_text_list:
            if len(lt_text.get_text()) <= 1:
                single_char_cnt += 1
        # log('len(lt_text_list), single_char_cnt ' + str(len(lt_text_list)) + ' ' + str(single_char_cnt))
        if single_char_cnt > 50 or single_char_cnt > 1/3 * len(lt_text_list):
            return [], []

    # raise
    # 有些确定为非表格，也输出，防止后续YOLO判断为表格，搞乱数据
    not_b_table_list = []

    layout_h = int(layout_bbox[3])
    layout_w = int(layout_bbox[2])

    if show:
        print('layout_w, layout_h', layout_w, layout_h)
        show_image = np.full((layout_h, layout_w, 3), 255, dtype=np.uint8)

    if show and image_np is not None:
        image_np_show = copy.copy(image_np)
        for lt_text in lt_text_list:
            bbox = [int(x) for x in lt_text.bbox]
            cv2.rectangle(image_np_show, bbox[:2], bbox[2:4], (0, 0, 255))
        cv2.imshow('image origin', image_np_show)
        cv2.waitKey(0)

    # pdf类型预处理
    start_time1 = time.time()
    if image_np is None:
        # 把单个lt_text中，中间多个空格分割的分开
        lt_text_list = split_lt_text_by_many_space(lt_text_list)

        if show:
            for lt_text in lt_text_list:
                bbox = [int(x) for x in lt_text.bbox]
                cv2.rectangle(show_image, bbox[:2], bbox[2:4], (0, 0, 255))
            cv2.imshow('pdf preprocess', show_image)
            cv2.waitKey(0)
        # log('get_b_table_by_blank_colon pdf preprocess cost: ' + str(time.time()-start_time1))

    # 图片类型预处理
    start_time1 = time.time()
    if image_np is not None:
        # 删除空的
        start_time2 = time.time()
        lt_text_list = delete_empty_bbox(lt_text_list)
        # print('delete_empty_bbox cost: ', time.time()-start_time2)

        # ocr识别的文本框需处理后紧贴文本，才能依靠空白分行
        start_time2 = time.time()
        new_bbox_list = shrink_bbox(image_np, [x.bbox for x in lt_text_list])
        # print('shrink_bbox cost: ', time.time()-start_time2)
        start_time2 = time.time()
        for i, lt_text in enumerate(lt_text_list):
            lt_text.bbox = new_bbox_list[i]
        # print('lt_text.bbox = new_bbox_list[i] cost: ', time.time()-start_time2)
        # log('get_b_table_by_blank_colon image preprocess1 cost: ' + str(time.time()-start_time1))

    # 计算单字平均距离
    start_time1 = time.time()
    all_char_cnt = 0
    all_text_width = 0
    for lt_text in lt_text_list:
        all_char_cnt += len(lt_text.get_text())
        all_text_width += abs(lt_text.bbox[2] - lt_text.bbox[0])
    if all_char_cnt == 0:
        return [], not_b_table_list
    avg_char_width = all_text_width / all_char_cnt

    # 图片类型预处理2
    if image_np is not None:
        # ocr识别的表格的值可能因空格分开，合并
        lt_text_list = merge_same_bbox(lt_text_list, avg_char_width)

        # bbox交叉，修复
        lt_text_list = fix_cross_bbox(lt_text_list)
        # log('get_b_table_by_blank_colon image preprocess2 cost: ' + str(time.time()-start_time1))

    if show and image_np is not None:
        image_np_show = copy.copy(image_np)
        for lt_text in lt_text_list:
            bbox = [int(x) for x in lt_text.bbox]
            cv2.rectangle(image_np_show, bbox[:2], bbox[2:4], (0, 0, 255))
        cv2.imshow('image preprocess', image_np_show)
        cv2.waitKey(0)

    if show:
        for lt_text in lt_text_list:
            print('lt_text', lt_text)

    # 过滤xy值过大过小的
    temp_list = []
    for lt_text in lt_text_list:
        if min(lt_text.bbox) < 0 or max(lt_text.bbox) > 10000:
            continue
        temp_list.append(lt_text)
    lt_text_list = temp_list

    if show:
        for lt_text in lt_text_list:
            cv2.rectangle(show_image,
                          (int(lt_text.bbox[0]), int(lt_text.bbox[1])),
                          (int(lt_text.bbox[2]), int(lt_text.bbox[3])),
                          (0, 0, 255)
                          )
        for table in table_list:
            cv2.rectangle(show_image,
                          (int(table.bbox[0]), int(table.bbox[1])),
                          (int(table.bbox[2]), int(table.bbox[3])),
                          (0, 255, 0)
                          )

    # 计算单字平均距离
    all_char_cnt = 0
    all_text_width = 0
    for lt_text in lt_text_list:
        all_char_cnt += len(lt_text.get_text())
        all_text_width += abs(lt_text.bbox[2] - lt_text.bbox[0])
    if all_char_cnt == 0:
        return [], not_b_table_list
    avg_char_width = all_text_width / all_char_cnt
    if show:
        print('avg_char_width', avg_char_width)

    if image_np is None:
        blank_width = 1 * avg_char_width
    else:
        blank_width = 1 * avg_char_width
    if show:
        print('blank_width', blank_width)

    # 根据有边框表格位置，将该页分为多个区域
    table_h_list = []
    area_h_list = []
    area_start_h = 0
    table_list.sort(key=lambda x: (x.bbox[1], x.bbox[0], x.bbox[3]))
    for table in table_list:
        table_h_list.append([table.bbox[1], table.bbox[3]])
        area_h_list.append([area_start_h, table.bbox[1]])
        area_start_h = table.bbox[3]
    area_h_list.append([area_start_h, layout_h])

    if show:
        for min_h, max_h in area_h_list:
            print('area_h_list', min_h, max_h)
            cv2.rectangle(show_image,
                          (0, int(min_h)),
                          (layout_w, int(max_h)),
                          (255, 0, 0)
                          )

    lt_text_area_list = []
    for area_min_h, area_max_h in area_h_list:
        sub_area = []
        for lt_text in lt_text_list:
            if area_min_h <= lt_text.bbox[1] <= lt_text.bbox[3] <= area_max_h:
                sub_area.append(lt_text)
        lt_text_area_list.append(sub_area)
    if show:
        print('len(lt_text_area_list)', len(lt_text_area_list))

    # 每个区域分别进行判断无边框表格
    result_table_list = []
    start_time1 = time.time()
    for sub_lt_text_list in lt_text_area_list:
        start_time2 = time.time()
        lt_text_row_list = get_text_row_by_blank(sub_lt_text_list, layout_h)
        # log('get_text_row_by_blank cost: ' + str(time.time()-start_time2))

        # 有补充的占位lt_text,需添加到lt_text_list
        for row in lt_text_row_list:
            for lt_text in row:
                if lt_text not in lt_text_list:
                    lt_text_list.append(lt_text)

        if show:
            for row in lt_text_row_list:
                print('row', row)

        start_time2 = time.time()
        b_table_list1, b_table_bbox_list1 = get_b_table_by_lt_text_row(lt_text_row_list)
        # log('get_b_table_by_lt_text_row cost: ' + str(time.time()-start_time2))

        # 确定区域后，对表格内重新分行，更精准
        start_time2 = time.time()
        table_lt_text_row_list = []
        for bi, b_table in enumerate(b_table_list1):
            b_table_bbox = b_table_bbox_list1[bi]
            sub_lt_text_list = []
            for lt_text in lt_text_list:
                if b_table_bbox[1] <= lt_text.bbox[1] <= lt_text.bbox[3] <= b_table_bbox[3]:
                    sub_lt_text_list.append(lt_text)
            _lt_text_row_list, center_blank_row = get_text_row_by_center_blank(b_table, sub_lt_text_list, blank_width,
                                                                               layout_h)
            table_lt_text_row_list += _lt_text_row_list
        # log('get_text_row_by_center_blank cost: ' + str(time.time()-start_time2))

        start_time2 = time.time()
        b_table_list3, b_table_bbox_list3 = get_b_table_by_lt_text_row(table_lt_text_row_list)
        # log('get_b_table_by_lt_text_row cost: ' + str(time.time()-start_time2))

        if show:
            for b_table in b_table_list3:
                print('b_table3', b_table)

        # 对大致的表格进行列判断，表格内不同列的框不能交叉，可以重合，需有一定空白
        start_time2 = time.time()
        b_table_list2 = []
        for b_table in b_table_list3:

            blank_row_list = get_blank_row(b_table, blank_width)
            if show:
                print('b_table get_blank_row b_table_list3', b_table)
                print('blank_row_list b_table_list3', blank_row_list)

            b_table2 = []
            for bi, lt_text_row1 in enumerate(b_table[:-1]):
                lt_text_row2 = b_table[bi + 1]
                # if row1_row2_has_same_col(lt_text_row1, lt_text_row2):
                if row1_row2_has_same_blank(blank_row_list[bi], blank_row_list[bi + 1]):
                    if lt_text_row1 not in b_table2:
                        b_table2.append(lt_text_row1)
                    if lt_text_row2 not in b_table2:
                        b_table2.append(lt_text_row2)
                else:
                    # print('not cross blank', blank_row_list[bi], blank_row_list[bi + 1])
                    if len(b_table2) >= 2:
                        b_table_list2.append(b_table2)
                    b_table2 = []
            if len(b_table2) >= 2:
                b_table_list2.append(b_table2)
        # log('get_blank_row cost: ' + str(time.time()-start_time2))

        if show:
            for b_table2 in b_table_list2:
                print('b_table2')
                for lt_text_row in b_table2:
                    print('b_table2 lt_text_row', lt_text_row)

        start_time2 = time.time()
        for bi, b_table2 in enumerate(b_table_list2):
            # 根据冒号得到表格
            start_time3 = time.time()
            table2, center_blank_row, _not_b_table_bbox_list, table_bbox \
                = get_b_table_by_colon(b_table2, blank_width)
            log('get_b_table_by_colon cost: ' + str(time.time()-start_time3))
            not_b_table_list += [[[], x] for x in _not_b_table_bbox_list]

            if show and center_blank_row:
                print('show center_blank_row', center_blank_row)
                bx = int((center_blank_row[2] + center_blank_row[0]) / 2)
                by = int((center_blank_row[3] + center_blank_row[1]) / 2)
                br = int((center_blank_row[2] - center_blank_row[0]) / 2)
                if br <= 5:
                    br = 5
                print('bx, by, br', bx, by, br)
                cv2.circle(show_image, (bx, by), br, (0, 255, 0))

            if show:
                min_w, min_h, max_w, max_h = table_bbox
                cv2.rectangle(show_image,
                              (int(min_w), int(min_h)),
                              (int(max_w), int(max_h)),
                              (0, 255, 0)
                              )

            # 修复最后一行跨行
            # table2 = fix_final_row(table2)

            # 表格末尾有些只有一列的需补充
            table2 = add_last_rows(table2, table_bbox, center_blank_row, lt_text_row_list, b_table2)

            table2 = add_first_rows(table2, table_bbox, center_blank_row, lt_text_row_list, b_table2)

            # table格式转化
            table2 = table_list_to_dict(table2)

            # 表格一些标准化，比如去掉占位符
            table2 = standard_table(table2)

            if table2:
                result_table_list.append([table2, table_bbox])
        # log('colon, add, standard cost: ' + str(time.time()-start_time2))

    # log('get_b_table_by_blank_colon area get b_table cost: ' + str(time.time()-start_time1))

    if show:
        cv2.namedWindow("final result", cv2.WINDOW_NORMAL)
        cv2.resizeWindow("final result", 768, 1024)
        cv2.imshow('final result', show_image)
        cv2.waitKey(0)

    if show:
        for table in result_table_list:
            print('get_b_table_by_bbox table ', table)

        for not_table_bbox in not_b_table_list:
            print('not_table bbox ', not_table_bbox)

    # log('get_b_table_by_blank_colon cost: ' + str(time.time()-start_time))
    return result_table_list, not_b_table_list


def get_b_table_by_lt_text_row(lt_text_row_list, show=0):
    # 先大致确定区域，列数大于2的区域
    b_table_list1 = []
    b_table = []

    for lt_text_row in lt_text_row_list:
        if len(lt_text_row) >= 2:
            b_table.append(lt_text_row)
        else:
            if len(b_table) >= 2:
                b_table_list1.append(b_table)
            b_table = []
    if len(b_table) >= 2:
        b_table_list1.append(b_table)

    # 获取bbox
    b_table_bbox_list = []
    for b_table in b_table_list1:
        x1 = min([y.bbox[0] for x in b_table for y in x])
        y1 = min([y.bbox[1] for x in b_table for y in x])
        x2 = max([y.bbox[2] for x in b_table for y in x])
        y2 = max([y.bbox[3] for x in b_table for y in x])

        b_table_bbox_list.append([x1, y1, x2, y2])

    if show:
        for b_table in b_table_list1:
            print('b_table')
            for lt_text_row in b_table:
                print('b_table lt_text_row', lt_text_row)
    return b_table_list1, b_table_bbox_list


def row1_row2_has_same_col(row1, row2):
    threshold = 5
    blank_len = 2
    cross_flag = 0
    for lt_text1 in row1:
        for lt_text2 in row2:
            if lt_text2.bbox[0] - lt_text1.bbox[2] >= blank_len \
                    or lt_text1.bbox[0] - lt_text2.bbox[2] >= blank_len \
                    or lt_text1.bbox[0] - threshold <= lt_text2.bbox[0] < lt_text2.bbox[2] <= lt_text1.bbox[
                2] + threshold \
                    or lt_text2.bbox[0] - threshold <= lt_text1.bbox[0] < lt_text1.bbox[2] <= lt_text2.bbox[
                2] + threshold:
                pass
            else:
                cross_flag = 1
    if cross_flag:
        return False
    else:
        return True


def get_blank_row(lt_text_row_list, blank_min_width, show=0):
    # 获取空白行
    blank_row_list = []
    # blank_min_width = avg_char_width * 3
    for lt_text_row in lt_text_row_list:
        lt_text_row.sort(key=lambda x: x.bbox[0])
        blank_row = []
        if len(lt_text_row) < 2:
            blank_row_list.append([])
        else:
            # 行内lt_text两两生成空白
            for lt_text1 in lt_text_row:
                sub_row = []
                for lt_text2 in lt_text_row:
                    if lt_text1 == lt_text2:
                        continue
                    # 必须从左到右
                    if lt_text1.bbox[2] > lt_text2.bbox[0]:
                        continue
                    line1 = ((lt_text1.bbox[0], 0), (lt_text1.bbox[2], 0))
                    line2 = ((lt_text2.bbox[0], 0), (lt_text2.bbox[2], 0))
                    if line_iou(line1, line2) > 0:
                        continue
                    sub_row.append([min(lt_text1.bbox[2], lt_text2.bbox[0]),
                                    min(lt_text1.bbox[3], lt_text2.bbox[1]),
                                    max(lt_text1.bbox[2], lt_text2.bbox[0]),
                                    max(lt_text1.bbox[3], lt_text2.bbox[1]),
                                    ])
                    if show:
                        print('sub_row', lt_text1.get_text(), lt_text2.get_text(), sub_row[-1])

                # 每个lt_text只找出其对应的最小的空白
                if not sub_row:
                    continue
                sub_row.sort(key=lambda x: abs(x[0] - x[2]))
                if show:
                    print('sub_row[-1]', lt_text1.get_text(), sub_row[-1])

                blank_row.append(sub_row[0])

            # 判断最小距离，一行至少有一段空白大于最小距离
            match_flag = 0
            for r in blank_row:
                if abs(r[2] - r[0]) >= blank_min_width:
                    match_flag = 1
                    break
            if match_flag:
                blank_row_list.append(blank_row)
            else:
                blank_row_list.append([])

    return blank_row_list


def row1_row2_has_same_blank(row1, row2):
    # row1的任一空白，都能和row2的任一空白相交
    cross_flag = 0
    for blank1 in row1:
        if cross_flag == 1:
            break
        for blank2 in row2:
            if blank1[0] <= blank2[0] <= blank1[2] \
                    or blank1[0] <= blank2[2] <= blank1[2] \
                    or blank2[0] <= blank1[0] <= blank2[2] \
                    or blank2[0] <= blank1[2] <= blank2[2]:
                cross_flag = 1
                break

    if cross_flag:
        return True
    else:
        return False


@memory_decorator
def get_b_table_by_colon(b_table, blank_width, show=0):
    # print('into get_b_table_by_colon')

    table_bbox = get_table_bbox(b_table)

    # 有些确定为非表格，也输出，防止后续YOLO判断为表格，搞乱数据
    not_table_bbox_list = []

    #
    # row_cnt_list = [len(x) in [2, 3, 4] for x in b_table]

    # 所有行需是2列或4列，同一列算作一列
    row_cnt_list = []
    head_cnt_list = []
    for row in b_table:
        if not row:
            continue
        row.sort(key=lambda x: (x.bbox[0]))
        col_cnt = 1
        head_cnt = 0
        if re.search('[：:]', row[0].get_text()):
            head_cnt += 1
        for ci, col in enumerate(row):
            if ci == 0:
                continue
            col1 = row[ci - 1]
            col2 = row[ci]
            line1 = [(col1.bbox[0], 0), (col1.bbox[2], 0)]
            line2 = [(col2.bbox[0], 0), (col2.bbox[2], 0)]
            if line_iou(line1, line2) >= 0.5:
                continue
            else:
                col_cnt += 1
                if re.search('[：:]', col2.get_text()):
                    head_cnt += 1
        row_cnt_list.append(col_cnt in [2, 3, 4])
        head_cnt_list.append(head_cnt)

    if show:
        print('row_cnt_list', row_cnt_list)
        print('head_cnt_list', head_cnt_list)

    if max(head_cnt_list) > 2:
        if show:
            for row in b_table:
                print('head_cnt_list row', row)
        return [], None, not_table_bbox_list, table_bbox

    # 最后一行年月日可能会影响列数，不是234列
    if row_cnt_list[-1] is False:
        row_cnt_list = row_cnt_list[:-1]
        b_table = b_table[:-1]
        table_bbox = get_table_bbox(b_table)

    row_cnt_list = list(set(row_cnt_list))
    if not (len(row_cnt_list) == 1 and row_cnt_list[0] is True):
        return [], None, not_table_bbox_list, table_bbox

    # 至少有2个以上文本包含冒号
    colon_cnt = 0
    for lt_text_row in b_table:
        for lt_text in lt_text_row:
            if re.search('[:：]', lt_text.get_text()) and re.search('[\u4e00-\u9fff]', lt_text.get_text()):
                colon_cnt += 1
    if show:
        print('colon_cnt, len(table)', colon_cnt, len(b_table))
    # if colon_cnt < 2:
    if colon_cnt < len(b_table) / 2:
        return [], None, not_table_bbox_list, table_bbox

    blank_row_list = get_blank_row(b_table, blank_width)
    if show:
        print('b_table get_blank_row colon', b_table)
        print('blank_row_list colon', blank_row_list)
    # blank_row_list = [y for x in blank_row_list for y in x]
    # print('blank_row_list2', blank_row_list)
    # # 先选最长空白包含的所有空白
    # blank_row_list.sort(key=lambda x: abs(x[0]-x[2]), reverse=True)
    # max_blank = blank_row_list[0]
    # if show:
    #     print('max_blank', max_blank)
    # if abs(max_blank[0]-max_blank[2]) <= 4 * avg_char_width:
    #     return []
    # max_col = []
    # for blank_row_bbox in blank_row_list:
    #     if max_blank[0] <= blank_row_bbox[0] <= blank_row_bbox[2] <= max_blank[2]:
    #         max_col.append(blank_row_bbox)
    # if show:
    #     print('max_col', max_col)
    # if not max_col:
    #     return []
    # # 选取被包含最多的空白
    # blank_contain_cnt_dict = {}
    # for bi, blank_row_bbox in enumerate(max_col):
    #     blank_contain_cnt_dict[bi] = 0
    #     for blank_row_bbox2 in max_col:
    #         if blank_row_bbox2[0] <= blank_row_bbox[0] <= blank_row_bbox[2] <= blank_row_bbox2[2]:
    #             blank_contain_cnt_dict[bi] += 1
    # blank_contain_cnt_list = [[k, v] for k, v in blank_contain_cnt_dict.items()]
    # blank_contain_cnt_list.sort(key=lambda x: x[1])
    # if show:
    #     print('blank_contain_cnt_list', blank_contain_cnt_list)
    # center_blank_row = max_col[blank_contain_cnt_list[-1][0]]

    center_blank_row = choose_center_blank(blank_row_list, blank_width)
    if show:
        print('center_blank_row', center_blank_row)

    # 获取中心最短的空白，作为参考
    # blank_list = [get_blank_row(x) for x in b_table]
    # blank_list = [x[0] if len(x) == 1 else x[1] for x in blank_list]
    # blank_list.sort(key=lambda x: abs(x[2] - x[0]))
    # center_blank = blank_list[0]
    #
    # print('center_blank', center_blank)

    # 根据中心空白，分为两列
    # col_list1 = []
    # col_list2 = []
    # col_box_dict = {}
    # for lt_text_row in b_table:
    #     lt_text_row.sort(key=lambda x: x.bbox[0])
    #     # if len(lt_text_row) == 4:
    #     #     text1 = lt_text_row[0].get_text() + lt_text_row[1].get_text()
    #     #     text2 = lt_text_row[2].get_text() + lt_text_row[3].get_text()
    #     #     box1 = [
    #     #         min(lt_text_row[0].bbox[0], lt_text_row[1].bbox[0]),
    #     #         max(lt_text_row[0].bbox[2], lt_text_row[1].bbox[2]),
    #     #         min(lt_text_row[0].bbox[1], lt_text_row[1].bbox[1]),
    #     #         max(lt_text_row[0].bbox[3], lt_text_row[1].bbox[3])
    #     #     ]
    #     #     box2 = [
    #     #         min(lt_text_row[2].bbox[0], lt_text_row[3].bbox[0]),
    #     #         max(lt_text_row[2].bbox[2], lt_text_row[3].bbox[2]),
    #     #         min(lt_text_row[2].bbox[1], lt_text_row[3].bbox[1]),
    #     #         max(lt_text_row[2].bbox[3], lt_text_row[3].bbox[3])
    #     #     ]
    #     #
    #     #     # col_list1.append(text1)
    #     #     # col_list2.append(text2)
    #     # else:
    #     #     text1 = lt_text_row[0].get_text()
    #     #     text2 = lt_text_row[1].get_text()
    #     #     box1 = lt_text_row[0].bbox
    #     #     box2 = lt_text_row[1].bbox
    #
    #     left_col = []
    #     right_col = []
    #     for lt_text in lt_text_row:
    #         if lt_text.bbox[2] <= center_blank_row[0]:
    #             left_col.append(lt_text)
    #         else:
    #             right_col.append(lt_text)
    #
    #     left_text = [x.get_text() for x in left_col]
    #     left_text = ''.join(left_text)
    #     right_text = [x.get_text() for x in right_col]
    #     right_text = ''.join(right_text)
    #
    #     text1 = left_text.strip()
    #     text2 = right_text.strip()
    #
    #     # if text1 in col_box_dict.keys():
    #     #     col_box_dict[text1] += [box1]
    #     # else:
    #     #     col_box_dict[text1] = [box1]
    #     # if text2 in col_box_dict.keys():
    #     #     col_box_dict[text2] += [box2]
    #     # else:
    #     #     col_box_dict[text2] = [box2]
    #
    #     col_list1.append(text1)
    #     col_list2.append(text2)
    #
    # if show:
    #     print('col_list1', col_list1)
    #     print('col_list2', col_list2)

    # col_key_value_list1 = []
    # last_key = ""
    # for col1 in col_list1:
    #     match = re.search('[:：]+', col1)
    #     # 有冒号的
    #     if match:
    #         key = col1[:match.end()]
    #         if last_key:
    #             key = last_key + key
    #             last_key = ""
    #         value = col1[match.end():]
    #         col_key_value_list1.append([key, value])
    #     # 没有冒号的
    #     else:
    #         # 如果该值也存在在col_list2里，则看做表头，和下一行的表头连在一起
    #         if col1 in col_list2:
    #             if show:
    #                 print('col1 in col_list2')
    #             last_key = col1
    #         # 不存在，则是上一行的值，和上一行的值连在一起
    #         else:
    #             if col_key_value_list1 and re.search('[:：]', col_key_value_list1[-1][1]):
    #                 col_key_value_list1[-1][1] += col1
    #             else:
    #                 col_key_value_list1.append(["", col1])
    #
    # if show:
    #     print('col_key_value_list1', col_key_value_list1)
    #
    # col_key_value_list2 = []
    # last_key = ""
    # for col2 in col_list2:
    #     match = re.search('[:：]+', col2)
    #     if match:
    #         key = col2[:match.end()]
    #         if last_key:
    #             key = last_key + key
    #             last_key = ""
    #         value = col2[match.end():]
    #         col_key_value_list2.append([key, value])
    #     else:
    #         # 如果该值也存在在col_list1里，则看做表头，和下一行的表头连在一起
    #         if col2 in col_list1:
    #             if show:
    #                 print('col2 in col_list1')
    #             last_key = col2
    #         # 不存在，则是上一行的值，和上一行的值连在一起
    #         else:
    #             if col_key_value_list2 and re.search('[:：]', col_key_value_list2[-1][1]):
    #                 col_key_value_list2[-1][1] += col2
    #             else:
    #                 col_key_value_list2.append(["", col2])
    #
    # if show:
    #     print('col_key_value_list2', col_key_value_list2)

    if not center_blank_row:
        return [], None, not_table_bbox_list, table_bbox

    # 根据中心空白，分为两列
    col_list1, col_list2 = divide_2_col_by_center_blank(b_table, center_blank_row)
    # 非表格，一般是那种一行里键值离的较远的单列，加入非表格，后续yolo判断也忽略
    if not col_list1 and not col_list2:
        not_table_bbox = get_table_bbox(b_table)
        not_table_bbox_list.append(not_table_bbox)
        return [], None, not_table_bbox_list, table_bbox

    # 两列中，分别设置head value
    col_key_value_list1 = set_head_value_in_col(col_list1, col_list2)
    col_key_value_list2 = set_head_value_in_col(col_list2, col_list1)

    # 根据两列head value，形成行
    b_table_row_list = []
    for i in range(max(len(col_key_value_list1), len(col_key_value_list2))):
        if i >= len(col_key_value_list1):
            col1 = ["", ""]
        else:
            col1 = col_key_value_list1[i]
        if i >= len(col_key_value_list2):
            col2 = ["", ""]
        else:
            col2 = col_key_value_list2[i]

        row = col1[:2] + col2[:2]
        b_table_row_list.append(row)

    # 删除空白列
    # col_dict = {}
    # for row in b_table_row_list:
    #     for col_i, col in enumerate(row):
    #         if col_i in col_dict.keys():
    #             col_dict[col_i] += [col]
    #         else:
    #             col_dict[col_i] = [col]
    # delete_col_i = []
    # for col_i, cols in col_dict.items():
    #     cols = list(set(cols))
    #     if len(cols) == 1 and cols[0] == '':
    #         delete_col_i.append(col_i)
    #
    # temp_list = []
    # for row in b_table_row_list:
    #     new_col = []
    #     for col_i, col in enumerate(row):
    #         if col_i in delete_col_i:
    #             continue
    #         new_col.append(col)
    #     temp_list.append(new_col)
    # b_table_row_list = temp_list

    # 去掉删除空白列
    # b_table_row_list = delete_blank_col(b_table_row_list)

    # 修复因表头和值是同一列上下排列，导致的错位
    b_table_row_list = fix_head_value_match(b_table_row_list)

    if show:
        print('b_table_row_list', b_table_row_list)
    return b_table_row_list, center_blank_row, not_table_bbox_list, table_bbox


@memory_decorator
def get_text_row_by_blank(lt_text_list, layout_h, show=0):
    if show:
        for lt_text_row in lt_text_list:
            print('lt_text_111', lt_text_row)
    lt_text_blank_list = get_up_down_blank(lt_text_list)
    lt_text_row_list = get_contain_blank_row(lt_text_blank_list, layout_h)
    if show:
        for lt_text_row in lt_text_row_list:
            print('lt_text_row', lt_text_row)

    return lt_text_row_list


def get_text_row_by_center_blank(b_table, lt_text_list, blank_width, layout_h, show=0):
    # 获取行空白
    blank_row_list = get_blank_row(b_table, blank_width)
    if show:
        print('b_table get_blank_row center_blank', b_table)
        print('blank_row_list center_blank', blank_row_list)

    # 获取中心空白
    center_blank_row = choose_center_blank(blank_row_list, blank_width)
    if show:
        print('center_blank_row center', center_blank_row)
    if not center_blank_row:
        return [], []

    center_x = (center_blank_row[2] + center_blank_row[0]) / 2

    lt_text_blank_list = get_up_down_blank(lt_text_list, center_x=center_x)

    lt_text_row_list = get_contain_blank_row(lt_text_blank_list, layout_h)

    if show:
        for lt_text_row in lt_text_row_list:
            print('lt_text_row center', lt_text_row)

    return lt_text_row_list, center_blank_row


def table_list_to_dict(table):
    table_dict_list = []
    for row in table:
        new_row = []
        for col in row:
            col_dict = {
                'rowspan': 1,
                'columnspan': 1,
                'text': col
            }
            new_row.append(col_dict)
        table_dict_list.append(new_row)
    return table_dict_list


@memory_decorator
def get_up_down_blank(lt_text_list, center_x=None, show=0):
    # 根据文本上下的空白分行
    lt_text_list.sort(key=lambda x: (x.bbox[1], x.bbox[0]))
    lt_text_blank_list = []
    for i in range(len(lt_text_list)):
        lt_text1 = lt_text_list[i]
        line1 = ((lt_text1.bbox[0], 0), (lt_text1.bbox[2], 0))
        if center_x is not None:
            left_or_right1 = 0 if (lt_text1.bbox[0] + lt_text1.bbox[2]) / 2 <= center_x else 1

        up_blank_list = []
        down_blank_list = []
        for j in range(len(lt_text_list)):
            lt_text2 = lt_text_list[j]
            if lt_text1 == lt_text2:
                continue

            # 没有中间列分割
            if center_x is None:
                line2 = ((lt_text2.bbox[0], 0), (lt_text2.bbox[2], 0))
                iou = line_iou(line1, line2)
                if lt_text2.bbox[1] > lt_text1.bbox[3] and iou > 0:
                    down_blank_list.append([lt_text1.bbox[3], lt_text2.bbox[1]])
                if lt_text2.bbox[3] < lt_text1.bbox[1] and iou > 0:
                    up_blank_list.append([lt_text2.bbox[3], lt_text1.bbox[1]])
                # if lt_text1.bbox[1] > lt_text2.bbox[3] and iou > 0:
                #     down_blank_list.append([lt_text2.bbox[3], lt_text1.bbox[1]])
                # if lt_text1.bbox[3] < lt_text2.bbox[1] and iou > 0:
                #     up_blank_list.append([lt_text1.bbox[3], lt_text2.bbox[1]])
            # 有中间列分割
            else:
                left_or_right2 = 0 if (lt_text2.bbox[0] + lt_text2.bbox[2]) / 2 <= center_x else 1
                if lt_text2.bbox[1] > lt_text1.bbox[3] and left_or_right1 == left_or_right2:
                    down_blank_list.append([lt_text1.bbox[3], lt_text2.bbox[1]])
                if lt_text2.bbox[3] < lt_text1.bbox[1] and left_or_right1 == left_or_right2:
                    up_blank_list.append([lt_text2.bbox[3], lt_text1.bbox[1]])
                # if lt_text1.bbox[1] > lt_text2.bbox[3] and left_or_right1 == left_or_right2:
                #     down_blank_list.append([lt_text2.bbox[3], lt_text1.bbox[1]])
                # if lt_text1.bbox[3] < lt_text2.bbox[1] and left_or_right1 == left_or_right2:
                #     up_blank_list.append([lt_text1.bbox[3], lt_text2.bbox[1]])

        # 找不到的，空白设置为自身text高度
        text_h = abs(lt_text1.bbox[3] - lt_text1.bbox[1])
        if not up_blank_list:
            up_blank_list.append([max(0, lt_text1.bbox[1] - text_h), lt_text1.bbox[1]])
        if not down_blank_list:
            down_blank_list.append([lt_text1.bbox[3], lt_text1.bbox[3] + text_h])

        down_blank = down_blank_list[0]
        up_blank = up_blank_list[-1]

        if show:
            print('lt_text1.get_text()', lt_text1.get_text(), lt_text1.bbox)
            if center_x is not None:
                print('center_x', center_x)
            print('up_blank', up_blank)
            print('down_blank', down_blank)

        lt_text_blank_list.append([lt_text1, up_blank, down_blank])
    return lt_text_blank_list


@memory_decorator
def filter_large_blank_row(lt_text_blank_list, layout_h, show=0):
    # 先过滤空白过大的，单独成行
    lt_text_row_list = []
    single_lt_text_list = []
    max_blank_h = layout_h / 6
    index = 0
    threshold = 20
    lt_text_blank_list.sort(key=lambda x: (x[0].bbox[1], x[0].bbox[0]))
    for lt_text1, up_blank1, down_blank1 in lt_text_blank_list:
        row = []
        # 空白高度大于一定值，单独一行
        match_flag = 0
        # 在最下方的lt_text，判断上空白
        if index >= len(lt_text_blank_list) - 4 \
                and abs(up_blank1[0] - up_blank1[1]) >= max_blank_h:
            if show:
                print('match single lt_text 1')
            match_flag = 1
        # 在最上方的lt_text，判断下空白
        elif index <= 2 \
                and abs(down_blank1[0] - down_blank1[1]) >= max_blank_h:
            if show:
                print('match single lt_text 2')
            match_flag = 1
        # 在中间的，上下一起判断
        elif 2 <= index <= len(lt_text_blank_list) - 4 \
                and abs(up_blank1[0] - down_blank1[1]) >= max_blank_h:
            # 判断没有同行的
            has_same_row_flag = 0
            for lt_text2, _, _ in lt_text_blank_list:
                if lt_text1 == lt_text2:
                    continue
                if lt_text1.bbox[1] - threshold <= lt_text2.bbox[1] <= lt_text2.bbox[3] <= lt_text1.bbox[3] + threshold:
                    has_same_row_flag = 1
                    break
            if has_same_row_flag:
                match_flag = 0
            else:
                match_flag = 1
            if show:
                print('match single lt_text 3')

        if match_flag:
            row.append(lt_text1)
            lt_text_row_list.append(row)
            single_lt_text_list.append(lt_text1)
        index += 1

    if show:
        print('single_lt_text_list', single_lt_text_list)
    return lt_text_row_list, single_lt_text_list


@memory_decorator
def get_contain_blank_row(lt_text_blank_list, layout_h, show=0):
    from format_convert.convert_tree import TextBox
    lt_text_row_list, single_lt_text_list = filter_large_blank_row(lt_text_blank_list, layout_h)
    single_lt_text_list = set(single_lt_text_list)

    # 空白互相包含的就是同一行
    time1 = time.time()
    threshold = 5
    used_lt_text_list = set([])
    another_used_lt_text_list = set([])
    for i1 in range(len(lt_text_blank_list)):
        time2 = time.time()
        lt_text1, up_blank1, down_blank1 = lt_text_blank_list[i1]
        row = []
        if lt_text1 in single_lt_text_list:
            continue
        for i2 in range(len(lt_text_blank_list)):
            lt_text2, up_blank2, down_blank2 = lt_text_blank_list[i2]
            if lt_text1 == lt_text2:
                continue
            if lt_text2 in another_used_lt_text_list:
                continue
            if lt_text2 in used_lt_text_list and lt_text1.bbox[1] >= lt_text2.bbox[3]:
                continue
            if lt_text2 in single_lt_text_list:
                continue

            # 单独上空白包含上空白，下空白包含下空白
            if (up_blank1[0] - threshold <= up_blank2[0] <= up_blank2[1] <= up_blank1[1] + threshold) \
                    or (down_blank1[0] - threshold <= down_blank2[0] <= down_blank2[1] <= down_blank1[1] + threshold):
                    # or (up_blank2[0] - threshold <= up_blank1[0] <= up_blank1[1] <= up_blank2[1] + threshold) \
                    # or (down_blank2[0] - threshold <= down_blank1[0] <= down_blank1[1] <= down_blank2[1] + threshold):
                if lt_text2 not in row:
                    row.append(lt_text2)
                    used_lt_text_list.add(lt_text2)

            # 若是上下空白包含了另一个的文本部分，也成立
            # if up_blank1[0] <= lt_text2.bbox[1] <= lt_text2.bbox[3] <= down_blank1[1]:
            #     if lt_text2 not in row:
            #         row.append(lt_text2)
            #         used_lt_text_list.append(lt_text2)


        if lt_text1 not in row:
            row.append(lt_text1)

        if show:
            print('get_contain_blank_row loop2 cost:', time.time()-time2)

        # 若一个row中有3个带冒号的，说明误把一个单独行合进来了，分开
        time2 = time.time()
        colon_cnt = 0
        colon_lt_text = []
        for lt in row:
            if re.search('[:：]', lt.get_text()):
                colon_cnt += 1
                colon_lt_text.append(lt)
        if colon_cnt >= 3:
            if show:
                print('colon_cnt >= 3 row', row)

            another_lt_text_list = find_outline_lt_text(row)

            # # 把y最大的lt_text单独放一行
            # colon_lt_text.sort(key=lambda x: x.bbox[1])
            # # 除了前两个，其他都单放一行
            # another_lt_text_list = colon_lt_text[2:]
            for lt_text in another_lt_text_list:
                if lt_text in row:
                    row.remove(lt_text)
                if lt_text in colon_lt_text:
                    colon_lt_text.remove(lt_text)

            if show:
                print('another_lt_text_list', another_lt_text_list)
                print('colon_lt_text', colon_lt_text)

            if not colon_lt_text:
                continue

            colon_lt_text.sort(key=lambda x: x.bbox[0])
            lt_text_row_list.append(row)
            for another_lt_text in another_lt_text_list:
                if abs(another_lt_text.bbox[0] - colon_lt_text[0].bbox[0]) > abs(
                        another_lt_text.bbox[0] - colon_lt_text[-1].bbox[0]):
                    new_bbox = [colon_lt_text[0].bbox[0], another_lt_text.bbox[1],
                                colon_lt_text[0].bbox[2], another_lt_text.bbox[3]]
                    another_row = [TextBox(text="@@:", bbox=new_bbox), another_lt_text]
                else:
                    new_bbox = [colon_lt_text[-1].bbox[0], another_lt_text.bbox[1],
                                colon_lt_text[-1].bbox[2], another_lt_text.bbox[3]]
                    # 新增一列占位
                    another_row = [another_lt_text, TextBox(text="@@:", bbox=new_bbox)]
                if show:
                    print('another_row', another_row)
                for lt_text3 in another_row:
                    another_used_lt_text_list.add(lt_text3)
                lt_text_row_list.append(another_row)
        else:
            lt_text_row_list.append(row)

        if show:
            print('get_contain_blank_row judge colon cost:', time.time()-time2)

    if show:
        print('get_contain_blank_row double loop cost: ', time.time()-time1)

    # 去重
    lt_text_row_list.sort(key=lambda x: len(x), reverse=True)
    if show:
        for lt_text_row in lt_text_row_list:
            print('before dedup lt_text_row', lt_text_row)

    lt_text_row_list = merge_intersecting_lists(lt_text_row_list)

    if show:
        for lt_text_row in lt_text_row_list:
            print('after dedup lt_text_row', lt_text_row)

    lt_text_row_list.sort(key=lambda x: x[0].bbox[1])

    # 剔除全是空白的行
    temp_list = []
    for lt_text_row in lt_text_row_list:
        row_text = ""
        for lt_text in lt_text_row:
            row_text += lt_text.get_text()
        if re.sub('\s+', '', row_text) == "":
            continue
        temp_list.append(lt_text_row)
    lt_text_row_list = temp_list
    return lt_text_row_list


def choose_center_blank(blank_row_list, blank_width, show=0):
    if not blank_row_list:
        return []

    # 先选最长空白包含的所有空白
    blank_list = [y for x in blank_row_list for y in x]
    if not blank_list:
        return []

    blank_list.sort(key=lambda x: abs(x[0] - x[2]), reverse=True)
    max_blank = blank_list[0]
    if show:
        print('max_blank', max_blank)
    if abs(max_blank[0] - max_blank[2]) <= blank_width:
        return []

    max_col = []
    for blank_row in blank_row_list:
        if not blank_row:
            continue

        # # 找出每一行最大的空白列，但是同一列中则选列中最小的空白
        # # 空白分列
        # blank_row.sort(key=lambda x: (x[0], x[1]))
        # last_blank_bbox = blank_row[0]
        # blank_col = []
        # blank_col_list = []
        # for blank_bbox in blank_row[1:]:
        #     line1 = ([blank_bbox[0], 0], [blank_bbox[2], 0])
        #     line2 = ([last_blank_bbox[0], 0], [last_blank_bbox[2], 0])
        #     if line_iou(line1, line2) >= 0.7:
        #         blank_col += [blank_bbox, last_blank_bbox]
        #     else:
        #         blank_col.sort(key=lambda x: abs(x[2] - x[0]))
        #         blank_col_list.append(blank_col)
        #         blank_col = []
        #     last_blank_bbox = blank_bbox

        # 选最大的列
        max_blank_bbox = blank_row[0]
        for blank_bbox in blank_row[1:]:
            if abs(blank_bbox[0] - blank_bbox[2]) > abs(max_blank_bbox[0] - max_blank_bbox[2]):
                max_blank_bbox = blank_bbox

        if show:
            print('max_blank_bbox, blank_row', max_blank_bbox, blank_row)

        line1 = ([max_blank[0], 0], [max_blank[2], 0])
        line2 = ([max_blank_bbox[0], 0], [max_blank_bbox[2], 0])
        iou = line_iou(line1, line2)
        # if max_blank[0] <= blank_row_bbox[0] <= blank_row_bbox[2] <= max_blank[2]:
        if iou >= 0.5:
            max_col.append(max_blank_bbox)
    if show:
        print('max_col', max_col)
    if not max_col:
        return []

    # # 选取被包含最多的空白
    # # 选取交集最多的空白，相同数量则最短
    # blank_contain_cnt_dict = {}
    # for bi, blank_row_bbox in enumerate(max_col):
    #     blank_contain_cnt_dict[bi] = 0
    #     for blank_row_bbox2 in max_col:
    #         line1 = ([blank_row_bbox2[0], 0], [blank_row_bbox2[2], 0])
    #         line2 = ([blank_row_bbox[0], 0], [blank_row_bbox[2], 0])
    #         iou = line_iou(line1, line2)
    #         # if blank_row_bbox2[0] <= blank_row_bbox[0] <= blank_row_bbox[2] <= blank_row_bbox2[2]:
    #         if iou >= 0.2:
    #             blank_contain_cnt_dict[bi] += 1
    # blank_contain_cnt_list = [[k, v, abs(max_col[k][2] - max_col[k][0])/2] for k, v in blank_contain_cnt_dict.items()]
    # blank_contain_cnt_list.sort(key=lambda x: (x[1], -x[2]))
    # if show:
    #     print('blank_contain_cnt_list', blank_contain_cnt_list)
    # center_blank_row = max_col[blank_contain_cnt_list[-1][0]]

    # 选取交集部分
    center_blank_row = get_inter_part(max_col)
    return center_blank_row


def set_head_value_in_col(col_list1, col_list2, show=0):
    # 在列中设置 表头和值
    col_key_value_list = []
    last_key = ""
    for col1 in col_list1:
        match = re.search('[:：]+', col1)
        # 有冒号的
        if match:
            key = col1[:match.end()]
            if last_key:
                key = last_key + key
                last_key = ""
            value = col1[match.end():]
            col_key_value_list.append([key, value])
        # 没有冒号的
        else:
            # 如果该值也存在在col_list2里，则看做表头，和下一行的表头连在一起
            if col1 in col_list2:
                if show:
                    print('col1 in col_list2')
                # 若上一行也是无冒号的，直接加入一行
                if last_key:
                    col_key_value_list.append(["", last_key])
                    last_key = ''
                last_key = col1
            # 不存在，则是上一行的值，和上一行的值连在一起
            else:
                if col_key_value_list and re.search('[:：]', col_key_value_list[-1][1]):
                    col_key_value_list[-1][1] += col1
                else:
                    col_key_value_list.append(["", col1])

    # 如果是最后一行没有冒号的，col1 col2都有的，直接当做一行
    if last_key:
        col_key_value_list.append(["", last_key])

    if show:
        print('col_key_value_list', col_key_value_list)

    return col_key_value_list


def divide_2_col_by_center_blank(b_table, center_blank_row, show=0):
    # 根据中心空白，分为两列
    col_list1 = []
    col_list2 = []
    col_box_dict = {}
    for lt_text_row in b_table:
        lt_text_row.sort(key=lambda x: x.bbox[0])
        # if len(lt_text_row) == 4:
        #     text1 = lt_text_row[0].get_text() + lt_text_row[1].get_text()
        #     text2 = lt_text_row[2].get_text() + lt_text_row[3].get_text()
        #     box1 = [
        #         min(lt_text_row[0].bbox[0], lt_text_row[1].bbox[0]),
        #         max(lt_text_row[0].bbox[2], lt_text_row[1].bbox[2]),
        #         min(lt_text_row[0].bbox[1], lt_text_row[1].bbox[1]),
        #         max(lt_text_row[0].bbox[3], lt_text_row[1].bbox[3])
        #     ]
        #     box2 = [
        #         min(lt_text_row[2].bbox[0], lt_text_row[3].bbox[0]),
        #         max(lt_text_row[2].bbox[2], lt_text_row[3].bbox[2]),
        #         min(lt_text_row[2].bbox[1], lt_text_row[3].bbox[1]),
        #         max(lt_text_row[2].bbox[3], lt_text_row[3].bbox[3])
        #     ]
        #
        #     # col_list1.append(text1)
        #     # col_list2.append(text2)
        # else:
        #     text1 = lt_text_row[0].get_text()
        #     text2 = lt_text_row[1].get_text()
        #     box1 = lt_text_row[0].bbox
        #     box2 = lt_text_row[1].bbox

        left_col = []
        right_col = []
        for lt_text in lt_text_row:
            if (lt_text.bbox[2] + lt_text.bbox[0]) / 2 <= abs(center_blank_row[0] + center_blank_row[2]) / 2:
                left_col.append(lt_text)
            else:
                right_col.append(lt_text)

        # 按阅读顺序排序
        left_col = sort_by_read_order(left_col)
        left_text = [x.get_text() for x in left_col]
        left_text = ''.join(left_text)
        right_col = sort_by_read_order(right_col)
        right_text = [x.get_text() for x in right_col]
        right_text = ''.join(right_text)

        text1 = left_text.strip()
        text2 = right_text.strip()

        col_list1.append(text1)
        col_list2.append(text2)

    if show:
        print('col_list1', col_list1)
        print('col_list2', col_list2)

    # 两列都必须有冒号，否则就是非2列表格
    colon_cnt1 = 0
    colon_cnt2 = 0
    for col in col_list1:
        if re.search('[：:]', col):
            colon_cnt1 += 1
    for col in col_list2:
        if re.search('[：:]', col):
            colon_cnt2 += 1

    if colon_cnt1 < len(col_list1) / 3 or colon_cnt2 < len(col_list2) / 3:
        col_list1 = []
        col_list2 = []
        if show:
            print('col_list1 colon_cnt1 less', colon_cnt1)
            print('col_list2 colon_cnt2 less', colon_cnt2)

    return col_list1, col_list2


def delete_blank_col(b_table_row_list):
    # 删除空白列
    col_dict = {}
    for row in b_table_row_list:
        for col_i, col in enumerate(row):
            if col_i in col_dict.keys():
                col_dict[col_i] += [col]
            else:
                col_dict[col_i] = [col]
    delete_col_i = []
    for col_i, cols in col_dict.items():
        cols = list(set(cols))
        if len(cols) == 1 and cols[0] == '':
            delete_col_i.append(col_i)

    temp_list = []
    for row in b_table_row_list:
        new_col = []
        for col_i, col in enumerate(row):
            if col_i in delete_col_i:
                continue
            new_col.append(col)
        temp_list.append(new_col)
    b_table_row_list = temp_list
    return b_table_row_list


def fix_head_value_match(b_table, show=0):
    if not b_table:
        return b_table
    if len(b_table[0]) != 4:
        return b_table
    maybe_head_index = None
    match_head_value_dict = {}
    # 修复值跨行
    for row_i, row in enumerate(b_table):
        if maybe_head_index is None:
            if row[1] in ["", '@@:'] and row[3] in ["", '@@:']:
                match1 = re.search("[:：]", row[0])
                match2 = re.search("[:：]", row[2])
                if match1 and match2:
                    maybe_head_index = row_i
        else:
            if row[0] in ["", '@@:'] and row[2] in ["", '@@:'] and row[1] not in ["", '@@:'] and row[3] not in ["", '@@:']:
                if maybe_head_index in match_head_value_dict.keys():
                    match_head_value_dict[maybe_head_index] += [row_i]
                else:
                    match_head_value_dict[maybe_head_index] = [row_i]
            else:
                maybe_head_index = None

    if show:
        print('match_head_value_dict', match_head_value_dict)

    add_row_dict = {}
    delete_head_index_list = []
    delete_value_index_list = []
    for row_index, value_index_list in match_head_value_dict.items():
        head_row = b_table[row_index]
        delete_head_index_list.append(row_index)
        left_value_text = ""
        right_value_text = ""
        for value_index in value_index_list:
            value_row = b_table[value_index]
            delete_value_index_list.append(value_index)
            for col in value_row[:2]:
                left_value_text += col
            for col in value_row[2:]:
                right_value_text += col
        head_row[1] = left_value_text
        head_row[3] = right_value_text
        add_row_dict[row_index] = head_row

    # 删掉原来的，加上新的row
    temp_list = []
    for row_i, row in enumerate(b_table):
        if row_i in delete_head_index_list:
            temp_list.append(add_row_dict.get(row_i))
            continue
        if row_i in delete_value_index_list:
            continue
        temp_list.append(row)
    b_table = temp_list
    return b_table


def add_last_rows(b_table, table_bbox, center_blank_bbox, lt_text_row_list,
                  table_lt_text_row_list, show=0):
    if not b_table:
        return b_table
    if len(b_table[0]) not in [4]:
        return b_table

    blank_h_list = []
    max_h_list = []
    for lt_text_row in table_lt_text_row_list:
        if not lt_text_row:
            continue
        min_w, min_h, max_w, max_h = get_row_bbox(lt_text_row, mode='.bbox')
        max_h_list.append(max_h)
    max_h_list.sort(key=lambda x: x)
    for i in range(1, len(max_h_list)):
        blank_h_list.append(max_h_list[i] - max_h_list[i - 1])
    mean_blank_h = np.mean(blank_h_list)
    if show:
        print('add_last_rows blank_width_list', blank_h_list)
        print('add_last_rows mean_blank_h', mean_blank_h)

    lt_text_row_list.sort(key=lambda x: x[0].bbox[1])
    match_row_list = []
    threshold = 5
    add_blank_h = mean_blank_h + threshold
    for li, lt_text_row in enumerate(lt_text_row_list):
        min_w, min_h, max_w, max_h = get_row_bbox(lt_text_row, mode='.bbox')
        if show:
            print('max_h > table_bbox[3]', lt_text_row, max_h, table_bbox[3])
        # 高度需要在表格y2和y2加上空白的距离间
        if table_bbox[3] < max_h < table_bbox[3] + add_blank_h:
            # lt_text x轴上穿过了中心bbox，则跳过
            if min_w <= center_blank_bbox[0] <= center_blank_bbox[2] <= max_w:
                print('continue1', min_w, center_blank_bbox[0], center_blank_bbox[2], max_w)
                continue

            # 左边需在表格x1和中心x1之间
            if table_bbox[0] - threshold <= min_w < center_blank_bbox[0]:
                match_row_list.append([lt_text_row, 0, max_h])
            # 右边需在表格x2和中心x2之间
            elif center_blank_bbox[2] < max_w < table_bbox[2] + threshold * 3:
                match_row_list.append([lt_text_row, 1, max_h])
            else:
                print('center_blank_bbox[2] < max_w < table_bbox[2] + threshold * 3')
                break

            add_blank_h = add_blank_h + mean_blank_h + threshold

    if show:
        print('add_last_rows match_row_list', match_row_list)

    add_b_table = []
    real_max_h = None
    for mi, match_row in enumerate(match_row_list):
        lt_text_row, is_right, max_h = match_row
        lt_text_row.sort(key=lambda x: (x.bbox[0], x.bbox[1]))
        # 只有一列
        if len(lt_text_row) == 1:
            text = lt_text_row[0].get_text()
            match = re.search('[:：]+', text)
            real_max_h = max_h
            if not match:
                head = ""
                value = text
            else:
                head = text[:match.end()]
                value = text[match.end():]
        # 或 两列，其实是表头由于空白被隔开
        elif len(lt_text_row) == 2 and len(lt_text_row[0].get_text()) \
                and lt_text_row[1].get_text()[-1] in [':', "："]:
            text = lt_text_row[0].get_text() + lt_text_row[1].get_text()
            head = text
            value = ''
        # 两列
        elif len(lt_text_row) == 2:
            text1 = lt_text_row[0].get_text()
            match = re.search('[:：]+', text1)
            if not match:
                break
            real_max_h = max_h
            head = text1
            value = lt_text_row[1].get_text()
        else:
            if show:
                print('add_last_rows len(lt_text_row) break', len(lt_text_row))
            break

        # 获取上一行，可能需要将值补到上一行
        if mi == 0 or len(add_b_table) == 0:
            last_row = b_table[-1]
            last_flag = 0
        else:
            last_row = add_b_table[-1]
            last_flag = 1

        if is_right:
            if last_row[2] and not last_row[3] and not head and value:
                b_table[-1][3] = value
                current_row = ["", "", last_row[2], value]
            else:
                current_row = ["", "", head, value]
        else:
            if last_row[0] and not last_row[1] and not head and value:
                current_row = [last_row[0], value, "", ""]
            else:
                current_row = [head, value, "", ""]

        # if last_flag == 0:
        #     b_table = b_table[:-1]
        add_b_table.append(current_row)

        if show:
            print('current_row', current_row)

    if show:
        print('add_b_table', add_b_table)

    b_table += add_b_table
    if real_max_h is not None:
        table_bbox[3] = real_max_h
    return b_table


def add_first_rows(b_table, table_bbox, center_blank_bbox, lt_text_row_list,
                   table_lt_text_row_list, show=0):
    if not b_table:
        return b_table
    if len(b_table[0]) not in [4]:
        return b_table

    blank_h_list = []
    max_h_list = []
    for lt_text_row in table_lt_text_row_list:
        if not lt_text_row:
            continue
        min_w, min_h, max_w, max_h = get_row_bbox(lt_text_row, mode='.bbox')
        max_h_list.append(max_h)
    max_h_list.sort(key=lambda x: x)
    for i in range(1, len(max_h_list)):
        blank_h_list.append(max_h_list[i] - max_h_list[i - 1])
    mean_blank_h = np.mean(blank_h_list)
    if show:
        print('add_first_rows blank_width_list', blank_h_list)
        print('add_first_rows mean_blank_h', mean_blank_h)

    lt_text_row_list.sort(key=lambda x: x[0].bbox[1])
    match_row_list = []
    threshold = 5
    add_blank_h = mean_blank_h + threshold
    for li, lt_text_row in enumerate(lt_text_row_list):
        min_w, min_h, max_w, max_h = get_row_bbox(lt_text_row, mode='.bbox')
        if show:
            print('min_h < table_bbox[3]', lt_text_row, min_h, table_bbox[3])
        # 高度需要有一部分在在表格中
        if min_h <= table_bbox[1] < max_h:
            # lt_text x轴上穿过了中心bbox，则跳过
            if min_w <= center_blank_bbox[0] <= center_blank_bbox[2] <= max_w:
                print('continue1', min_w, center_blank_bbox[0], center_blank_bbox[2], max_w)
                continue
            # match_row_list.append([lt_text_row, 1, min_h])

            # 中心x1左边
            if min_w < center_blank_bbox[0]:
                match_row_list.append([lt_text_row, 0, min_h])
            # 中心x2右边
            elif center_blank_bbox[2] < max_w:
                match_row_list.append([lt_text_row, 1, min_h])
            else:
                break

    if show:
        print('add_first_rows match_row_list', match_row_list)

    real_min_h = None
    for mi, match_row in enumerate(match_row_list):
        lt_text_row, is_right, min_h = match_row
        lt_text_row.sort(key=lambda x: (x.bbox[0], x.bbox[1]))
        # 只有一列
        if len(lt_text_row) == 1:
            text = lt_text_row[0].get_text()
            match = re.search('[:：]+', text)
            real_min_h = min_h
            if not match:
                head = ""
                value = text
            else:
                head = text[:match.end()]
                value = text[match.end():]
        # # 或 两列，其实是表头由于空白被隔开
        # elif len(lt_text_row) == 2 and len(lt_text_row[0].get_text()) \
        #         and lt_text_row[1].get_text()[-1] in [':', "："]:
        #     text = lt_text_row[0].get_text() + lt_text_row[1].get_text()
        #     head = text
        #     value = ''
        # # 两列
        # elif len(lt_text_row) == 2:
        #     text1 = lt_text_row[0].get_text()
        #     match = re.search('[:：]+', text1)
        #     if not match:
        #         break
        #     real_max_h = max_h
        #     head = text1
        #     value = lt_text_row[1].get_text()
        else:
            if show:
                print('add_first_rows len(lt_text_row) break', len(lt_text_row))
            break

        # 获取表格第一行，可能需要将值补进去
        if not head and value:
            if is_right:
                b_table[0][3] = value + b_table[0][3]
            else:
                b_table[0][1] = value + b_table[0][1]

    if real_min_h is not None:
        table_bbox[1] = real_min_h
    return b_table


def get_row_bbox(row, mode='list'):
    # 提取所有x1, y1, x2, y2的值

    if mode == 'list':
        x1_values = [x[0] for x in row]
        y1_values = [x[1] for x in row]
        x2_values = [x[2] for x in row]
        y2_values = [x[3] for x in row]
    elif mode == '.bbox':
        x1_values = [x.bbox[0] for x in row]
        y1_values = [x.bbox[1] for x in row]
        x2_values = [x.bbox[2] for x in row]
        y2_values = [x.bbox[3] for x in row]

    min_x = min(x1_values)
    max_x = max(x2_values)
    min_y = min(y1_values)
    max_y = max(y2_values)
    return min_x, min_y, max_x, max_y


def shrink_bbox(img, bbox_list):
    def return_not_most_color_index(image_np, match_color):
        # 计算每个像素与背景色的欧几里得距离的平方
        diff = np.sum(np.sqrt((image_np.astype(np.int32) - match_color.astype(np.int32)) ** 2), axis=2)
        threshold = 100  # 假设阈值为 10000，可以调整
        diff_mask = diff > threshold
        # 获取与背景色相差较大的像素的索引
        diff_index = np.where(diff_mask)
        # print('diff_index.size', diff_index[0].size)
        return diff_index

    def return_not_most_color_index_fast(image_np, match_color):
        # 将图像和匹配颜色转换为整数类型
        # image_int = image_np.astype(np.int32)
        # match_color_int = match_color.astype(np.int32)

        # 计算每个像素与背景色的欧几里得距离的平方
        diff = np.sum((image_np - match_color) ** 2, axis=2)
        threshold = 20 # 假设阈值为 10000，可以调整
        threshold = threshold ** 2
        diff_mask = diff > threshold
        # 获取与背景色相差较大的像素的索引
        diff_index = np.where(diff_mask)
        # print('diff_index.size', diff_index[0].size)
        return diff_index


    # def count_colors_with_histogram(img):
    #     time00 = time.time()
    #
    #     # 计算每个颜色通道的直方图
    #     hist_b = cv2.calcHist([img], [0], None, [256], [0, 256])
    #     hist_g = cv2.calcHist([img], [1], None, [256], [0, 256])
    #     hist_r = cv2.calcHist([img], [2], None, [256], [0, 256])
    #
    #     # 将直方图合并成一个数组
    #     hist = np.concatenate((hist_b.flatten(), hist_g.flatten(), hist_r.flatten()))
    #
    #     # 获取非零值的索引及其数量
    #     non_zero_indices = np.nonzero(hist)[0]
    #     counts = hist[non_zero_indices]
    #
    #     # 将索引转换为颜色值
    #     colors = np.unravel_index(non_zero_indices, (256, 256, 256))
    #     colors = np.transpose(colors)
    #
    #     log("count_colors_with_histogram Time taken: " + str(time.time() - time00))
    #     return colors, counts
    #
    #
    # def count_colors_with_kmeans(img):
    #     time00 = time.time()
    #     img_color = img.reshape(-1, 3)
    #
    #     # 使用 KMeans 聚类，将颜色聚类为 16 种
    #     kmeans = KMeans(n_clusters=4, random_state=0, n_init=2, max_iter=10)
    #     kmeans.fit(img_color)
    #
    #     # 获取聚类后的标签和中心
    #     labels = kmeans.labels_
    #     centers = kmeans.cluster_centers_
    #
    #     # 统计每个聚类中心的数量
    #     unique_labels, counts = np.unique(labels, return_counts=True)
    #
    #     print("Time taken: ", time.time() - time00)
    #     return centers[unique_labels], counts
    #
    # def count_colors_with_bincount(img):
    #     time00 = time.time()
    #     img_color = img.reshape(-1, 3)
    #
    #     # 将颜色编码为一个整数
    #     colors_encoded = img_color[:, 0] * 256 * 256 + img_color[:, 1] * 256 + img_color[:, 2]
    #
    #     # 使用 bincount 计算每个颜色的数量
    #     counts = np.bincount(colors_encoded)
    #
    #     # 获取非零值的索引及其数量
    #     non_zero_indices = np.nonzero(counts)[0]
    #
    #     # 解码颜色值
    #     colors_decoded = []
    #     for index in non_zero_indices:
    #         r = (index // (256 * 256)) % 256
    #         g = (index // 256) % 256
    #         b = index % 256
    #         colors_decoded.append([r, g, b])
    #
    #     colors_decoded = np.array(colors_decoded)
    #     counts_non_zero = counts[non_zero_indices]
    #
    #     print("Time taken: ", time.time() - time00)
    #     return colors_decoded, counts_non_zero

    # 统计每种颜色的出现次数
    # time00 = time.time()

    # 对图像进行降采样

    time0 = time.time()
    down_sample_factor = 8
    down_sampled_img = img[::down_sample_factor, ::down_sample_factor, :]
    down_sampled_img_color = down_sampled_img.reshape(-1, 3)
    colors, counts = np.unique(down_sampled_img_color, return_counts=True, axis=0)
    log('shrink_bbox 0 ' + str(time.time()-time0))

    # 找到出现次数最多的颜色
    time0 = time.time()
    max_count_index = np.argmax(counts)
    most_frequent_color = colors[max_count_index]
    most_frequent_color = most_frequent_color.astype(np.int32)
    log('shrink_bbox 1 ' + str(time.time()-time0))

    new_bbox_list = []
    img_int = img.astype(np.int32)
    time0 = time.time()
    for bbox in bbox_list:
        # img_bbox = img[int(bbox[0][1]):int(bbox[2][1]), int(bbox[0][0]):int(bbox[2][0]), :]
        # img_bbox = img[int(bbox[1]):int(bbox[3]), int(bbox[0]):int(bbox[2]), :]
        img_bbox_int = img_int[int(bbox[1]):int(bbox[3]), int(bbox[0]):int(bbox[2]), :]

        if 0 in img_bbox_int.shape:
            new_bbox_list.append(bbox)
            continue

        # 左右上下开始扫描，碰到黑像素即停
        # index_list = return_first_black_index(img_bbox[:, :, :])
        index_list = return_not_most_color_index_fast(img_bbox_int, most_frequent_color)

        if index_list[0].size == 0 or index_list[1].size == 0:
            new_bbox_list.append(bbox)
            continue
        min_h = index_list[0][0]
        max_h = index_list[0][-1]

        img_bbox1 = np.swapaxes(img_bbox_int, 0, 1)
        # index_list = return_first_black_index(img_bbox1[:, :, :])
        index_list = return_not_most_color_index_fast(img_bbox1, most_frequent_color)

        if index_list[0].size == 0 or index_list[1].size == 0:
            new_bbox_list.append(bbox)
            continue
        min_w = index_list[0][0]
        max_w = index_list[0][-1]

        real_min_w = bbox[0] + min_w
        real_max_w = bbox[0] + max_w
        real_min_h = bbox[1] + min_h
        real_max_h = bbox[1] + max_h
        new_bbox = [real_min_w, real_min_h, real_max_w, real_max_h]
        new_bbox_list.append(new_bbox)

        # cv2.imshow('img', img_bbox)
        # cv2.imshow('shrink', img[int(new_bbox[0][1]):int(new_bbox[2][1]), int(new_bbox[0][0]):int(new_bbox[2][0]), :])
        # cv2.waitKey(0)
    log('shrink_bbox 2 ' + str(time.time() - time0))
    return new_bbox_list


def shrink_bbox_by_pixel(lt_text_list):
    for lt_text in lt_text_list:
        bbox = lt_text.bbox
        bbox_h = abs(bbox[3] - bbox[1])
        shrink_h = bbox_h / 2
        new_bbox = [bbox[0], int(bbox[1] + shrink_h / 2),
                    bbox[2], int(bbox[3] - shrink_h / 2)
                    ]
        lt_text.bbox = new_bbox
    return lt_text_list


def get_inter_part(bbox_list, show=0):
    if not bbox_list:
        return None

    # xs = [[x[0], x[2]] for x in bbox_list]
    # xs = [y for x in xs for y in x]
    #
    # ys = [[x[1], x[3]] for x in bbox_list]
    # ys = [y for x in ys for y in x]
    #
    # xs.sort(key=lambda x: x)
    # ys.sort(key=lambda x: x)
    #
    # max_index = len(bbox_list)
    # min_index = max_index - 1
    #
    # min_x, max_x = xs[min_index], xs[max_index]
    # min_y, max_y = ys[min_index], ys[max_index]

    # min_x, min_y, max_x, max_y = bbox_list[0]
    # for bbox in bbox_list:
    #     # if min_x < bbox[0]:
    #     #     min_x = bbox[0]
    #     # if min_y < bbox[1]:
    #     #     min_y = bbox[1]
    #     # if max_x > bbox[2]:
    #     #     max_x = bbox[2]
    #     # if max_y > bbox[3]:
    #     #     max_y = bbox[3]
    #     if min_x < min(bbox[0], bbox[2]):
    #         min_x = min(bbox[0], bbox[2])
    #     if min_y < min(bbox[1], bbox[3]):
    #         min_y = min(bbox[1], bbox[3])
    #     if max_x > max(bbox[0], bbox[2]):
    #         max_x = max(bbox[0], bbox[2])
    #     if max_y > max(bbox[1], bbox[3]):
    #         max_y = max(bbox[1], bbox[3])
    #     # print('min_x, min_y, max_x, max_y', min_x, min_y, max_x, max_y)
    # _min_x = min(min_x, max_x)
    # _max_x = max(min_x, max_x)
    # _min_y = min(min_y, max_y)
    # _max_y = max(min_y, max_y)

    # # 同一行的bbox去重，取最大的
    # # used_bbox_list = []
    # current_bbox = bbox_list[0]
    # delete_bbox_list = []
    # bbox_list.sort(key=lambda x: (x[1], x[3]))
    # threshold = 5
    # for bbox in bbox_list:
    #     if bbox == current_bbox:
    #         continue
    #     if current_bbox in delete_bbox_list:
    #         current_bbox = bbox
    #         continue
    #     if current_bbox[1] - threshold <= bbox[1] <= bbox[3] <= current_bbox[3] + threshold:
    #         if abs(current_bbox[0] - current_bbox[2]) > abs(bbox[0] - bbox[2]):
    #             delete_bbox_list.append(bbox)
    #         else:
    #             delete_bbox_list.append(current_bbox)
    #     else:
    #         current_bbox = bbox
    #
    # for bbox in delete_bbox_list:
    #     if bbox in bbox_list:
    #         bbox_list.remove(bbox)

    bbox_list.sort(key=lambda x: (x[0], x[2]))
    min_x, min_y, max_x, max_y = bbox_list[0]
    for bbox in bbox_list:
        if min_x < bbox[0]:
            min_x = bbox[0]
        if min_y < bbox[1]:
            min_y = bbox[1]
        if max_x > bbox[2]:
            max_x = bbox[2]
        if max_y > bbox[3]:
            max_y = bbox[3]
    _min_x = min(min_x, max_x)
    _max_x = max(min_x, max_x)
    _min_y = min(min_y, max_y)
    _max_y = max(min_y, max_y)
    if show:
        print('get_inter_part', [_min_x, _min_y, _max_x, _max_y])
    return [_min_x, _min_y, _max_x, _max_y]


def get_inter_part_250530(bbox_list, show=0):
    if not bbox_list:
        return None

    x1_list = [x[0] for x in bbox_list]
    x2_list = [x[2] for x in bbox_list]
    y1_list = [x[1] for x in bbox_list]
    y2_list = [x[3] for x in bbox_list]

    x1_list.sort(key=lambda x: x, reverse=True)
    x2_list.sort(key=lambda x: x)


def get_straight_lines_from_image(image_np, threshold=50):
    # 读取图像
    if image_np is None:
        print("无法读取图像")
        return False

    # 转换为灰度图像
    gray = cv2.cvtColor(image_np, cv2.COLOR_BGR2GRAY)

    # 使用Canny算子进行边缘检测
    edges = cv2.Canny(gray, 20, 150)

    cv2.imshow('edges', edges)

    # 使用霍夫直线变换检测直线
    lines = cv2.HoughLinesP(edges, 1, np.pi / 180, threshold,
                            minLineLength=50, maxLineGap=2)

    for line in lines:
        line = line[0]
        print('line', line)
        cv2.line(image_np, line[:2], line[2:], (0, 0, 255))

    cv2.imshow('img', image_np)
    cv2.waitKey(0)

    print('lines', lines)


def get_table_bbox(table):
    x1 = min([y.bbox[0] for x in table for y in x])
    y1 = min([y.bbox[1] for x in table for y in x])
    x2 = max([y.bbox[2] for x in table for y in x])
    y2 = max([y.bbox[3] for x in table for y in x])
    return [x1, y1, x2, y2]


@memory_decorator
def merge_intersecting_lists(lists):
    merged_lists = []
    for current_list in lists:
        # 当前列表转换为集合，方便后续操作
        current_set = set(current_list)
        merged = False
        # 遍历已合并的列表，检查是否有交集
        for i in range(len(merged_lists)):
            merged_set = set(merged_lists[i])
            # 如果存在交集
            if current_set & merged_set:
                # 合并两个列表，并去重
                merged_lists[i] = list(merged_set.union(current_set))
                merged = True
                break
        # 如果没有与任何已合并列表交集，则添加为新的合并列表
        if not merged:
            merged_lists.append(current_list.copy())
    return merged_lists


def merge_same_bbox(lt_text_list, avg_char_width, show=0):
    from format_convert.convert_tree import TextBox
    for i in range(len(lt_text_list)):
        lt_text1 = lt_text_list[i]
        line1_x = ((lt_text1.bbox[0], 0), (lt_text1.bbox[2], 0))
        line1_y = ((lt_text1.bbox[1], 0), (lt_text1.bbox[3], 0))

        for j in range(i+1, len(lt_text_list)):
            lt_text2 = lt_text_list[j]
            # if lt_text1 == lt_text2:
            #     continue
            if lt_text1.bbox[2] >= lt_text2.bbox[0]:
                continue

            # x轴上不相交
            line2_x = ((lt_text2.bbox[0], 0), (lt_text2.bbox[2], 0))
            if line_iou(line1_x, line2_x) > 0:
                continue

            # y轴上iou大于一定值
            line2_y = ((lt_text2.bbox[1], 0), (lt_text2.bbox[3], 0))
            if line_iou(line1_y, line2_y) > 0.9 \
                    and abs(lt_text1.bbox[2] - lt_text2.bbox[0]) < avg_char_width * 5 \
                    and re.search('[:：]', lt_text2.get_text()) \
                    and not re.search('[:：]', lt_text1.get_text()) \
                    and len(lt_text1.get_text()) <= 2:
                new_lt_text = TextBox(text=lt_text1.get_text() + lt_text2.get_text(),
                                      bbox=[lt_text1.bbox[0], min(lt_text1.bbox[1], lt_text2.bbox[1]),
                                            lt_text2.bbox[2], max(lt_text1.bbox[3], lt_text2.bbox[3])
                                            ])
                lt_text_list[i] = new_lt_text
                lt_text_list[j] = new_lt_text
                if show:
                    print('new_lt_text', new_lt_text)

    lt_text_list = list(set(lt_text_list))
    lt_text_list.sort(key=lambda x: (x.bbox[0], x.bbox[1]))

    return lt_text_list


def sort_by_read_order(lt_text_list, threshold=10):
    if not lt_text_list:
        return lt_text_list

    # 按 y1 升序排序
    lt_text_list.sort(key=lambda x: x.bbox[1])

    # 初始化变量
    sorted_lt_text_list = []
    current_row = [lt_text_list[0]]

    for i in range(1, len(lt_text_list)):
        # 如果当前边界框的 y1 与前一个边界框的 y1 差距小于阈值，认为是同一行
        if abs(lt_text_list[i].bbox[1] - lt_text_list[i - 1].bbox[1]) < threshold:
            current_row.append(lt_text_list[i])
        else:
            # 对当前行按 x1 排序并添加到结果中
            current_row.sort(key=lambda x: x.bbox[0])
            sorted_lt_text_list += current_row
            current_row = [lt_text_list[i]]

    # 添加最后一行
    current_row.sort(key=lambda x: x.bbox[0])
    sorted_lt_text_list += current_row
    return sorted_lt_text_list


def delete_empty_bbox(lt_text_list, show=0):
    temp_list = []
    for lt_text in lt_text_list:
        if lt_text.get_text() in [':', "：", ";", "；"] \
                or re.sub('\s', '', lt_text.get_text()) == "":
            continue
        temp_list.append(lt_text)
    lt_text_list = temp_list
    return lt_text_list


def standard_table(table, show=0):
    if not table:
        return table

    # 去掉占位符
    for ri, row in enumerate(table):
        for ci, col in enumerate(row):
            if '@@:' in col.get('text'):
                col['text'] = re.sub('@@:', '', col.get('text'))

    # 修复一些表头冒号ocr提取不到被作为值的问题
    for ri, row in enumerate(table):
        if row[0].get('text') == '' and row[1].get('text') != '' and row[2].get('text') != '' and row[3].get('text') == '':
            row[0]['text'] = row[1].get('text')
            row[1]['text'] = ''
            if show:
                print('standard_table, add colon head', table[ri])

    # 修复表头值上下错位的情况
    # head          head
    #       value           value
    delete_row_index_list = []
    for ri, row in enumerate(table):
        if ri == 0:
            continue
        last_row = table[ri - 1]
        if last_row[0].get('text') != '' and last_row[1].get('text') == '' \
                and row[0].get('text') == '' and row[1].get('text') != '' \
                and last_row[2].get('text') != '' and last_row[3].get('text') == '' \
                and row[2].get('text') == '' and row[3].get('text') != '':
            # 补上表头
            row[0]['text'] = last_row[0].get('text')
            row[2]['text'] = last_row[2].get('text')
            delete_row_index_list.append(ri - 1)
            if show:
                print('standard_table, fix head value 1', table[ri])

    temp_list = []
    for ri, row in enumerate(table):
        if ri in delete_row_index_list:
            continue
        temp_list.append(row)
    table = temp_list

    # 修复值未被合进上一行的情况
    # head  value   head    value
    #       value           value
    delete_row_index_list = []
    for ri, row in enumerate(table):
        if ri == 0:
            continue
        last_row = table[ri - 1]
        if last_row[0].get('text') != '' and last_row[1].get('text') != '' \
                and row[0].get('text') == '' and row[1].get('text') != '' \
                and last_row[2].get('text') != '' and last_row[3].get('text') != '' \
                and row[2].get('text') == '' and row[3].get('text') != '':
            # 补上值
            last_row[1]['text'] += row[1]['text']
            last_row[3]['text'] += row[3]['text']
            delete_row_index_list.append(ri)
    temp_list = []
    for ri, row in enumerate(table):
        if ri in delete_row_index_list:
            continue
        temp_list.append(row)
    table = temp_list
    return table


@memory_decorator
def find_outline_lt_text(lt_text_list, show=0):
    lt_text_list.sort(key=lambda x: (x.bbox[1], x.bbox[0]))
    used_lt_text_list = []
    row_list = []
    for lt_text1 in lt_text_list:
        if lt_text1 in used_lt_text_list:
            continue
        row = [lt_text1]
        used_lt_text_list.append(lt_text1)
        for lt_text2 in lt_text_list:
            if lt_text2 in used_lt_text_list:
                continue
            line1 = [(lt_text1.bbox[1], 0), (lt_text1.bbox[3], 0)]
            line2 = [(lt_text2.bbox[1], 0), (lt_text2.bbox[3], 0)]
            if line_iou(line1, line2) > 0:
                row.append(lt_text2)
                used_lt_text_list.append(lt_text2)
        row_list.append(row)

    outline_lt_text_list = []
    for row in row_list:
        if len(row) >= 2:
            continue
        outline_lt_text_list += row

    if show:
        print('outline_lt_text_list', outline_lt_text_list)
    return outline_lt_text_list


def get_iou(bbox1, bbox2):
    # 提取边界框的坐标
    x1_1, y1_1, x2_1, y2_1 = bbox1
    x1_2, y1_2, x2_2, y2_2 = bbox2

    # 判断是否完全包含
    if (x1_1 <= x1_2 and y1_1 <= y1_2 and x2_1 >= x2_2 and y2_1 >= y2_2) or \
            (x1_2 <= x1_1 and y1_2 <= y1_1 and x2_2 >= x2_1 and y2_2 >= y2_1):
        return 1.0

    # 计算交集区域的坐标
    inter_x1 = max(x1_1, x1_2)
    inter_y1 = max(y1_1, y1_2)
    inter_x2 = min(x2_1, x2_2)
    inter_y2 = min(y2_1, y2_2)

    # 计算交集区域的面积
    inter_width = max(0, inter_x2 - inter_x1 + 1)
    inter_height = max(0, inter_y2 - inter_y1 + 1)
    inter_area = inter_width * inter_height

    # 计算两个边界框的面积
    bbox1_area = (x2_1 - x1_1 + 1) * (y2_1 - y1_1 + 1)
    bbox2_area = (x2_2 - x1_2 + 1) * (y2_2 - y1_2 + 1)

    # 计算并集区域的面积
    union_area = bbox1_area + bbox2_area - inter_area

    # 计算 IoU
    iou = inter_area / union_area if union_area != 0 else 0

    return iou


def fix_cross_bbox(lt_text_list, show=0):
    for lt_text1 in lt_text_list:
        for lt_text2 in lt_text_list:
            if lt_text1 == lt_text2:
                continue
            if get_iou(lt_text1.bbox, lt_text2.bbox) > 0:
                if show:
                    print('fix_cross_bbox1', lt_text1, lt_text2)
                x10, x11, x12, x13 = lt_text1.bbox
                x20, x21, x22, x23 = lt_text2.bbox

                # 右侧相交，且交集不能过大，过大则不是这一维相交
                if x10 < x20 < x12 and x12 - x20 < max(abs(x12 - x10), abs(x20 - x22)) / 2:
                    x12 = min(lt_text1.bbox[2], lt_text2.bbox[0])
                    x20 = max(lt_text1.bbox[2], lt_text2.bbox[0])

                # 下方相交，且交集不能过大，过大则不是这一维相交
                if x11 < x21 < x13 and x13 - x21 < max(abs(x13 - x11), abs(x21 - x23)) / 2:
                    x13 = min(lt_text1.bbox[3], lt_text2.bbox[1])
                    x21 = max(lt_text1.bbox[3], lt_text2.bbox[1])

                lt_text1.bbox = [x10, x11, x12, x13]
                lt_text2.bbox = [x20, x21, x22, x23]
                if show:
                    print('fix_cross_bbox2', lt_text1, lt_text2)
    return lt_text_list


def split_lt_text_by_many_space(lt_text_list, show=0):
    from format_convert.convert_tree import TextBox

    # 先处理前后空格
    add_lt_text_list = []
    delete_lt_text_list = []
    for lt_text in lt_text_list:
        text = lt_text.get_text()
        bbox = lt_text.bbox

        if len(text) == 0:
            continue
        text_unicode_len = get_char_unicode_length(text)
        if text_unicode_len == 0:
            continue
        ratio = abs(bbox[2] - bbox[0]) / text_unicode_len

        space1 = re.findall('^[ 　]+', text)
        if space1:
            space1 = ''.join(space1)
            space1_unicode_len = get_char_unicode_length(space1)
            space1_pixel_len = space1_unicode_len * ratio
            text = re.sub('^[ 　]+', '', text)
            bbox = [bbox[0] + space1_pixel_len, bbox[1], bbox[2], bbox[3]]
            if len(text) == 0:
                continue
            text_unicode_len = get_char_unicode_length(text)
            if text_unicode_len == 0:
                continue
            ratio = abs(bbox[2] - bbox[0]) / text_unicode_len

        space2 = re.findall('[ 　]+$', text)
        if space2:
            space2 = ''.join(space2)
            space2_unicode_len = get_char_unicode_length(space2)
            space2_pixel_len = space2_unicode_len * ratio
            text = re.sub('[ 　]+$', '', text)
            bbox = [bbox[0], bbox[1], bbox[2] - space2_pixel_len, bbox[3]]
            if len(text) == 0:
                continue
            text_unicode_len = get_char_unicode_length(text)
            if text_unicode_len == 0:
                continue
            ratio = abs(bbox[2] - bbox[0]) / text_unicode_len

        if space1 or space2:
            new_lt_text = TextBox(text=text, bbox=bbox)
            add_lt_text_list.append(new_lt_text)
            delete_lt_text_list.append(lt_text)

    for lt_text in delete_lt_text_list:
        if lt_text in lt_text_list:
            lt_text_list.remove(lt_text)
    lt_text_list += add_lt_text_list

    # 处理表头中间隔着几个空格 电  话：        电  话：
    add_lt_text_list = []
    delete_lt_text_list = []
    for lt_text in lt_text_list:
        text = lt_text.get_text()
        bbox = lt_text.bbox

        if len(text) == 0:
            continue

        space_list = re.findall('[ 　]+', text)
        if len(space_list) >= 2:
            space_list.sort(key=lambda x: len(x))
            max_space = space_list[-1]
            match = re.search(max_space, text)
            if show:
                print('max_space', max_space)
                print('space_list', space_list)
            if match:
                part1 = text[:match.start()]
                part2 = text[match.end():]
                ss1 = re.split('[ 　]+', part1)
                ss2 = re.split('[ 　]+', part2)

                if len(ss1) == 2 and len(ss1[0]) == 1 and len(ss1[1]) == 2 and ss1[1][-1] in [':', '：'] \
                        and len(ss2) == 2 and len(ss2[0]) == 1 and len(ss2[1]) == 2 and ss2[1][-1] in [':', '：']:
                    new_text = ''.join(ss1) + max_space + ''.join(ss2)
                    new_lt_text = TextBox(text=new_text, bbox=bbox)
                    add_lt_text_list.append(new_lt_text)
                    delete_lt_text_list.append(lt_text)

    if show:
        print('split_lt_text_by_many_space add_lt_text_list222', add_lt_text_list)
        print('split_lt_text_by_many_space delete_lt_text_list222', delete_lt_text_list)

    for lt_text in delete_lt_text_list:
        if lt_text in lt_text_list:
            lt_text_list.remove(lt_text)
    lt_text_list += add_lt_text_list

    # 处理中间多个空格，并拆分为两个
    add_lt_text_list = []
    delete_lt_text_list = []
    for lt_text in lt_text_list:
        text = lt_text.get_text()
        bbox = lt_text.bbox

        if len(text) == 0:
            continue

        text_unicode_len = get_char_unicode_length(text)
        if text_unicode_len == 0:
            continue
        ratio = abs(bbox[2] - bbox[0]) / text_unicode_len

        # 中间有多个空格，且空格分割为两部分
        match = re.search('[ 　]{4,}', text)
        ss = re.split('[ 　]+', text)
        if match and len(ss) == 2:
            # if match:
            part1 = text[:match.start()]
            part2 = text[match.end():]

            l1 = re.findall('[a-zA-Z0-9\u4e00-\u9fff]', part1)
            l2 = re.findall('[a-zA-Z0-9\u4e00-\u9fff]', part2)
            # 两边字符数都足够
            if len(l1) >= 2 and len(l2) >= 2:
                part1_unicode_len = get_char_unicode_length(part1)
                part2_unicode_len = get_char_unicode_length(part2)

                part1_pixel_len = ratio * part1_unicode_len
                part2_pixel_len = ratio * part2_unicode_len

                # avg_char_w = abs(bbox[0] - bbox[2]) / len(text)
                bbox1 = [bbox[0], bbox[1], bbox[0] + part1_pixel_len, bbox[3]]
                bbox2 = [bbox[2] - part2_pixel_len, bbox[1], bbox[2], bbox[3]]
                # 用自己的对象新增
                new_lt_text1 = TextBox(text=part1, bbox=bbox1)
                new_lt_text2 = TextBox(text=part2, bbox=bbox2)
                add_lt_text_list += [new_lt_text1, new_lt_text2]
                delete_lt_text_list.append(lt_text)

    for lt_text in delete_lt_text_list:
        if lt_text in lt_text_list:
            lt_text_list.remove(lt_text)
    lt_text_list += add_lt_text_list

    if show:
        print('split_lt_text_by_many_space add_lt_text_list333', add_lt_text_list)
        print('split_lt_text_by_many_space delete_lt_text_list333', delete_lt_text_list)

    return lt_text_list


def get_char_unicode_length(text, show=0):
    # char_reg_len_dict = {
    #     '[ ]': 1,
    #     '[　]': 1.5,
    #     '[\u4e00-\u9fff]': 1.5,
    #     '[a-zA-Z0-9#@,^.+=\(\)<>\-@#$%&*\[\]\'":;?~!’‘“”{}/]': 1,
    #     '[：，。！￥……（）【】；？《》、]': 1.5
    # }
    #
    # text_real_len = 0
    # for reg, char_len in char_reg_len_dict.items():
    #     cs = re.findall(reg, text)
    #     text_real_len += len(cs) * char_len
    #
    # real_avg_char_len = abs(bbox[2] - bbox[0]) / text_real_len
    #
    # char_reg_real_len_dict = {}
    # for reg, char_len in char_reg_len_dict.items():
    #     char_reg_real_len_dict[reg] = real_avg_char_len * char_len
    #
    # return char_reg_real_len_dict

    width = wcwidth.wcswidth(text)
    if show:
        print('text unicode_length', text, width)
    return width


def fix_final_row(table, show=0):
    # print('fix_final_row table', table)
    if len(table) < 2:
        return table
    last_row = table[-2]
    final_row = table[-1]
    print('final_row', final_row)
    print('last_row', last_row)
    delete_final_flag = 0
    if final_row[0] in ['', '@@:'] and final_row[1] in ['', '@@:'] \
            and final_row[2] in ['', '@@:'] and final_row[3] not in ['', '@@:']:
        table[-2][3] = final_row[3]
        delete_final_flag = 1
        if show:
            print('fix_final_row right', table[-2])

    if final_row[0] in ['', '@@:'] and final_row[1] not in ['', '@@:'] \
            and final_row[2] in ['', '@@:'] and final_row[3] in ['', '@@:']:
        table[-2][1] = final_row[1]
        delete_final_flag = 1
        if show:
            print('fix_final_row left', table[-2])

    if delete_final_flag:
        table = table[:-1]

    return table


if __name__ == '__main__':
    # from format_convert.convert_pdf import PDFConvert
    # pdf_c = PDFConvert(None, None, None)
    # from format_convert.convert_image import ImageProcess
    # img_p = ImageProcess(None, None)
    #
    # ps = glob(r'D:\Project\format_conversion_maxcompute\save_b_table_not_detect\*')
    # image_np_list = [[x, cv2.imread(x)] for x in ps]
    # for p, image_np in image_np_list:
    #     # 整体分辨率限制
    #     image_np = img_p.resize_process(image_np)
    #     # 文字识别
    #     text_list, box_list = img_p.ocr_process(image_np)
    #     # 转换为lt_text_box
    #     _lt_text_list = text_bbox_to_lt(text_list, box_list)
    # 先bbox预先判断可能有无边框
    # _flag = judge_has_b_table_by_bbox(_lt_text_list, [], 0)
    # print('path', p, 'has b table', _flag)

    _pp = r'D:\Project\format_conversion_maxcompute\save_b_table\15-8292f767be81f404b813c119058a8a75.png'
    img111 = cv2.imread(_pp)
    img111 = pil_resize(img111, 1024, 768)
    get_straight_lines_from_image(img111)
    pass