12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989910010110210310410510610710810911011111211311411511611711811912012112212312412512612712812913013113213313413513613713813914014114214314414514614714814915015115215315415515615715815916016116216316416516616716816917017117217317417517617717817918018118218318418518618718818919019119219319419519619719819920020120220320420520620720820921021121221321421521621721821922022122222322422522622722822923023123223323423523623723823924024124224324424524624724824925025125225325425525625725825926026126226326426526626726826927027127227327427527627727827928028128228328428528628728828929029129229329429529629729829930030130230330430530630730830931031131231331431531631731831932032132232332432532632732832933033133233333433533633733833934034134234334434534634734834935035135235335435535635735835936036136236336436536636736836937037137237337437537637737837938038138238338438538638738838939039139239339439539639739839940040140240340440540640740840941041141241341441541641741841942042142242342442542642742842943043143243343443543643743843944044144244344444544644744844945045145245345445545645745845946046146246346446546646746846947047147247347447547647747847948048148248348448548648748848949049149249349449549649749849950050150250350450550650750850951051151251351451551651751851952052152252352452552652752852953053153253353453553653753853954054154254354454554654754854955055155255355455555655755855956056156256356456556656756856957057157257357457557657757857958058158258358458558658758858959059159259359459559659759859960060160260360460560660760860961061161261361461561661761861962062162262362462562662762862963063163263363463563663763863964064164264364464564664764864965065165265365465565665765865966066166266366466566666766866967067167267367467567667767867968068168268368468568668768868969069169269369469569669769869970070170270370470570670770870971071171271371471571671771871972072172272372472572672772872973073173273373473573673773873974074174274374474574674774874975075175275375475575675775875976076176276376476576676776876977077177277377477577677777877978078178278378478578678778878979079179279379479579679779879980080180280380480580680780880981081181281381481581681781881982082182282382482582682782882983083183283383483583683783883984084184284384484584684784884985085185285385485585685785885986086186286386486586686786886987087187287387487587687787887988088188288388488588688788888989089189289389489589689789889990090190290390490590690790890991091191291391491591691791891992092192292392492592692792892993093193293393493593693793893994094194294394494594694794894995095195295395495595695795895996096196296396496596696796896997097197297397497597697797897998098198298398498598698798898999099199299399499599699799899910001001100210031004100510061007100810091010101110121013101410151016101710181019102010211022102310241025102610271028102910301031103210331034103510361037103810391040104110421043104410451046104710481049105010511052105310541055105610571058105910601061106210631064106510661067106810691070107110721073107410751076107710781079 |
- import os
- import sys
- sys.path.append(os.path.dirname(__file__) + "/../")
- import difflib
- import logging
- import mimetypes
- import platform
- import re
- import traceback
- import filetype
- from bs4 import BeautifulSoup
- from pdfminer.layout import *
- def judge_error_code(_list, code=[0, -1, -2, -3, -4, -5, -6, -7, -8]):
- """
- [0] : continue
- [-1]: 逻辑处理错误
- [-2]: 接口调用错误
- [-3]: 文件格式错误,无法打开
- [-4]: 各类文件调用第三方包读取超时
- [-5]: 整个转换过程超时
- [-6]: 阿里云UDF队列超时
- [-7]: 文件需密码,无法打开
- [-8]: 调用现成接口报错
- """
- for c in code:
- if _list == [c]:
- return True
- return False
- def add_div(text):
- if text == "" or text is None:
- return text
- if get_platform() == "Windows":
- print("add_div", text)
- if re.findall("<div>", text):
- return text
- text = "<div>" + text + "\n"
- text = re.sub("\n", "</div>\n<div>", text)
- # text += "</div>"
- if text[-5:] == "<div>":
- print("add_div has cut", text[-30:])
- text = text[:-5]
- return text
- def get_platform():
- sys = platform.system()
- return sys
- def get_html_p(html_path):
- logging.info("into get_html_p")
- try:
- with open(html_path, "r") as ff:
- html_str = ff.read()
- soup = BeautifulSoup(html_str, 'lxml')
- text = ""
- for p in soup.find_all("p"):
- p_text = p.text
- p_text = p_text.strip()
- if p.string != "":
- text += p_text
- text += "\n"
- return text
- except Exception as e:
- logging.info("get_html_p error!")
- print("get_html_p", traceback.print_exc())
- return [-1]
- def string_similarity(str1, str2):
- # 去掉<div>和回车
- str1 = re.sub("<div>", "", str1)
- str1 = re.sub("</div>", "", str1)
- str1 = re.sub("\n", "", str1)
- str2 = re.sub("<div>", "", str2)
- str2 = re.sub("</div>", "", str2)
- str2 = re.sub("\n", "", str2)
- # print("********************************")
- # print("str1", str1)
- # print("********************************")
- # print("str2", str2)
- # print("********************************")
- score = difflib.SequenceMatcher(None, str1, str2).ratio()
- print("string_similarity", score)
- return score
- def get_sequential_data(text_list, bbox_list, html=False):
- logging.info("into get_sequential_data")
- try:
- text = ""
- order_list = []
- for i in range(len(text_list)):
- length_start = bbox_list[i][0][0]
- length_end = bbox_list[i][1][0]
- height_start = bbox_list[i][0][1]
- height_end = bbox_list[i][-1][1]
- # print([length_start, length_end, height_start, height_end])
- order_list.append([text_list[i], length_start, length_end, height_start, height_end])
- # text = text + infomation['text'] + "\n"
- if get_platform() == "Windows":
- print("get_sequential_data", order_list)
- if not order_list:
- if get_platform() == "Windows":
- print("get_sequential_data", "no order list")
- return ""
- # 根据bbox的坐标对输出排序
- order_list.sort(key=lambda x: (x[3], x[1]))
- # 根据bbox分行分列
- # col_list = []
- # height_end = int((order_list[0][4] + order_list[0][3]) / 2)
- # for i in range(len(order_list)):
- # if height_end - threshold <= order_list[i][3] <= height_end + threshold:
- # col_list.append(order_list[i])
- # else:
- # row_list.append(col_list)
- # col_list = []
- # height_end = int((order_list[i][4] + order_list[i][3]) / 2)
- # col_list.append(order_list[i])
- # if i == len(order_list) - 1:
- # row_list.append(col_list)
- row_list = []
- used_box = []
- threshold = 5
- for box in order_list:
- if box in used_box:
- continue
- height_center = (box[4] + box[3]) / 2
- row = []
- for box2 in order_list:
- if box2 in used_box:
- continue
- height_center2 = (box2[4] + box2[3]) / 2
- if height_center - threshold <= height_center2 <= height_center + threshold:
- if box2 not in row:
- row.append(box2)
- used_box.append(box2)
- row.sort(key=lambda x: x[0])
- row_list.append(row)
- for row in row_list:
- if not row:
- continue
- if len(row) <= 1:
- text = text + row[0][0] + "\n"
- else:
- sub_text = ""
- row.sort(key=lambda x: x[1])
- for col in row:
- sub_text = sub_text + col[0] + " "
- sub_text = sub_text + "\n"
- text += sub_text
- if html:
- text = "<div>" + text
- text = re.sub("\n", "</div>\n<div>", text)
- text += "</div>"
- # if text[-5:] == "<div>":
- # text = text[:-5]
- return text
- except Exception as e:
- logging.info("get_sequential_data error!")
- print("get_sequential_data", traceback.print_exc())
- return [-1]
- # def get_formatted_table(text_list, text_bbox_list, table_bbox_list, split_line):
- # logging.info("into get_formatted_table")
- # try:
- # # 重新定义text_bbox_list,[point, point, text]
- # text_bbox_list = [[text_bbox_list[i][0], text_bbox_list[i][2], text_list[i]] for i in
- # range(len(text_bbox_list))]
- # # 按纵坐标排序
- # text_bbox_list.sort(key=lambda x: (x[0][1], x[0][0]))
- # table_bbox_list.sort(key=lambda x: (x[0][1], x[0][0]))
- #
- # # print("text_bbox_list", text_bbox_list)
- # # print("table_bbox_list", table_bbox_list)
- #
- # # bbox位置 threshold
- # threshold = 5
- #
- # # 根据split_line分区,可能有个区多个表格 [(), ()]
- # area_text_bbox_list = []
- # area_table_bbox_list = []
- # # print("get_formatted_table, split_line", split_line)
- # for j in range(1, len(split_line)):
- # last_y = split_line[j - 1][0][1]
- # current_y = split_line[j][0][1]
- # temp_text_bbox_list = []
- # temp_table_bbox_list = []
- #
- # # 找出该区域下text bbox
- # for text_bbox in text_bbox_list:
- # # 计算 text bbox 中心点
- # text_bbox_center = ((text_bbox[1][0] + text_bbox[0][0]) / 2,
- # (text_bbox[1][1] + text_bbox[0][1]) / 2)
- # if last_y - threshold <= text_bbox_center[1] <= current_y + threshold:
- # temp_text_bbox_list.append(text_bbox)
- # area_text_bbox_list.append(temp_text_bbox_list)
- #
- # # 找出该区域下table bbox
- # for table_bbox in table_bbox_list:
- # # 计算 table bbox 中心点
- # table_bbox_center = ((table_bbox[1][0] + table_bbox[0][0]) / 2,
- # (table_bbox[1][1] + table_bbox[0][1]) / 2)
- # if last_y < table_bbox_center[1] < current_y:
- # temp_table_bbox_list.append(table_bbox)
- # area_table_bbox_list.append(temp_table_bbox_list)
- #
- # # for j in range(len(area_text_bbox_list)):
- # # print("area_text_bbox_list", j, area_text_bbox_list[j])
- #
- # # 对每个区域分别进行两个bbox匹配,生成表格
- # area_text_list = []
- # area_column_list = []
- # for j in range(len(area_text_bbox_list)):
- # # 每个区域的table bbox 和text bbox
- # temp_table_bbox_list = area_table_bbox_list[j]
- # temp_text_bbox_list = area_text_bbox_list[j]
- #
- # # 判断该区域有无表格bbox
- # # 若无表格,将该区域文字连接
- # if not temp_table_bbox_list:
- # # 找出该区域的所有text bbox
- # only_text_list = []
- # only_bbox_list = []
- # for text_bbox in temp_text_bbox_list:
- # only_text_list.append(text_bbox[2])
- # only_bbox_list.append([text_bbox[0], text_bbox[1]])
- # only_text = get_sequential_data(only_text_list, only_bbox_list, True)
- # if only_text == [-1]:
- # return [-1], [-1]
- # area_text_list.append(only_text)
- # area_column_list.append(0)
- # continue
- #
- # # 有表格
- # # 文本对应的表格格子
- # text_in_table = {}
- # for i in range(len(temp_text_bbox_list)):
- # text_bbox = temp_text_bbox_list[i]
- #
- # # 计算 text bbox 中心点
- # text_bbox_center = ((text_bbox[1][0] + text_bbox[0][0]) / 2,
- # (text_bbox[1][1] + text_bbox[0][1]) / 2)
- #
- # # 判断中心点在哪个table bbox中
- # for table_bbox in temp_table_bbox_list:
- # # 中心点在table bbox中,将text写入字典
- # if table_bbox[0][0] <= text_bbox_center[0] <= table_bbox[1][0] and \
- # table_bbox[0][1] <= text_bbox_center[1] <= table_bbox[1][1]:
- # if str(table_bbox) in text_in_table.keys():
- # text_in_table[str(table_bbox)] = text_in_table.get(str(table_bbox)) + text_bbox[2]
- # else:
- # text_in_table[str(table_bbox)] = text_bbox[2]
- # break
- #
- # # 如果未找到text bbox匹配的table bbox,加大threshold匹配
- # # elif (table_bbox[0][0] <= text_bbox_center[0]+threshold <= table_bbox[1][0] and
- # # table_bbox[0][1] <= text_bbox_center[1]+threshold <= table_bbox[1][1]) or \
- # # (table_bbox[0][0] <= text_bbox_center[0]-threshold <= table_bbox[1][0] and
- # # table_bbox[0][1] <= text_bbox_center[1]-threshold <= table_bbox[1][1]) or \
- # # (table_bbox[0][0] <= text_bbox_center[0]+threshold <= table_bbox[1][0] and
- # # table_bbox[0][1] <= text_bbox_center[1]-threshold <= table_bbox[1][1]) or \
- # # (table_bbox[0][0] <= text_bbox_center[0]-threshold <= table_bbox[1][0] and
- # # table_bbox[0][1] <= text_bbox_center[1]+threshold <= table_bbox[1][1]):
- # # if str(table_bbox) in text_in_table.keys():
- # # text_in_table[str(table_bbox)] = text_in_table.get(str(table_bbox)) + text_bbox[2]
- # # else:
- # # text_in_table[str(table_bbox)] = text_bbox[2]
- # # break
- #
- # # 对表格格子进行分行分列,并计算总计多少小列
- # # 放入坐标
- # all_col_list = []
- # all_row_list = []
- # for i in range(len(temp_table_bbox_list)):
- # table_bbox = temp_table_bbox_list[i]
- #
- # # 放入所有坐标x
- # if table_bbox[0][0] not in all_col_list:
- # all_col_list.append(table_bbox[0][0])
- # if table_bbox[1][0] not in all_col_list:
- # all_col_list.append(table_bbox[1][0])
- #
- # # 放入所有坐标y
- # if table_bbox[0][1] not in all_row_list:
- # all_row_list.append(table_bbox[0][1])
- # if table_bbox[1][1] not in all_row_list:
- # all_row_list.append(table_bbox[1][1])
- # all_col_list.sort(key=lambda x: x)
- # all_row_list.sort(key=lambda x: x)
- #
- # # 分行
- # row_list = []
- # rows = []
- # temp_table_bbox_list.sort(key=lambda x: (x[0][1], x[0][0], x[1][1], x[1][0]))
- # y_row = temp_table_bbox_list[0][0][1]
- # for i in range(len(temp_table_bbox_list)):
- # table_bbox = temp_table_bbox_list[i]
- #
- # if y_row - threshold <= table_bbox[0][1] <= y_row + threshold:
- # rows.append(table_bbox)
- # else:
- # y_row = table_bbox[0][1]
- # if rows:
- # rows.sort(key=lambda x: x[0][0])
- # row_list.append(rows)
- # rows = []
- # rows.append(table_bbox)
- # # print("*" * 30)
- # # print(row_list)
- #
- # if i == len(temp_table_bbox_list) - 1:
- # if rows:
- # rows.sort(key=lambda x: x[0][0])
- # row_list.append(rows)
- #
- # # 生成表格,包括文字和格子宽度
- # area_column = []
- # text = '<table border="1">' + "\n"
- # for row in row_list:
- # text += "<tr>" + "\n"
- # for col in row:
- # # 计算bbox y坐标之间有多少其他点,+1即为所占行数
- # row_span = 1
- # for y in all_row_list:
- # if col[0][1] < y < col[1][1]:
- # if y - col[0][1] >= 2 and col[1][1] - y >= 2:
- # row_span += 1
- #
- # # 计算bbox x坐标之间有多少其他点,+1即为所占列数
- # col_span = 1
- # for x in all_col_list:
- # if col[0][0] < x < col[1][0]:
- # if x - col[0][0] >= 2 and col[1][0] - x >= 2:
- # col_span += 1
- #
- # text += "<td colspan=" + str(col_span) + " rowspan=" + str(row_span) + ">"
- #
- # if str(col) in text_in_table.keys():
- # text += text_in_table.get(str(col))
- # else:
- # text += ""
- # text += "</td>" + "\n"
- # text += "</tr>" + "\n"
- # text += "</table>" + "\n"
- #
- # # 计算最大column
- # max_col_num = 0
- # for row in row_list:
- # col_num = 0
- # for col in row:
- # col_num += 1
- # if max_col_num < col_num:
- # max_col_num = col_num
- #
- # area_text_list.append(text)
- # area_column_list.append(max_col_num)
- #
- # text = ""
- # if get_platform() == "Windows":
- # print("get_formatted_table area_text_list", area_text_list)
- # for area_text in area_text_list:
- # text += area_text
- # return text, area_column_list
- # except Exception as e:
- # logging.info("get_formatted_table error!")
- # print("get_formatted_table", traceback.print_exc())
- # return [-1], [-1]
- def rename_inner_files(root_path):
- try:
- logging.info("into rename_inner_files")
- # 获取解压文件夹下所有文件+文件夹,不带根路径
- path_list = []
- for root, dirs, files in os.walk(root_path, topdown=False):
- for name in dirs:
- p = os.path.join(root, name) + os.sep
- if get_platform() == "Windows":
- root_path = slash_replace(root_path)
- p = slash_replace(p)
- p = re.sub(root_path, "", p)
- root_path = slash_replace(root_path, True)
- p = slash_replace(p, True)
- else:
- p = re.sub(root_path, "", p)
- path_list.append(p)
- for name in files:
- p = os.path.join(root, name)
- if get_platform() == "Windows":
- root_path = slash_replace(root_path)
- p = slash_replace(p)
- p = re.sub(root_path, "", p)
- root_path = slash_replace(root_path, True)
- p = slash_replace(p, True)
- else:
- p = re.sub(root_path, "", p)
- path_list.append(p)
- # 按路径长度排序
- path_list.sort(key=lambda x: len(x), reverse=True)
- # 循环改名
- for old_path in path_list:
- # 按路径分隔符分割
- ss = old_path.split(os.sep)
- # 判断是否文件夹
- is_dir = 0
- file_type = ""
- if os.path.isdir(root_path + old_path):
- ss = ss[:-1]
- is_dir = 1
- else:
- if "." in old_path:
- file_type = "." + old_path.split(".")[-1]
- else:
- file_type = ""
- # 最后一级需要用hash改名
- new_path = ""
- # new_path = re.sub(ss[-1], str(hash(ss[-1])), old_path) + file_type
- current_level = 0
- for s in ss:
- # 路径拼接
- if current_level < len(ss) - 1:
- new_path += s + os.sep
- else:
- new_path += str(hash(s)) + file_type
- current_level += 1
- new_ab_path = root_path + new_path
- old_ab_path = root_path + old_path
- os.rename(old_ab_path, new_ab_path)
- # 重新获取解压文件夹下所有文件+文件夹
- new_path_list = []
- for root, dirs, files in os.walk(root_path, topdown=False):
- for name in dirs:
- new_path_list.append(os.path.join(root, name) + os.sep)
- for name in files:
- new_path_list.append(os.path.join(root, name))
- return new_path_list
- except:
- traceback.print_exc()
- return [-1]
- def judge_format(path):
- guess1 = mimetypes.guess_type(path)
- _type = None
- if guess1[0]:
- _type = guess1[0]
- else:
- guess2 = filetype.guess(path)
- if guess2:
- _type = guess2.mime
- if _type == "application/pdf":
- return "pdf"
- if _type == "application/vnd.openxmlformats-officedocument.wordprocessingml.document":
- return "docx"
- if _type == "application/x-zip-compressed" or _type == "application/zip":
- return "zip"
- if _type == "application/x-rar-compressed" or _type == "application/rar":
- return "rar"
- if _type == "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet":
- return "xlsx"
- if _type == "application/msword":
- return "doc"
- if _type == "image/png":
- return "png"
- if _type == "image/jpeg":
- return "jpg"
- # 猜不到,返回None
- return None
- def slash_replace(_str, reverse=False):
- if reverse:
- _str = eval(repr(_str).replace('/', '\\\\'))
- else:
- _str = eval(repr(_str).replace('\\\\', '/'))
- return _str
- class LineTable:
- def recognize_table(self, list_textbox, list_line):
- self.list_line = list_line
- self.list_crosspoints = self.recognize_crosspoints(list_line)
- # 聚类
- cluster_crosspoints = []
- for _point in self.list_crosspoints:
- cluster_crosspoints.append({"lines": _point.get("lines"), "points": [_point]})
- while 1:
- _find = False
- new_cluster_crosspoints = []
- for l_point in cluster_crosspoints:
- _flag = False
- for l_n_point in new_cluster_crosspoints:
- line1 = l_point.get("lines")
- line2 = l_n_point.get("lines")
- if len(line1&line2) > 0:
- _find = True
- _flag = True
- l_n_point["lines"] = line1.union(line2)
- l_n_point["points"].extend(l_point["points"])
- if not _flag:
- new_cluster_crosspoints.append({"lines":l_point.get("lines"),"points":l_point.get("points")})
- cluster_crosspoints = new_cluster_crosspoints
- if not _find:
- break
- list_l_rect = []
- for table_crosspoint in cluster_crosspoints:
- list_rect = self.crosspoint2rect(table_crosspoint.get("points"))
- list_l_rect.append(list_rect)
- in_objs = set()
- list_tables = []
- for l_rect in list_l_rect:
- _ta = self.rect2table(list_textbox,l_rect,in_objs)
- if _ta:
- list_tables.append(_ta)
- self._plot(list_line, list_textbox)
- return list_tables, in_objs, list_l_rect
- def recognize_table_by_rect(self, list_textbox, list_rect, margin=2):
- dump_margin = 5
- list_rect_tmp = []
- # 去重
- for _rect in list_rect:
- if (_rect.bbox[3]-_rect.bbox[1] < 10) or (abs(_rect.bbox[2]-_rect.bbox[0]) < 5):
- continue
- _find = False
- for _tmp in list_rect_tmp:
- for i in range(4):
- if abs(_rect.bbox[i]-_tmp.bbox[i]) < dump_margin:
- pass
- else:
- _find = False
- break
- if i == 3:
- _find = True
- if _find:
- break
- if not _find:
- list_rect_tmp.append(_rect)
- # print("=====",len(list_rect),len(list_rect_tmp))
- # print(list_rect_tmp)
- # from matplotlib import pyplot as plt
- # plt.figure()
- # for _rect in list_rect_tmp:
- # x0,y0,x1,y1 = _rect.bbox
- # plt.boxplot(_rect.bbox)
- # plt.show()
- cluster_rect = []
- for _rect in list_rect:
- _find = False
- for cr in cluster_rect:
- for cr_rect in cr:
- if abs((cr_rect.bbox[2]-cr_rect.bbox[0]+_rect.bbox[2]-_rect.bbox[0])-(max(cr_rect.bbox[2],_rect.bbox[2])-min(cr_rect.bbox[0],_rect.bbox[0])))<margin:
- _find = True
- cr.append(_rect)
- break
- elif abs((cr_rect.bbox[3]-cr_rect.bbox[1]+_rect.bbox[3]-_rect.bbox[1])-(max(cr_rect.bbox[3],_rect.bbox[3])-min(cr_rect.bbox[1],_rect.bbox[1])))<margin:
- _find = True
- cr.append(_rect)
- break
- if _find:
- break
- if not _find:
- cluster_rect.append([_rect])
- list_l_rect = cluster_rect
- in_objs = set()
- list_tables = []
- for l_rect in list_l_rect:
- _ta = self.rect2table(list_textbox,l_rect,in_objs)
- if _ta:
- list_tables.append(_ta)
- return list_tables,in_objs,list_l_rect
- def recognize_crosspoints(self, list_line):
- from matplotlib import pyplot as plt
- list_crosspoints = []
- # print("lines num",len(list_line))
- for _i in range(len(list_line)):
- for _j in range(len(list_line)):
- line1 = list_line[_i].__dict__.get("bbox")
- line2 = list_line[_j].__dict__.get("bbox")
- exists,point = self.cross_point(line1,line2)
- if exists:
- list_crosspoints.append(point)
- # plt.figure()
- # for _line in list_line:
- # x0,y0,x1,y1 = _line.__dict__.get("bbox")
- # plt.plot([x0,x1],[y0,y1])
- # for _line in list_line:
- # x0,y0,x1,y1 = _line.bbox
- # plt.plot([x0,x1],[y0,y1])
- # for point in list_crosspoints:
- # plt.scatter(point.get("point")[0],point.get("point")[1])
- # plt.show()
- # print(list_crosspoints)
- # print("points num",len(list_crosspoints))
- return list_crosspoints
- def recognize_rect(self, _page):
- list_line = []
- for _obj in _page._objs:
- if isinstance(_obj, (LTLine)):
- list_line.append(_obj)
- list_crosspoints = self.recognize_crosspoints(list_line)
- #聚类
- cluster_crosspoints = []
- for _point in list_crosspoints:
- cluster_crosspoints.append({"lines":_point.get("lines"),"points":[_point]})
- while 1:
- _find = False
- new_cluster_crosspoints = []
- for l_point in cluster_crosspoints:
- _flag = False
- for l_n_point in new_cluster_crosspoints:
- line1 = l_point.get("lines")
- line2 = l_n_point.get("lines")
- if len(line1&line2)>0:
- _find = True
- _flag = True
- l_n_point["lines"] = line1.union(line2)
- l_n_point["points"].extend(l_point["points"])
- if not _flag:
- new_cluster_crosspoints.append({"lines":l_point.get("lines"),"points":l_point.get("points")})
- cluster_crosspoints = new_cluster_crosspoints
- if not _find:
- break
- # print(len(cluster_crosspoints))
- list_l_rect = []
- for table_crosspoint in cluster_crosspoints:
- list_rect = self.crosspoint2rect(table_crosspoint.get("points"))
- list_l_rect.append(list_rect)
- return list_l_rect
- def crosspoint2rect(self, list_crosspoint, margin=4):
- dict_line_points = {}
- for _point in list_crosspoint:
- lines = list(_point.get("lines"))
- for _line in lines:
- if _line not in dict_line_points:
- dict_line_points[_line] = {"direct":None,"points":[]}
- dict_line_points[_line]["points"].append(_point)
- # 排序
- for k, v in dict_line_points.items():
- list_x = []
- list_y = []
- for _p in v["points"]:
- list_x.append(_p.get("point")[0])
- list_y.append(_p.get("point")[1])
- if max(list_x)-min(list_x)>max(list_y)-min(list_y):
- v.get("points").sort(key=lambda x:x.get("point")[0])
- v["direct"] = "row"
- else:
- v.get("points").sort(key=lambda x:x.get("point")[1])
- v["direct"] = "column"
- list_rect = []
- for _point in list_crosspoint:
- if _point["buttom"]>=margin and _point["right"]>=margin:
- lines = list(_point.get("lines"))
- _line = lines[0]
- if dict_line_points[_line]["direct"]=="column":
- _line = lines[1]
- next_point = None
- for p1 in dict_line_points[_line]["points"]:
- if p1["buttom"]>=margin and p1["point"][0]>_point["point"][0]:
- next_point = p1
- break
- if not next_point:
- continue
- lines = list(next_point.get("lines"))
- _line = lines[0]
- if dict_line_points[_line]["direct"]=="row":
- _line = lines[1]
- final_point = None
- for p1 in dict_line_points[_line]["points"]:
- if p1["left"]>=margin and p1["point"][1]>next_point["point"][1]:
- final_point = p1
- break
- if not final_point:
- continue
- _r = LTRect(1,(_point["point"][0],_point["point"][1],final_point["point"][0],final_point["point"][1]))
- list_rect.append(_r)
- return list_rect
- def cross_point(self, line1, line2, segment=True, margin=2):
- point_is_exist = False
- x = y = 0
- x1, y1, x2, y2 = line1
- x3, y3, x4, y4 = line2
- if (x2 - x1) == 0:
- k1 = None
- b1 = 0
- else:
- k1 = (y2 - y1) * 1.0 / (x2 - x1) # 计算k1,由于点均为整数,需要进行浮点数转化
- b1 = y1 * 1.0 - x1 * k1 * 1.0 # 整型转浮点型是关键
- if (x4 - x3) == 0: # L2直线斜率不存在
- k2 = None
- b2 = 0
- else:
- k2 = (y4 - y3) * 1.0 / (x4 - x3) # 斜率存在
- b2 = y3 * 1.0 - x3 * k2 * 1.0
- if k1 is None:
- if not k2 is None:
- x = x1
- y = k2 * x1 + b2
- point_is_exist = True
- elif k2 is None:
- x = x3
- y = k1 * x3 + b1
- elif not k2 == k1:
- x = (b2 - b1) * 1.0 / (k1 - k2)
- y = k1 * x * 1.0 + b1 * 1.0
- point_is_exist = True
- left = 0
- right = 0
- top = 0
- buttom = 0
- if point_is_exist:
- if segment:
- if x>=(min(x1,x2)-margin) and x<=(max(x1,x2)+margin) and y>=(min(y1,y2)-margin) and y<=(max(y1,y2)+margin):
- if x>=(min(x3,x4)-margin) and x<=(max(x3,x4)+margin) and y>=(min(y3,y4)-margin) and y<=(max(y3,y4)+margin):
- point_is_exist = True
- left = abs(min(x1,x3)-x)
- right = abs(max(x2,x4)-x)
- top = abs(min(y1,y3)-y)
- buttom = abs(max(y2,y4)-y)
- else:
- point_is_exist = False
- else:
- point_is_exist = False
- line1_key = "%.2f-%.2f-%.2f-%.2f"%(x1, y1, x2, y2)
- line2_key = "%.2f-%.2f-%.2f-%.2f"%(x3, y3, x4, y4)
- return point_is_exist, {"point": [x, y], "left": left, "right": right,
- "top": top, "buttom": buttom, "lines": set([line1_key,line2_key])}
- def unionTable(self, list_table, fixspan=True, margin=2):
- set_x = set()
- set_y = set()
- list_cell = []
- for _t in list_table:
- for _line in _t:
- list_cell.extend(_line)
- clusters_rects = []
- #根据y1聚类
- set_id = set()
- list_cell_dump = []
- for _cell in list_cell:
- _id = id(_cell)
- if _id in set_id:
- continue
- set_id.add(_id)
- list_cell_dump.append(_cell)
- list_cell = list_cell_dump
- list_cell.sort(key=lambda x:x.get("bbox")[3])
- for _rect in list_cell:
- _y0 = _rect.get("bbox")[3]
- _find = False
- for l_cr in clusters_rects:
- if abs(l_cr[0].get("bbox")[3]-_y0)<2:
- _find = True
- l_cr.append(_rect)
- break
- if not _find:
- clusters_rects.append([_rect])
- clusters_rects.sort(key=lambda x:x[0].get("bbox")[3],reverse=True)
- for l_cr in clusters_rects:
- l_cr.sort(key=lambda x:x.get("bbox")[0])
- print("=============:")
- for l_r in clusters_rects:
- print(len(l_r))
- for _line in clusters_rects:
- for _rect in _line:
- (x0,y0,x1,y1) = _rect.get("bbox")
- set_x.add(x0)
- set_x.add(x1)
- set_y.add(y0)
- set_y.add(y1)
- if len(set_x)==0 or len(set_y)==0:
- return
- list_x = list(set_x)
- list_y = list(set_y)
- list_x.sort(key=lambda x:x)
- list_y.sort(key=lambda x:x,reverse=True)
- _table = []
- for _line in clusters_rects:
- table_line = []
- for _rect in _line:
- (x0,y0,x1,y1) = _rect.get("bbox")
- _cell = {"bbox":(x0,y0,x1,y1),"rect":_rect.get("rect"),"rowspan":self.getspan(list_y,y0,y1,margin),"columnspan":self.getspan(list_x,x0,x1,margin),"text":_rect.get("text","")}
- table_line.append(_cell)
- _table.append(table_line)
- # print("=====================>>")
- # for _line in _table:
- # for _cell in _line:
- # print(_cell,end="\t")
- # print("\n")
- # print("=====================>>")
- # print(_table)
- if fixspan:
- for _line in _table:
- for c_i in range(len(_line)):
- _cell = _line[c_i]
- if _cell.get("columnspan")>1:
- _cospan = _cell.get("columnspan")
- _cell["columnspan"] = 1
- for i in range(1,_cospan):
- _line.insert(c_i,_cell)
- for l_i in range(len(_table)):
- _line = _table[l_i]
- for c_i in range(len(_line)):
- _cell = _line[c_i]
- if _cell.get("rowspan")>1:
- _rospan = _cell.get("rowspan")
- _cell["rowspan"] = 1
- for i in range(1,_rospan):
- _table[l_i+i].insert(c_i,_cell)
- table_bbox = (_table[0][0].get("bbox")[0],_table[0][0].get("bbox")[1],_table[-1][-1].get("bbox")[2],_table[-1][-1].get("bbox")[3])
- ta = {"bbox":table_bbox,"table":_table}
- return ta
- def rect2table(self, list_textbox, list_rect, in_objs, margin=0.2, fixspan=True):
- _table = []
- set_x = set()
- set_y = set()
- clusters_rects = []
- # 根据y1聚类
- list_rect.sort(key=lambda x:x.bbox[3])
- for _rect in list_rect:
- _y0 = _rect.bbox[3]
- _find = False
- for l_cr in clusters_rects:
- if abs(l_cr[0].bbox[3]-_y0)<2:
- _find = True
- l_cr.append(_rect)
- break
- if not _find:
- clusters_rects.append([_rect])
- print("clusters_rects", len(clusters_rects))
- clusters_rects.sort(key=lambda x:x[0].bbox[3],reverse=True)
- for l_cr in clusters_rects:
- l_cr.sort(key=lambda x:x.bbox[0])
- # cul spans
- for _line in clusters_rects:
- for _rect in _line:
- (x0,y0,x1,y1) = _rect.bbox
- set_x.add(x0)
- set_x.add(x1)
- set_y.add(y0)
- set_y.add(y1)
- if len(set_x)==0 or len(set_y)==0:
- return
- list_x = list(set_x)
- list_y = list(set_y)
- list_x.sort(key=lambda x:x)
- list_y.sort(key=lambda x:x,reverse=True)
- pop_x = []
- for i in range(len(list_x)-1):
- _i = len(list_x)-i-1
- l_i = _i-1
- if abs(list_x[_i]-list_x[l_i])<2:
- pop_x.append(_i)
- pop_x.sort(key=lambda x:x,reverse=True)
- for _x in pop_x:
- list_x.pop(_x)
- #
- pop_x = []
- for i in range(len(list_y)-1):
- _i = len(list_y)-i-1
- l_i = _i-1
- if abs(list_y[_i]-list_y[l_i])<2:
- pop_x.append(_i)
- pop_x.sort(key=lambda x:x,reverse=True)
- for _x in pop_x:
- list_y.pop(_x)
- # print(list_x)
- # print(list_y)
- for _line in clusters_rects:
- table_line = []
- for _rect in _line:
- (x0, y0, x1, y1) = _rect.bbox
- _cell = {"bbox": (x0, y0, x1, y1),
- "rect": _rect,
- "rowspan": self.getspan(list_y, y0, y1, margin),
- "columnspan": self.getspan(list_x, x0, x1, margin),
- "text": ""}
- table_line.append(_cell)
- _table.append(table_line)
- list_textbox.sort(key=lambda x:x.bbox[0])
- list_textbox.sort(key=lambda x:x.bbox[3],reverse=True)
- for textbox in list_textbox:
- (x0,y0,x1,y1) = textbox.bbox
- _text = textbox.get_text()
- print("textbox", _text, textbox.bbox)
- _find = False
- for table_line in _table:
- for _cell in table_line:
- if self.inbox(textbox.bbox,_cell["bbox"]):
- _cell["text"] += _text
- in_objs.add(textbox)
- _find = True
- break
- if _find:
- break
- if fixspan:
- for _line in _table:
- for c_i in range(len(_line)):
- _cell = _line[c_i]
- if _cell.get("columnspan")>1:
- _cospan = _cell.get("columnspan")
- _cell["columnspan"] = 1
- for i in range(1,_cospan):
- _line.insert(c_i,_cell)
- for l_i in range(len(_table)):
- _line = _table[l_i]
- for c_i in range(len(_line)):
- _cell = _line[c_i]
- if _cell.get("rowspan")>1:
- _rospan = _cell.get("rowspan")
- _cell["rowspan"] = 1
- for i in range(1,_rospan):
- if l_i+i<len(_table)-1:
- print(len(_table),l_i+i)
- _table[l_i+i].insert(c_i,_cell)
- # print("=======")
- # for _line in _table:
- # for _cell in _line:
- # print("[%s]"%_cell.get("text")[:10].replace("\n",''),end="\t\t")
- # print("\n")
- # print("===========")
- table_bbox = (_table[0][0].get("bbox")[0],
- _table[0][0].get("bbox")[1],
- _table[-1][-1].get("bbox")[2],
- _table[-1][-1].get("bbox")[3])
- ta = {"bbox": table_bbox, "table": _table}
- return ta
- def inbox(self, bbox0, bbox_g):
- # if bbox_g[0]<=bbox0[0] and bbox_g[1]<=bbox0[1] and bbox_g[2]>=bbox0[2] and bbox_g[3]>=bbox0[3]:
- # return 1
- if self.getIOU(bbox0,bbox_g)>0.5:
- return 1
- return 0
- def getIOU(self, bbox0, bbox1):
- width = max(bbox0[2],bbox1[2])-min(bbox0[0],bbox1[0])-(bbox0[2]-bbox0[0]+bbox1[2]-bbox1[0])
- height = max(bbox0[3],bbox1[3])-min(bbox0[1],bbox1[1])-(bbox0[3]-bbox0[1]+bbox1[3]-bbox1[1])
- print("getIOU", width, height)
- if width < 0 and height < 0:
- iou = abs(width*height/min(abs((bbox0[2]-bbox0[0])*(bbox0[3]-bbox0[1])),
- abs((bbox1[2]-bbox1[0])*(bbox1[3]-bbox1[1]))))
- print("getIOU", iou)
- return iou
- return 0
- def getspan(self, _list, x0, x1, margin):
- _count = 0
- (x0,x1) = (min(x0,x1),max(x0,x1))
- for _x in _list:
- if _x>=(x0-margin) and _x<=(x1+margin):
- _count += 1
- return _count-1
- def _plot(self, list_line, list_textbox):
- from matplotlib import pyplot as plt
- plt.figure()
- for _line in list_line:
- x0, y0, x1, y1 = _line.__dict__.get("bbox")
- plt.plot([x0, x1], [y0, y1])
- for _line in list_line:
- x0, y0, x1, y1 = _line.bbox
- plt.plot([x0, x1], [y0, y1])
- # for point in list_crosspoints:
- # plt.scatter(point.get("point")[0],point.get("point")[1])
- for textbox in list_textbox:
- x0, y0, x1, y1 = textbox.bbox
- plt.plot([x0, x1], [y0, y1])
- plt.show()
- def get_table_html(table):
- html_text = '<table border="1">' + "\n"
- for row in table:
- html_text += "<tr>" + "\n"
- for col in row:
- row_span = col.get("rowspan")
- col_span = col.get("columnspan")
- bbox_text = col.get("text")
- html_text += "<td colspan=" + str(col_span) + " rowspan=" + str(row_span) + ">"
- html_text += bbox_text + "</td>" + "\n"
- html_text += "</tr>" + "\n"
- html_text += "</table>" + "\n"
- return html_text
- def sort_object(obj_list):
- from format_convert.convert_tree import _Table, _Image, _Sentence, _Page
- if len(obj_list) == 0:
- return obj_list
- if isinstance(obj_list[0], (_Table, _Sentence, _Image)):
- obj_list.sort(key=lambda x: x.y, reverse=True)
- return obj_list
- elif isinstance(obj_list[0], _Page):
- obj_list.sort(key=lambda x: x.page_no)
- return obj_list
- else:
- return obj_list
- if __name__ == "__main__":
- strs = r"D:\Project\temp\04384fcc9e8911ecbd2844f971944973\043876ca9e8911eca5e144f971944973_rar\1624114035529.jpeg"
- print(slash_replace(strs))
|