", text): return text text = "

" + text + "\n" text = re.sub("\n", "

", text) # text += "

" if text[-5:] == "

": print("add_div has cut", text[-30:]) text = text[:-5] return text def get_platform(): sys = platform.system() return sys def get_html_p(html_path): logging.info("into get_html_p") try: with open(html_path, "r") as ff: html_str = ff.read() soup = BeautifulSoup(html_str, 'lxml') text = "" for p in soup.find_all("p"): p_text = p.text p_text = p_text.strip() if p.string != "": text += p_text text += "\n" return text except Exception as e: logging.info("get_html_p error!") print("get_html_p", traceback.print_exc()) return [-1] def string_similarity(str1, str2): # 去掉

和回车 str1 = re.sub("

", "", str1) str1 = re.sub("

", "", str1) str1 = re.sub("\n", "", str1) str2 = re.sub("

", "", str2) str2 = re.sub("

", "", str2) str2 = re.sub("\n", "", str2) # print("********************************") # print("str1", str1) # print("********************************") # print("str2", str2) # print("********************************") score = difflib.SequenceMatcher(None, str1, str2).ratio() print("string_similarity", score) return score def get_sequential_data(text_list, bbox_list, html=False): logging.info("into get_sequential_data") try: text = "" order_list = [] for i in range(len(text_list)): length_start = bbox_list[i][0][0] length_end = bbox_list[i][1][0] height_start = bbox_list[i][0][1] height_end = bbox_list[i][-1][1] # print([length_start, length_end, height_start, height_end]) order_list.append([text_list[i], length_start, length_end, height_start, height_end]) # text = text + infomation['text'] + "\n" if get_platform() == "Windows": print("get_sequential_data", order_list) if not order_list: if get_platform() == "Windows": print("get_sequential_data", "no order list") return "" # 根据bbox的坐标对输出排序 order_list.sort(key=lambda x: (x[3], x[1])) # 根据bbox分行分列 # col_list = [] # height_end = int((order_list[0][4] + order_list[0][3]) / 2) # for i in range(len(order_list)): # if height_end - threshold <= order_list[i][3] <= height_end + threshold: # col_list.append(order_list[i]) # else: # row_list.append(col_list) # col_list = [] # height_end = int((order_list[i][4] + order_list[i][3]) / 2) # col_list.append(order_list[i]) # if i == len(order_list) - 1: # row_list.append(col_list) row_list = [] used_box = [] threshold = 5 for box in order_list: if box in used_box: continue height_center = (box[4] + box[3]) / 2 row = [] for box2 in order_list: if box2 in used_box: continue height_center2 = (box2[4] + box2[3]) / 2 if height_center - threshold <= height_center2 <= height_center + threshold: if box2 not in row: row.append(box2) used_box.append(box2) row.sort(key=lambda x: x[0]) row_list.append(row) for row in row_list: if not row: continue if len(row) <= 1: text = text + row[0][0] + "\n" else: sub_text = "" row.sort(key=lambda x: x[1]) for col in row: sub_text = sub_text + col[0] + " " sub_text = sub_text + "\n" text += sub_text if html: text = "

" + text text = re.sub("\n", "

", text) text += "

" # if text[-5:] == "

": # text = text[:-5] return text except Exception as e: logging.info("get_sequential_data error!") print("get_sequential_data", traceback.print_exc()) return [-1] def get_formatted_table(text_list, text_bbox_list, table_bbox_list, split_line): logging.info("into get_formatted_table") try: # 重新定义text_bbox_list，[point, point, text] text_bbox_list = [[text_bbox_list[i][0], text_bbox_list[i][2], text_list[i]] for i in range(len(text_bbox_list))] # 按纵坐标排序 text_bbox_list.sort(key=lambda x: (x[0][1], x[0][0])) table_bbox_list.sort(key=lambda x: (x[0][1], x[0][0])) # print("text_bbox_list", text_bbox_list) # print("table_bbox_list", table_bbox_list) # bbox位置 threshold threshold = 5 # 根据split_line分区，可能有个区多个表格 [(), ()] area_text_bbox_list = [] area_table_bbox_list = [] # print("get_formatted_table, split_line", split_line) for j in range(1, len(split_line)): last_y = split_line[j - 1][0][1] current_y = split_line[j][0][1] temp_text_bbox_list = [] temp_table_bbox_list = [] # 找出该区域下text bbox for text_bbox in text_bbox_list: # 计算 text bbox 中心点 text_bbox_center = ((text_bbox[1][0] + text_bbox[0][0]) / 2, (text_bbox[1][1] + text_bbox[0][1]) / 2) if last_y - threshold <= text_bbox_center[1] <= current_y + threshold: temp_text_bbox_list.append(text_bbox) area_text_bbox_list.append(temp_text_bbox_list) # 找出该区域下table bbox for table_bbox in table_bbox_list: # 计算 table bbox 中心点 table_bbox_center = ((table_bbox[1][0] + table_bbox[0][0]) / 2, (table_bbox[1][1] + table_bbox[0][1]) / 2) if last_y < table_bbox_center[1] < current_y: temp_table_bbox_list.append(table_bbox) area_table_bbox_list.append(temp_table_bbox_list) # for j in range(len(area_text_bbox_list)): # print("area_text_bbox_list", j, area_text_bbox_list[j]) # 对每个区域分别进行两个bbox匹配，生成表格 area_text_list = [] area_column_list = [] for j in range(len(area_text_bbox_list)): # 每个区域的table bbox 和text bbox temp_table_bbox_list = area_table_bbox_list[j] temp_text_bbox_list = area_text_bbox_list[j] # 判断该区域有无表格bbox # 若无表格，将该区域文字连接 if not temp_table_bbox_list: # 找出该区域的所有text bbox only_text_list = [] only_bbox_list = [] for text_bbox in temp_text_bbox_list: only_text_list.append(text_bbox[2]) only_bbox_list.append([text_bbox[0], text_bbox[1]]) only_text = get_sequential_data(only_text_list, only_bbox_list, True) if only_text == [-1]: return [-1], [-1] area_text_list.append(only_text) area_column_list.append(0) continue # 有表格 # 文本对应的表格格子 text_in_table = {} for i in range(len(temp_text_bbox_list)): text_bbox = temp_text_bbox_list[i] # 计算 text bbox 中心点 text_bbox_center = ((text_bbox[1][0] + text_bbox[0][0]) / 2, (text_bbox[1][1] + text_bbox[0][1]) / 2) # 判断中心点在哪个table bbox中 for table_bbox in temp_table_bbox_list: # 中心点在table bbox中，将text写入字典 if table_bbox[0][0] <= text_bbox_center[0] <= table_bbox[1][0] and \ table_bbox[0][1] <= text_bbox_center[1] <= table_bbox[1][1]: if str(table_bbox) in text_in_table.keys(): text_in_table[str(table_bbox)] = text_in_table.get(str(table_bbox)) + text_bbox[2] else: text_in_table[str(table_bbox)] = text_bbox[2] break # 如果未找到text bbox匹配的table bbox，加大threshold匹配 # elif (table_bbox[0][0] <= text_bbox_center[0]+threshold <= table_bbox[1][0] and # table_bbox[0][1] <= text_bbox_center[1]+threshold <= table_bbox[1][1]) or \ # (table_bbox[0][0] <= text_bbox_center[0]-threshold <= table_bbox[1][0] and # table_bbox[0][1] <= text_bbox_center[1]-threshold <= table_bbox[1][1]) or \ # (table_bbox[0][0] <= text_bbox_center[0]+threshold <= table_bbox[1][0] and # table_bbox[0][1] <= text_bbox_center[1]-threshold <= table_bbox[1][1]) or \ # (table_bbox[0][0] <= text_bbox_center[0]-threshold <= table_bbox[1][0] and # table_bbox[0][1] <= text_bbox_center[1]+threshold <= table_bbox[1][1]): # if str(table_bbox) in text_in_table.keys(): # text_in_table[str(table_bbox)] = text_in_table.get(str(table_bbox)) + text_bbox[2] # else: # text_in_table[str(table_bbox)] = text_bbox[2] # break # 对表格格子进行分行分列，并计算总计多少小列 # 放入坐标 all_col_list = [] all_row_list = [] for i in range(len(temp_table_bbox_list)): table_bbox = temp_table_bbox_list[i] # 放入所有坐标x if table_bbox[0][0] not in all_col_list: all_col_list.append(table_bbox[0][0]) if table_bbox[1][0] not in all_col_list: all_col_list.append(table_bbox[1][0]) # 放入所有坐标y if table_bbox[0][1] not in all_row_list: all_row_list.append(table_bbox[0][1]) if table_bbox[1][1] not in all_row_list: all_row_list.append(table_bbox[1][1]) all_col_list.sort(key=lambda x: x) all_row_list.sort(key=lambda x: x) # 分行 row_list = [] rows = [] temp_table_bbox_list.sort(key=lambda x: (x[0][1], x[0][0], x[1][1], x[1][0])) y_row = temp_table_bbox_list[0][0][1] for i in range(len(temp_table_bbox_list)): table_bbox = temp_table_bbox_list[i] if y_row - threshold <= table_bbox[0][1] <= y_row + threshold: rows.append(table_bbox) else: y_row = table_bbox[0][1] if rows: rows.sort(key=lambda x: x[0][0]) row_list.append(rows) rows = [] rows.append(table_bbox) # print("*" * 30) # print(row_list) if i == len(temp_table_bbox_list) - 1: if rows: rows.sort(key=lambda x: x[0][0]) row_list.append(rows) # 生成表格，包括文字和格子宽度 area_column = [] text = '' + "\n" for row in row_list: text += "" + "\n" for col in row: # 计算bbox y坐标之间有多少其他点，+1即为所占行数 row_span = 1 for y in all_row_list: if col[0][1] < y < col[1][1]: if y - col[0][1] >= 2 and col[1][1] - y >= 2: row_span += 1 # 计算bbox x坐标之间有多少其他点，+1即为所占列数 col_span = 1 for x in all_col_list: if col[0][0] < x < col[1][0]: if x - col[0][0] >= 2 and col[1][0] - x >= 2: col_span += 1 text += "" + "\n" text += "" + "\n" text += "

" if str(col) in text_in_table.keys(): text += text_in_table.get(str(col)) else: text += "" text += "

" + "\n" # 计算最大column max_col_num = 0 for row in row_list: col_num = 0 for col in row: col_num += 1 if max_col_num < col_num: max_col_num = col_num area_text_list.append(text) area_column_list.append(max_col_num) text = "" if get_platform() == "Windows": print("get_formatted_table area_text_list", area_text_list) for area_text in area_text_list: text += area_text return text, area_column_list except Exception as e: logging.info("get_formatted_table error!") print("get_formatted_table", traceback.print_exc()) return [-1], [-1] def rename_inner_files(root_path): try: logging.info("into rename_inner_files") # 获取解压文件夹下所有文件+文件夹，不带根路径 path_list = [] for root, dirs, files in os.walk(root_path, topdown=False): for name in dirs: p = os.path.join(root, name) + os.sep if get_platform() == "Windows": root_path = slash_replace(root_path) p = slash_replace(p) p = re.sub(root_path, "", p) root_path = slash_replace(root_path, True) p = slash_replace(p, True) else: p = re.sub(root_path, "", p) path_list.append(p) for name in files: p = os.path.join(root, name) if get_platform() == "Windows": root_path = slash_replace(root_path) p = slash_replace(p) p = re.sub(root_path, "", p) root_path = slash_replace(root_path, True) p = slash_replace(p, True) else: p = re.sub(root_path, "", p) path_list.append(p) # 按路径长度排序 path_list.sort(key=lambda x: len(x), reverse=True) # 循环改名 for old_path in path_list: # 按路径分隔符分割 ss = old_path.split(os.sep) # 判断是否文件夹 is_dir = 0 file_type = "" if os.path.isdir(root_path + old_path): ss = ss[:-1] is_dir = 1 else: if "." in old_path: file_type = "." + old_path.split(".")[-1] else: file_type = "" # 最后一级需要用hash改名 new_path = "" # new_path = re.sub(ss[-1], str(hash(ss[-1])), old_path) + file_type current_level = 0 for s in ss: # 路径拼接 if current_level < len(ss) - 1: new_path += s + os.sep else: new_path += str(hash(s)) + file_type current_level += 1 new_ab_path = root_path + new_path old_ab_path = root_path + old_path os.rename(old_ab_path, new_ab_path) # 重新获取解压文件夹下所有文件+文件夹 new_path_list = [] for root, dirs, files in os.walk(root_path, topdown=False): for name in dirs: new_path_list.append(os.path.join(root, name) + os.sep) for name in files: new_path_list.append(os.path.join(root, name)) return new_path_list except: traceback.print_exc() return [-1] def judge_format(path): guess1 = mimetypes.guess_type(path) _type = None if guess1[0]: _type = guess1[0] else: guess2 = filetype.guess(path) if guess2: _type = guess2.mime if _type == "application/pdf": return "pdf" if _type == "application/vnd.openxmlformats-officedocument.wordprocessingml.document": return "docx" if _type == "application/x-zip-compressed" or _type == "application/zip": return "zip" if _type == "application/x-rar-compressed" or _type == "application/rar": return "rar" if _type == "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet": return "xlsx" if _type == "application/msword": return "doc" if _type == "image/png": return "png" if _type == "image/jpeg": return "jpg" # 猜不到，返回None return None def slash_replace(_str, reverse=False): if reverse: _str = eval(repr(_str).replace('/', '\\\\')) else: _str = eval(repr(_str).replace('\\\\', '/')) return _str if __name__ == "__main__": strs = r"D:\Project\temp\04384fcc9e8911ecbd2844f971944973\043876ca9e8911eca5e144f971944973_rar\1624114035529.jpeg" print(slash_replace(strs))