2 years ago · 7741734a8c
--- a/format_convert/convert.py
+++ b/format_convert/convert.py
@@ -364,6 +364,12 @@ def _convert():
 
															     {[-5], 0}: 整个转换过程超时
														
 
															     {[-6], 0}: 阿里云UDF队列超时
														
 
															     {[-7], 1}: 文件需密码，无法打开
														
 
															+    {[-8], 0}: 调用现成接口报错
														
 
															+    {[-9], 0}: 接口接收数据为空
														
 
															+    {[-10], 0}: 长图分割报错
														
 
															+    {[-11], 0}: 新接口idc、isr、atc报错
														
 
															+    {[-12], 0}: 表格跨页连接报错
														
 
															+    {[-13], 0}: pdf表格线处理报错
														
 
															     :return: {"result_html": str([]), "result_text":str([]) "is_success": int}
														
 
															     """
														
--- a/format_convert/convert_doc.py
+++ b/format_convert/convert_doc.py
@@ -1,8 +1,9 @@
 
															 import inspect
														
 
															 import os
														
 
															 import sys
														
 
															+from bs4 import BeautifulSoup
														
 
															 sys.path.append(os.path.dirname(__file__) + "/../")
														
 
															-from format_convert.convert_tree import _Document
														
 
															+from format_convert.convert_tree import _Document, _Sentence, _Page
														
 
															 import logging
														
 
															 import traceback
														
 
															 from format_convert import get_memory_info
														
@@ -35,14 +36,31 @@ class DocConvert:
 
															         self.unique_type_dir = unique_type_dir
														
 
															     def convert(self):
														
 
															-        # 调用office格式转换
														
 
															-        file_path = from_office_interface(self.path, self.unique_type_dir, 'docx')
														
 
															-        if judge_error_code(file_path):
														
 
															-            self._doc.error_code = file_path
														
 
															-            return
														
 
															-        _docx = DocxConvert(file_path, self.unique_type_dir)
														
 
															-        _docx.convert()
														
 
															-        self._doc = _docx._doc
														
 
															+        # 先判断特殊doc文件，可能是html文本
														
 
															+        is_html_doc = False
														
 
															+        try:
														
 
															+            with open(self.path, 'r') as f:
														
 
															+                html_str = f.read()
														
 
															+            soup = BeautifulSoup(html_str, 'lxml')
														
 
															+            text = soup.text
														
 
															+            is_html_doc = True
														
 
															+        except:
														
 
															+            pass
														
 
															+
														
 
															+        if is_html_doc:
														
 
															+            self._page = _Page(None, 0)
														
 
															+            _sen = _Sentence(text, (0, 0, 0, 0))
														
 
															+            self._page.add_child(_sen)
														
 
															+            self._doc.add_child(self._page)
														
 
															+        else:
														
 
															+            # 调用office格式转换
														
 
															+            file_path = from_office_interface(self.path, self.unique_type_dir, 'docx')
														
 
															+            if judge_error_code(file_path):
														
 
															+                self._doc.error_code = file_path
														
 
															+                return
														
 
															+            _docx = DocxConvert(file_path, self.unique_type_dir)
														
 
															+            _docx.convert()
														
 
															+            self._doc = _docx._doc
														
 
															     def get_html(self):
														
 
															         try:
														
@@ -52,5 +70,10 @@ class DocConvert:
 
															             self._doc.error_code = [-1]
														
 
															         if self._doc.error_code is not None:
														
 
															             return self._doc.error_code
														
 
															-        print(self._doc.children)
														
 
															+        # print(self._doc.children)
														
 
															         return self._doc.get_html()
														
 
															+
														
 
															+
														
 
															+if __name__ == '__main__':
														
 
															+    c = DocConvert("C:/Users/Administrator/Downloads/-4274446916340743056.doc", "C:/Users/Administrator/Downloads/1")
														
 
															+    print(c.get_html())
														
--- a/format_convert/convert_docx.py
+++ b/format_convert/convert_docx.py
@@ -10,7 +10,7 @@ import xml
 
															 import zipfile
														
 
															 import docx
														
 
															 from format_convert.convert_image import picture2text
														
 
															-from format_convert.utils import judge_error_code, add_div, get_logger, log, memory_decorator
														
 
															+from format_convert.utils import judge_error_code, add_div, get_logger, log, memory_decorator, get_garble_code
														
 
															 from format_convert.wrapt_timeout_decorator import timeout
														
@@ -325,6 +325,18 @@ class DocxConvert:
 
															             return
														
 
															         order_list, text_list = order_and_text_list
														
 
															+        self._page = _Page(None, 0)
														
 
															+
														
 
															+        # 乱码返回文件格式错误
														
 
															+        match1 = re.findall(get_garble_code(), ''.join(text_list))
														
 
															+        if len(match1) > 10:
														
 
															+            log("doc/docx garbled code!")
														
 
															+            # self._doc.error_code = [-3]
														
 
															+            _sen = _Sentence('文件乱码！', (0, 0, 0, 0))
														
 
															+            self._page.add_child(_sen)
														
 
															+            self._doc.add_child(self._page)
														
 
															+            return
														
 
															+
														
 
															         # test
														
 
															         # for i in range(len(text_list)):
														
 
															         #     print(order_list[i], text_list[i])
														
@@ -338,7 +350,6 @@ class DocxConvert:
 
															         image_list = self.get_images()
														
 
															-        self._page = _Page(None, 0)
														
 
															         order_y = 0
														
 
															         doc_pr_cnt = 0
														
 
															         for tag in order_list:
														
@@ -427,3 +438,8 @@ class DocxConvert:
 
															         if self._doc.error_code is not None:
														
 
															             return self._doc.error_code
														
 
															         return self._doc.get_html()
														
 
															+
														
 
															+
														
 
															+if __name__ == '__main__':
														
 
															+    c = DocxConvert("C:/Users/Administrator/Downloads/1631944542835.docx", "C:/Users/Administrator/Downloads/1/")
														
 
															+    print(c.get_html())
														
--- a/format_convert/convert_image.py
+++ b/format_convert/convert_image.py
@@ -224,6 +224,9 @@ def image_process(image_np, image_path, is_from_pdf=False, is_from_docx=False, u
 
															             # for _textbox in list_text_boxes:
														
 
															             #     print("==",_textbox.get_text())
														
 
															             lt = LineTable()
														
 
															+            # print('text_list', text_list)
														
 
															+            # print('bbox_list', bbox_list)
														
 
															+            # print('list_line', list_line)
														
 
															             tables, obj_in_table, _ = lt.recognize_table(list_text_boxes, list_lines, False)
														
 
															             # 合并同一行textbox
														
--- a/format_convert/convert_pdf.py
+++ b/format_convert/convert_pdf.py
@@ -5,6 +5,9 @@ import logging
 
															 import os
														
 
															 import re
														
 
															 import sys
														
 
															+
														
 
															+from bs4 import BeautifulSoup
														
 
															+
														
 
															 sys.path.append(os.path.dirname(__file__) + "/../")
														
 
															 from pdfplumber import PDF
														
 
															 from pdfplumber.table import TableFinder
														
@@ -12,6 +15,10 @@ from pdfplumber.page import Page as pdfPage
 
															 from format_convert.convert_tree import _Document, _Page, _Image, _Sentence, _Table
														
 
															 import time
														
 
															 import pdfminer
														
 
															+import math
														
 
															+from scipy.stats import linregress
														
 
															+from matplotlib import pyplot as plt
														
 
															+from shapely.geometry import LineString, Point
														
 
															 from format_convert import timeout_decorator
														
 
															 from PIL import Image
														
 
															 from format_convert.convert_image import image_process
														
@@ -26,9 +33,9 @@ from pdfminer.pdfpage import PDFPage
 
															 from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
														
 
															 from pdfminer.converter import PDFPageAggregator
														
 
															 from pdfminer.layout import LTTextBoxHorizontal, LAParams, LTFigure, LTImage, LTCurve, LTText, LTChar, LTRect, \
														
 
															-    LTTextBoxVertical, LTLine
														
 
															+    LTTextBoxVertical, LTLine, LTTextContainer
														
 
															 from format_convert.utils import judge_error_code, add_div, get_platform, get_html_p, string_similarity, LineTable, \
														
 
															-    get_logger, log, memory_decorator,draw_lines_plt
														
 
															+    get_logger, log, memory_decorator, draw_lines_plt, get_garble_code, line_is_cross
														
 
															 import fitz
														
 
															 from format_convert.wrapt_timeout_decorator import timeout
														
@@ -100,9 +107,9 @@ def pdf2Image(path, save_dir):
 
															 def pdf_analyze(interpreter, page, device, page_no):
														
 
															     log("into pdf_analyze")
														
 
															     pdf_time = time.time()
														
 
															-    print("pdf_analyze interpreter process...")
														
 
															+    # print("pdf_analyze interpreter process...")
														
 
															     interpreter.process_page(page)
														
 
															-    print("pdf_analyze device get_result...")
														
 
															+    # print("pdf_analyze device get_result...")
														
 
															     layout = device.get_result()
														
 
															     log("pdf2text page " + str(page_no) + " read time " + str(time.time() - pdf_time))
														
 
															     return layout
														
@@ -389,7 +396,7 @@ def pdf2text(path, unique_type_dir):
 
															                                 #         image_stream = ff.read()
														
 
															                                 except Exception:
														
 
															                                     log("pdf2text pdfminer read image in page " + str(page_no) +
														
 
															-                                                 "  fail! use pymupdf read image...")
														
 
															+                                        "  fail! use pymupdf read image...")
														
 
															                                     # print(traceback.print_exc())
														
 
															                                     image_text = page_info_dict.get(page_no)[0]
														
 
															                                     if image_text is None:
														
@@ -476,7 +483,7 @@ def pdf2text(path, unique_type_dir):
 
															         return [-3]
														
 
															     except Exception as e:
														
 
															         log("pdf2text error!")
														
 
															-        print("pdf2text", traceback.print_exc())
														
 
															+        traceback.print_exc()
														
 
															         return [-1]
														
@@ -497,7 +504,7 @@ def get_single_pdf(path, page_no):
 
															         raise e
														
 
															     except Exception as e:
														
 
															         log("get_single_pdf error! page " + str(page_no))
														
 
															-        print("get_single_pdf", traceback.print_exc())
														
 
															+        traceback.print_exc()
														
 
															         raise e
														
@@ -578,7 +585,7 @@ def page_table_connect(has_table_dict):
 
															     except Exception as e:
														
 
															         # print("page_table_connect", e)
														
 
															         log("page_table_connect error!")
														
 
															-        print("page_table_connect", traceback.print_exc())
														
 
															+        traceback.print_exc()
														
 
															         return [-1], [-1]
														
@@ -589,7 +596,7 @@ def read_pdf(path, package_name, packages):
 
															                         char_margin=0.3,
														
 
															                         line_margin=0.01,
														
 
															                         word_margin=0.01,
														
 
															-                        boxes_flow=0.1,)
														
 
															+                        boxes_flow=0.1, )
														
 
															     if package_name == packages[0]:
														
 
															         fp = open(path, 'rb')
														
@@ -668,7 +675,7 @@ class PDFConvert:
 
															                                 char_margin=0.3,
														
 
															                                 line_margin=0.01,
														
 
															                                 word_margin=0.01,
														
 
															-                                boxes_flow=0.1,)
														
 
															+                                boxes_flow=0.1, )
														
 
															             if package_name == self.packages[0]:
														
 
															                 # fp = open(self.path, 'rb')
														
 
															                 # parser = PDFParser(fp)
														
@@ -702,7 +709,7 @@ class PDFConvert:
 
															                 self.lt, self.doc_top, self.doc_pdfplumber = read_pdfplumber(self.path, laparams)
														
 
															                 self.has_init_pdf[3] = 0
														
 
															             else:
														
 
															-                print("Only Support Packages", str(self.packages))
														
 
															+                log("Only Support Packages " + str(self.packages))
														
 
															                 raise Exception
														
 
															         except Exception as e:
														
 
															             log(package_name + " cannot open pdf!")
														
@@ -766,29 +773,401 @@ class PDFConvert:
 
															             self._doc.add_child(self._page)
														
 
															             page_no += 1
														
 
															+    def clean_text(self, _text):
														
 
															+        return re.sub("\s", "", _text)
														
 
															-    def clean_text(self,_text):
														
 
															-
														
 
															-        return re.sub("\s","",_text)
														
 
															-
														
 
															-
														
 
															-    def get_text_lines(self,page,page_no):
														
 
															+    def get_text_lines(self, page, page_no):
														
 
															         lt_line_list = []
														
 
															         page_plumber = pdfPage(self.doc_pdfplumber, page, page_number=page_no, initial_doctop=self.doc_top)
														
 
															         self.doc_top += page_plumber.height
														
 
															         table_finder = TableFinder(page_plumber)
														
 
															+        all_width_zero = True
														
 
															+        for _edge in table_finder.get_edges():
														
 
															+            if _edge.get('linewidth') and _edge.get('linewidth') > 0:
														
 
															+                all_width_zero = False
														
 
															+                break
														
 
															         for _edge in table_finder.get_edges():
														
 
															-            lt_line_list.append(LTLine(1, (float(_edge["x0"]), float(_edge["y0"])),
														
 
															-                                       (float(_edge["x1"]), float(_edge["y1"]))))
														
 
															-        log("pdf page %s has %s lines"%(str(page_no),str(len(lt_line_list))))
														
 
															+            # print(_edge)
														
 
															+            if _edge.get('linewidth', 0.1) > 0 or all_width_zero:
														
 
															+                lt_line_list.append(LTLine(1, (float(_edge["x0"]), float(_edge["y0"])),
														
 
															+                                           (float(_edge["x1"]), float(_edge["y1"]))))
														
 
															+        log("pdf page %s has %s lines" % (str(page_no), str(len(lt_line_list))))
														
 
															+        return lt_line_list
														
 
															+
														
 
															+    def get_page_lines(self, layout, page_no):
														
 
															+        def _plot(_line_list, mode=1):
														
 
															+            for _line in _line_list:
														
 
															+                if mode == 1:
														
 
															+                    x0, y0, x1, y1 = _line.__dict__.get("bbox")
														
 
															+                elif mode == 2:
														
 
															+                    x0, y0, x1, y1 = _line
														
 
															+                plt.plot([x0, x1], [y0, y1])
														
 
															+            plt.show()
														
 
															+            return
														
 
															+
														
 
															+        def is_cross(A, B, C, D):
														
 
															+            if A[0] == B[0] == C[0] == D[0]:
														
 
															+                if A[1] <= C[1] <= B[1] or A[1] <= D[1] <= B[1] \
														
 
															+                        or C[1] <= A[1] <= D[1] or C[1] <= B[1] <= D[1]:
														
 
															+                    return True
														
 
															+            if A[1] == B[1] == C[1] == D[1]:
														
 
															+                if A[0] <= C[0] <= B[0] or A[0] <= D[0] <= B[0] \
														
 
															+                        or C[0] <= A[0] <= D[0] or C[0] <= B[0] <= D[0]:
														
 
															+                    return True
														
 
															+
														
 
															+            line1 = LineString([A, B])
														
 
															+            line2 = LineString([C, D])
														
 
															+
														
 
															+            int_pt = line1.intersection(line2)
														
 
															+            try:
														
 
															+                point_of_intersection = int_pt.x, int_pt.y
														
 
															+                return True
														
 
															+            except:
														
 
															+                return False
														
 
															+
														
 
															+        def calculate_k(bbox):
														
 
															+            x = [bbox[0], bbox[2]]
														
 
															+            y = [bbox[1], bbox[3]]
														
 
															+            slope, intercept, r_value, p_value, std_err = linregress(x, y)
														
 
															+            # print('k', slope)
														
 
															+            if math.isnan(slope):
														
 
															+                slope = 0
														
 
															+            return slope
														
 
															+
														
 
															+        def line_iou(line1, line2, axis=0):
														
 
															+            inter = min(line1[1][axis], line2[1][axis]) - max(line1[0][axis], line2[0][axis])
														
 
															+            # union = max(line1[1][axis], line2[1][axis]) - min(line1[0][axis], line2[0][axis])
														
 
															+            union = min(abs(line1[0][axis] - line1[1][axis]), abs(line2[0][axis] - line2[1][axis]))
														
 
															+            if union in [0, 0.]:
														
 
															+                iou = 0.
														
 
															+            else:
														
 
															+                iou = inter / union
														
 
															+            return iou
														
 
															+
														
 
															+        def get_cross_line(_line_list, threshold=1, cross_times=0):
														
 
															+            # 根据是否有交点判断表格线
														
 
															+            _cross_line_list = []
														
 
															+            for line1 in _line_list:
														
 
															+                if line1 in _cross_line_list:
														
 
															+                    continue
														
 
															+                if abs(line1[2] - line1[0]) > abs(line1[3] - line1[1]):
														
 
															+                    p1 = [max(0, line1[0] - threshold), line1[1]]
														
 
															+                    p2 = [min(line1[2] + threshold, page_w), line1[3]]
														
 
															+                else:
														
 
															+                    p1 = [line1[0], max(0, line1[1] - threshold)]
														
 
															+                    p2 = [line1[2], min(line1[3] + threshold, page_h)]
														
 
															+                line1 = [p1[0], p1[1], p2[0], p2[1]]
														
 
															+                _times = 0
														
 
															+                for line2 in _line_list:
														
 
															+                    if abs(line2[2] - line2[0]) > abs(line2[3] - line2[1]):
														
 
															+                        p3 = [max(0, line2[0] - threshold), line2[1]]
														
 
															+                        p4 = [min(line2[2] + threshold, page_w), line2[3]]
														
 
															+                    else:
														
 
															+                        p3 = [line2[0], max(0, line2[1] - threshold)]
														
 
															+                        p4 = [line2[2], min(line2[3] + threshold, page_h)]
														
 
															+                    line2 = [p3[0], p3[1], p4[0], p4[1]]
														
 
															+                    if line1 == line2:
														
 
															+                        continue
														
 
															+                    if is_cross(p1, p2, p3, p4):
														
 
															+                        _times += 1
														
 
															+                        if _times >= cross_times:
														
 
															+                            _cross_line_list += [line1]
														
 
															+                            break
														
 
															+            return _cross_line_list
														
 
															+
														
 
															+        def repair_bias_line(_line_list):
														
 
															+            temp_list = []
														
 
															+            for line in _line_list:
														
 
															+                x0, y0, x1, y1 = line
														
 
															+                _y = min(y0, y1)
														
 
															+                _x = min(x0, x1)
														
 
															+                if abs(x0 - x1) > abs(y0 - y1):
														
 
															+                    temp_list.append([x0, _y, x1, _y])
														
 
															+                else:
														
 
															+                    temp_list.append([_x, y0, _x, y1])
														
 
															+            _line_list = temp_list
														
 
															+            return _line_list
														
 
															+
														
 
															+        def repair_col_line(_straight_list, _bias_list, threshold=2, min_width=7):
														
 
															+            if not _straight_list or not _bias_list:
														
 
															+                print('add_col_bias_line empty', len(_straight_list), len(_bias_list))
														
 
															+                return []
														
 
															+
														
 
															+            # 分列
														
 
															+            _straight_list.sort(key=lambda x: (x[0], x[1]))
														
 
															+            cols = []
														
 
															+            col = []
														
 
															+            current_w = _straight_list[0][0]
														
 
															+            for line in _straight_list:
														
 
															+                if abs(line[0] - line[2]) > abs(line[1] - line[3]):
														
 
															+                    continue
														
 
															+                if min(line[0], line[2]) - threshold <= current_w <= max(line[0], line[2]) + threshold:
														
 
															+                    col.append(line)
														
 
															+                else:
														
 
															+                    if col:
														
 
															+                        cols.append(col)
														
 
															+                    col = [line]
														
 
															+                    current_w = line[0]
														
 
															+            if col:
														
 
															+                cols.append(col)
														
 
															+
														
 
															+            # 补充col
														
 
															+            new_list = []
														
 
															+            for line in bias_line_list:
														
 
															+                if abs(line[0] - line[2]) > abs(line[1] - line[3]):
														
 
															+                    continue
														
 
															+                for col in cols:
														
 
															+                    w = col[0][0]
														
 
															+                    if w - threshold <= line[0] <= w + threshold or w - threshold <= line[2] <= w + threshold:
														
 
															+                        new_list.append([w, line[1] - 3, w, line[3] + 3])
														
 
															+            new_list += _straight_list
														
 
															+
														
 
															+            # 去重
														
 
															+            new_list = [str(x) for x in new_list]
														
 
															+            new_list = list(set(new_list))
														
 
															+            new_list = [eval(x) for x in new_list]
														
 
															+
														
 
															+            # 分列
														
 
															+            new_list.sort(key=lambda x: (x[0], x[1]))
														
 
															+            cols = []
														
 
															+            col = []
														
 
															+            current_w = new_list[0][0]
														
 
															+            for line in new_list:
														
 
															+                if abs(line[0] - line[2]) > abs(line[1] - line[3]):
														
 
															+                    continue
														
 
															+                if min(line[0], line[2]) - threshold <= current_w <= max(line[0], line[2]) + threshold:
														
 
															+                    col.append(line)
														
 
															+                else:
														
 
															+                    if col:
														
 
															+                        cols.append(col)
														
 
															+                    col = [line]
														
 
															+                    current_w = line[0]
														
 
															+            if col:
														
 
															+                cols.append(col)
														
 
															+
														
 
															+            # 删除col
														
 
															+            for col1 in cols:
														
 
															+                for col2 in cols:
														
 
															+                    if col1 == col2 or abs(col1[0][0] - col2[0][0]) > min_width:
														
 
															+                        continue
														
 
															+
														
 
															+                    col1_len, col2_len = 0, 0
														
 
															+                    for c in col1:
														
 
															+                        col1_len += abs(c[1] - c[3])
														
 
															+                    for c in col2:
														
 
															+                        col2_len += abs(c[1] - c[3])
														
 
															+                    if col1_len > col2_len * 3:
														
 
															+                        for c in col2:
														
 
															+                            if c in new_list:
														
 
															+                                new_list.remove(c)
														
 
															+                    if col2_len > col1_len * 3:
														
 
															+                        for c in col1:
														
 
															+                            if c in new_list:
														
 
															+                                new_list.remove(c)
														
 
															+            return new_list
														
 
															+
														
 
															+        def merge_line(_line_list, threshold=2):
														
 
															+            new_line_list = []
														
 
															+            # 分列
														
 
															+            _line_list.sort(key=lambda x: (x[0], x[1]))
														
 
															+            cols = []
														
 
															+            col = [_line_list[0]]
														
 
															+            current_w = _line_list[0][0]
														
 
															+            for line in _line_list:
														
 
															+                if abs(line[0] - line[2]) > abs(line[1] - line[3]):
														
 
															+                    continue
														
 
															+                if min(line[0], line[2]) - threshold <= current_w <= max(line[0], line[2]) + threshold \
														
 
															+                        and is_cross(line[0:2], line[2:4], col[-1][0:2], col[-1][2:4]):
														
 
															+                    col.append(line)
														
 
															+                else:
														
 
															+                    if col:
														
 
															+                        cols.append(col)
														
 
															+                    col = [line]
														
 
															+                    current_w = line[0]
														
 
															+            if col:
														
 
															+                cols.append(col)
														
 
															+
														
 
															+            for col in cols:
														
 
															+                temp_c = col[0]
														
 
															+                col_w = col[0][0]
														
 
															+                for i in range(len(col) - 1):
														
 
															+                    c = col[i]
														
 
															+                    next_c = col[i + 1]
														
 
															+                    if is_cross(c[0:2], c[2:4], next_c[0:2], next_c[2:4]):
														
 
															+                        temp_c = [col_w, min(temp_c[1], c[1], c[3], next_c[1], next_c[3]), col_w,
														
 
															+                                  max(temp_c[3], c[1], c[3], next_c[1], next_c[3])]
														
 
															+                    else:
														
 
															+                        new_line_list.append(temp_c)
														
 
															+                        temp_c = next_c
														
 
															+                if not new_line_list or (new_line_list and new_line_list[-1] != temp_c):
														
 
															+                    new_line_list.append(temp_c)
														
 
															+
														
 
															+            # 分行
														
 
															+            _line_list.sort(key=lambda x: (x[1], x[0]))
														
 
															+            rows = []
														
 
															+            row = []
														
 
															+            current_h = _line_list[0][1]
														
 
															+            for line in _line_list:
														
 
															+                if abs(line[0] - line[2]) < abs(line[1] - line[3]):
														
 
															+                    continue
														
 
															+                if min(line[1], line[3]) - threshold <= current_h <= max(line[1], line[3]) + threshold:
														
 
															+                    row.append(line)
														
 
															+                else:
														
 
															+                    if row:
														
 
															+                        rows.append(row)
														
 
															+                    row = [line]
														
 
															+                    current_h = line[1]
														
 
															+            if row:
														
 
															+                rows.append(row)
														
 
															+
														
 
															+            for row in rows:
														
 
															+                temp_r = row[0]
														
 
															+                row_h = row[0][1]
														
 
															+                for i in range(len(row) - 1):
														
 
															+                    r = row[i]
														
 
															+                    next_r = row[i + 1]
														
 
															+                    # if is_cross(r[0:2], r[2:4], next_r[0:2], next_r[2:4]):
														
 
															+                    if line_iou([r[0:2], r[2:4]], [next_r[0:2], next_r[2:4]], axis=0):
														
 
															+                        temp_r = [min(temp_r[0], r[0], r[2], next_r[0], next_r[2]), row_h,
														
 
															+                                  max(temp_r[2], r[0], r[2], next_r[0], next_r[2]), row_h]
														
 
															+                    else:
														
 
															+                        new_line_list.append(temp_r)
														
 
															+                        temp_r = next_r
														
 
															+                if not new_line_list or (new_line_list and new_line_list[-1] != temp_r):
														
 
															+                    new_line_list.append(temp_r)
														
 
															+            return new_line_list
														
 
															+
														
 
															+        def remove_outline_no_cross(_line_list):
														
 
															+            row_list = []
														
 
															+            col_list = []
														
 
															+            for line in _line_list:
														
 
															+                # 存所有行
														
 
															+                if abs(line[0] - line[2]) > abs(line[1] - line[3]):
														
 
															+                    row_list.append(line)
														
 
															+                # 存所有列
														
 
															+                if abs(line[0] - line[2]) < abs(line[1] - line[3]):
														
 
															+                    col_list.append(line)
														
 
															+
														
 
															+            if not col_list:
														
 
															+                return _line_list
														
 
															+
														
 
															+            # 左右两条边框
														
 
															+            col_list.sort(key=lambda x: (x[0], x[1]))
														
 
															+            left_col = col_list[0]
														
 
															+            right_col = col_list[-1]
														
 
															+
														
 
															+            # 判断有交点但中间区域无交点
														
 
															+            compare_list = []
														
 
															+            for col in [left_col, right_col]:
														
 
															+                add_h = abs(col[1]-col[3]) / 8
														
 
															+                center_area = [col[1]+add_h, col[3]-add_h]
														
 
															+                cross_cnt = 0
														
 
															+                center_cross_cnt = 0
														
 
															+                center_row_cnt = 0
														
 
															+                for row in row_list:
														
 
															+                    if is_cross(row[0:2], row[2:4], col[0:2], col[2:4]):
														
 
															+                        if center_area[0] <= row[1] <= center_area[1]:
														
 
															+                            center_cross_cnt += 1
														
 
															+                        else:
														
 
															+                            cross_cnt += 1
														
 
															+                    else:
														
 
															+                        if center_area[0] <= row[1] <= center_area[1]:
														
 
															+                            center_row_cnt += 1
														
 
															+                compare_list.append([cross_cnt, center_cross_cnt, center_row_cnt])
														
 
															+
														
 
															+            _flag = True
														
 
															+            for c in compare_list:
														
 
															+                if c[0] >= 2 and c[1] == 0 and c[2] >= 2:
														
 
															+                    continue
														
 
															+                _flag = False
														
 
															+            print('compare_list', compare_list)
														
 
															+            if _flag and compare_list[0][1] == compare_list[1][1] \
														
 
															+                    and compare_list[0][2] == compare_list[1][2]:
														
 
															+                for col in [left_col, right_col]:
														
 
															+                    if col in _line_list:
														
 
															+                        _line_list.remove(col)
														
 
															+            return _line_list
														
 
															+
														
 
															+        log('into get_page_lines')
														
 
															+
														
 
															+        page_h = layout.height
														
 
															+        page_w = layout.width
														
 
															+
														
 
															+        element_list = []
														
 
															+        line_list = []
														
 
															+        bias_line_list = []
														
 
															+        text_bbox_list = []
														
 
															+        for element in layout:
														
 
															+            if isinstance(element, LTTextContainer):
														
 
															+                text_bbox_list.append(element.bbox)
														
 
															+
														
 
															+            # 只取这三种类型的bbox
														
 
															+            if isinstance(element, (LTRect, LTCurve, LTLine)):
														
 
															+                element_list.append(element)
														
 
															+                if element.height > 0.5 and element.width > 0.5:
														
 
															+                    # print('element.height, element.width', element.height, element.width)
														
 
															+                    k = calculate_k(element.bbox)
														
 
															+                    if 1.73 / 3 < abs(k) < 1.73:
														
 
															+                        continue
														
 
															+                    else:
														
 
															+                        bias_line_list.append(element.bbox)
														
 
															+                    continue
														
 
															+                line_list.append(element.bbox)
														
 
															+
														
 
															+        if not line_list and not bias_line_list:
														
 
															+            return []
														
 
															+
														
 
															+        # 是否使用斜线来生成表格
														
 
															+        if len(line_list) < 6 and len(bias_line_list) > len(line_list) * 2:
														
 
															+            # print('use bias line')
														
 
															+            # bias_line_list += add_col_bias_line(line_list, bias_line_list)
														
 
															+            line_list = bias_line_list
														
 
															+
														
 
															+        # 去重
														
 
															+        line_list = [str(x) for x in line_list]
														
 
															+        line_list = list(set(line_list))
														
 
															+        line_list = [eval(x) for x in line_list]
														
 
															+
														
 
															+        # 根据是否有交点判断表格线
														
 
															+        cross_line_list = get_cross_line(line_list, threshold=2, cross_times=1)
														
 
															+
														
 
															+        if not cross_line_list:
														
 
															+            return []
														
 
															+
														
 
															+        # 斜线校正
														
 
															+        if cross_line_list:
														
 
															+            cross_line_list = repair_bias_line(cross_line_list)
														
 
															+
														
 
															+        # 修复竖线
														
 
															+        if bias_line_list:
														
 
															+            cross_line_list = repair_col_line(cross_line_list, bias_line_list)
														
 
															+
														
 
															+        # 根据是否有交点判断表格线
														
 
															+        cross_line_list = get_cross_line(cross_line_list, threshold=1, cross_times=1)
														
 
															+
														
 
															+        # 合并线条
														
 
															+        cross_line_list = merge_line(cross_line_list)
														
 
															+
														
 
															+        # 删除最外层嵌套边框
														
 
															+        cross_line_list = remove_outline_no_cross(cross_line_list)
														
 
															+        # show
														
 
															+        # print('len(cross_line_list)', len(cross_line_list))
														
 
															+        # _plot(line_list, mode=2)
														
 
															+        # _plot(cross_line_list, mode=2)
														
 
															+
														
 
															+        lt_line_list = []
														
 
															+        for line in cross_line_list:
														
 
															+            lt_line_list.append(LTLine(1, (float(line[0]), float(line[1])),
														
 
															+                                       (float(line[2]), float(line[3]))))
														
 
															+        log("pdf page %s has %s lines" % (str(page_no), str(len(lt_line_list))))
														
 
															         return lt_line_list
														
 
															-    def recognize_text(self,layout,page_no,lt_text_list,lt_line_list):
														
 
															+    def recognize_text(self, layout, page_no, lt_text_list, lt_line_list):
														
 
															         list_tables, filter_objs, _ = self.lt.recognize_table(lt_text_list, lt_line_list)
														
 
															         self._page.in_table_objs = filter_objs
														
 
															-        print("=======text_len:%d:filter_len:%d"%(len(lt_text_list),len(filter_objs)))
														
 
															+        # print("=======text_len:%d:filter_len:%d"%(len(lt_text_list),len(filter_objs)))
														
 
															         for table in list_tables:
														
 
															             _table = _Table(table["table"], table["bbox"])
														
@@ -804,7 +1183,7 @@ class PDFConvert:
 
															         # pdf对象需反向排序
														
 
															         self._page.is_reverse = True
														
 
															-    def is_text_legal(self,lt_text_list,page_no):
														
 
															+    def is_text_legal(self, lt_text_list, page_no):
														
 
															         # 无法识别pdf字符编码，整页用ocr
														
 
															         text_temp = ""
														
 
															         for _t in lt_text_list:
														
@@ -819,6 +1198,19 @@ class PDFConvert:
 
															                 _image = _Image(page_image[1], page_image[0])
														
 
															                 self._page.add_child(_image)
														
 
															             return False
														
 
															+
														
 
															+        match1 = re.findall(get_garble_code(), text_temp)
														
 
															+        # match2 = re.search('[\u4e00-\u9fa5]', text_temp)
														
 
															+        if len(match1) > 3 and len(text_temp) > 10:
														
 
															+            log("pdf garbled code! try pymupdf... " + text_temp[:20])
														
 
															+            page_image = self.get_page_image(page_no)
														
 
															+            if judge_error_code(page_image):
														
 
															+                self._page.error_code = page_image
														
 
															+            else:
														
 
															+                _image = _Image(page_image[1], page_image[0])
														
 
															+                self._page.add_child(_image)
														
 
															+            return False
														
 
															+
														
 
															         return True
														
 
															     def convert_page(self, page, page_no):
														
@@ -852,7 +1244,7 @@ class PDFConvert:
 
															                         lt_image_list.append(y)
														
 
															                         # image_count += 1
														
 
															         lt_text_list = self.delete_water_mark(lt_text_list, layout.bbox, 15)
														
 
															-        print("convert_pdf page", page_no)
														
 
															+        log("convert_pdf page " + str(page_no))
														
 
															         log("len(lt_image_list), len(lt_text_list) " + str(len(lt_image_list)) + " " + str(len(lt_text_list)))
														
 
															         # 若只有文本且图片数为0，直接提取文字及表格
														
@@ -873,14 +1265,18 @@ class PDFConvert:
 
															                     self._page.add_child(_image)
														
 
															                 return
														
 
															-
														
 
															-            if not self.is_text_legal(lt_text_list,page_no):
														
 
															+            if not self.is_text_legal(lt_text_list, page_no):
														
 
															                 return
														
 
															             try:
														
 
															-                lt_line_list = self.get_text_lines(page,page_no)
														
 
															-                self.recognize_text(layout,page_no,lt_text_list,lt_line_list)
														
 
															-
														
 
															+                lt_line_list = self.get_page_lines(layout, page_no)
														
 
															+            except:
														
 
															+                traceback.print_exc()
														
 
															+                lt_line_list = []
														
 
															+                self._page.error_code = [-13]
														
 
															+            try:
														
 
															+                # lt_line_list = self.get_text_lines(page,page_no)
														
 
															+                self.recognize_text(layout, page_no, lt_text_list, lt_line_list)
														
 
															             except:
														
 
															                 traceback.print_exc()
														
 
															                 self._page.error_code = [-8]
														
@@ -902,7 +1298,7 @@ class PDFConvert:
 
															             # 图表对象
														
 
															             for image in lt_image_list:
														
 
															                 try:
														
 
															-                    print("pdf2text LTImage size", page_no, image.width, image.height)
														
 
															+                    # print("pdf2text LTImage size", page_no, image.width, image.height)
														
 
															                     image_stream = image.stream.get_data()
														
 
															                     # 小的图忽略
														
 
															                     if image.width <= 300 and image.height <= 300:
														
@@ -911,7 +1307,7 @@ class PDFConvert:
 
															                     img_test = Image.open(io.BytesIO(image_stream))
														
 
															                     # img_test.show()
														
 
															                     if image.height >= 1000 and image.width >= 1000:
														
 
															-                        print("pdf2text LTImage stream output size", img_test.size)
														
 
															+                        # print("pdf2text LTImage stream output size", img_test.size)
														
 
															                         page_image = self.get_page_image(page_no)
														
 
															                         if judge_error_code(page_image):
														
 
															                             self._page.error_code = page_image
														
@@ -932,19 +1328,25 @@ class PDFConvert:
 
															                 except Exception:
														
 
															                     log("pdf2text pdfminer read image in page " + str(page_no) +
														
 
															                         "  fail! use pymupdf read image...")
														
 
															-                    print(traceback.print_exc())
														
 
															+                    traceback.print_exc()
														
 
															             # pdf对象需反向排序
														
 
															             self._page.is_reverse = True
														
 
															             self.init_package("pdfplumber")
														
 
															-            if not self.is_text_legal(lt_text_list,page_no):
														
 
															+            if not self.is_text_legal(lt_text_list, page_no):
														
 
															                 return
														
 
															-            lt_line_list = self.get_text_lines(page,page_no)
														
 
															-            self.recognize_text(layout,page_no,lt_text_list,lt_line_list)
														
 
															+            # lt_line_list = self.get_text_lines(page, page_no)
														
 
															+            try:
														
 
															+                lt_line_list = self.get_page_lines(layout, page_no)
														
 
															+            except:
														
 
															+                traceback.print_exc()
														
 
															+                lt_line_list = []
														
 
															+                self._page.error_code = [-13]
														
 
															+            self.recognize_text(layout, page_no, lt_text_list, lt_line_list)
														
 
															     def get_layout(self, page, page_no):
														
 
															-        log("")
														
 
															+        log("get_layout")
														
 
															         if self.has_init_pdf[0] == 0:
														
 
															             self.init_package("pdfminer")
														
 
															         if self._doc.error_code is not None:
														
@@ -1032,11 +1434,137 @@ class PDFConvert:
 
															                 continue
														
 
															             self._doc.add_child(self._page)
														
 
															+    def connect_table(self, html_list):
														
 
															+        if not html_list:
														
 
															+            return html_list
														
 
															+
														
 
															+        # 判断条件1：最后一个表格后有无非页码文本/第一个表格前有无文本
														
 
															+        connect_flag_list = []
														
 
															+        soup_list = []
														
 
															+        for i, h in enumerate(html_list):
														
 
															+            soup_list.append(BeautifulSoup(h, 'lxml'))
														
 
															+            # 找最后一个表格
														
 
															+            table_start1, table_end1 = None, None
														
 
															+            # print('h', h)
														
 
															+            match = re.finditer('<table', h)
														
 
															+            for m in match:
														
 
															+                table_start1 = m.span()[0]
														
 
															+            if table_start1 is not None:
														
 
															+                match = re.finditer('</table>', h[table_start1:])
														
 
															+                for m in match:
														
 
															+                    table_end1 = m.span()[1] + table_start1
														
 
															+            # 最后一个表格后有无除了页码外的内容
														
 
															+            connect_flag1 = False
														
 
															+            if table_end1 is not None:
														
 
															+                match = re.search('[^-/第页0-9]*', re.sub('<div>|</div>', '', h[table_end1:]))
														
 
															+                # print('match1', match.group())
														
 
															+                if not match or match.group() == '':
														
 
															+                    connect_flag1 = True
														
 
															+
														
 
															+            # 找第一个表格
														
 
															+            table_start2, table_end2 = None, None
														
 
															+            match = re.finditer('<table', h)
														
 
															+            for m in match:
														
 
															+                table_start2 = m.span()[0]
														
 
															+                break
														
 
															+            # 第一个表格后有无内容
														
 
															+            connect_flag2 = False
														
 
															+            if table_start2 is not None and table_start2 == 0:
														
 
															+                connect_flag2 = True
														
 
															+            connect_flag_list.append([i, connect_flag2, connect_flag1])
														
 
															+
														
 
															+        # print('connect_flag_list', connect_flag_list)
														
 
															+
														
 
															+        # 根据条件1合并需连接页码，形成组
														
 
															+        connect_pages_list = []
														
 
															+        temp_list = []
														
 
															+        for i, c in enumerate(connect_flag_list):
														
 
															+            if temp_list and c[1]:
														
 
															+                temp_list.append(c)
														
 
															+            elif not temp_list and c[2]:
														
 
															+                temp_list.append(c)
														
 
															+            else:
														
 
															+                if temp_list:
														
 
															+                    connect_pages_list.append(temp_list)
														
 
															+                    temp_list = []
														
 
															+                connect_pages_list.append([c])
														
 
															+        if temp_list:
														
 
															+            connect_pages_list.append(temp_list)
														
 
															+
														
 
															+        # print('connect_pages_list', connect_pages_list)
														
 
															+
														
 
															+        # 判断条件2：判断组内列数是否相同
														
 
															+        connect_pages_list2 = []
														
 
															+        for c_list in connect_pages_list:
														
 
															+            if len(c_list) == 1:
														
 
															+                connect_pages_list2.append(c_list)
														
 
															+            else:
														
 
															+                col_cnt_list = []
														
 
															+                for c in c_list:
														
 
															+                    soup = soup_list[c[0]]
														
 
															+                    table1 = soup.findAll('table')[-1]
														
 
															+                    table2 = soup.findAll('table')[0]
														
 
															+                    tr1 = table1.findAll('tr')
														
 
															+                    tr2 = table2.findAll('tr')
														
 
															+                    td1 = tr1[-1].findAll('td')
														
 
															+                    td2 = tr2[0].findAll('td')
														
 
															+                    col_cnt_list.append([len(td2), len(td1)])
														
 
															+                new_c_list = [c_list[0]]
														
 
															+                # print('col_cnt_list', col_cnt_list)
														
 
															+                for i in range(len(col_cnt_list) - 1):
														
 
															+                    if col_cnt_list[i][1] != col_cnt_list[i + 1][0]:
														
 
															+                        connect_pages_list2.append(new_c_list)
														
 
															+                        new_c_list = [c_list[i + 1]]
														
 
															+                    else:
														
 
															+                        new_c_list.append(c_list[i + 1])
														
 
															+                if new_c_list:
														
 
															+                    connect_pages_list2.append(new_c_list)
														
 
															+
														
 
															+        # print('connect_pages_list2', connect_pages_list2)
														
 
															+
														
 
															+        # 符合连接条件的拼接表格
														
 
															+        new_html_list = []
														
 
															+        for c_list in connect_pages_list2:
														
 
															+            if len(c_list) == 1:
														
 
															+                new_html_list.append(html_list[c_list[0][0]])
														
 
															+                continue
														
 
															+            new_html = ''
														
 
															+            for c in c_list:
														
 
															+                new_html += html_list[c[0]]
														
 
															+            new_html = re.sub('</table>([-/第页0-9]|<div>|</div>)*<table border="1">', '<tr><td>#@#@#</td></tr>',
														
 
															+                              new_html)
														
 
															+
														
 
															+            soup = BeautifulSoup(new_html, 'lxml')
														
 
															+            trs = soup.findAll('tr')
														
 
															+            for i in range(len(trs)):
														
 
															+                if trs[i].get_text() == '#@#@#':
														
 
															+                    td1 = trs[i - 1].findAll('td')
														
 
															+                    td2 = trs[i + 1].findAll('td')
														
 
															+                    if td2[0].get_text() == '':
														
 
															+                        for j in range(len(td1)):
														
 
															+                            td1[j].string = td1[j].get_text() + td2[j].get_text()
														
 
															+                        trs[i + 1].decompose()
														
 
															+                    trs[i].decompose()
														
 
															+            new_html = str(soup)
														
 
															+            new_html_list.append(new_html)
														
 
															+
														
 
															+        html_str = ''
														
 
															+        for h in new_html_list:
														
 
															+            html_str += h
														
 
															+        return [html_str]
														
 
															+
														
 
															     def get_html(self):
														
 
															         self.convert()
														
 
															         if self._doc.error_code is not None:
														
 
															             return self._doc.error_code
														
 
															-        return self._doc.get_html()
														
 
															+        html = self._doc.get_html(return_list=True)
														
 
															+        # 表格连接
														
 
															+        try:
														
 
															+            html = self.connect_table(html)
														
 
															+        except:
														
 
															+            traceback.print_exc()
														
 
															+            return [-12]
														
 
															+        return html
														
 
															     def delete_water_mark(self, lt_text_list, page_bbox, times=5):
														
 
															         # 删除过多重复字句，为水印
														
@@ -1075,7 +1603,7 @@ class PDFConvert:
 
															             ratio = max_size / _img.shape[resize_axis]
														
 
															             new_shape = [0, 0]
														
 
															             new_shape[resize_axis] = max_size
														
 
															-            new_shape[1-resize_axis] = int(_img.shape[1-resize_axis] * ratio)
														
 
															+            new_shape[1 - resize_axis] = int(_img.shape[1 - resize_axis] * ratio)
														
 
															             _img = cv2.resize(_img, (new_shape[1], new_shape[0]))
														
 
															             cv2.imwrite(img_path, _img)
														
@@ -1097,11 +1625,116 @@ class PDFConvert:
 
															             return [-3]
														
 
															+def get_text_font():
														
 
															+    def flags_decomposer(flags):
														
 
															+        """Make font flags human readable."""
														
 
															+        l = []
														
 
															+        if flags & 2 ** 0:
														
 
															+            l.append("superscript")
														
 
															+        if flags & 2 ** 1:
														
 
															+            l.append("italic")
														
 
															+        if flags & 2 ** 2:
														
 
															+            l.append("serifed")
														
 
															+        else:
														
 
															+            l.append("sans")
														
 
															+        if flags & 2 ** 3:
														
 
															+            l.append("monospaced")
														
 
															+        else:
														
 
															+            l.append("proportional")
														
 
															+        if flags & 2 ** 4:
														
 
															+            l.append("bold")
														
 
															+        return ", ".join(l)
														
 
															+
														
 
															+    def get_underlined_textLines(page):
														
 
															+        """
														
 
															+        获取某页pdf上的所有下划线文本信息
														
 
															+        :param page: fitz中的一页
														
 
															+        :return: list of tuples,每个tuple都是一个完整的下划线覆盖的整体：[(下划线句, 所在blk_no, 所在line_no), ...]
														
 
															+        """
														
 
															+        paths = page.get_drawings()  # get drawings on the current page
														
 
															+
														
 
															+        # 获取该页内所有的height很小的bbox。因为下划线其实大多是这种矩形
														
 
															+        # subselect things we may regard as lines
														
 
															+        lines = []
														
 
															+        for p in paths:
														
 
															+            for item in p["items"]:
														
 
															+                if item[0] == "l":  # an actual line
														
 
															+                    p1, p2 = item[1:]
														
 
															+                    if p1.y == p2.y:
														
 
															+                        lines.append((p1, p2))
														
 
															+                elif item[0] == "re":  # a rectangle: check if height is small
														
 
															+                    r = item[1]
														
 
															+                    if r.width > r.height and r.height <= 2:
														
 
															+                        lines.append((r.tl, r.tr))  # take top left / right points
														
 
															+
														
 
															+        # 获取该页的`max_lineheight`，用于下面比较距离使用
														
 
															+        blocks = page.get_text("dict", flags=11)["blocks"]
														
 
															+        max_lineheight = 0
														
 
															+        for b in blocks:
														
 
															+            for l in b["lines"]:
														
 
															+                bbox = fitz.Rect(l["bbox"])
														
 
															+                if bbox.height > max_lineheight:
														
 
															+                    max_lineheight = bbox.height
														
 
															+
														
 
															+        underlined_res = []
														
 
															+        # 开始对下划线内容进行查询
														
 
															+        # make a list of words
														
 
															+        words = page.get_text("words")
														
 
															+        # if underlined, the bottom left / right of a word
														
 
															+        # should not be too far away from left / right end of some line:
														
 
															+        for wdx, w in enumerate(words):  # w[4] is the actual word string
														
 
															+            r = fitz.Rect(w[:4])  # first 4 items are the word bbox
														
 
															+            for p1, p2 in lines:  # check distances for start / end points
														
 
															+                if abs(r.bl - p1) <= max_lineheight:  # 当前word的左下满足下划线左下
														
 
															+                    if abs(r.br - p2) <= max_lineheight:  # 当前word的右下满足下划线右下（单个词，无空格）
														
 
															+                        print(f"Word '{w[4]}' is underlined! Its block-line number is {w[-3], w[-2]}")
														
 
															+                        underlined_res.append((w[4], w[-3], w[-2]))  # 分别是(下划线词，所在blk_no，所在line_no)
														
 
															+                        break  # don't check more lines
														
 
															+                    else:  # 继续寻找同line右侧的有缘人，因为有些下划线覆盖的词包含多个词，多个词之间有空格
														
 
															+                        curr_line_num = w[-2]  # line nunmber
														
 
															+                        for right_wdx in range(wdx + 1, len(words), 1):
														
 
															+                            _next_w = words[right_wdx]
														
 
															+                            if _next_w[-2] != curr_line_num:  # 当前遍历到的右侧word已经不是当前行的了（跨行是不行的）
														
 
															+                                break
														
 
															+                            _r_right = fitz.Rect(_next_w[:4])  # 获取当前同行右侧某word的方框4点
														
 
															+                            if abs(_r_right.br - p2) <= max_lineheight:  # 用此word右下点和p2(目标下划线右上点)算距离，距离要小于max_lineheight
														
 
															+                                print(
														
 
															+                                    f"Word '{' '.join([_one_word[4] for _one_word in words[wdx:right_wdx + 1]])}' is underlined! " +
														
 
															+                                    f"Its block-line number is {w[-3], w[-2]}")
														
 
															+                                underlined_res.append(
														
 
															+                                    (' '.join([_one_word[4] for _one_word in words[wdx:right_wdx + 1]]),
														
 
															+                                     w[-3], w[-2])
														
 
															+                                )  # 分别是(下划线词，所在blk_no，所在line_no)
														
 
															+                                break  # don't check more lines
														
 
															+        return underlined_res
														
 
															+
														
 
															+    _p = r'C:\Users\Administrator\Desktop\test_pdf\error2-2.pdf'
														
 
															+    doc_pymupdf = read_pymupdf(_p)
														
 
															+    page = doc_pymupdf[0]
														
 
															+    blocks = page.get_text("dict", flags=11)["blocks"]
														
 
															+    for b in blocks:  # iterate through the text blocks
														
 
															+        for l in b["lines"]:  # iterate through the text lines
														
 
															+            for s in l["spans"]:  # iterate through the text spans
														
 
															+                print("")
														
 
															+                font_properties = "Font: '%s' (%s), size %g, color #%06x" % (
														
 
															+                    s["font"],  # font name
														
 
															+                    flags_decomposer(s["flags"]),  # readable font flags
														
 
															+                    s["size"],  # font size
														
 
															+                    s["color"],  # font color
														
 
															+                )
														
 
															+                print(s)
														
 
															+                print("Text: '%s'" % s["text"])  # simple print of text
														
 
															+                print(font_properties)
														
 
															+
														
 
															+    get_underlined_textLines(page)
														
 
															+
														
 
															+
														
 
															 # 以下为现成pdf单页解析接口
														
 
															 class ParseSentence:
														
 
															-    def __init__(self,bbox,fontname,fontsize,_text,_title,title_text,_pattern,title_degree,is_outline,outline_location,page_no):
														
 
															-        (x0,y0,x1,y1) = bbox
														
 
															+    def __init__(self, bbox, fontname, fontsize, _text, _title, title_text, _pattern, title_degree, is_outline,
														
 
															+                 outline_location, page_no):
														
 
															+        (x0, y0, x1, y1) = bbox
														
 
															         self.x0 = x0
														
 
															         self.y0 = y0
														
 
															         self.x1 = x1
														
@@ -1119,7 +1752,7 @@ class ParseSentence:
 
															         self.page_no = page_no
														
 
															     def __repr__(self):
														
 
															-        return "%s,%s,%s,%d,%s"%(self.text,self.title,self.is_outline,self.outline_location,str(self.bbox))
														
 
															+        return "%s,%s,%s,%d,%s" % (self.text, self.title, self.is_outline, self.outline_location, str(self.bbox))
														
 
															 class ParseUtils:
														
@@ -1127,11 +1760,11 @@ class ParseUtils:
 
															     @staticmethod
														
 
															     def getFontinfo(_page):
														
 
															         for _obj in _page._objs:
														
 
															-            if isinstance(_obj,(LTTextBoxHorizontal,LTTextBoxVertical)):
														
 
															+            if isinstance(_obj, (LTTextBoxHorizontal, LTTextBoxVertical)):
														
 
															                 for textline in _obj._objs:
														
 
															                     done = False
														
 
															                     for lchar in textline._objs:
														
 
															-                        if isinstance(lchar,(LTChar)):
														
 
															+                        if isinstance(lchar, (LTChar)):
														
 
															                             _obj.fontname = lchar.fontname
														
 
															                             _obj.fontsize = lchar.size
														
 
															                         done = True
														
@@ -1153,7 +1786,7 @@ class ParseUtils:
 
															             _find = False
														
 
															             for _ct in cluster_textbox:
														
 
															-                if abs(_ct["y"]-_textbox.bbox[1]) < 5:
														
 
															+                if abs(_ct["y"] - _textbox.bbox[1]) < 5:
														
 
															                     _find = True
														
 
															                     _ct["textbox"].append(_textbox)
														
 
															             if not _find:
														
@@ -1167,14 +1800,14 @@ class ParseUtils:
 
															             _linetext = _textboxs[0].get_text()
														
 
															             for _i in range(1, len(_textboxs)):
														
 
															-                if abs(_textboxs[_i].bbox[0]-_textboxs[_i-1].bbox[2])>60:
														
 
															+                if abs(_textboxs[_i].bbox[0] - _textboxs[_i - 1].bbox[2]) > 60:
														
 
															                     if _linetext[-1] not in (",", "，", "。", ".", "、", "；"):
														
 
															                         _linetext += "=，="
														
 
															                 _linetext += _textboxs[_i].get_text()
														
 
															             _linetext = re.sub("[\s\r\n]", "", _linetext)
														
 
															             _bbox = (_textboxs[0].bbox[0], _textboxs[0].bbox[1],
														
 
															-                     _textboxs[-1].bbox[2],_textboxs[-1].bbox[3])
														
 
															+                     _textboxs[-1].bbox[2], _textboxs[-1].bbox[3])
														
 
															             _title = None
														
 
															             _pattern_groups = None
														
@@ -1192,7 +1825,7 @@ class ParseUtils:
 
															                     title_text = _groups[0][1]
														
 
															                     _pattern_groups = _groups
														
 
															             if not _title:
														
 
															-                _title = ParseUtils.rec_incenter(_bbox,page_bbox)
														
 
															+                _title = ParseUtils.rec_incenter(_bbox, page_bbox)
														
 
															             title_degree = 2
														
 
															             if not _title:
														
@@ -1202,7 +1835,7 @@ class ParseUtils:
 
															                 title_degree = int(_title.split("_")[1])
														
 
															             # 页码
														
 
															-            if ParseUtils.rec_incenter(_bbox,page_bbox) and re.search("^\d+$", _linetext) is not None:
														
 
															+            if ParseUtils.rec_incenter(_bbox, page_bbox) and re.search("^\d+$", _linetext) is not None:
														
 
															                 continue
														
 
															             if _linetext == "" or re.search("^，+$", _linetext) is not None:
														
@@ -1216,7 +1849,10 @@ class ParseUtils:
 
															                 _linetext = _search.group("text")
														
 
															                 outline_location = int(_search.group("nums"))
														
 
															-            list_sentences.append(ParseSentence(_bbox,_textboxs[-1].__dict__.get("fontname"),_textboxs[-1].__dict__.get("fontsize"),_linetext,_title,title_text,_pattern_groups,title_degree,is_outline,outline_location,page_no))
														
 
															+            list_sentences.append(
														
 
															+                ParseSentence(_bbox, _textboxs[-1].__dict__.get("fontname"), _textboxs[-1].__dict__.get("fontsize"),
														
 
															+                              _linetext, _title, title_text, _pattern_groups, title_degree, is_outline,
														
 
															+                              outline_location, page_no))
														
 
															         # for _sen in list_sentences:
														
 
															         #     print(_sen.__dict__)
														
@@ -1224,133 +1860,136 @@ class ParseUtils:
 
															         return list_sentences
														
 
															     @staticmethod
														
 
															-    def find_title_by_pattern(_text,_pattern="(?P<title_1>(?P<title_1_index_0_0>^第?)(?P<title_1_index_1_1>[一二三四五六七八九十ⅠⅡⅢⅣⅤⅥⅦⅧⅨⅩⅪⅫ]+)(?P<title_1_index_2_0>[、章]))|" \
														
 
															-                                             "(?P<title_3>^(?P<title_3_index_0_1>[ⅠⅡⅢⅣⅤⅥⅦⅧⅨⅩⅪⅫ]+))|" \
														
 
															-                                             "(?P<title_4>^(?P<title_4_index_0_0>第?)(?P<title_4_index_1_1>[一二三四五六七八九十]+)(?P<title_4_index_2_0>[节]))|" \
														
 
															-                                             "(?P<title_11>^(?P<title_11_index_0_0>\d{1,2}[\.．、\s\-]\d{1,2}[\.．、\s\-]\d{1,2}[\.．、\s\-])(?P<title_11_index_1_1>\d{1,2})(?P<title_11_index_2_0>[\.．、\s\-]))|" \
														
 
															-                                             "(?P<title_10>^(?P<title_10_index_0_0>\d{1,2}[\.．、\s\-]\d{1,2}[\.．、\s\-])(?P<title_10_index_1_1>\d{1,2})(?P<title_10_index_2_0>[\.．、\s\-]))|" \
														
 
															-                                             "(?P<title_7>^(?P<title_7_index_0_0>\d{1,2}[\.．、\s\-])(?P<title_7_index_1_1>\d{1,2})(?P<title_7_index_2_0>[\.．、\s\-]))|" \
														
 
															-                                             "(?P<title_6>^(?P<title_6_index_0_1>\d{1,2})(?P<title_6_index_1_0>[\.．、\s\-]))|" \
														
 
															-                                             "(?P<title_15>^(?P<title_15_index_0_0>（?)(?P<title_15_index_1_1>\d{1,2})(?P<title_15_index_2_0>）))|" \
														
 
															-                                             "(?P<title_17>^(?P<title_17_index_0_0>（?)(?P<title_17_index_1_1>[a-zA-Z]+)(?P<title_17_index_2_0>）))|"
														
 
															-                                             "(?P<title_19>^(?P<title_19_index_0_0>（?)(?P<title_19_index_1_1>[一二三四五六七八九十ⅠⅡⅢⅣⅤⅥⅦⅧⅨⅩⅪⅫ]+)(?P<title_19_index_2_0>）))|" \
														
 
															+    def find_title_by_pattern(_text,
														
 
															+                              _pattern="(?P<title_1>(?P<title_1_index_0_0>^第?)(?P<title_1_index_1_1>[一二三四五六七八九十ⅠⅡⅢⅣⅤⅥⅦⅧⅨⅩⅪⅫ]+)(?P<title_1_index_2_0>[、章]))|" \
														
 
															+                                       "(?P<title_3>^(?P<title_3_index_0_1>[ⅠⅡⅢⅣⅤⅥⅦⅧⅨⅩⅪⅫ]+))|" \
														
 
															+                                       "(?P<title_4>^(?P<title_4_index_0_0>第?)(?P<title_4_index_1_1>[一二三四五六七八九十]+)(?P<title_4_index_2_0>[节]))|" \
														
 
															+                                       "(?P<title_11>^(?P<title_11_index_0_0>\d{1,2}[\.．、\s\-]\d{1,2}[\.．、\s\-]\d{1,2}[\.．、\s\-])(?P<title_11_index_1_1>\d{1,2})(?P<title_11_index_2_0>[\.．、\s\-]))|" \
														
 
															+                                       "(?P<title_10>^(?P<title_10_index_0_0>\d{1,2}[\.．、\s\-]\d{1,2}[\.．、\s\-])(?P<title_10_index_1_1>\d{1,2})(?P<title_10_index_2_0>[\.．、\s\-]))|" \
														
 
															+                                       "(?P<title_7>^(?P<title_7_index_0_0>\d{1,2}[\.．、\s\-])(?P<title_7_index_1_1>\d{1,2})(?P<title_7_index_2_0>[\.．、\s\-]))|" \
														
 
															+                                       "(?P<title_6>^(?P<title_6_index_0_1>\d{1,2})(?P<title_6_index_1_0>[\.．、\s\-]))|" \
														
 
															+                                       "(?P<title_15>^(?P<title_15_index_0_0>（?)(?P<title_15_index_1_1>\d{1,2})(?P<title_15_index_2_0>）))|" \
														
 
															+                                       "(?P<title_17>^(?P<title_17_index_0_0>（?)(?P<title_17_index_1_1>[a-zA-Z]+)(?P<title_17_index_2_0>）))|"
														
 
															+                                       "(?P<title_19>^(?P<title_19_index_0_0>（?)(?P<title_19_index_1_1>[一二三四五六七八九十ⅠⅡⅢⅣⅤⅥⅦⅧⅨⅩⅪⅫ]+)(?P<title_19_index_2_0>）))|" \
														
 
															                               ):
														
 
															-        _se = re.search(_pattern,_text)
														
 
															+        _se = re.search(_pattern, _text)
														
 
															         groups = []
														
 
															         if _se is not None:
														
 
															             _gd = _se.groupdict()
														
 
															-            for k,v in _gd.items():
														
 
															+            for k, v in _gd.items():
														
 
															                 if v is not None:
														
 
															-                    groups.append((k,v))
														
 
															+                    groups.append((k, v))
														
 
															         if len(groups):
														
 
															-            groups.sort(key=lambda x:x[0])
														
 
															+            groups.sort(key=lambda x: x[0])
														
 
															             return groups
														
 
															         return None
														
 
															     @staticmethod
														
 
															-    def rec_incenter(o_bbox,p_bbox):
														
 
															-        p_width = p_bbox[2]-p_bbox[0]
														
 
															-        l_space = (o_bbox[0]-p_bbox[0])/p_width
														
 
															-        r_space = (p_bbox[2]-o_bbox[2])/p_width
														
 
															+    def rec_incenter(o_bbox, p_bbox):
														
 
															+        p_width = p_bbox[2] - p_bbox[0]
														
 
															+        l_space = (o_bbox[0] - p_bbox[0]) / p_width
														
 
															+        r_space = (p_bbox[2] - o_bbox[2]) / p_width
														
 
															-        if abs((l_space-r_space))<0.1 and l_space>0.2:
														
 
															+        if abs((l_space - r_space)) < 0.1 and l_space > 0.2:
														
 
															             return "title_2"
														
 
															     @staticmethod
														
 
															     def is_first_title(_title):
														
 
															         if _title is None:
														
 
															             return False
														
 
															-        if re.search("^\d+$",_title) is not None:
														
 
															-            if int(_title)==1:
														
 
															+        if re.search("^\d+$", _title) is not None:
														
 
															+            if int(_title) == 1:
														
 
															                 return True
														
 
															             return False
														
 
															-        if re.search("^[一二三四五六七八九十百]+$",_title) is not None:
														
 
															-            if _title=="一":
														
 
															+        if re.search("^[一二三四五六七八九十百]+$", _title) is not None:
														
 
															+            if _title == "一":
														
 
															                 return True
														
 
															             return False
														
 
															-        if re.search("^[a-z]+$",_title) is not None:
														
 
															-            if _title=="a":
														
 
															+        if re.search("^[a-z]+$", _title) is not None:
														
 
															+            if _title == "a":
														
 
															                 return True
														
 
															             return False
														
 
															-        if re.search("^[A-Z]+$",_title) is not None:
														
 
															-            if _title=="A":
														
 
															+        if re.search("^[A-Z]+$", _title) is not None:
														
 
															+            if _title == "A":
														
 
															                 return True
														
 
															             return False
														
 
															-        if re.search("^[ⅠⅡⅢⅣⅤⅥⅦⅧⅨⅩⅪⅫ]$",_title) is not None:
														
 
															-            if _title=="Ⅰ":
														
 
															+        if re.search("^[ⅠⅡⅢⅣⅤⅥⅦⅧⅨⅩⅪⅫ]$", _title) is not None:
														
 
															+            if _title == "Ⅰ":
														
 
															                 return True
														
 
															             return False
														
 
															         return False
														
 
															     @staticmethod
														
 
															     def get_next_title(_title):
														
 
															-        if re.search("^\d+$",_title) is not None:
														
 
															-            return str(int(_title)+1)
														
 
															-        if re.search("^[一二三四五六七八九十百]+$",_title) is not None:
														
 
															-            _next_title = ParseUtils.make_increase(['一','二','三','四','五','六','七','八','九','十'],re.sub("[十百]",'',_title))
														
 
															+        if re.search("^\d+$", _title) is not None:
														
 
															+            return str(int(_title) + 1)
														
 
															+        if re.search("^[一二三四五六七八九十百]+$", _title) is not None:
														
 
															+            _next_title = ParseUtils.make_increase(['一', '二', '三', '四', '五', '六', '七', '八', '九', '十'],
														
 
															+                                                   re.sub("[十百]", '', _title))
														
 
															             _next_title = list(_next_title)
														
 
															             _next_title.reverse()
														
 
															-            if _next_title[-1]!="十":
														
 
															-                if len(_next_title)>=2:
														
 
															-                    _next_title.insert(-1,'十')
														
 
															-            if len(_next_title)>=4:
														
 
															-                _next_title.insert(-3,'百')
														
 
															-            if _title[0]=="十":
														
 
															-                if _next_title=="十":
														
 
															-                    _next_title = ["二","十"]
														
 
															-                _next_title.insert(0,"十")
														
 
															+            if _next_title[-1] != "十":
														
 
															+                if len(_next_title) >= 2:
														
 
															+                    _next_title.insert(-1, '十')
														
 
															+            if len(_next_title) >= 4:
														
 
															+                _next_title.insert(-3, '百')
														
 
															+            if _title[0] == "十":
														
 
															+                if _next_title == "十":
														
 
															+                    _next_title = ["二", "十"]
														
 
															+                _next_title.insert(0, "十")
														
 
															             _next_title = "".join(_next_title)
														
 
															             return _next_title
														
 
															-        if re.search("^[a-z]+$",_title) is not None:
														
 
															-            _next_title = ParseUtils.make_increase([chr(i+ord('a')) for i in range(26)],_title)
														
 
															+        if re.search("^[a-z]+$", _title) is not None:
														
 
															+            _next_title = ParseUtils.make_increase([chr(i + ord('a')) for i in range(26)], _title)
														
 
															             _next_title = list(_next_title)
														
 
															             _next_title.reverse()
														
 
															             return "".join(_next_title)
														
 
															-        if re.search("^[A-Z]+$",_title) is not None:
														
 
															-            _next_title = ParseUtils.make_increase([chr(i+ord('A')) for i in range(26)],_title)
														
 
															+        if re.search("^[A-Z]+$", _title) is not None:
														
 
															+            _next_title = ParseUtils.make_increase([chr(i + ord('A')) for i in range(26)], _title)
														
 
															             _next_title = list(_next_title)
														
 
															             _next_title.reverse()
														
 
															             return "".join(_next_title)
														
 
															-        if re.search("^[ⅠⅡⅢⅣⅤⅥⅦⅧⅨⅩⅪⅫ]$",_title) is not None:
														
 
															-            _sort = ["Ⅰ","Ⅱ","Ⅲ","Ⅳ","Ⅴ","Ⅵ","Ⅶ","Ⅷ","Ⅸ","Ⅹ","Ⅺ","Ⅻ"]
														
 
															+        if re.search("^[ⅠⅡⅢⅣⅤⅥⅦⅧⅨⅩⅪⅫ]$", _title) is not None:
														
 
															+            _sort = ["Ⅰ", "Ⅱ", "Ⅲ", "Ⅳ", "Ⅴ", "Ⅵ", "Ⅶ", "Ⅷ", "Ⅸ", "Ⅹ", "Ⅺ", "Ⅻ"]
														
 
															             _index = _sort.index(_title)
														
 
															-            if _index<len(_sort)-1:
														
 
															-                return _sort[_index+1]
														
 
															+            if _index < len(_sort) - 1:
														
 
															+                return _sort[_index + 1]
														
 
															             return None
														
 
															-
														
 
															     @staticmethod
														
 
															-    def make_increase(_sort,_title,_add=1):
														
 
															-        if len(_title)==0 and _add==0:
														
 
															+    def make_increase(_sort, _title, _add=1):
														
 
															+        if len(_title) == 0 and _add == 0:
														
 
															             return ""
														
 
															-        if len(_title)==0 and _add==1:
														
 
															+        if len(_title) == 0 and _add == 1:
														
 
															             return _sort[0]
														
 
															         _index = _sort.index(_title[-1])
														
 
															-        next_index = (_index+_add)%len(_sort)
														
 
															+        next_index = (_index + _add) % len(_sort)
														
 
															         next_chr = _sort[next_index]
														
 
															-        if _index==len(_sort)-1:
														
 
															+        if _index == len(_sort) - 1:
														
 
															             _add = 1
														
 
															         else:
														
 
															             _add = 0
														
 
															-        return next_chr+ParseUtils.make_increase(_sort,_title[:-1],_add)
														
 
															-
														
 
															-
														
 
															-
														
 
															+        return next_chr + ParseUtils.make_increase(_sort, _title[:-1], _add)
														
 
															     @staticmethod
														
 
															-    def rec_serial(_text,o_bbox,p_bbox,fontname,_pattern="(?P<title_1>^[一二三四五六七八九十]+[、])|" \
														
 
															-                                                         "(?P<title_2>^\d+[\.、\s])|" \
														
 
															-                                                         "(?P<title_3>^\d+\.\d+[\.、\s])|" \
														
 
															-                                                         "(?P<title_4>^\d+\.\d+\.\d+[\.、\s])|" \
														
 
															-                                                         "(?P<title_5>^\d+\.\d+\.\d+\.\d+[\.、\s])"):
														
 
															-        #todo :recog the serial of the sentence
														
 
															-
														
 
															-
														
 
															-
														
 
															-        _se = re.search(_pattern,_text)
														
 
															+    def rec_serial(_text, o_bbox, p_bbox, fontname, _pattern="(?P<title_1>^[一二三四五六七八九十]+[、])|" \
														
 
															+                                                             "(?P<title_2>^\d+[\.、\s])|" \
														
 
															+                                                             "(?P<title_3>^\d+\.\d+[\.、\s])|" \
														
 
															+                                                             "(?P<title_4>^\d+\.\d+\.\d+[\.、\s])|" \
														
 
															+                                                             "(?P<title_5>^\d+\.\d+\.\d+\.\d+[\.、\s])"):
														
 
															+        # todo :recog the serial of the sentence
														
 
															+
														
 
															+        _se = re.search(_pattern, _text)
														
 
															         if _se is not None:
														
 
															             _gd = _se.groupdict()
														
 
															-            for k,v in _gd.items():
														
 
															+            for k, v in _gd.items():
														
 
															                 if v is not None:
														
 
															                     return k
														
 
															         return None
														
 
															+
														
 
															+
														
 
															+if __name__ == '__main__':
														
 
															+    # get_text_font()
														
 
															+    PDFConvert(r"C:/Users/Administrator/Downloads/1651896704621.pdf", "C:/Users/Administrator/Downloads/1").get_html()
														
 
															+
														
 
															+    # print(b'\x10')
														
--- a/format_convert/convert_test.py
+++ b/format_convert/convert_test.py
@@ -6,6 +6,9 @@ import sys
 
															 import time
														
 
															 from glob import glob
														
 
															 from multiprocessing import Process
														
 
															+
														
 
															+from bs4 import BeautifulSoup
														
 
															+
														
 
															 sys.path.append(os.path.dirname(os.path.abspath(__file__)) + "/../")
														
 
															 from format_convert.utils import get_platform, request_post, get_md5_from_bytes
														
 
															 from format_convert.convert import to_html
														
@@ -21,10 +24,10 @@ def test_one(p, from_remote=False):
 
															     data = {"file": file_base64, "type": p.split(".")[-1], "filemd5": 100}
														
 
															     if from_remote:
														
 
															-        # _url = 'http://121.46.18.113:15010/convert'
														
 
															+        _url = 'http://121.46.18.113:15010/convert'
														
 
															         # _url = 'http://192.168.2.103:15010/convert'
														
 
															         # _url = 'http://172.16.160.65:15010/convert'
														
 
															-        _url = 'http://127.0.0.1:15010/convert'
														
 
															+        # _url = 'http://127.0.0.1:15010/convert'
														
 
															         result = json.loads(request_post(_url, data, time_out=10000))
														
 
															         text_str = ""
														
 
															         for t in result.get("result_html"):
														
@@ -58,9 +61,10 @@ if __name__ == '__main__':
 
															         # file_path = "C:/Users/Administrator/Desktop/test_xls/merge_cell.xlsx"
														
 
															         # file_path = "D:/BIDI_DOC/比地_文档/2022/Test_Interface/20210609202634853485.xlsx"
														
 
															         # file_path = "D:/BIDI_DOC/比地_文档/2022/Test_ODPS/1624325845476.pdf"
														
 
															-        # file_path = "C:/Users/Administrator/Downloads/神仙居旅游汽车租赁竞争性磋商文件(1).doc"
														
 
															+        # file_path = "C:/Users/Administrator/Downloads/QQ图片20230616105216.jpg"
														
 
															         # file_path = "C:/Users/Administrator/Desktop/test_xls/error2.xlsx"
														
 
															-        file_path = "C:/Users/Administrator/Desktop/test_doc/error5.docx"
														
 
															+        # file_path = "C:/Users/Administrator/Desktop/test_image/error9-2.png"
														
 
															+        file_path = "C:/Users/Administrator/Desktop/test_pdf/直接读表格线error/error51.pdf"
														
 
															     else:
														
 
															         file_path = "1660296734009.pdf"
														
 
															     test_one(file_path, from_remote=True)
														
@@ -87,4 +91,9 @@ if __name__ == '__main__':
 
															     #     p_list.append(p)
														
 
															     # for p in p_list:
														
 
															     #     p.join()
														
 
															-    # print("finish", time.time() - start_time)
														
 
															+    # print("finish", time.time() - start_time)
														
 
															+
														
 
															+    # with open(file_path, 'r') as f:
														
 
															+    #     t = f.read()
														
 
															+    # soup = BeautifulSoup(t, 'lxml')
														
 
															+    # print(soup.text)
														
--- a/format_convert/convert_tree.py
+++ b/format_convert/convert_tree.py
@@ -20,11 +20,15 @@ class _Document:
 
															         else:
														
 
															             self.error_code = child.error_code
														
 
															-    def get_html(self):
														
 
															+    def get_html(self, return_list=False):
														
 
															         if self.error_code is not None:
														
 
															             return self.error_code
														
 
															-        html_text = ""
														
 
															+        if return_list:
														
 
															+            html_text = []
														
 
															+        else:
														
 
															+            html_text = ""
														
 
															+
														
 
															         for child in self.children:
														
 
															             # 先调用get_html才能更新error_code
														
 
															             child_html_text = child.get_html()
														
@@ -32,8 +36,13 @@ class _Document:
 
															                 self.error_code = child.error_code
														
 
															                 return self.error_code
														
 
															             else:
														
 
															-                html_text += child_html_text
														
 
															-        return [html_text]
														
 
															+                if return_list:
														
 
															+                    html_text += [child_html_text]
														
 
															+                else:
														
 
															+                    html_text += child_html_text
														
 
															+        if not return_list:
														
 
															+            html_text = [html_text]
														
 
															+        return html_text
														
 
															 class _Page:
														
--- a/format_convert/convert_xls.py
+++ b/format_convert/convert_xls.py
@@ -1,8 +1,9 @@
 
															 import inspect
														
 
															 import os
														
 
															 import sys
														
 
															+from bs4 import BeautifulSoup
														
 
															 sys.path.append(os.path.dirname(os.path.abspath(__file__)) + "/../")
														
 
															-from format_convert.convert_tree import _Document
														
 
															+from format_convert.convert_tree import _Document, _Page, _Sentence
														
 
															 import logging
														
 
															 import traceback
														
 
															 from format_convert import get_memory_info
														
@@ -38,14 +39,31 @@ class XlsConvert:
 
															         self.unique_type_dir = unique_type_dir
														
 
															     def convert(self):
														
 
															-        # 调用office格式转换
														
 
															-        file_path = from_office_interface(self.path, self.unique_type_dir, 'xlsx')
														
 
															-        if judge_error_code(file_path):
														
 
															-            self._doc.error_code = file_path
														
 
															-            return
														
 
															-        _xlsx = XlsxConvert(file_path, self.unique_type_dir)
														
 
															-        _xlsx.convert()
														
 
															-        self._doc = _xlsx._doc
														
 
															+        # 先判断特殊xls文件，可能是html文本
														
 
															+        is_html_xls = False
														
 
															+        try:
														
 
															+            with open(self.path, 'r') as f:
														
 
															+                html_str = f.read()
														
 
															+            soup = BeautifulSoup(html_str, 'lxml')
														
 
															+            text = soup.text
														
 
															+            is_html_xls = True
														
 
															+        except:
														
 
															+            pass
														
 
															+
														
 
															+        if is_html_xls:
														
 
															+            self._page = _Page(None, 0)
														
 
															+            _sen = _Sentence(text, (0, 0, 0, 0))
														
 
															+            self._page.add_child(_sen)
														
 
															+            self._doc.add_child(self._page)
														
 
															+        else:
														
 
															+            # 调用office格式转换
														
 
															+            file_path = from_office_interface(self.path, self.unique_type_dir, 'xlsx')
														
 
															+            if judge_error_code(file_path):
														
 
															+                self._doc.error_code = file_path
														
 
															+                return
														
 
															+            _xlsx = XlsxConvert(file_path, self.unique_type_dir)
														
 
															+            _xlsx.convert()
														
 
															+            self._doc = _xlsx._doc
														
 
															     def get_html(self):
														
 
															         try:
														
@@ -53,8 +71,13 @@ class XlsConvert:
 
															         except:
														
 
															             traceback.print_exc()
														
 
															             self._doc.error_code = [-1]
														
 
															-        print("xls ", self._doc)
														
 
															+        # print("xls ", self._doc)
														
 
															         if self._doc.error_code is not None:
														
 
															             return self._doc.error_code
														
 
															-        print(self._doc.children)
														
 
															-        return self._doc.get_html()
														
 
															+        # print(self._doc.children)
														
 
															+        return self._doc.get_html()
														
 
															+
														
 
															+
														
 
															+if __name__ == '__main__':
														
 
															+    c = XlsConvert("C:/Users/Administrator/Downloads/1683641686556.xls", "C:/Users/Administrator/Downloads/1")
														
 
															+    print(c.get_html())
														
--- a/format_convert/convert_zip.py
+++ b/format_convert/convert_zip.py
@@ -1,11 +1,13 @@
 
															 import inspect
														
 
															 import os
														
 
															 import sys
														
 
															+import uuid
														
 
															+
														
 
															 sys.path.append(os.path.dirname(__file__) + "/../")
														
 
															 from format_convert.convert_tree import _Document, _Page, _Sentence
														
 
															 import logging
														
 
															 import traceback
														
 
															-import zipfile
														
 
															+import my_zipfile as zipfile
														
 
															 from format_convert import get_memory_info
														
 
															 from format_convert.utils import get_platform, rename_inner_files, judge_error_code, judge_format, get_logger, log, \
														
 
															     memory_decorator
														
@@ -126,14 +128,19 @@ class ZipConvert:
 
															                 # 中文乱码，会导致zip解压失败，直接修改对象
														
 
															                 try:
														
 
															                     new_f = f.encode('cp437').decode('gbk')
														
 
															+                    # print('1', new_f)
														
 
															                 except:
														
 
															                     new_f = f.encode('utf-8').decode('utf-8')
														
 
															+                    # print('2', new_f)
														
 
															                 if f != new_f:
														
 
															+                    new_f = str(uuid.uuid1().hex) + '.' + f.split('.')[-1]
														
 
															                     zip_file.NameToInfo[new_f] = zip_file.NameToInfo[f]
														
 
															                     zip_file.NameToInfo[new_f].filename = new_f
														
 
															                     zip_file.NameToInfo.pop(f)
														
 
															+                    zip_file.NameToInfo[new_f].orig_filename = new_f
														
 
															+                    # zip_file.NameToInfo[new_f].flag_bits = 2048
														
 
															+                    zip_file.NameToInfo[new_f].has_changed_name = True
														
 
															                 new_zip_list.append(new_f)
														
 
															-
														
 
															             new_zip_list.sort(key=lambda x: len(x))
														
 
															             for f in new_zip_list:
														
 
															                 file_list.append(zip_file.extract(f, path=self.zip_path))
														
@@ -198,4 +205,9 @@ class ZipConvert:
 
															             self._doc.error_code = [-1]
														
 
															         if self._doc.error_code is not None:
														
 
															             return self._doc.error_code
														
 
															-        return self._doc.get_html()
														
 
															+        return self._doc.get_html()
														
 
															+
														
 
															+
														
 
															+if __name__ == '__main__':
														
 
															+    c = ZipConvert("C:/Users/Administrator/Downloads/3775865878373065499.zip", "C:/Users/Administrator/Downloads/1")
														
 
															+    c.get_html()
														
--- a/format_convert/interface.yml
+++ b/format_convert/interface.yml
@@ -5,7 +5,7 @@ MASTER:
 
															 #  local-102: 'http://192.168.2.102'
														
 
															 #  local-103: 'http://192.168.2.103'
														
 
															 #  local 'http://127.0.0.1'
														
 
															-  ip: ['http://192.168.0.115']
														
 
															+  ip: ['http://127.0.0.1']
														
 
															   PATH:
														
 
															     python: ['/data/anaconda3/envs/convert3/bin/python']
														
--- a/format_convert/kill_all.sh
+++ b/format_convert/kill_all.sh
@@ -0,0 +1,6 @@
 
															+kill -9 $(lsof -i:15010|sed -n '2,$p'|awk '{print $2}'|tr '\n' ' ')
														
 
															+kill -9 $(lsof -i:17000|sed -n '2,$p'|awk '{print $2}'|tr '\n' ' ')
														
 
															+kill -9 $(lsof -i:18000|sed -n '2,$p'|awk '{print $2}'|tr '\n' ' ')
														
 
															+kill -9 $(lsof -i:18020|sed -n '2,$p'|awk '{print $2}'|tr '\n' ' ')
														
 
															+kill -9 $(lsof -i:18040|sed -n '2,$p'|awk '{print $2}'|tr '\n' ' ')
														
 
															+kill -9 $(lsof -i:18060|sed -n '2,$p'|awk '{print $2}'|tr '\n' ' ')
														
--- a/format_convert/kill_main.sh
+++ b/format_convert/kill_main.sh
@@ -0,0 +1 @@
 
															+kill -9 $(lsof -i:15010|sed -n '2,$p'|awk '{print $2}'|tr '\n' ' ')
														
--- a/format_convert/test_walk.py
+++ b/format_convert/test_walk.py
@@ -1,17 +1,107 @@
 
															+import copy
														
 
															 import os
														
 
															-file_list = []
														
 
															-for root, dirs, files in os.walk("./", topdown=False):
														
 
															-    for name in dirs:
														
 
															-        file_list.append(os.path.join(root, name) + os.sep)
														
 
															-    for name in files:
														
 
															-        file_list.append(os.path.join(root, name))
														
 
															-print(file_list)
														
 
															+import random
														
 
															+import re
														
 
															+import sys
														
 
															+import time
														
 
															+from bs4 import BeautifulSoup
														
 
															+from datetime import datetime
														
 
															+from multiprocessing import Process
														
 
															+import datetime as dt
														
 
															+sys.path.append(os.path.dirname(os.path.abspath(__file__)) + "/../")
														
 
															+from format_convert.utils import file_lock
														
 
															-s = set()
														
 
															-s.update("1231asdb我深大")
														
 
															-s.update("g6712")
														
 
															+def run():
														
 
															+
														
 
															+    f = file_lock(os.path.abspath(os.path.dirname(__file__)) + '/19022.lock')
														
 
															+    print("acquire file_lock! process " + str(os.getpid()))
														
 
															+    for i in range(10):
														
 
															+        print("process " + str(os.getpid()) + " " + str(i))
														
 
															+        time.sleep(random.randint(0, 1))
														
 
															+    f.close()
														
 
															+
														
 
															+
														
 
															+def merge_table():
														
 
															+    with open(r'C:\Users\Administrator\Desktop\2.html', 'r') as f:
														
 
															+        html_str = f.read()
														
 
															+    html_str_origin = copy.deepcopy(html_str)
														
 
															+
														
 
															+    try:
														
 
															+        match1 = re.finditer('<table', html_str)
														
 
															+        match2 = re.finditer('</table>', html_str)
														
 
															+        table_index_list = []
														
 
															+        for m1, m2 in zip(match1, match2):
														
 
															+            table_index_list.append([m1.span()[0], m1.span()[1], m2.span()[0], m2.span()[1]])
														
 
															+        print(table_index_list)
														
 
															+
														
 
															+        soup = BeautifulSoup(html_str)
														
 
															+        tables = soup.find_all('table')
														
 
															+        table_td_cnt_list = []
														
 
															+        for table in tables:
														
 
															+            tds = table.tr.find_all('td')
														
 
															+            table_td_cnt_list.append(len(list(tds)))
														
 
															+        print(table_td_cnt_list)
														
 
															+
														
 
															+        if len(table_index_list) == len(table_td_cnt_list):
														
 
															+            merge_index_list = []
														
 
															+            temp_index = []
														
 
															+            for i in range(1, len(table_index_list)):
														
 
															+                last_index = table_index_list[i-1]
														
 
															+                index = table_index_list[i]
														
 
															+                last_tds = table_td_cnt_list[i-1]
														
 
															+                tds = table_td_cnt_list[i]
														
 
															+                if index[0] - last_index[-1] == 0 and last_tds == tds:
														
 
															+                    temp_index += [i-1, i]
														
 
															+                    temp_index = list(set(temp_index))
														
 
															+                else:
														
 
															+                    if temp_index:
														
 
															+                        merge_index_list.append(temp_index)
														
 
															+                    temp_index = []
														
 
															+            if temp_index:
														
 
															+                merge_index_list.append(temp_index)
														
 
															+            print(merge_index_list)
														
 
															+
														
 
															+            print('before len(html_str)', len(html_str))
														
 
															+            for merge in merge_index_list:
														
 
															+                start_index = table_index_list[merge[0]][0]
														
 
															+                end_index = table_index_list[merge[-1]][-1]
														
 
															+                table_replace = re.sub('<table[^>]*>|</table>', '', html_str[start_index:end_index])
														
 
															+                table_replace = '<table border="1">' + table_replace + '</table>'
														
 
															+                table_replace += ' '*(end_index-start_index-len(table_replace))
														
 
															+                html_str = html_str[:start_index] + table_replace + html_str[end_index:]
														
 
															+            print('after len(html_str)', len(html_str))
														
 
															+
														
 
															+            if len(html_str_origin) == len(html_str):
														
 
															+                with open(r'C:\Users\Administrator\Desktop\3.html', 'w') as f:
														
 
															+                    f.write(html_str)
														
 
															+                return html_str
														
 
															+            else:
														
 
															+                return html_str_origin
														
 
															+        else:
														
 
															+            return html_str_origin
														
 
															+    except:
														
 
															+        return html_str_origin
														
 
															+
														
 
															+
														
 
															+if __name__ == '__main__':
														
 
															+    # process_list = []
														
 
															+    # for j in range(10):
														
 
															+    #     p1 = Process(target=run,)
														
 
															+    #     p1.start()
														
 
															+    #     process_list.append(p1)
														
 
															+    #
														
 
															+    # for p in process_list:
														
 
															+    #     p.join()
														
 
															+
														
 
															+    print('|'.join(['a', 'n']))
														
 
															+    _t = datetime.strptime('2023-04-26', '%Y-%m-%d')
														
 
															+    _t2 = datetime.strptime('2023-04-02', '%Y-%m-%d')
														
 
															+    print(abs((_t2-_t).days))
														
 
															+    print(datetime.strftime(_t + dt.timedelta(days=10), '%Y-%m-%d'))
														
 
															+
														
 
															+    # merge_table()
														
 
															+
														
 
															+    print(datetime.now())
														
 
															-print(len(s))
														
 
															-print(len("".join(["sdas", "我是觉得", "111"])))
														
--- a/format_convert/utils.py
+++ b/format_convert/utils.py
--- a/isr/pre_process.py
+++ b/isr/pre_process.py
@@ -19,11 +19,11 @@ def count_red_pixel(image_np, cnt=1000):
 
															     labels = measure.label(red_mask, connectivity=2)  # 8连通区域标记
														
 
															     regions = measure.regionprops(labels)
														
 
															     red_cnt = np.sum(red_mask != 0)
														
 
															-    print("red_cnt regions", len(regions),red_cnt, time.time()-start_time)
														
 
															+    # print("red_cnt regions", len(regions),red_cnt, time.time()-start_time)
														
 
															     if regions and len(regions)>0:
														
 
															         _max_area = max([r.bbox_area for r in regions])
														
 
															         if _max_area>100:
														
 
															-            print("red_cnt max_area", _max_area, time.time()-start_time)
														
 
															+            # print("red_cnt max_area", _max_area, time.time()-start_time)
														
 
															             return True
														
 
															     return False
														
--- a/my_zipfile.py
+++ b/my_zipfile.py
@@ -0,0 +1,2183 @@
 
															+"""
														
 
															+Read and write ZIP files.
														
 
															+
														
 
															+XXX references to utf-8 need further investigation.
														
 
															+"""
														
 
															+import io
														
 
															+import os
														
 
															+import importlib.util
														
 
															+import sys
														
 
															+import time
														
 
															+import stat
														
 
															+import shutil
														
 
															+import struct
														
 
															+import binascii
														
 
															+import threading
														
 
															+
														
 
															+try:
														
 
															+    import zlib # We may need its compression method
														
 
															+    crc32 = zlib.crc32
														
 
															+except ImportError:
														
 
															+    zlib = None
														
 
															+    crc32 = binascii.crc32
														
 
															+
														
 
															+try:
														
 
															+    import bz2 # We may need its compression method
														
 
															+except ImportError:
														
 
															+    bz2 = None
														
 
															+
														
 
															+try:
														
 
															+    import lzma # We may need its compression method
														
 
															+except ImportError:
														
 
															+    lzma = None
														
 
															+
														
 
															+__all__ = ["BadZipFile", "BadZipfile", "error",
														
 
															+           "ZIP_STORED", "ZIP_DEFLATED", "ZIP_BZIP2", "ZIP_LZMA",
														
 
															+           "is_zipfile", "ZipInfo", "ZipFile", "PyZipFile", "LargeZipFile"]
														
 
															+
														
 
															+class BadZipFile(Exception):
														
 
															+    pass
														
 
															+
														
 
															+
														
 
															+class LargeZipFile(Exception):
														
 
															+    """
														
 
															+    Raised when writing a zipfile, the zipfile requires ZIP64 extensions
														
 
															+    and those extensions are disabled.
														
 
															+    """
														
 
															+
														
 
															+error = BadZipfile = BadZipFile      # Pre-3.2 compatibility names
														
 
															+
														
 
															+
														
 
															+ZIP64_LIMIT = (1 << 31) - 1
														
 
															+ZIP_FILECOUNT_LIMIT = (1 << 16) - 1
														
 
															+ZIP_MAX_COMMENT = (1 << 16) - 1
														
 
															+
														
 
															+# constants for Zip file compression methods
														
 
															+ZIP_STORED = 0
														
 
															+ZIP_DEFLATED = 8
														
 
															+ZIP_BZIP2 = 12
														
 
															+ZIP_LZMA = 14
														
 
															+# Other ZIP compression methods not supported
														
 
															+
														
 
															+DEFAULT_VERSION = 20
														
 
															+ZIP64_VERSION = 45
														
 
															+BZIP2_VERSION = 46
														
 
															+LZMA_VERSION = 63
														
 
															+# we recognize (but not necessarily support) all features up to that version
														
 
															+MAX_EXTRACT_VERSION = 63
														
 
															+
														
 
															+# Below are some formats and associated data for reading/writing headers using
														
 
															+# the struct module.  The names and structures of headers/records are those used
														
 
															+# in the PKWARE description of the ZIP file format:
														
 
															+#     http://www.pkware.com/documents/casestudies/APPNOTE.TXT
														
 
															+# (URL valid as of January 2008)
														
 
															+
														
 
															+# The "end of central directory" structure, magic number, size, and indices
														
 
															+# (section V.I in the format document)
														
 
															+structEndArchive = b"<4s4H2LH"
														
 
															+stringEndArchive = b"PK\005\006"
														
 
															+sizeEndCentDir = struct.calcsize(structEndArchive)
														
 
															+
														
 
															+_ECD_SIGNATURE = 0
														
 
															+_ECD_DISK_NUMBER = 1
														
 
															+_ECD_DISK_START = 2
														
 
															+_ECD_ENTRIES_THIS_DISK = 3
														
 
															+_ECD_ENTRIES_TOTAL = 4
														
 
															+_ECD_SIZE = 5
														
 
															+_ECD_OFFSET = 6
														
 
															+_ECD_COMMENT_SIZE = 7
														
 
															+# These last two indices are not part of the structure as defined in the
														
 
															+# spec, but they are used internally by this module as a convenience
														
 
															+_ECD_COMMENT = 8
														
 
															+_ECD_LOCATION = 9
														
 
															+
														
 
															+# The "central directory" structure, magic number, size, and indices
														
 
															+# of entries in the structure (section V.F in the format document)
														
 
															+structCentralDir = "<4s4B4HL2L5H2L"
														
 
															+stringCentralDir = b"PK\001\002"
														
 
															+sizeCentralDir = struct.calcsize(structCentralDir)
														
 
															+
														
 
															+# indexes of entries in the central directory structure
														
 
															+_CD_SIGNATURE = 0
														
 
															+_CD_CREATE_VERSION = 1
														
 
															+_CD_CREATE_SYSTEM = 2
														
 
															+_CD_EXTRACT_VERSION = 3
														
 
															+_CD_EXTRACT_SYSTEM = 4
														
 
															+_CD_FLAG_BITS = 5
														
 
															+_CD_COMPRESS_TYPE = 6
														
 
															+_CD_TIME = 7
														
 
															+_CD_DATE = 8
														
 
															+_CD_CRC = 9
														
 
															+_CD_COMPRESSED_SIZE = 10
														
 
															+_CD_UNCOMPRESSED_SIZE = 11
														
 
															+_CD_FILENAME_LENGTH = 12
														
 
															+_CD_EXTRA_FIELD_LENGTH = 13
														
 
															+_CD_COMMENT_LENGTH = 14
														
 
															+_CD_DISK_NUMBER_START = 15
														
 
															+_CD_INTERNAL_FILE_ATTRIBUTES = 16
														
 
															+_CD_EXTERNAL_FILE_ATTRIBUTES = 17
														
 
															+_CD_LOCAL_HEADER_OFFSET = 18
														
 
															+
														
 
															+# The "local file header" structure, magic number, size, and indices
														
 
															+# (section V.A in the format document)
														
 
															+structFileHeader = "<4s2B4HL2L2H"
														
 
															+stringFileHeader = b"PK\003\004"
														
 
															+sizeFileHeader = struct.calcsize(structFileHeader)
														
 
															+
														
 
															+_FH_SIGNATURE = 0
														
 
															+_FH_EXTRACT_VERSION = 1
														
 
															+_FH_EXTRACT_SYSTEM = 2
														
 
															+_FH_GENERAL_PURPOSE_FLAG_BITS = 3
														
 
															+_FH_COMPRESSION_METHOD = 4
														
 
															+_FH_LAST_MOD_TIME = 5
														
 
															+_FH_LAST_MOD_DATE = 6
														
 
															+_FH_CRC = 7
														
 
															+_FH_COMPRESSED_SIZE = 8
														
 
															+_FH_UNCOMPRESSED_SIZE = 9
														
 
															+_FH_FILENAME_LENGTH = 10
														
 
															+_FH_EXTRA_FIELD_LENGTH = 11
														
 
															+
														
 
															+# The "Zip64 end of central directory locator" structure, magic number, and size
														
 
															+structEndArchive64Locator = "<4sLQL"
														
 
															+stringEndArchive64Locator = b"PK\x06\x07"
														
 
															+sizeEndCentDir64Locator = struct.calcsize(structEndArchive64Locator)
														
 
															+
														
 
															+# The "Zip64 end of central directory" record, magic number, size, and indices
														
 
															+# (section V.G in the format document)
														
 
															+structEndArchive64 = "<4sQ2H2L4Q"
														
 
															+stringEndArchive64 = b"PK\x06\x06"
														
 
															+sizeEndCentDir64 = struct.calcsize(structEndArchive64)
														
 
															+
														
 
															+_CD64_SIGNATURE = 0
														
 
															+_CD64_DIRECTORY_RECSIZE = 1
														
 
															+_CD64_CREATE_VERSION = 2
														
 
															+_CD64_EXTRACT_VERSION = 3
														
 
															+_CD64_DISK_NUMBER = 4
														
 
															+_CD64_DISK_NUMBER_START = 5
														
 
															+_CD64_NUMBER_ENTRIES_THIS_DISK = 6
														
 
															+_CD64_NUMBER_ENTRIES_TOTAL = 7
														
 
															+_CD64_DIRECTORY_SIZE = 8
														
 
															+_CD64_OFFSET_START_CENTDIR = 9
														
 
															+
														
 
															+_DD_SIGNATURE = 0x08074b50
														
 
															+
														
 
															+_EXTRA_FIELD_STRUCT = struct.Struct('<HH')
														
 
															+
														
 
															+def _strip_extra(extra, xids):
														
 
															+    # Remove Extra Fields with specified IDs.
														
 
															+    unpack = _EXTRA_FIELD_STRUCT.unpack
														
 
															+    modified = False
														
 
															+    buffer = []
														
 
															+    start = i = 0
														
 
															+    while i + 4 <= len(extra):
														
 
															+        xid, xlen = unpack(extra[i : i + 4])
														
 
															+        j = i + 4 + xlen
														
 
															+        if xid in xids:
														
 
															+            if i != start:
														
 
															+                buffer.append(extra[start : i])
														
 
															+            start = j
														
 
															+            modified = True
														
 
															+        i = j
														
 
															+    if not modified:
														
 
															+        return extra
														
 
															+    return b''.join(buffer)
														
 
															+
														
 
															+def _check_zipfile(fp):
														
 
															+    try:
														
 
															+        if _EndRecData(fp):
														
 
															+            return True         # file has correct magic number
														
 
															+    except OSError:
														
 
															+        pass
														
 
															+    return False
														
 
															+
														
 
															+def is_zipfile(filename):
														
 
															+    """Quickly see if a file is a ZIP file by checking the magic number.
														
 
															+
														
 
															+    The filename argument may be a file or file-like object too.
														
 
															+    """
														
 
															+    result = False
														
 
															+    try:
														
 
															+        if hasattr(filename, "read"):
														
 
															+            result = _check_zipfile(fp=filename)
														
 
															+        else:
														
 
															+            with open(filename, "rb") as fp:
														
 
															+                result = _check_zipfile(fp)
														
 
															+    except OSError:
														
 
															+        pass
														
 
															+    return result
														
 
															+
														
 
															+def _EndRecData64(fpin, offset, endrec):
														
 
															+    """
														
 
															+    Read the ZIP64 end-of-archive records and use that to update endrec
														
 
															+    """
														
 
															+    try:
														
 
															+        fpin.seek(offset - sizeEndCentDir64Locator, 2)
														
 
															+    except OSError:
														
 
															+        # If the seek fails, the file is not large enough to contain a ZIP64
														
 
															+        # end-of-archive record, so just return the end record we were given.
														
 
															+        return endrec
														
 
															+
														
 
															+    data = fpin.read(sizeEndCentDir64Locator)
														
 
															+    if len(data) != sizeEndCentDir64Locator:
														
 
															+        return endrec
														
 
															+    sig, diskno, reloff, disks = struct.unpack(structEndArchive64Locator, data)
														
 
															+    if sig != stringEndArchive64Locator:
														
 
															+        return endrec
														
 
															+
														
 
															+    if diskno != 0 or disks > 1:
														
 
															+        raise BadZipFile("zipfiles that span multiple disks are not supported")
														
 
															+
														
 
															+    # Assume no 'zip64 extensible data'
														
 
															+    fpin.seek(offset - sizeEndCentDir64Locator - sizeEndCentDir64, 2)
														
 
															+    data = fpin.read(sizeEndCentDir64)
														
 
															+    if len(data) != sizeEndCentDir64:
														
 
															+        return endrec
														
 
															+    sig, sz, create_version, read_version, disk_num, disk_dir, \
														
 
															+        dircount, dircount2, dirsize, diroffset = \
														
 
															+        struct.unpack(structEndArchive64, data)
														
 
															+    if sig != stringEndArchive64:
														
 
															+        return endrec
														
 
															+
														
 
															+    # Update the original endrec using data from the ZIP64 record
														
 
															+    endrec[_ECD_SIGNATURE] = sig
														
 
															+    endrec[_ECD_DISK_NUMBER] = disk_num
														
 
															+    endrec[_ECD_DISK_START] = disk_dir
														
 
															+    endrec[_ECD_ENTRIES_THIS_DISK] = dircount
														
 
															+    endrec[_ECD_ENTRIES_TOTAL] = dircount2
														
 
															+    endrec[_ECD_SIZE] = dirsize
														
 
															+    endrec[_ECD_OFFSET] = diroffset
														
 
															+    return endrec
														
 
															+
														
 
															+
														
 
															+def _EndRecData(fpin):
														
 
															+    """Return data from the "End of Central Directory" record, or None.
														
 
															+
														
 
															+    The data is a list of the nine items in the ZIP "End of central dir"
														
 
															+    record followed by a tenth item, the file seek offset of this record."""
														
 
															+
														
 
															+    # Determine file size
														
 
															+    fpin.seek(0, 2)
														
 
															+    filesize = fpin.tell()
														
 
															+
														
 
															+    # Check to see if this is ZIP file with no archive comment (the
														
 
															+    # "end of central directory" structure should be the last item in the
														
 
															+    # file if this is the case).
														
 
															+    try:
														
 
															+        fpin.seek(-sizeEndCentDir, 2)
														
 
															+    except OSError:
														
 
															+        return None
														
 
															+    data = fpin.read()
														
 
															+    if (len(data) == sizeEndCentDir and
														
 
															+        data[0:4] == stringEndArchive and
														
 
															+        data[-2:] == b"\000\000"):
														
 
															+        # the signature is correct and there's no comment, unpack structure
														
 
															+        endrec = struct.unpack(structEndArchive, data)
														
 
															+        endrec=list(endrec)
														
 
															+
														
 
															+        # Append a blank comment and record start offset
														
 
															+        endrec.append(b"")
														
 
															+        endrec.append(filesize - sizeEndCentDir)
														
 
															+
														
 
															+        # Try to read the "Zip64 end of central directory" structure
														
 
															+        return _EndRecData64(fpin, -sizeEndCentDir, endrec)
														
 
															+
														
 
															+    # Either this is not a ZIP file, or it is a ZIP file with an archive
														
 
															+    # comment.  Search the end of the file for the "end of central directory"
														
 
															+    # record signature. The comment is the last item in the ZIP file and may be
														
 
															+    # up to 64K long.  It is assumed that the "end of central directory" magic
														
 
															+    # number does not appear in the comment.
														
 
															+    maxCommentStart = max(filesize - (1 << 16) - sizeEndCentDir, 0)
														
 
															+    fpin.seek(maxCommentStart, 0)
														
 
															+    data = fpin.read()
														
 
															+    start = data.rfind(stringEndArchive)
														
 
															+    if start >= 0:
														
 
															+        # found the magic number; attempt to unpack and interpret
														
 
															+        recData = data[start:start+sizeEndCentDir]
														
 
															+        if len(recData) != sizeEndCentDir:
														
 
															+            # Zip file is corrupted.
														
 
															+            return None
														
 
															+        endrec = list(struct.unpack(structEndArchive, recData))
														
 
															+        commentSize = endrec[_ECD_COMMENT_SIZE] #as claimed by the zip file
														
 
															+        comment = data[start+sizeEndCentDir:start+sizeEndCentDir+commentSize]
														
 
															+        endrec.append(comment)
														
 
															+        endrec.append(maxCommentStart + start)
														
 
															+
														
 
															+        # Try to read the "Zip64 end of central directory" structure
														
 
															+        return _EndRecData64(fpin, maxCommentStart + start - filesize,
														
 
															+                             endrec)
														
 
															+
														
 
															+    # Unable to find a valid end of central directory structure
														
 
															+    return None
														
 
															+
														
 
															+
														
 
															+class ZipInfo (object):
														
 
															+    """Class with attributes describing each file in the ZIP archive."""
														
 
															+
														
 
															+    __slots__ = (
														
 
															+        'orig_filename',
														
 
															+        'filename',
														
 
															+        'date_time',
														
 
															+        'compress_type',
														
 
															+        '_compresslevel',
														
 
															+        'comment',
														
 
															+        'extra',
														
 
															+        'create_system',
														
 
															+        'create_version',
														
 
															+        'extract_version',
														
 
															+        'reserved',
														
 
															+        'flag_bits',
														
 
															+        'volume',
														
 
															+        'internal_attr',
														
 
															+        'external_attr',
														
 
															+        'header_offset',
														
 
															+        'CRC',
														
 
															+        'compress_size',
														
 
															+        'file_size',
														
 
															+        '_raw_time',
														
 
															+        'has_changed_name',
														
 
															+    )
														
 
															+
														
 
															+    def __init__(self, filename="NoName", date_time=(1980,1,1,0,0,0), has_changed_name=False):
														
 
															+        self.orig_filename = filename   # Original file name in archive
														
 
															+
														
 
															+        # Terminate the file name at the first null byte.  Null bytes in file
														
 
															+        # names are used as tricks by viruses in archives.
														
 
															+        null_byte = filename.find(chr(0))
														
 
															+        if null_byte >= 0:
														
 
															+            filename = filename[0:null_byte]
														
 
															+        # This is used to ensure paths in generated ZIP files always use
														
 
															+        # forward slashes as the directory separator, as required by the
														
 
															+        # ZIP format specification.
														
 
															+        if os.sep != "/" and os.sep in filename:
														
 
															+            filename = filename.replace(os.sep, "/")
														
 
															+
														
 
															+        self.filename = filename        # Normalized file name
														
 
															+        self.date_time = date_time      # year, month, day, hour, min, sec
														
 
															+
														
 
															+        if date_time[0] < 1980:
														
 
															+            raise ValueError('ZIP does not support timestamps before 1980')
														
 
															+
														
 
															+        # Standard values:
														
 
															+        self.compress_type = ZIP_STORED # Type of compression for the file
														
 
															+        self._compresslevel = None      # Level for the compressor
														
 
															+        self.comment = b""              # Comment for each file
														
 
															+        self.extra = b""                # ZIP extra data
														
 
															+        if sys.platform == 'win32':
														
 
															+            self.create_system = 0          # System which created ZIP archive
														
 
															+        else:
														
 
															+            # Assume everything else is unix-y
														
 
															+            self.create_system = 3          # System which created ZIP archive
														
 
															+        self.create_version = DEFAULT_VERSION  # Version which created ZIP archive
														
 
															+        self.extract_version = DEFAULT_VERSION # Version needed to extract archive
														
 
															+        self.reserved = 0               # Must be zero
														
 
															+        self.flag_bits = 0              # ZIP flag bits
														
 
															+        self.volume = 0                 # Volume number of file header
														
 
															+        self.internal_attr = 0          # Internal attributes
														
 
															+        self.external_attr = 0          # External file attributes
														
 
															+        # Other attributes are set by class ZipFile:
														
 
															+        # header_offset         Byte offset to the file header
														
 
															+        # CRC                   CRC-32 of the uncompressed file
														
 
															+        # compress_size         Size of the compressed file
														
 
															+        # file_size             Size of the uncompressed file
														
 
															+        self.has_changed_name = has_changed_name
														
 
															+
														
 
															+    def __repr__(self):
														
 
															+        result = ['<%s filename=%r' % (self.__class__.__name__, self.filename)]
														
 
															+        if self.compress_type != ZIP_STORED:
														
 
															+            result.append(' compress_type=%s' %
														
 
															+                          compressor_names.get(self.compress_type,
														
 
															+                                               self.compress_type))
														
 
															+        hi = self.external_attr >> 16
														
 
															+        lo = self.external_attr & 0xFFFF
														
 
															+        if hi:
														
 
															+            result.append(' filemode=%r' % stat.filemode(hi))
														
 
															+        if lo:
														
 
															+            result.append(' external_attr=%#x' % lo)
														
 
															+        isdir = self.is_dir()
														
 
															+        if not isdir or self.file_size:
														
 
															+            result.append(' file_size=%r' % self.file_size)
														
 
															+        if ((not isdir or self.compress_size) and
														
 
															+            (self.compress_type != ZIP_STORED or
														
 
															+             self.file_size != self.compress_size)):
														
 
															+            result.append(' compress_size=%r' % self.compress_size)
														
 
															+        result.append('>')
														
 
															+        return ''.join(result)
														
 
															+
														
 
															+    def FileHeader(self, zip64=None):
														
 
															+        """Return the per-file header as a bytes object."""
														
 
															+        dt = self.date_time
														
 
															+        dosdate = (dt[0] - 1980) << 9 | dt[1] << 5 | dt[2]
														
 
															+        dostime = dt[3] << 11 | dt[4] << 5 | (dt[5] // 2)
														
 
															+        if self.flag_bits & 0x08:
														
 
															+            # Set these to zero because we write them after the file data
														
 
															+            CRC = compress_size = file_size = 0
														
 
															+        else:
														
 
															+            CRC = self.CRC
														
 
															+            compress_size = self.compress_size
														
 
															+            file_size = self.file_size
														
 
															+
														
 
															+        extra = self.extra
														
 
															+
														
 
															+        min_version = 0
														
 
															+        if zip64 is None:
														
 
															+            zip64 = file_size > ZIP64_LIMIT or compress_size > ZIP64_LIMIT
														
 
															+        if zip64:
														
 
															+            fmt = '<HHQQ'
														
 
															+            extra = extra + struct.pack(fmt,
														
 
															+                                        1, struct.calcsize(fmt)-4, file_size, compress_size)
														
 
															+        if file_size > ZIP64_LIMIT or compress_size > ZIP64_LIMIT:
														
 
															+            if not zip64:
														
 
															+                raise LargeZipFile("Filesize would require ZIP64 extensions")
														
 
															+            # File is larger than what fits into a 4 byte integer,
														
 
															+            # fall back to the ZIP64 extension
														
 
															+            file_size = 0xffffffff
														
 
															+            compress_size = 0xffffffff
														
 
															+            min_version = ZIP64_VERSION
														
 
															+
														
 
															+        if self.compress_type == ZIP_BZIP2:
														
 
															+            min_version = max(BZIP2_VERSION, min_version)
														
 
															+        elif self.compress_type == ZIP_LZMA:
														
 
															+            min_version = max(LZMA_VERSION, min_version)
														
 
															+
														
 
															+        self.extract_version = max(min_version, self.extract_version)
														
 
															+        self.create_version = max(min_version, self.create_version)
														
 
															+        filename, flag_bits = self._encodeFilenameFlags()
														
 
															+        header = struct.pack(structFileHeader, stringFileHeader,
														
 
															+                             self.extract_version, self.reserved, flag_bits,
														
 
															+                             self.compress_type, dostime, dosdate, CRC,
														
 
															+                             compress_size, file_size,
														
 
															+                             len(filename), len(extra))
														
 
															+        return header + filename + extra
														
 
															+
														
 
															+    def _encodeFilenameFlags(self):
														
 
															+        try:
														
 
															+            return self.filename.encode('ascii'), self.flag_bits
														
 
															+        except UnicodeEncodeError:
														
 
															+            return self.filename.encode('utf-8'), self.flag_bits | 0x800
														
 
															+
														
 
															+    def _decodeExtra(self):
														
 
															+        # Try to decode the extra field.
														
 
															+        extra = self.extra
														
 
															+        unpack = struct.unpack
														
 
															+        while len(extra) >= 4:
														
 
															+            tp, ln = unpack('<HH', extra[:4])
														
 
															+            if ln+4 > len(extra):
														
 
															+                raise BadZipFile("Corrupt extra field %04x (size=%d)" % (tp, ln))
														
 
															+            if tp == 0x0001:
														
 
															+                if ln >= 24:
														
 
															+                    counts = unpack('<QQQ', extra[4:28])
														
 
															+                elif ln == 16:
														
 
															+                    counts = unpack('<QQ', extra[4:20])
														
 
															+                elif ln == 8:
														
 
															+                    counts = unpack('<Q', extra[4:12])
														
 
															+                elif ln == 0:
														
 
															+                    counts = ()
														
 
															+                else:
														
 
															+                    raise BadZipFile("Corrupt extra field %04x (size=%d)" % (tp, ln))
														
 
															+
														
 
															+                idx = 0
														
 
															+
														
 
															+                # ZIP64 extension (large files and/or large archives)
														
 
															+                if self.file_size in (0xffffffffffffffff, 0xffffffff):
														
 
															+                    if len(counts) <= idx:
														
 
															+                        raise BadZipFile(
														
 
															+                            "Corrupt zip64 extra field. File size not found."
														
 
															+                        )
														
 
															+                    self.file_size = counts[idx]
														
 
															+                    idx += 1
														
 
															+
														
 
															+                if self.compress_size == 0xFFFFFFFF:
														
 
															+                    if len(counts) <= idx:
														
 
															+                        raise BadZipFile(
														
 
															+                            "Corrupt zip64 extra field. Compress size not found."
														
 
															+                        )
														
 
															+                    self.compress_size = counts[idx]
														
 
															+                    idx += 1
														
 
															+
														
 
															+                if self.header_offset == 0xffffffff:
														
 
															+                    if len(counts) <= idx:
														
 
															+                        raise BadZipFile(
														
 
															+                            "Corrupt zip64 extra field. Header offset not found."
														
 
															+                        )
														
 
															+                    old = self.header_offset
														
 
															+                    self.header_offset = counts[idx]
														
 
															+                    idx+=1
														
 
															+
														
 
															+            extra = extra[ln+4:]
														
 
															+
														
 
															+    @classmethod
														
 
															+    def from_file(cls, filename, arcname=None):
														
 
															+        """Construct an appropriate ZipInfo for a file on the filesystem.
														
 
															+
														
 
															+        filename should be the path to a file or directory on the filesystem.
														
 
															+
														
 
															+        arcname is the name which it will have within the archive (by default,
														
 
															+        this will be the same as filename, but without a drive letter and with
														
 
															+        leading path separators removed).
														
 
															+        """
														
 
															+        if isinstance(filename, os.PathLike):
														
 
															+            filename = os.fspath(filename)
														
 
															+        st = os.stat(filename)
														
 
															+        isdir = stat.S_ISDIR(st.st_mode)
														
 
															+        mtime = time.localtime(st.st_mtime)
														
 
															+        date_time = mtime[0:6]
														
 
															+        # Create ZipInfo instance to store file information
														
 
															+        if arcname is None:
														
 
															+            arcname = filename
														
 
															+        arcname = os.path.normpath(os.path.splitdrive(arcname)[1])
														
 
															+        while arcname[0] in (os.sep, os.altsep):
														
 
															+            arcname = arcname[1:]
														
 
															+        if isdir:
														
 
															+            arcname += '/'
														
 
															+        zinfo = cls(arcname, date_time)
														
 
															+        zinfo.external_attr = (st.st_mode & 0xFFFF) << 16  # Unix attributes
														
 
															+        if isdir:
														
 
															+            zinfo.file_size = 0
														
 
															+            zinfo.external_attr |= 0x10  # MS-DOS directory flag
														
 
															+        else:
														
 
															+            zinfo.file_size = st.st_size
														
 
															+
														
 
															+        return zinfo
														
 
															+
														
 
															+    def is_dir(self):
														
 
															+        """Return True if this archive member is a directory."""
														
 
															+        return self.filename[-1] == '/'
														
 
															+
														
 
															+
														
 
															+# ZIP encryption uses the CRC32 one-byte primitive for scrambling some
														
 
															+# internal keys. We noticed that a direct implementation is faster than
														
 
															+# relying on binascii.crc32().
														
 
															+
														
 
															+_crctable = None
														
 
															+def _gen_crc(crc):
														
 
															+    for j in range(8):
														
 
															+        if crc & 1:
														
 
															+            crc = (crc >> 1) ^ 0xEDB88320
														
 
															+        else:
														
 
															+            crc >>= 1
														
 
															+    return crc
														
 
															+
														
 
															+# ZIP supports a password-based form of encryption. Even though known
														
 
															+# plaintext attacks have been found against it, it is still useful
														
 
															+# to be able to get data out of such a file.
														
 
															+#
														
 
															+# Usage:
														
 
															+#     zd = _ZipDecrypter(mypwd)
														
 
															+#     plain_bytes = zd(cypher_bytes)
														
 
															+
														
 
															+def _ZipDecrypter(pwd):
														
 
															+    key0 = 305419896
														
 
															+    key1 = 591751049
														
 
															+    key2 = 878082192
														
 
															+
														
 
															+    global _crctable
														
 
															+    if _crctable is None:
														
 
															+        _crctable = list(map(_gen_crc, range(256)))
														
 
															+    crctable = _crctable
														
 
															+
														
 
															+    def crc32(ch, crc):
														
 
															+        """Compute the CRC32 primitive on one byte."""
														
 
															+        return (crc >> 8) ^ crctable[(crc ^ ch) & 0xFF]
														
 
															+
														
 
															+    def update_keys(c):
														
 
															+        nonlocal key0, key1, key2
														
 
															+        key0 = crc32(c, key0)
														
 
															+        key1 = (key1 + (key0 & 0xFF)) & 0xFFFFFFFF
														
 
															+        key1 = (key1 * 134775813 + 1) & 0xFFFFFFFF
														
 
															+        key2 = crc32(key1 >> 24, key2)
														
 
															+
														
 
															+    for p in pwd:
														
 
															+        update_keys(p)
														
 
															+
														
 
															+    def decrypter(data):
														
 
															+        """Decrypt a bytes object."""
														
 
															+        result = bytearray()
														
 
															+        append = result.append
														
 
															+        for c in data:
														
 
															+            k = key2 | 2
														
 
															+            c ^= ((k * (k^1)) >> 8) & 0xFF
														
 
															+            update_keys(c)
														
 
															+            append(c)
														
 
															+        return bytes(result)
														
 
															+
														
 
															+    return decrypter
														
 
															+
														
 
															+
														
 
															+class LZMACompressor:
														
 
															+
														
 
															+    def __init__(self):
														
 
															+        self._comp = None
														
 
															+
														
 
															+    def _init(self):
														
 
															+        props = lzma._encode_filter_properties({'id': lzma.FILTER_LZMA1})
														
 
															+        self._comp = lzma.LZMACompressor(lzma.FORMAT_RAW, filters=[
														
 
															+            lzma._decode_filter_properties(lzma.FILTER_LZMA1, props)
														
 
															+        ])
														
 
															+        return struct.pack('<BBH', 9, 4, len(props)) + props
														
 
															+
														
 
															+    def compress(self, data):
														
 
															+        if self._comp is None:
														
 
															+            return self._init() + self._comp.compress(data)
														
 
															+        return self._comp.compress(data)
														
 
															+
														
 
															+    def flush(self):
														
 
															+        if self._comp is None:
														
 
															+            return self._init() + self._comp.flush()
														
 
															+        return self._comp.flush()
														
 
															+
														
 
															+
														
 
															+class LZMADecompressor:
														
 
															+
														
 
															+    def __init__(self):
														
 
															+        self._decomp = None
														
 
															+        self._unconsumed = b''
														
 
															+        self.eof = False
														
 
															+
														
 
															+    def decompress(self, data):
														
 
															+        if self._decomp is None:
														
 
															+            self._unconsumed += data
														
 
															+            if len(self._unconsumed) <= 4:
														
 
															+                return b''
														
 
															+            psize, = struct.unpack('<H', self._unconsumed[2:4])
														
 
															+            if len(self._unconsumed) <= 4 + psize:
														
 
															+                return b''
														
 
															+
														
 
															+            self._decomp = lzma.LZMADecompressor(lzma.FORMAT_RAW, filters=[
														
 
															+                lzma._decode_filter_properties(lzma.FILTER_LZMA1,
														
 
															+                                               self._unconsumed[4:4 + psize])
														
 
															+            ])
														
 
															+            data = self._unconsumed[4 + psize:]
														
 
															+            del self._unconsumed
														
 
															+
														
 
															+        result = self._decomp.decompress(data)
														
 
															+        self.eof = self._decomp.eof
														
 
															+        return result
														
 
															+
														
 
															+
														
 
															+compressor_names = {
														
 
															+    0: 'store',
														
 
															+    1: 'shrink',
														
 
															+    2: 'reduce',
														
 
															+    3: 'reduce',
														
 
															+    4: 'reduce',
														
 
															+    5: 'reduce',
														
 
															+    6: 'implode',
														
 
															+    7: 'tokenize',
														
 
															+    8: 'deflate',
														
 
															+    9: 'deflate64',
														
 
															+    10: 'implode',
														
 
															+    12: 'bzip2',
														
 
															+    14: 'lzma',
														
 
															+    18: 'terse',
														
 
															+    19: 'lz77',
														
 
															+    97: 'wavpack',
														
 
															+    98: 'ppmd',
														
 
															+}
														
 
															+
														
 
															+def _check_compression(compression):
														
 
															+    if compression == ZIP_STORED:
														
 
															+        pass
														
 
															+    elif compression == ZIP_DEFLATED:
														
 
															+        if not zlib:
														
 
															+            raise RuntimeError(
														
 
															+                "Compression requires the (missing) zlib module")
														
 
															+    elif compression == ZIP_BZIP2:
														
 
															+        if not bz2:
														
 
															+            raise RuntimeError(
														
 
															+                "Compression requires the (missing) bz2 module")
														
 
															+    elif compression == ZIP_LZMA:
														
 
															+        if not lzma:
														
 
															+            raise RuntimeError(
														
 
															+                "Compression requires the (missing) lzma module")
														
 
															+    else:
														
 
															+        raise NotImplementedError("That compression method is not supported")
														
 
															+
														
 
															+
														
 
															+def _get_compressor(compress_type, compresslevel=None):
														
 
															+    if compress_type == ZIP_DEFLATED:
														
 
															+        if compresslevel is not None:
														
 
															+            return zlib.compressobj(compresslevel, zlib.DEFLATED, -15)
														
 
															+        return zlib.compressobj(zlib.Z_DEFAULT_COMPRESSION, zlib.DEFLATED, -15)
														
 
															+    elif compress_type == ZIP_BZIP2:
														
 
															+        if compresslevel is not None:
														
 
															+            return bz2.BZ2Compressor(compresslevel)
														
 
															+        return bz2.BZ2Compressor()
														
 
															+    # compresslevel is ignored for ZIP_LZMA
														
 
															+    elif compress_type == ZIP_LZMA:
														
 
															+        return LZMACompressor()
														
 
															+    else:
														
 
															+        return None
														
 
															+
														
 
															+
														
 
															+def _get_decompressor(compress_type):
														
 
															+    if compress_type == ZIP_STORED:
														
 
															+        return None
														
 
															+    elif compress_type == ZIP_DEFLATED:
														
 
															+        return zlib.decompressobj(-15)
														
 
															+    elif compress_type == ZIP_BZIP2:
														
 
															+        return bz2.BZ2Decompressor()
														
 
															+    elif compress_type == ZIP_LZMA:
														
 
															+        return LZMADecompressor()
														
 
															+    else:
														
 
															+        descr = compressor_names.get(compress_type)
														
 
															+        if descr:
														
 
															+            raise NotImplementedError("compression type %d (%s)" % (compress_type, descr))
														
 
															+        else:
														
 
															+            raise NotImplementedError("compression type %d" % (compress_type,))
														
 
															+
														
 
															+
														
 
															+class _SharedFile:
														
 
															+    def __init__(self, file, pos, close, lock, writing):
														
 
															+        self._file = file
														
 
															+        self._pos = pos
														
 
															+        self._close = close
														
 
															+        self._lock = lock
														
 
															+        self._writing = writing
														
 
															+        self.seekable = file.seekable
														
 
															+        self.tell = file.tell
														
 
															+
														
 
															+    def seek(self, offset, whence=0):
														
 
															+        with self._lock:
														
 
															+            if self._writing():
														
 
															+                raise ValueError("Can't reposition in the ZIP file while "
														
 
															+                        "there is an open writing handle on it. "
														
 
															+                        "Close the writing handle before trying to read.")
														
 
															+            self._file.seek(offset, whence)
														
 
															+            self._pos = self._file.tell()
														
 
															+            return self._pos
														
 
															+
														
 
															+    def read(self, n=-1):
														
 
															+        with self._lock:
														
 
															+            if self._writing():
														
 
															+                raise ValueError("Can't read from the ZIP file while there "
														
 
															+                        "is an open writing handle on it. "
														
 
															+                        "Close the writing handle before trying to read.")
														
 
															+            self._file.seek(self._pos)
														
 
															+            data = self._file.read(n)
														
 
															+            self._pos = self._file.tell()
														
 
															+            return data
														
 
															+
														
 
															+    def close(self):
														
 
															+        if self._file is not None:
														
 
															+            fileobj = self._file
														
 
															+            self._file = None
														
 
															+            self._close(fileobj)
														
 
															+
														
 
															+# Provide the tell method for unseekable stream
														
 
															+class _Tellable:
														
 
															+    def __init__(self, fp):
														
 
															+        self.fp = fp
														
 
															+        self.offset = 0
														
 
															+
														
 
															+    def write(self, data):
														
 
															+        n = self.fp.write(data)
														
 
															+        self.offset += n
														
 
															+        return n
														
 
															+
														
 
															+    def tell(self):
														
 
															+        return self.offset
														
 
															+
														
 
															+    def flush(self):
														
 
															+        self.fp.flush()
														
 
															+
														
 
															+    def close(self):
														
 
															+        self.fp.close()
														
 
															+
														
 
															+
														
 
															+class ZipExtFile(io.BufferedIOBase):
														
 
															+    """File-like object for reading an archive member.
														
 
															+       Is returned by ZipFile.open().
														
 
															+    """
														
 
															+
														
 
															+    # Max size supported by decompressor.
														
 
															+    MAX_N = 1 << 31 - 1
														
 
															+
														
 
															+    # Read from compressed files in 4k blocks.
														
 
															+    MIN_READ_SIZE = 4096
														
 
															+
														
 
															+    # Chunk size to read during seek
														
 
															+    MAX_SEEK_READ = 1 << 24
														
 
															+
														
 
															+    def __init__(self, fileobj, mode, zipinfo, pwd=None,
														
 
															+                 close_fileobj=False):
														
 
															+        self._fileobj = fileobj
														
 
															+        self._pwd = pwd
														
 
															+        self._close_fileobj = close_fileobj
														
 
															+
														
 
															+        self._compress_type = zipinfo.compress_type
														
 
															+        self._compress_left = zipinfo.compress_size
														
 
															+        self._left = zipinfo.file_size
														
 
															+
														
 
															+        self._decompressor = _get_decompressor(self._compress_type)
														
 
															+
														
 
															+        self._eof = False
														
 
															+        self._readbuffer = b''
														
 
															+        self._offset = 0
														
 
															+
														
 
															+        self.newlines = None
														
 
															+
														
 
															+        self.mode = mode
														
 
															+        self.name = zipinfo.filename
														
 
															+
														
 
															+        if hasattr(zipinfo, 'CRC'):
														
 
															+            self._expected_crc = zipinfo.CRC
														
 
															+            self._running_crc = crc32(b'')
														
 
															+        else:
														
 
															+            self._expected_crc = None
														
 
															+
														
 
															+        self._seekable = False
														
 
															+        try:
														
 
															+            if fileobj.seekable():
														
 
															+                self._orig_compress_start = fileobj.tell()
														
 
															+                self._orig_compress_size = zipinfo.compress_size
														
 
															+                self._orig_file_size = zipinfo.file_size
														
 
															+                self._orig_start_crc = self._running_crc
														
 
															+                self._seekable = True
														
 
															+        except AttributeError:
														
 
															+            pass
														
 
															+
														
 
															+        self._decrypter = None
														
 
															+        if pwd:
														
 
															+            if zipinfo.flag_bits & 0x8:
														
 
															+                # compare against the file type from extended local headers
														
 
															+                check_byte = (zipinfo._raw_time >> 8) & 0xff
														
 
															+            else:
														
 
															+                # compare against the CRC otherwise
														
 
															+                check_byte = (zipinfo.CRC >> 24) & 0xff
														
 
															+            h = self._init_decrypter()
														
 
															+            if h != check_byte:
														
 
															+                raise RuntimeError("Bad password for file %r" % zipinfo.orig_filename)
														
 
															+
														
 
															+
														
 
															+    def _init_decrypter(self):
														
 
															+        self._decrypter = _ZipDecrypter(self._pwd)
														
 
															+        # The first 12 bytes in the cypher stream is an encryption header
														
 
															+        #  used to strengthen the algorithm. The first 11 bytes are
														
 
															+        #  completely random, while the 12th contains the MSB of the CRC,
														
 
															+        #  or the MSB of the file time depending on the header type
														
 
															+        #  and is used to check the correctness of the password.
														
 
															+        header = self._fileobj.read(12)
														
 
															+        self._compress_left -= 12
														
 
															+        return self._decrypter(header)[11]
														
 
															+
														
 
															+    def __repr__(self):
														
 
															+        result = ['<%s.%s' % (self.__class__.__module__,
														
 
															+                              self.__class__.__qualname__)]
														
 
															+        if not self.closed:
														
 
															+            result.append(' name=%r mode=%r' % (self.name, self.mode))
														
 
															+            if self._compress_type != ZIP_STORED:
														
 
															+                result.append(' compress_type=%s' %
														
 
															+                              compressor_names.get(self._compress_type,
														
 
															+                                                   self._compress_type))
														
 
															+        else:
														
 
															+            result.append(' [closed]')
														
 
															+        result.append('>')
														
 
															+        return ''.join(result)
														
 
															+
														
 
															+    def readline(self, limit=-1):
														
 
															+        """Read and return a line from the stream.
														
 
															+
														
 
															+        If limit is specified, at most limit bytes will be read.
														
 
															+        """
														
 
															+
														
 
															+        if limit < 0:
														
 
															+            # Shortcut common case - newline found in buffer.
														
 
															+            i = self._readbuffer.find(b'\n', self._offset) + 1
														
 
															+            if i > 0:
														
 
															+                line = self._readbuffer[self._offset: i]
														
 
															+                self._offset = i
														
 
															+                return line
														
 
															+
														
 
															+        return io.BufferedIOBase.readline(self, limit)
														
 
															+
														
 
															+    def peek(self, n=1):
														
 
															+        """Returns buffered bytes without advancing the position."""
														
 
															+        if n > len(self._readbuffer) - self._offset:
														
 
															+            chunk = self.read(n)
														
 
															+            if len(chunk) > self._offset:
														
 
															+                self._readbuffer = chunk + self._readbuffer[self._offset:]
														
 
															+                self._offset = 0
														
 
															+            else:
														
 
															+                self._offset -= len(chunk)
														
 
															+
														
 
															+        # Return up to 512 bytes to reduce allocation overhead for tight loops.
														
 
															+        return self._readbuffer[self._offset: self._offset + 512]
														
 
															+
														
 
															+    def readable(self):
														
 
															+        return True
														
 
															+
														
 
															+    def read(self, n=-1):
														
 
															+        """Read and return up to n bytes.
														
 
															+        If the argument is omitted, None, or negative, data is read and returned until EOF is reached.
														
 
															+        """
														
 
															+        if n is None or n < 0:
														
 
															+            buf = self._readbuffer[self._offset:]
														
 
															+            self._readbuffer = b''
														
 
															+            self._offset = 0
														
 
															+            while not self._eof:
														
 
															+                buf += self._read1(self.MAX_N)
														
 
															+            return buf
														
 
															+
														
 
															+        end = n + self._offset
														
 
															+        if end < len(self._readbuffer):
														
 
															+            buf = self._readbuffer[self._offset:end]
														
 
															+            self._offset = end
														
 
															+            return buf
														
 
															+
														
 
															+        n = end - len(self._readbuffer)
														
 
															+        buf = self._readbuffer[self._offset:]
														
 
															+        self._readbuffer = b''
														
 
															+        self._offset = 0
														
 
															+        while n > 0 and not self._eof:
														
 
															+            data = self._read1(n)
														
 
															+            if n < len(data):
														
 
															+                self._readbuffer = data
														
 
															+                self._offset = n
														
 
															+                buf += data[:n]
														
 
															+                break
														
 
															+            buf += data
														
 
															+            n -= len(data)
														
 
															+        return buf
														
 
															+
														
 
															+    def _update_crc(self, newdata):
														
 
															+        # Update the CRC using the given data.
														
 
															+        if self._expected_crc is None:
														
 
															+            # No need to compute the CRC if we don't have a reference value
														
 
															+            return
														
 
															+        self._running_crc = crc32(newdata, self._running_crc)
														
 
															+        # Check the CRC if we're at the end of the file
														
 
															+        if self._eof and self._running_crc != self._expected_crc:
														
 
															+            raise BadZipFile("Bad CRC-32 for file %r" % self.name)
														
 
															+
														
 
															+    def read1(self, n):
														
 
															+        """Read up to n bytes with at most one read() system call."""
														
 
															+
														
 
															+        if n is None or n < 0:
														
 
															+            buf = self._readbuffer[self._offset:]
														
 
															+            self._readbuffer = b''
														
 
															+            self._offset = 0
														
 
															+            while not self._eof:
														
 
															+                data = self._read1(self.MAX_N)
														
 
															+                if data:
														
 
															+                    buf += data
														
 
															+                    break
														
 
															+            return buf
														
 
															+
														
 
															+        end = n + self._offset
														
 
															+        if end < len(self._readbuffer):
														
 
															+            buf = self._readbuffer[self._offset:end]
														
 
															+            self._offset = end
														
 
															+            return buf
														
 
															+
														
 
															+        n = end - len(self._readbuffer)
														
 
															+        buf = self._readbuffer[self._offset:]
														
 
															+        self._readbuffer = b''
														
 
															+        self._offset = 0
														
 
															+        if n > 0:
														
 
															+            while not self._eof:
														
 
															+                data = self._read1(n)
														
 
															+                if n < len(data):
														
 
															+                    self._readbuffer = data
														
 
															+                    self._offset = n
														
 
															+                    buf += data[:n]
														
 
															+                    break
														
 
															+                if data:
														
 
															+                    buf += data
														
 
															+                    break
														
 
															+        return buf
														
 
															+
														
 
															+    def _read1(self, n):
														
 
															+        # Read up to n compressed bytes with at most one read() system call,
														
 
															+        # decrypt and decompress them.
														
 
															+        if self._eof or n <= 0:
														
 
															+            return b''
														
 
															+
														
 
															+        # Read from file.
														
 
															+        if self._compress_type == ZIP_DEFLATED:
														
 
															+            ## Handle unconsumed data.
														
 
															+            data = self._decompressor.unconsumed_tail
														
 
															+            if n > len(data):
														
 
															+                data += self._read2(n - len(data))
														
 
															+        else:
														
 
															+            data = self._read2(n)
														
 
															+
														
 
															+        if self._compress_type == ZIP_STORED:
														
 
															+            self._eof = self._compress_left <= 0
														
 
															+        elif self._compress_type == ZIP_DEFLATED:
														
 
															+            n = max(n, self.MIN_READ_SIZE)
														
 
															+            data = self._decompressor.decompress(data, n)
														
 
															+            self._eof = (self._decompressor.eof or
														
 
															+                         self._compress_left <= 0 and
														
 
															+                         not self._decompressor.unconsumed_tail)
														
 
															+            if self._eof:
														
 
															+                data += self._decompressor.flush()
														
 
															+        else:
														
 
															+            data = self._decompressor.decompress(data)
														
 
															+            self._eof = self._decompressor.eof or self._compress_left <= 0
														
 
															+
														
 
															+        data = data[:self._left]
														
 
															+        self._left -= len(data)
														
 
															+        if self._left <= 0:
														
 
															+            self._eof = True
														
 
															+        self._update_crc(data)
														
 
															+        return data
														
 
															+
														
 
															+    def _read2(self, n):
														
 
															+        if self._compress_left <= 0:
														
 
															+            return b''
														
 
															+
														
 
															+        n = max(n, self.MIN_READ_SIZE)
														
 
															+        n = min(n, self._compress_left)
														
 
															+
														
 
															+        data = self._fileobj.read(n)
														
 
															+        self._compress_left -= len(data)
														
 
															+        if not data:
														
 
															+            raise EOFError
														
 
															+
														
 
															+        if self._decrypter is not None:
														
 
															+            data = self._decrypter(data)
														
 
															+        return data
														
 
															+
														
 
															+    def close(self):
														
 
															+        try:
														
 
															+            if self._close_fileobj:
														
 
															+                self._fileobj.close()
														
 
															+        finally:
														
 
															+            super().close()
														
 
															+
														
 
															+    def seekable(self):
														
 
															+        return self._seekable
														
 
															+
														
 
															+    def seek(self, offset, whence=0):
														
 
															+        if not self._seekable:
														
 
															+            raise io.UnsupportedOperation("underlying stream is not seekable")
														
 
															+        curr_pos = self.tell()
														
 
															+        if whence == 0: # Seek from start of file
														
 
															+            new_pos = offset
														
 
															+        elif whence == 1: # Seek from current position
														
 
															+            new_pos = curr_pos + offset
														
 
															+        elif whence == 2: # Seek from EOF
														
 
															+            new_pos = self._orig_file_size + offset
														
 
															+        else:
														
 
															+            raise ValueError("whence must be os.SEEK_SET (0), "
														
 
															+                             "os.SEEK_CUR (1), or os.SEEK_END (2)")
														
 
															+
														
 
															+        if new_pos > self._orig_file_size:
														
 
															+            new_pos = self._orig_file_size
														
 
															+
														
 
															+        if new_pos < 0:
														
 
															+            new_pos = 0
														
 
															+
														
 
															+        read_offset = new_pos - curr_pos
														
 
															+        buff_offset = read_offset + self._offset
														
 
															+
														
 
															+        if buff_offset >= 0 and buff_offset < len(self._readbuffer):
														
 
															+            # Just move the _offset index if the new position is in the _readbuffer
														
 
															+            self._offset = buff_offset
														
 
															+            read_offset = 0
														
 
															+        elif read_offset < 0:
														
 
															+            # Position is before the current position. Reset the ZipExtFile
														
 
															+            self._fileobj.seek(self._orig_compress_start)
														
 
															+            self._running_crc = self._orig_start_crc
														
 
															+            self._compress_left = self._orig_compress_size
														
 
															+            self._left = self._orig_file_size
														
 
															+            self._readbuffer = b''
														
 
															+            self._offset = 0
														
 
															+            self._decompressor = _get_decompressor(self._compress_type)
														
 
															+            self._eof = False
														
 
															+            read_offset = new_pos
														
 
															+            if self._decrypter is not None:
														
 
															+                self._init_decrypter()
														
 
															+
														
 
															+        while read_offset > 0:
														
 
															+            read_len = min(self.MAX_SEEK_READ, read_offset)
														
 
															+            self.read(read_len)
														
 
															+            read_offset -= read_len
														
 
															+
														
 
															+        return self.tell()
														
 
															+
														
 
															+    def tell(self):
														
 
															+        if not self._seekable:
														
 
															+            raise io.UnsupportedOperation("underlying stream is not seekable")
														
 
															+        filepos = self._orig_file_size - self._left - len(self._readbuffer) + self._offset
														
 
															+        return filepos
														
 
															+
														
 
															+
														
 
															+class _ZipWriteFile(io.BufferedIOBase):
														
 
															+    def __init__(self, zf, zinfo, zip64):
														
 
															+        self._zinfo = zinfo
														
 
															+        self._zip64 = zip64
														
 
															+        self._zipfile = zf
														
 
															+        self._compressor = _get_compressor(zinfo.compress_type,
														
 
															+                                           zinfo._compresslevel)
														
 
															+        self._file_size = 0
														
 
															+        self._compress_size = 0
														
 
															+        self._crc = 0
														
 
															+
														
 
															+    @property
														
 
															+    def _fileobj(self):
														
 
															+        return self._zipfile.fp
														
 
															+
														
 
															+    def writable(self):
														
 
															+        return True
														
 
															+
														
 
															+    def write(self, data):
														
 
															+        if self.closed:
														
 
															+            raise ValueError('I/O operation on closed file.')
														
 
															+        nbytes = len(data)
														
 
															+        self._file_size += nbytes
														
 
															+        self._crc = crc32(data, self._crc)
														
 
															+        if self._compressor:
														
 
															+            data = self._compressor.compress(data)
														
 
															+            self._compress_size += len(data)
														
 
															+        self._fileobj.write(data)
														
 
															+        return nbytes
														
 
															+
														
 
															+    def close(self):
														
 
															+        if self.closed:
														
 
															+            return
														
 
															+        try:
														
 
															+            super().close()
														
 
															+            # Flush any data from the compressor, and update header info
														
 
															+            if self._compressor:
														
 
															+                buf = self._compressor.flush()
														
 
															+                self._compress_size += len(buf)
														
 
															+                self._fileobj.write(buf)
														
 
															+                self._zinfo.compress_size = self._compress_size
														
 
															+            else:
														
 
															+                self._zinfo.compress_size = self._file_size
														
 
															+            self._zinfo.CRC = self._crc
														
 
															+            self._zinfo.file_size = self._file_size
														
 
															+
														
 
															+            # Write updated header info
														
 
															+            if self._zinfo.flag_bits & 0x08:
														
 
															+                # Write CRC and file sizes after the file data
														
 
															+                fmt = '<LLQQ' if self._zip64 else '<LLLL'
														
 
															+                self._fileobj.write(struct.pack(fmt, _DD_SIGNATURE, self._zinfo.CRC,
														
 
															+                    self._zinfo.compress_size, self._zinfo.file_size))
														
 
															+                self._zipfile.start_dir = self._fileobj.tell()
														
 
															+            else:
														
 
															+                if not self._zip64:
														
 
															+                    if self._file_size > ZIP64_LIMIT:
														
 
															+                        raise RuntimeError(
														
 
															+                            'File size unexpectedly exceeded ZIP64 limit')
														
 
															+                    if self._compress_size > ZIP64_LIMIT:
														
 
															+                        raise RuntimeError(
														
 
															+                            'Compressed size unexpectedly exceeded ZIP64 limit')
														
 
															+                # Seek backwards and write file header (which will now include
														
 
															+                # correct CRC and file sizes)
														
 
															+
														
 
															+                # Preserve current position in file
														
 
															+                self._zipfile.start_dir = self._fileobj.tell()
														
 
															+                self._fileobj.seek(self._zinfo.header_offset)
														
 
															+                self._fileobj.write(self._zinfo.FileHeader(self._zip64))
														
 
															+                self._fileobj.seek(self._zipfile.start_dir)
														
 
															+
														
 
															+            # Successfully written: Add file to our caches
														
 
															+            self._zipfile.filelist.append(self._zinfo)
														
 
															+            self._zipfile.NameToInfo[self._zinfo.filename] = self._zinfo
														
 
															+        finally:
														
 
															+            self._zipfile._writing = False
														
 
															+
														
 
															+
														
 
															+
														
 
															+class ZipFile:
														
 
															+    """ Class with methods to open, read, write, close, list zip files.
														
 
															+
														
 
															+    z = ZipFile(file, mode="r", compression=ZIP_STORED, allowZip64=True,
														
 
															+                compresslevel=None)
														
 
															+
														
 
															+    file: Either the path to the file, or a file-like object.
														
 
															+          If it is a path, the file will be opened and closed by ZipFile.
														
 
															+    mode: The mode can be either read 'r', write 'w', exclusive create 'x',
														
 
															+          or append 'a'.
														
 
															+    compression: ZIP_STORED (no compression), ZIP_DEFLATED (requires zlib),
														
 
															+                 ZIP_BZIP2 (requires bz2) or ZIP_LZMA (requires lzma).
														
 
															+    allowZip64: if True ZipFile will create files with ZIP64 extensions when
														
 
															+                needed, otherwise it will raise an exception when this would
														
 
															+                be necessary.
														
 
															+    compresslevel: None (default for the given compression type) or an integer
														
 
															+                   specifying the level to pass to the compressor.
														
 
															+                   When using ZIP_STORED or ZIP_LZMA this keyword has no effect.
														
 
															+                   When using ZIP_DEFLATED integers 0 through 9 are accepted.
														
 
															+                   When using ZIP_BZIP2 integers 1 through 9 are accepted.
														
 
															+
														
 
															+    """
														
 
															+
														
 
															+    fp = None                   # Set here since __del__ checks it
														
 
															+    _windows_illegal_name_trans_table = None
														
 
															+
														
 
															+    def __init__(self, file, mode="r", compression=ZIP_STORED, allowZip64=True,
														
 
															+                 compresslevel=None, has_changed_name=False):
														
 
															+        """Open the ZIP file with mode read 'r', write 'w', exclusive create 'x',
														
 
															+        or append 'a'."""
														
 
															+        if mode not in ('r', 'w', 'x', 'a'):
														
 
															+            raise ValueError("ZipFile requires mode 'r', 'w', 'x', or 'a'")
														
 
															+
														
 
															+        _check_compression(compression)
														
 
															+
														
 
															+        self._allowZip64 = allowZip64
														
 
															+        self._didModify = False
														
 
															+        self.debug = 0  # Level of printing: 0 through 3
														
 
															+        self.NameToInfo = {}    # Find file info given name
														
 
															+        self.filelist = []      # List of ZipInfo instances for archive
														
 
															+        self.compression = compression  # Method of compression
														
 
															+        self.compresslevel = compresslevel
														
 
															+        self.mode = mode
														
 
															+        self.pwd = None
														
 
															+        self._comment = b''
														
 
															+        self.has_changed_name = has_changed_name
														
 
															+
														
 
															+        # Check if we were passed a file-like object
														
 
															+        if isinstance(file, os.PathLike):
														
 
															+            file = os.fspath(file)
														
 
															+        if isinstance(file, str):
														
 
															+            # No, it's a filename
														
 
															+            self._filePassed = 0
														
 
															+            self.filename = file
														
 
															+            modeDict = {'r' : 'rb', 'w': 'w+b', 'x': 'x+b', 'a' : 'r+b',
														
 
															+                        'r+b': 'w+b', 'w+b': 'wb', 'x+b': 'xb'}
														
 
															+            filemode = modeDict[mode]
														
 
															+            while True:
														
 
															+                try:
														
 
															+                    self.fp = io.open(file, filemode)
														
 
															+                except OSError:
														
 
															+                    if filemode in modeDict:
														
 
															+                        filemode = modeDict[filemode]
														
 
															+                        continue
														
 
															+                    raise
														
 
															+                break
														
 
															+        else:
														
 
															+            self._filePassed = 1
														
 
															+            self.fp = file
														
 
															+            self.filename = getattr(file, 'name', None)
														
 
															+        self._fileRefCnt = 1
														
 
															+        self._lock = threading.RLock()
														
 
															+        self._seekable = True
														
 
															+        self._writing = False
														
 
															+
														
 
															+        try:
														
 
															+            if mode == 'r':
														
 
															+                self._RealGetContents()
														
 
															+            elif mode in ('w', 'x'):
														
 
															+                # set the modified flag so central directory gets written
														
 
															+                # even if no files are added to the archive
														
 
															+                self._didModify = True
														
 
															+                try:
														
 
															+                    self.start_dir = self.fp.tell()
														
 
															+                except (AttributeError, OSError):
														
 
															+                    self.fp = _Tellable(self.fp)
														
 
															+                    self.start_dir = 0
														
 
															+                    self._seekable = False
														
 
															+                else:
														
 
															+                    # Some file-like objects can provide tell() but not seek()
														
 
															+                    try:
														
 
															+                        self.fp.seek(self.start_dir)
														
 
															+                    except (AttributeError, OSError):
														
 
															+                        self._seekable = False
														
 
															+            elif mode == 'a':
														
 
															+                try:
														
 
															+                    # See if file is a zip file
														
 
															+                    self._RealGetContents()
														
 
															+                    # seek to start of directory and overwrite
														
 
															+                    self.fp.seek(self.start_dir)
														
 
															+                except BadZipFile:
														
 
															+                    # file is not a zip file, just append
														
 
															+                    self.fp.seek(0, 2)
														
 
															+
														
 
															+                    # set the modified flag so central directory gets written
														
 
															+                    # even if no files are added to the archive
														
 
															+                    self._didModify = True
														
 
															+                    self.start_dir = self.fp.tell()
														
 
															+            else:
														
 
															+                raise ValueError("Mode must be 'r', 'w', 'x', or 'a'")
														
 
															+        except:
														
 
															+            fp = self.fp
														
 
															+            self.fp = None
														
 
															+            self._fpclose(fp)
														
 
															+            raise
														
 
															+
														
 
															+    def __enter__(self):
														
 
															+        return self
														
 
															+
														
 
															+    def __exit__(self, type, value, traceback):
														
 
															+        self.close()
														
 
															+
														
 
															+    def __repr__(self):
														
 
															+        result = ['<%s.%s' % (self.__class__.__module__,
														
 
															+                              self.__class__.__qualname__)]
														
 
															+        if self.fp is not None:
														
 
															+            if self._filePassed:
														
 
															+                result.append(' file=%r' % self.fp)
														
 
															+            elif self.filename is not None:
														
 
															+                result.append(' filename=%r' % self.filename)
														
 
															+            result.append(' mode=%r' % self.mode)
														
 
															+        else:
														
 
															+            result.append(' [closed]')
														
 
															+        result.append('>')
														
 
															+        return ''.join(result)
														
 
															+
														
 
															+    def _RealGetContents(self):
														
 
															+        """Read in the table of contents for the ZIP file."""
														
 
															+        fp = self.fp
														
 
															+        try:
														
 
															+            endrec = _EndRecData(fp)
														
 
															+        except OSError:
														
 
															+            raise BadZipFile("File is not a zip file")
														
 
															+        if not endrec:
														
 
															+            raise BadZipFile("File is not a zip file")
														
 
															+        if self.debug > 1:
														
 
															+            print(endrec)
														
 
															+        size_cd = endrec[_ECD_SIZE]             # bytes in central directory
														
 
															+        offset_cd = endrec[_ECD_OFFSET]         # offset of central directory
														
 
															+        self._comment = endrec[_ECD_COMMENT]    # archive comment
														
 
															+
														
 
															+        # "concat" is zero, unless zip was concatenated to another file
														
 
															+        concat = endrec[_ECD_LOCATION] - size_cd - offset_cd
														
 
															+        if endrec[_ECD_SIGNATURE] == stringEndArchive64:
														
 
															+            # If Zip64 extension structures are present, account for them
														
 
															+            concat -= (sizeEndCentDir64 + sizeEndCentDir64Locator)
														
 
															+
														
 
															+        if self.debug > 2:
														
 
															+            inferred = concat + offset_cd
														
 
															+            print("given, inferred, offset", offset_cd, inferred, concat)
														
 
															+        # self.start_dir:  Position of start of central directory
														
 
															+        self.start_dir = offset_cd + concat
														
 
															+        fp.seek(self.start_dir, 0)
														
 
															+        data = fp.read(size_cd)
														
 
															+        fp = io.BytesIO(data)
														
 
															+        total = 0
														
 
															+        while total < size_cd:
														
 
															+            centdir = fp.read(sizeCentralDir)
														
 
															+            if len(centdir) != sizeCentralDir:
														
 
															+                raise BadZipFile("Truncated central directory")
														
 
															+            centdir = struct.unpack(structCentralDir, centdir)
														
 
															+            if centdir[_CD_SIGNATURE] != stringCentralDir:
														
 
															+                raise BadZipFile("Bad magic number for central directory")
														
 
															+            if self.debug > 2:
														
 
															+                print(centdir)
														
 
															+            filename = fp.read(centdir[_CD_FILENAME_LENGTH])
														
 
															+            flags = centdir[5]
														
 
															+            if flags & 0x800:
														
 
															+                # UTF-8 file names extension
														
 
															+                filename = filename.decode('utf-8')
														
 
															+            else:
														
 
															+                # Historical ZIP filename encoding
														
 
															+                filename = filename.decode('cp437')
														
 
															+                # filename = filename.decode('utf-8')
														
 
															+            # Create ZipInfo instance to store file information
														
 
															+            x = ZipInfo(filename, has_changed_name=self.has_changed_name)
														
 
															+            x.extra = fp.read(centdir[_CD_EXTRA_FIELD_LENGTH])
														
 
															+            x.comment = fp.read(centdir[_CD_COMMENT_LENGTH])
														
 
															+            x.header_offset = centdir[_CD_LOCAL_HEADER_OFFSET]
														
 
															+            (x.create_version, x.create_system, x.extract_version, x.reserved,
														
 
															+             x.flag_bits, x.compress_type, t, d,
														
 
															+             x.CRC, x.compress_size, x.file_size) = centdir[1:12]
														
 
															+            if x.extract_version > MAX_EXTRACT_VERSION:
														
 
															+                raise NotImplementedError("zip file version %.1f" %
														
 
															+                                          (x.extract_version / 10))
														
 
															+            x.volume, x.internal_attr, x.external_attr = centdir[15:18]
														
 
															+            # Convert date/time code to (year, month, day, hour, min, sec)
														
 
															+            x._raw_time = t
														
 
															+            x.date_time = ( (d>>9)+1980, (d>>5)&0xF, d&0x1F,
														
 
															+                            t>>11, (t>>5)&0x3F, (t&0x1F) * 2 )
														
 
															+
														
 
															+            x._decodeExtra()
														
 
															+            x.header_offset = x.header_offset + concat
														
 
															+            self.filelist.append(x)
														
 
															+            self.NameToInfo[x.filename] = x
														
 
															+
														
 
															+            # update total bytes read from central directory
														
 
															+            total = (total + sizeCentralDir + centdir[_CD_FILENAME_LENGTH]
														
 
															+                     + centdir[_CD_EXTRA_FIELD_LENGTH]
														
 
															+                     + centdir[_CD_COMMENT_LENGTH])
														
 
															+
														
 
															+            if self.debug > 2:
														
 
															+                print("total", total)
														
 
															+
														
 
															+
														
 
															+    def namelist(self):
														
 
															+        """Return a list of file names in the archive."""
														
 
															+        return [data.filename for data in self.filelist]
														
 
															+
														
 
															+    def infolist(self):
														
 
															+        """Return a list of class ZipInfo instances for files in the
														
 
															+        archive."""
														
 
															+        return self.filelist
														
 
															+
														
 
															+    def printdir(self, file=None):
														
 
															+        """Print a table of contents for the zip file."""
														
 
															+        print("%-46s %19s %12s" % ("File Name", "Modified    ", "Size"),
														
 
															+              file=file)
														
 
															+        for zinfo in self.filelist:
														
 
															+            date = "%d-%02d-%02d %02d:%02d:%02d" % zinfo.date_time[:6]
														
 
															+            print("%-46s %s %12d" % (zinfo.filename, date, zinfo.file_size),
														
 
															+                  file=file)
														
 
															+
														
 
															+    def testzip(self):
														
 
															+        """Read all the files and check the CRC."""
														
 
															+        chunk_size = 2 ** 20
														
 
															+        for zinfo in self.filelist:
														
 
															+            try:
														
 
															+                # Read by chunks, to avoid an OverflowError or a
														
 
															+                # MemoryError with very large embedded files.
														
 
															+                with self.open(zinfo.filename, "r") as f:
														
 
															+                    while f.read(chunk_size):     # Check CRC-32
														
 
															+                        pass
														
 
															+            except BadZipFile:
														
 
															+                return zinfo.filename
														
 
															+
														
 
															+    def getinfo(self, name):
														
 
															+        """Return the instance of ZipInfo given 'name'."""
														
 
															+        info = self.NameToInfo.get(name)
														
 
															+        if info is None:
														
 
															+            raise KeyError(
														
 
															+                'There is no item named %r in the archive' % name)
														
 
															+
														
 
															+        return info
														
 
															+
														
 
															+    def setpassword(self, pwd):
														
 
															+        """Set default password for encrypted files."""
														
 
															+        if pwd and not isinstance(pwd, bytes):
														
 
															+            raise TypeError("pwd: expected bytes, got %s" % type(pwd).__name__)
														
 
															+        if pwd:
														
 
															+            self.pwd = pwd
														
 
															+        else:
														
 
															+            self.pwd = None
														
 
															+
														
 
															+    @property
														
 
															+    def comment(self):
														
 
															+        """The comment text associated with the ZIP file."""
														
 
															+        return self._comment
														
 
															+
														
 
															+    @comment.setter
														
 
															+    def comment(self, comment):
														
 
															+        if not isinstance(comment, bytes):
														
 
															+            raise TypeError("comment: expected bytes, got %s" % type(comment).__name__)
														
 
															+        # check for valid comment length
														
 
															+        if len(comment) > ZIP_MAX_COMMENT:
														
 
															+            import warnings
														
 
															+            warnings.warn('Archive comment is too long; truncating to %d bytes'
														
 
															+                          % ZIP_MAX_COMMENT, stacklevel=2)
														
 
															+            comment = comment[:ZIP_MAX_COMMENT]
														
 
															+        self._comment = comment
														
 
															+        self._didModify = True
														
 
															+
														
 
															+    def read(self, name, pwd=None):
														
 
															+        """Return file bytes for name."""
														
 
															+        with self.open(name, "r", pwd) as fp:
														
 
															+            return fp.read()
														
 
															+
														
 
															+    def open(self, name, mode="r", pwd=None, *, force_zip64=False):
														
 
															+        """Return file-like object for 'name'.
														
 
															+
														
 
															+        name is a string for the file name within the ZIP file, or a ZipInfo
														
 
															+        object.
														
 
															+
														
 
															+        mode should be 'r' to read a file already in the ZIP file, or 'w' to
														
 
															+        write to a file newly added to the archive.
														
 
															+
														
 
															+        pwd is the password to decrypt files (only used for reading).
														
 
															+
														
 
															+        When writing, if the file size is not known in advance but may exceed
														
 
															+        2 GiB, pass force_zip64 to use the ZIP64 format, which can handle large
														
 
															+        files.  If the size is known in advance, it is best to pass a ZipInfo
														
 
															+        instance for name, with zinfo.file_size set.
														
 
															+        """
														
 
															+        if mode not in {"r", "w"}:
														
 
															+            raise ValueError('open() requires mode "r" or "w"')
														
 
															+        if pwd and not isinstance(pwd, bytes):
														
 
															+            raise TypeError("pwd: expected bytes, got %s" % type(pwd).__name__)
														
 
															+        if pwd and (mode == "w"):
														
 
															+            raise ValueError("pwd is only supported for reading files")
														
 
															+        if not self.fp:
														
 
															+            raise ValueError(
														
 
															+                "Attempt to use ZIP archive that was already closed")
														
 
															+
														
 
															+        # Make sure we have an info object
														
 
															+        if isinstance(name, ZipInfo):
														
 
															+            # 'name' is already an info object
														
 
															+            zinfo = name
														
 
															+        elif mode == 'w':
														
 
															+            zinfo = ZipInfo(name)
														
 
															+            zinfo.compress_type = self.compression
														
 
															+            zinfo._compresslevel = self.compresslevel
														
 
															+        else:
														
 
															+            # Get info object for name
														
 
															+            zinfo = self.getinfo(name)
														
 
															+
														
 
															+        if mode == 'w':
														
 
															+            return self._open_to_write(zinfo, force_zip64=force_zip64)
														
 
															+
														
 
															+        if self._writing:
														
 
															+            raise ValueError("Can't read from the ZIP file while there "
														
 
															+                    "is an open writing handle on it. "
														
 
															+                    "Close the writing handle before trying to read.")
														
 
															+
														
 
															+        # Open for reading:
														
 
															+        self._fileRefCnt += 1
														
 
															+        zef_file = _SharedFile(self.fp, zinfo.header_offset,
														
 
															+                               self._fpclose, self._lock, lambda: self._writing)
														
 
															+        try:
														
 
															+            # Skip the file header:
														
 
															+            fheader = zef_file.read(sizeFileHeader)
														
 
															+            if len(fheader) != sizeFileHeader:
														
 
															+                raise BadZipFile("Truncated file header")
														
 
															+            fheader = struct.unpack(structFileHeader, fheader)
														
 
															+            if fheader[_FH_SIGNATURE] != stringFileHeader:
														
 
															+                raise BadZipFile("Bad magic number for file header")
														
 
															+
														
 
															+            fname = zef_file.read(fheader[_FH_FILENAME_LENGTH])
														
 
															+            if fheader[_FH_EXTRA_FIELD_LENGTH]:
														
 
															+                zef_file.read(fheader[_FH_EXTRA_FIELD_LENGTH])
														
 
															+
														
 
															+            if zinfo.flag_bits & 0x20:
														
 
															+                # Zip 2.7: compressed patched data
														
 
															+                raise NotImplementedError("compressed patched data (flag bit 5)")
														
 
															+
														
 
															+            if zinfo.flag_bits & 0x40:
														
 
															+                # strong encryption
														
 
															+                raise NotImplementedError("strong encryption (flag bit 6)")
														
 
															+
														
 
															+            if zinfo.flag_bits & 0x800:
														
 
															+                # UTF-8 filename
														
 
															+                fname_str = fname.decode("utf-8")
														
 
															+            else:
														
 
															+                fname_str = fname.decode("cp437")
														
 
															+
														
 
															+            print('zinfo.has_changed_name', zinfo.has_changed_name)
														
 
															+            if not zinfo.has_changed_name:
														
 
															+                if fname_str != zinfo.orig_filename:
														
 
															+                    raise BadZipFile(
														
 
															+                        'File name in directory %r and header %r differ.'
														
 
															+                        % (zinfo.orig_filename, fname))
														
 
															+
														
 
															+            # check for encrypted flag & handle password
														
 
															+            is_encrypted = zinfo.flag_bits & 0x1
														
 
															+            if is_encrypted:
														
 
															+                if not pwd:
														
 
															+                    pwd = self.pwd
														
 
															+                if not pwd:
														
 
															+                    raise RuntimeError("File %r is encrypted, password "
														
 
															+                                       "required for extraction" % name)
														
 
															+            else:
														
 
															+                pwd = None
														
 
															+
														
 
															+            return ZipExtFile(zef_file, mode, zinfo, pwd, True)
														
 
															+        except:
														
 
															+            zef_file.close()
														
 
															+            raise
														
 
															+
														
 
															+    def _open_to_write(self, zinfo, force_zip64=False):
														
 
															+        if force_zip64 and not self._allowZip64:
														
 
															+            raise ValueError(
														
 
															+                "force_zip64 is True, but allowZip64 was False when opening "
														
 
															+                "the ZIP file."
														
 
															+            )
														
 
															+        if self._writing:
														
 
															+            raise ValueError("Can't write to the ZIP file while there is "
														
 
															+                             "another write handle open on it. "
														
 
															+                             "Close the first handle before opening another.")
														
 
															+
														
 
															+        # Sizes and CRC are overwritten with correct data after processing the file
														
 
															+        if not hasattr(zinfo, 'file_size'):
														
 
															+            zinfo.file_size = 0
														
 
															+        zinfo.compress_size = 0
														
 
															+        zinfo.CRC = 0
														
 
															+
														
 
															+        zinfo.flag_bits = 0x00
														
 
															+        if zinfo.compress_type == ZIP_LZMA:
														
 
															+            # Compressed data includes an end-of-stream (EOS) marker
														
 
															+            zinfo.flag_bits |= 0x02
														
 
															+        if not self._seekable:
														
 
															+            zinfo.flag_bits |= 0x08
														
 
															+
														
 
															+        if not zinfo.external_attr:
														
 
															+            zinfo.external_attr = 0o600 << 16  # permissions: ?rw-------
														
 
															+
														
 
															+        # Compressed size can be larger than uncompressed size
														
 
															+        zip64 = self._allowZip64 and \
														
 
															+                (force_zip64 or zinfo.file_size * 1.05 > ZIP64_LIMIT)
														
 
															+
														
 
															+        if self._seekable:
														
 
															+            self.fp.seek(self.start_dir)
														
 
															+        zinfo.header_offset = self.fp.tell()
														
 
															+
														
 
															+        self._writecheck(zinfo)
														
 
															+        self._didModify = True
														
 
															+
														
 
															+        self.fp.write(zinfo.FileHeader(zip64))
														
 
															+
														
 
															+        self._writing = True
														
 
															+        return _ZipWriteFile(self, zinfo, zip64)
														
 
															+
														
 
															+    def extract(self, member, path=None, pwd=None):
														
 
															+        """Extract a member from the archive to the current working directory,
														
 
															+           using its full name. Its file information is extracted as accurately
														
 
															+           as possible. `member' may be a filename or a ZipInfo object. You can
														
 
															+           specify a different directory using `path'.
														
 
															+        """
														
 
															+        if path is None:
														
 
															+            path = os.getcwd()
														
 
															+        else:
														
 
															+            path = os.fspath(path)
														
 
															+
														
 
															+        return self._extract_member(member, path, pwd)
														
 
															+
														
 
															+    def extractall(self, path=None, members=None, pwd=None):
														
 
															+        """Extract all members from the archive to the current working
														
 
															+           directory. `path' specifies a different directory to extract to.
														
 
															+           `members' is optional and must be a subset of the list returned
														
 
															+           by namelist().
														
 
															+        """
														
 
															+        if members is None:
														
 
															+            members = self.namelist()
														
 
															+
														
 
															+        if path is None:
														
 
															+            path = os.getcwd()
														
 
															+        else:
														
 
															+            path = os.fspath(path)
														
 
															+
														
 
															+        for zipinfo in members:
														
 
															+            self._extract_member(zipinfo, path, pwd)
														
 
															+
														
 
															+    @classmethod
														
 
															+    def _sanitize_windows_name(cls, arcname, pathsep):
														
 
															+        """Replace bad characters and remove trailing dots from parts."""
														
 
															+        table = cls._windows_illegal_name_trans_table
														
 
															+        if not table:
														
 
															+            illegal = ':<>|"?*'
														
 
															+            table = str.maketrans(illegal, '_' * len(illegal))
														
 
															+            cls._windows_illegal_name_trans_table = table
														
 
															+        arcname = arcname.translate(table)
														
 
															+        # remove trailing dots
														
 
															+        arcname = (x.rstrip('.') for x in arcname.split(pathsep))
														
 
															+        # rejoin, removing empty parts.
														
 
															+        arcname = pathsep.join(x for x in arcname if x)
														
 
															+        return arcname
														
 
															+
														
 
															+    def _extract_member(self, member, targetpath, pwd):
														
 
															+        """Extract the ZipInfo object 'member' to a physical
														
 
															+           file on the path targetpath.
														
 
															+        """
														
 
															+        if not isinstance(member, ZipInfo):
														
 
															+            member = self.getinfo(member)
														
 
															+
														
 
															+        # build the destination pathname, replacing
														
 
															+        # forward slashes to platform specific separators.
														
 
															+        arcname = member.filename.replace('/', os.path.sep)
														
 
															+
														
 
															+        if os.path.altsep:
														
 
															+            arcname = arcname.replace(os.path.altsep, os.path.sep)
														
 
															+        # interpret absolute pathname as relative, remove drive letter or
														
 
															+        # UNC path, redundant separators, "." and ".." components.
														
 
															+        arcname = os.path.splitdrive(arcname)[1]
														
 
															+        invalid_path_parts = ('', os.path.curdir, os.path.pardir)
														
 
															+        arcname = os.path.sep.join(x for x in arcname.split(os.path.sep)
														
 
															+                                   if x not in invalid_path_parts)
														
 
															+        if os.path.sep == '\\':
														
 
															+            # filter illegal characters on Windows
														
 
															+            arcname = self._sanitize_windows_name(arcname, os.path.sep)
														
 
															+
														
 
															+        targetpath = os.path.join(targetpath, arcname)
														
 
															+        targetpath = os.path.normpath(targetpath)
														
 
															+
														
 
															+        # Create all upper directories if necessary.
														
 
															+        upperdirs = os.path.dirname(targetpath)
														
 
															+        if upperdirs and not os.path.exists(upperdirs):
														
 
															+            os.makedirs(upperdirs)
														
 
															+
														
 
															+        if member.is_dir():
														
 
															+            if not os.path.isdir(targetpath):
														
 
															+                os.mkdir(targetpath)
														
 
															+            return targetpath
														
 
															+
														
 
															+        with self.open(member, pwd=pwd) as source, \
														
 
															+             open(targetpath, "wb") as target:
														
 
															+            shutil.copyfileobj(source, target)
														
 
															+
														
 
															+        return targetpath
														
 
															+
														
 
															+    def _writecheck(self, zinfo):
														
 
															+        """Check for errors before writing a file to the archive."""
														
 
															+        if zinfo.filename in self.NameToInfo:
														
 
															+            import warnings
														
 
															+            warnings.warn('Duplicate name: %r' % zinfo.filename, stacklevel=3)
														
 
															+        if self.mode not in ('w', 'x', 'a'):
														
 
															+            raise ValueError("write() requires mode 'w', 'x', or 'a'")
														
 
															+        if not self.fp:
														
 
															+            raise ValueError(
														
 
															+                "Attempt to write ZIP archive that was already closed")
														
 
															+        _check_compression(zinfo.compress_type)
														
 
															+        if not self._allowZip64:
														
 
															+            requires_zip64 = None
														
 
															+            if len(self.filelist) >= ZIP_FILECOUNT_LIMIT:
														
 
															+                requires_zip64 = "Files count"
														
 
															+            elif zinfo.file_size > ZIP64_LIMIT:
														
 
															+                requires_zip64 = "Filesize"
														
 
															+            elif zinfo.header_offset > ZIP64_LIMIT:
														
 
															+                requires_zip64 = "Zipfile size"
														
 
															+            if requires_zip64:
														
 
															+                raise LargeZipFile(requires_zip64 +
														
 
															+                                   " would require ZIP64 extensions")
														
 
															+
														
 
															+    def write(self, filename, arcname=None,
														
 
															+              compress_type=None, compresslevel=None):
														
 
															+        """Put the bytes from filename into the archive under the name
														
 
															+        arcname."""
														
 
															+        if not self.fp:
														
 
															+            raise ValueError(
														
 
															+                "Attempt to write to ZIP archive that was already closed")
														
 
															+        if self._writing:
														
 
															+            raise ValueError(
														
 
															+                "Can't write to ZIP archive while an open writing handle exists"
														
 
															+            )
														
 
															+
														
 
															+        zinfo = ZipInfo.from_file(filename, arcname)
														
 
															+
														
 
															+        if zinfo.is_dir():
														
 
															+            zinfo.compress_size = 0
														
 
															+            zinfo.CRC = 0
														
 
															+        else:
														
 
															+            if compress_type is not None:
														
 
															+                zinfo.compress_type = compress_type
														
 
															+            else:
														
 
															+                zinfo.compress_type = self.compression
														
 
															+
														
 
															+            if compresslevel is not None:
														
 
															+                zinfo._compresslevel = compresslevel
														
 
															+            else:
														
 
															+                zinfo._compresslevel = self.compresslevel
														
 
															+
														
 
															+        if zinfo.is_dir():
														
 
															+            with self._lock:
														
 
															+                if self._seekable:
														
 
															+                    self.fp.seek(self.start_dir)
														
 
															+                zinfo.header_offset = self.fp.tell()  # Start of header bytes
														
 
															+                if zinfo.compress_type == ZIP_LZMA:
														
 
															+                # Compressed data includes an end-of-stream (EOS) marker
														
 
															+                    zinfo.flag_bits |= 0x02
														
 
															+
														
 
															+                self._writecheck(zinfo)
														
 
															+                self._didModify = True
														
 
															+
														
 
															+                self.filelist.append(zinfo)
														
 
															+                self.NameToInfo[zinfo.filename] = zinfo
														
 
															+                self.fp.write(zinfo.FileHeader(False))
														
 
															+                self.start_dir = self.fp.tell()
														
 
															+        else:
														
 
															+            with open(filename, "rb") as src, self.open(zinfo, 'w') as dest:
														
 
															+                shutil.copyfileobj(src, dest, 1024*8)
														
 
															+
														
 
															+    def writestr(self, zinfo_or_arcname, data,
														
 
															+                 compress_type=None, compresslevel=None):
														
 
															+        """Write a file into the archive.  The contents is 'data', which
														
 
															+        may be either a 'str' or a 'bytes' instance; if it is a 'str',
														
 
															+        it is encoded as UTF-8 first.
														
 
															+        'zinfo_or_arcname' is either a ZipInfo instance or
														
 
															+        the name of the file in the archive."""
														
 
															+        if isinstance(data, str):
														
 
															+            data = data.encode("utf-8")
														
 
															+        if not isinstance(zinfo_or_arcname, ZipInfo):
														
 
															+            zinfo = ZipInfo(filename=zinfo_or_arcname,
														
 
															+                            date_time=time.localtime(time.time())[:6])
														
 
															+            zinfo.compress_type = self.compression
														
 
															+            zinfo._compresslevel = self.compresslevel
														
 
															+            if zinfo.filename[-1] == '/':
														
 
															+                zinfo.external_attr = 0o40775 << 16   # drwxrwxr-x
														
 
															+                zinfo.external_attr |= 0x10           # MS-DOS directory flag
														
 
															+            else:
														
 
															+                zinfo.external_attr = 0o600 << 16     # ?rw-------
														
 
															+        else:
														
 
															+            zinfo = zinfo_or_arcname
														
 
															+
														
 
															+        if not self.fp:
														
 
															+            raise ValueError(
														
 
															+                "Attempt to write to ZIP archive that was already closed")
														
 
															+        if self._writing:
														
 
															+            raise ValueError(
														
 
															+                "Can't write to ZIP archive while an open writing handle exists."
														
 
															+            )
														
 
															+
														
 
															+        if compress_type is not None:
														
 
															+            zinfo.compress_type = compress_type
														
 
															+
														
 
															+        if compresslevel is not None:
														
 
															+            zinfo._compresslevel = compresslevel
														
 
															+
														
 
															+        zinfo.file_size = len(data)            # Uncompressed size
														
 
															+        with self._lock:
														
 
															+            with self.open(zinfo, mode='w') as dest:
														
 
															+                dest.write(data)
														
 
															+
														
 
															+    def __del__(self):
														
 
															+        """Call the "close()" method in case the user forgot."""
														
 
															+        self.close()
														
 
															+
														
 
															+    def close(self):
														
 
															+        """Close the file, and for mode 'w', 'x' and 'a' write the ending
														
 
															+        records."""
														
 
															+        if self.fp is None:
														
 
															+            return
														
 
															+
														
 
															+        if self._writing:
														
 
															+            raise ValueError("Can't close the ZIP file while there is "
														
 
															+                             "an open writing handle on it. "
														
 
															+                             "Close the writing handle before closing the zip.")
														
 
															+
														
 
															+        try:
														
 
															+            if self.mode in ('w', 'x', 'a') and self._didModify: # write ending records
														
 
															+                with self._lock:
														
 
															+                    if self._seekable:
														
 
															+                        self.fp.seek(self.start_dir)
														
 
															+                    self._write_end_record()
														
 
															+        finally:
														
 
															+            fp = self.fp
														
 
															+            self.fp = None
														
 
															+            self._fpclose(fp)
														
 
															+
														
 
															+    def _write_end_record(self):
														
 
															+        for zinfo in self.filelist:         # write central directory
														
 
															+            dt = zinfo.date_time
														
 
															+            dosdate = (dt[0] - 1980) << 9 | dt[1] << 5 | dt[2]
														
 
															+            dostime = dt[3] << 11 | dt[4] << 5 | (dt[5] // 2)
														
 
															+            extra = []
														
 
															+            if zinfo.file_size > ZIP64_LIMIT \
														
 
															+               or zinfo.compress_size > ZIP64_LIMIT:
														
 
															+                extra.append(zinfo.file_size)
														
 
															+                extra.append(zinfo.compress_size)
														
 
															+                file_size = 0xffffffff
														
 
															+                compress_size = 0xffffffff
														
 
															+            else:
														
 
															+                file_size = zinfo.file_size
														
 
															+                compress_size = zinfo.compress_size
														
 
															+
														
 
															+            if zinfo.header_offset > ZIP64_LIMIT:
														
 
															+                extra.append(zinfo.header_offset)
														
 
															+                header_offset = 0xffffffff
														
 
															+            else:
														
 
															+                header_offset = zinfo.header_offset
														
 
															+
														
 
															+            extra_data = zinfo.extra
														
 
															+            min_version = 0
														
 
															+            if extra:
														
 
															+                # Append a ZIP64 field to the extra's
														
 
															+                extra_data = _strip_extra(extra_data, (1,))
														
 
															+                extra_data = struct.pack(
														
 
															+                    '<HH' + 'Q'*len(extra),
														
 
															+                    1, 8*len(extra), *extra) + extra_data
														
 
															+
														
 
															+                min_version = ZIP64_VERSION
														
 
															+
														
 
															+            if zinfo.compress_type == ZIP_BZIP2:
														
 
															+                min_version = max(BZIP2_VERSION, min_version)
														
 
															+            elif zinfo.compress_type == ZIP_LZMA:
														
 
															+                min_version = max(LZMA_VERSION, min_version)
														
 
															+
														
 
															+            extract_version = max(min_version, zinfo.extract_version)
														
 
															+            create_version = max(min_version, zinfo.create_version)
														
 
															+            try:
														
 
															+                filename, flag_bits = zinfo._encodeFilenameFlags()
														
 
															+                centdir = struct.pack(structCentralDir,
														
 
															+                                      stringCentralDir, create_version,
														
 
															+                                      zinfo.create_system, extract_version, zinfo.reserved,
														
 
															+                                      flag_bits, zinfo.compress_type, dostime, dosdate,
														
 
															+                                      zinfo.CRC, compress_size, file_size,
														
 
															+                                      len(filename), len(extra_data), len(zinfo.comment),
														
 
															+                                      0, zinfo.internal_attr, zinfo.external_attr,
														
 
															+                                      header_offset)
														
 
															+            except DeprecationWarning:
														
 
															+                print((structCentralDir, stringCentralDir, create_version,
														
 
															+                       zinfo.create_system, extract_version, zinfo.reserved,
														
 
															+                       zinfo.flag_bits, zinfo.compress_type, dostime, dosdate,
														
 
															+                       zinfo.CRC, compress_size, file_size,
														
 
															+                       len(zinfo.filename), len(extra_data), len(zinfo.comment),
														
 
															+                       0, zinfo.internal_attr, zinfo.external_attr,
														
 
															+                       header_offset), file=sys.stderr)
														
 
															+                raise
														
 
															+            self.fp.write(centdir)
														
 
															+            self.fp.write(filename)
														
 
															+            self.fp.write(extra_data)
														
 
															+            self.fp.write(zinfo.comment)
														
 
															+
														
 
															+        pos2 = self.fp.tell()
														
 
															+        # Write end-of-zip-archive record
														
 
															+        centDirCount = len(self.filelist)
														
 
															+        centDirSize = pos2 - self.start_dir
														
 
															+        centDirOffset = self.start_dir
														
 
															+        requires_zip64 = None
														
 
															+        if centDirCount > ZIP_FILECOUNT_LIMIT:
														
 
															+            requires_zip64 = "Files count"
														
 
															+        elif centDirOffset > ZIP64_LIMIT:
														
 
															+            requires_zip64 = "Central directory offset"
														
 
															+        elif centDirSize > ZIP64_LIMIT:
														
 
															+            requires_zip64 = "Central directory size"
														
 
															+        if requires_zip64:
														
 
															+            # Need to write the ZIP64 end-of-archive records
														
 
															+            if not self._allowZip64:
														
 
															+                raise LargeZipFile(requires_zip64 +
														
 
															+                                   " would require ZIP64 extensions")
														
 
															+            zip64endrec = struct.pack(
														
 
															+                structEndArchive64, stringEndArchive64,
														
 
															+                44, 45, 45, 0, 0, centDirCount, centDirCount,
														
 
															+                centDirSize, centDirOffset)
														
 
															+            self.fp.write(zip64endrec)
														
 
															+
														
 
															+            zip64locrec = struct.pack(
														
 
															+                structEndArchive64Locator,
														
 
															+                stringEndArchive64Locator, 0, pos2, 1)
														
 
															+            self.fp.write(zip64locrec)
														
 
															+            centDirCount = min(centDirCount, 0xFFFF)
														
 
															+            centDirSize = min(centDirSize, 0xFFFFFFFF)
														
 
															+            centDirOffset = min(centDirOffset, 0xFFFFFFFF)
														
 
															+
														
 
															+        endrec = struct.pack(structEndArchive, stringEndArchive,
														
 
															+                             0, 0, centDirCount, centDirCount,
														
 
															+                             centDirSize, centDirOffset, len(self._comment))
														
 
															+        self.fp.write(endrec)
														
 
															+        self.fp.write(self._comment)
														
 
															+        self.fp.flush()
														
 
															+
														
 
															+    def _fpclose(self, fp):
														
 
															+        assert self._fileRefCnt > 0
														
 
															+        self._fileRefCnt -= 1
														
 
															+        if not self._fileRefCnt and not self._filePassed:
														
 
															+            fp.close()
														
 
															+
														
 
															+
														
 
															+class PyZipFile(ZipFile):
														
 
															+    """Class to create ZIP archives with Python library files and packages."""
														
 
															+
														
 
															+    def __init__(self, file, mode="r", compression=ZIP_STORED,
														
 
															+                 allowZip64=True, optimize=-1):
														
 
															+        ZipFile.__init__(self, file, mode=mode, compression=compression,
														
 
															+                         allowZip64=allowZip64)
														
 
															+        self._optimize = optimize
														
 
															+
														
 
															+    def writepy(self, pathname, basename="", filterfunc=None):
														
 
															+        """Add all files from "pathname" to the ZIP archive.
														
 
															+
														
 
															+        If pathname is a package directory, search the directory and
														
 
															+        all package subdirectories recursively for all *.py and enter
														
 
															+        the modules into the archive.  If pathname is a plain
														
 
															+        directory, listdir *.py and enter all modules.  Else, pathname
														
 
															+        must be a Python *.py file and the module will be put into the
														
 
															+        archive.  Added modules are always module.pyc.
														
 
															+        This method will compile the module.py into module.pyc if
														
 
															+        necessary.
														
 
															+        If filterfunc(pathname) is given, it is called with every argument.
														
 
															+        When it is False, the file or directory is skipped.
														
 
															+        """
														
 
															+        pathname = os.fspath(pathname)
														
 
															+        if filterfunc and not filterfunc(pathname):
														
 
															+            if self.debug:
														
 
															+                label = 'path' if os.path.isdir(pathname) else 'file'
														
 
															+                print('%s %r skipped by filterfunc' % (label, pathname))
														
 
															+            return
														
 
															+        dir, name = os.path.split(pathname)
														
 
															+        if os.path.isdir(pathname):
														
 
															+            initname = os.path.join(pathname, "__init__.py")
														
 
															+            if os.path.isfile(initname):
														
 
															+                # This is a package directory, add it
														
 
															+                if basename:
														
 
															+                    basename = "%s/%s" % (basename, name)
														
 
															+                else:
														
 
															+                    basename = name
														
 
															+                if self.debug:
														
 
															+                    print("Adding package in", pathname, "as", basename)
														
 
															+                fname, arcname = self._get_codename(initname[0:-3], basename)
														
 
															+                if self.debug:
														
 
															+                    print("Adding", arcname)
														
 
															+                self.write(fname, arcname)
														
 
															+                dirlist = sorted(os.listdir(pathname))
														
 
															+                dirlist.remove("__init__.py")
														
 
															+                # Add all *.py files and package subdirectories
														
 
															+                for filename in dirlist:
														
 
															+                    path = os.path.join(pathname, filename)
														
 
															+                    root, ext = os.path.splitext(filename)
														
 
															+                    if os.path.isdir(path):
														
 
															+                        if os.path.isfile(os.path.join(path, "__init__.py")):
														
 
															+                            # This is a package directory, add it
														
 
															+                            self.writepy(path, basename,
														
 
															+                                         filterfunc=filterfunc)  # Recursive call
														
 
															+                    elif ext == ".py":
														
 
															+                        if filterfunc and not filterfunc(path):
														
 
															+                            if self.debug:
														
 
															+                                print('file %r skipped by filterfunc' % path)
														
 
															+                            continue
														
 
															+                        fname, arcname = self._get_codename(path[0:-3],
														
 
															+                                                            basename)
														
 
															+                        if self.debug:
														
 
															+                            print("Adding", arcname)
														
 
															+                        self.write(fname, arcname)
														
 
															+            else:
														
 
															+                # This is NOT a package directory, add its files at top level
														
 
															+                if self.debug:
														
 
															+                    print("Adding files from directory", pathname)
														
 
															+                for filename in sorted(os.listdir(pathname)):
														
 
															+                    path = os.path.join(pathname, filename)
														
 
															+                    root, ext = os.path.splitext(filename)
														
 
															+                    if ext == ".py":
														
 
															+                        if filterfunc and not filterfunc(path):
														
 
															+                            if self.debug:
														
 
															+                                print('file %r skipped by filterfunc' % path)
														
 
															+                            continue
														
 
															+                        fname, arcname = self._get_codename(path[0:-3],
														
 
															+                                                            basename)
														
 
															+                        if self.debug:
														
 
															+                            print("Adding", arcname)
														
 
															+                        self.write(fname, arcname)
														
 
															+        else:
														
 
															+            if pathname[-3:] != ".py":
														
 
															+                raise RuntimeError(
														
 
															+                    'Files added with writepy() must end with ".py"')
														
 
															+            fname, arcname = self._get_codename(pathname[0:-3], basename)
														
 
															+            if self.debug:
														
 
															+                print("Adding file", arcname)
														
 
															+            self.write(fname, arcname)
														
 
															+
														
 
															+    def _get_codename(self, pathname, basename):
														
 
															+        """Return (filename, archivename) for the path.
														
 
															+
														
 
															+        Given a module name path, return the correct file path and
														
 
															+        archive name, compiling if necessary.  For example, given
														
 
															+        /python/lib/string, return (/python/lib/string.pyc, string).
														
 
															+        """
														
 
															+        def _compile(file, optimize=-1):
														
 
															+            import py_compile
														
 
															+            if self.debug:
														
 
															+                print("Compiling", file)
														
 
															+            try:
														
 
															+                py_compile.compile(file, doraise=True, optimize=optimize)
														
 
															+            except py_compile.PyCompileError as err:
														
 
															+                print(err.msg)
														
 
															+                return False
														
 
															+            return True
														
 
															+
														
 
															+        file_py  = pathname + ".py"
														
 
															+        file_pyc = pathname + ".pyc"
														
 
															+        pycache_opt0 = importlib.util.cache_from_source(file_py, optimization='')
														
 
															+        pycache_opt1 = importlib.util.cache_from_source(file_py, optimization=1)
														
 
															+        pycache_opt2 = importlib.util.cache_from_source(file_py, optimization=2)
														
 
															+        if self._optimize == -1:
														
 
															+            # legacy mode: use whatever file is present
														
 
															+            if (os.path.isfile(file_pyc) and
														
 
															+                  os.stat(file_pyc).st_mtime >= os.stat(file_py).st_mtime):
														
 
															+                # Use .pyc file.
														
 
															+                arcname = fname = file_pyc
														
 
															+            elif (os.path.isfile(pycache_opt0) and
														
 
															+                  os.stat(pycache_opt0).st_mtime >= os.stat(file_py).st_mtime):
														
 
															+                # Use the __pycache__/*.pyc file, but write it to the legacy pyc
														
 
															+                # file name in the archive.
														
 
															+                fname = pycache_opt0
														
 
															+                arcname = file_pyc
														
 
															+            elif (os.path.isfile(pycache_opt1) and
														
 
															+                  os.stat(pycache_opt1).st_mtime >= os.stat(file_py).st_mtime):
														
 
															+                # Use the __pycache__/*.pyc file, but write it to the legacy pyc
														
 
															+                # file name in the archive.
														
 
															+                fname = pycache_opt1
														
 
															+                arcname = file_pyc
														
 
															+            elif (os.path.isfile(pycache_opt2) and
														
 
															+                  os.stat(pycache_opt2).st_mtime >= os.stat(file_py).st_mtime):
														
 
															+                # Use the __pycache__/*.pyc file, but write it to the legacy pyc
														
 
															+                # file name in the archive.
														
 
															+                fname = pycache_opt2
														
 
															+                arcname = file_pyc
														
 
															+            else:
														
 
															+                # Compile py into PEP 3147 pyc file.
														
 
															+                if _compile(file_py):
														
 
															+                    if sys.flags.optimize == 0:
														
 
															+                        fname = pycache_opt0
														
 
															+                    elif sys.flags.optimize == 1:
														
 
															+                        fname = pycache_opt1
														
 
															+                    else:
														
 
															+                        fname = pycache_opt2
														
 
															+                    arcname = file_pyc
														
 
															+                else:
														
 
															+                    fname = arcname = file_py
														
 
															+        else:
														
 
															+            # new mode: use given optimization level
														
 
															+            if self._optimize == 0:
														
 
															+                fname = pycache_opt0
														
 
															+                arcname = file_pyc
														
 
															+            else:
														
 
															+                arcname = file_pyc
														
 
															+                if self._optimize == 1:
														
 
															+                    fname = pycache_opt1
														
 
															+                elif self._optimize == 2:
														
 
															+                    fname = pycache_opt2
														
 
															+                else:
														
 
															+                    msg = "invalid value for 'optimize': {!r}".format(self._optimize)
														
 
															+                    raise ValueError(msg)
														
 
															+            if not (os.path.isfile(fname) and
														
 
															+                    os.stat(fname).st_mtime >= os.stat(file_py).st_mtime):
														
 
															+                if not _compile(file_py, optimize=self._optimize):
														
 
															+                    fname = arcname = file_py
														
 
															+        archivename = os.path.split(arcname)[1]
														
 
															+        if basename:
														
 
															+            archivename = "%s/%s" % (basename, archivename)
														
 
															+        return (fname, archivename)
														
 
															+
														
 
															+
														
 
															+def main(args=None):
														
 
															+    import argparse
														
 
															+
														
 
															+    description = 'A simple command-line interface for zipfile module.'
														
 
															+    parser = argparse.ArgumentParser(description=description)
														
 
															+    group = parser.add_mutually_exclusive_group(required=True)
														
 
															+    group.add_argument('-l', '--list', metavar='<zipfile>',
														
 
															+                       help='Show listing of a zipfile')
														
 
															+    group.add_argument('-e', '--extract', nargs=2,
														
 
															+                       metavar=('<zipfile>', '<output_dir>'),
														
 
															+                       help='Extract zipfile into target dir')
														
 
															+    group.add_argument('-c', '--create', nargs='+',
														
 
															+                       metavar=('<name>', '<file>'),
														
 
															+                       help='Create zipfile from sources')
														
 
															+    group.add_argument('-t', '--test', metavar='<zipfile>',
														
 
															+                       help='Test if a zipfile is valid')
														
 
															+    args = parser.parse_args(args)
														
 
															+
														
 
															+    if args.test is not None:
														
 
															+        src = args.test
														
 
															+        with ZipFile(src, 'r') as zf:
														
 
															+            badfile = zf.testzip()
														
 
															+        if badfile:
														
 
															+            print("The following enclosed file is corrupted: {!r}".format(badfile))
														
 
															+        print("Done testing")
														
 
															+
														
 
															+    elif args.list is not None:
														
 
															+        src = args.list
														
 
															+        with ZipFile(src, 'r') as zf:
														
 
															+            zf.printdir()
														
 
															+
														
 
															+    elif args.extract is not None:
														
 
															+        src, curdir = args.extract
														
 
															+        with ZipFile(src, 'r') as zf:
														
 
															+            zf.extractall(curdir)
														
 
															+
														
 
															+    elif args.create is not None:
														
 
															+        zip_name = args.create.pop(0)
														
 
															+        files = args.create
														
 
															+
														
 
															+        def addToZip(zf, path, zippath):
														
 
															+            if os.path.isfile(path):
														
 
															+                zf.write(path, zippath, ZIP_DEFLATED)
														
 
															+            elif os.path.isdir(path):
														
 
															+                if zippath:
														
 
															+                    zf.write(path, zippath)
														
 
															+                for nm in sorted(os.listdir(path)):
														
 
															+                    addToZip(zf,
														
 
															+                             os.path.join(path, nm), os.path.join(zippath, nm))
														
 
															+            # else: ignore
														
 
															+
														
 
															+        with ZipFile(zip_name, 'w') as zf:
														
 
															+            for path in files:
														
 
															+                zippath = os.path.basename(path)
														
 
															+                if not zippath:
														
 
															+                    zippath = os.path.basename(os.path.dirname(path))
														
 
															+                if zippath in ('', os.curdir, os.pardir):
														
 
															+                    zippath = ''
														
 
															+                addToZip(zf, path, zippath)
														
 
															+
														
 
															+if __name__ == "__main__":
														
 
															+    main()
														
--- a/otr/otr_interface.py
+++ b/otr/otr_interface.py
@@ -411,40 +411,6 @@ if __name__ == '__main__':
 
															     else:
														
 
															         port = 18000
														
 
															         using_gpu_index = 0
														
 
															-    _global._init()
														
 
															-    _global.update({"port": str(port)})
														
 
															-    globals().update({"port": str(port)})
														
 
															-
														
 
															-    # 日志格式设置
														
 
															-    # ip = get_intranet_ip()
														
 
															-    # logging.basicConfig(level=logging.INFO,
														
 
															-    #                     format='%(asctime)s - %(name)s - %(levelname)s - '
														
 
															-    #                            + ip + ' - ' + str(port) + ' - %(message)s')
														
 
															-    logging.info(get_platform())
														
 
															-    # 限制tensorflow显存
														
 
															-    # os.environ['CUDA_VISIBLE_DEVICES'] = str(using_gpu_index)
														
 
															-    # import tensorflow as tf
														
 
															-    # if get_platform() != "Windows":
														
 
															-    #     _version = tf.__version__
														
 
															-    #     logging.info(str(_version))
														
 
															-    #     memory_limit_scale = 0.3
														
 
															-    #     # tensorflow 1.x
														
 
															-    #     if str(_version)[0] == "1":
														
 
															-    #         logging.info("1.x " + str(_version))
														
 
															-    #         os.environ['CUDA_CACHE_MAXSIZE'] = str(2147483648)
														
 
															-    #         os.environ['CUDA_CACHE_DISABLE'] = str(0)
														
 
															-    #         gpu_options = tf.GPUOptions(per_process_gpu_memory_fraction=memory_limit_scale)
														
 
															-    #         sess = tf.Session(config=tf.ConfigProto(gpu_options=gpu_options))
														
 
															-    #
														
 
															-    #     # tensorflow 2.x
														
 
															-    #     elif str(_version)[0] == "2":
														
 
															-    #         logging.info("2.x " + str(_version))
														
 
															-            # config = tf.compat.v1.ConfigProto()
														
 
															-            # config.gpu_options.per_process_gpu_memory_fraction = memory_limit_scale
														
 
															-            # config.gpu_options.allow_growth = True
														
 
															-            # sess = tf.compat.v1.Session(config=config)
														
 
															-
														
 
															-
														
 
															     # app.run(host='0.0.0.0', port=port, processes=1, threaded=False, debug=False)
														
 
															     app.run()
														
 
															     log("OTR running "+str(port))
														
--- a/otr/table_line.py
+++ b/otr/table_line.py
--- a/result.html
+++ b/result.html
	`@@ -0,0 +1 @@`
			`+kill -9 $(lsof -i:15010\|sed -n '2,$p'\|awk '{print $2}'\|tr '\n' ' ')`