3 жил өмнө · 7741734a8c
--- a/format_convert/convert.py
+++ b/format_convert/convert.py
@@ -364,6 +364,12 @@ def _convert():
 
				     {[-5], 0}: 整个转换过程超时
			
 
				     {[-6], 0}: 阿里云UDF队列超时
			
 
				     {[-7], 1}: 文件需密码，无法打开
			
 
				+    {[-8], 0}: 调用现成接口报错
			
 
				+    {[-9], 0}: 接口接收数据为空
			
 
				+    {[-10], 0}: 长图分割报错
			
 
				+    {[-11], 0}: 新接口idc、isr、atc报错
			
 
				+    {[-12], 0}: 表格跨页连接报错
			
 
				+    {[-13], 0}: pdf表格线处理报错
			
 
				     :return: {"result_html": str([]), "result_text":str([]) "is_success": int}
			
 
				     """
			
 
				 
			
--- a/format_convert/convert_doc.py
+++ b/format_convert/convert_doc.py
@@ -1,8 +1,9 @@
 
				 import inspect
			
 
				 import os
			
 
				 import sys
			
 
				+from bs4 import BeautifulSoup
			
 
				 sys.path.append(os.path.dirname(__file__) + "/../")
			
 
				-from format_convert.convert_tree import _Document
			
 
				+from format_convert.convert_tree import _Document, _Sentence, _Page
			
 
				 import logging
			
 
				 import traceback
			
 
				 from format_convert import get_memory_info
			
@@ -35,14 +36,31 @@ class DocConvert:
 
				         self.unique_type_dir = unique_type_dir
			
 
				 
			
 
				     def convert(self):
			
 
				-        # 调用office格式转换
			
 
				-        file_path = from_office_interface(self.path, self.unique_type_dir, 'docx')
			
 
				-        if judge_error_code(file_path):
			
 
				-            self._doc.error_code = file_path
			
 
				-            return
			
 
				-        _docx = DocxConvert(file_path, self.unique_type_dir)
			
 
				-        _docx.convert()
			
 
				-        self._doc = _docx._doc
			
 
				+        # 先判断特殊doc文件，可能是html文本
			
 
				+        is_html_doc = False
			
 
				+        try:
			
 
				+            with open(self.path, 'r') as f:
			
 
				+                html_str = f.read()
			
 
				+            soup = BeautifulSoup(html_str, 'lxml')
			
 
				+            text = soup.text
			
 
				+            is_html_doc = True
			
 
				+        except:
			
 
				+            pass
			
 
				+
			
 
				+        if is_html_doc:
			
 
				+            self._page = _Page(None, 0)
			
 
				+            _sen = _Sentence(text, (0, 0, 0, 0))
			
 
				+            self._page.add_child(_sen)
			
 
				+            self._doc.add_child(self._page)
			
 
				+        else:
			
 
				+            # 调用office格式转换
			
 
				+            file_path = from_office_interface(self.path, self.unique_type_dir, 'docx')
			
 
				+            if judge_error_code(file_path):
			
 
				+                self._doc.error_code = file_path
			
 
				+                return
			
 
				+            _docx = DocxConvert(file_path, self.unique_type_dir)
			
 
				+            _docx.convert()
			
 
				+            self._doc = _docx._doc
			
 
				 
			
 
				     def get_html(self):
			
 
				         try:
			
@@ -52,5 +70,10 @@ class DocConvert:
 
				             self._doc.error_code = [-1]
			
 
				         if self._doc.error_code is not None:
			
 
				             return self._doc.error_code
			
 
				-        print(self._doc.children)
			
 
				+        # print(self._doc.children)
			
 
				         return self._doc.get_html()
			
 
				+
			
 
				+
			
 
				+if __name__ == '__main__':
			
 
				+    c = DocConvert("C:/Users/Administrator/Downloads/-4274446916340743056.doc", "C:/Users/Administrator/Downloads/1")
			
 
				+    print(c.get_html())
			
--- a/format_convert/convert_docx.py
+++ b/format_convert/convert_docx.py
@@ -10,7 +10,7 @@ import xml
 
				 import zipfile
			
 
				 import docx
			
 
				 from format_convert.convert_image import picture2text
			
 
				-from format_convert.utils import judge_error_code, add_div, get_logger, log, memory_decorator
			
 
				+from format_convert.utils import judge_error_code, add_div, get_logger, log, memory_decorator, get_garble_code
			
 
				 from format_convert.wrapt_timeout_decorator import timeout
			
 
				 
			
 
				 
			
@@ -325,6 +325,18 @@ class DocxConvert:
 
				             return
			
 
				         order_list, text_list = order_and_text_list
			
 
				 
			
 
				+        self._page = _Page(None, 0)
			
 
				+
			
 
				+        # 乱码返回文件格式错误
			
 
				+        match1 = re.findall(get_garble_code(), ''.join(text_list))
			
 
				+        if len(match1) > 10:
			
 
				+            log("doc/docx garbled code!")
			
 
				+            # self._doc.error_code = [-3]
			
 
				+            _sen = _Sentence('文件乱码！', (0, 0, 0, 0))
			
 
				+            self._page.add_child(_sen)
			
 
				+            self._doc.add_child(self._page)
			
 
				+            return
			
 
				+
			
 
				         # test
			
 
				         # for i in range(len(text_list)):
			
 
				         #     print(order_list[i], text_list[i])
			
@@ -338,7 +350,6 @@ class DocxConvert:
 
				 
			
 
				         image_list = self.get_images()
			
 
				 
			
 
				-        self._page = _Page(None, 0)
			
 
				         order_y = 0
			
 
				         doc_pr_cnt = 0
			
 
				         for tag in order_list:
			
@@ -427,3 +438,8 @@ class DocxConvert:
 
				         if self._doc.error_code is not None:
			
 
				             return self._doc.error_code
			
 
				         return self._doc.get_html()
			
 
				+
			
 
				+
			
 
				+if __name__ == '__main__':
			
 
				+    c = DocxConvert("C:/Users/Administrator/Downloads/1631944542835.docx", "C:/Users/Administrator/Downloads/1/")
			
 
				+    print(c.get_html())
			
--- a/format_convert/convert_image.py
+++ b/format_convert/convert_image.py
@@ -224,6 +224,9 @@ def image_process(image_np, image_path, is_from_pdf=False, is_from_docx=False, u
 
				             # for _textbox in list_text_boxes:
			
 
				             #     print("==",_textbox.get_text())
			
 
				             lt = LineTable()
			
 
				+            # print('text_list', text_list)
			
 
				+            # print('bbox_list', bbox_list)
			
 
				+            # print('list_line', list_line)
			
 
				             tables, obj_in_table, _ = lt.recognize_table(list_text_boxes, list_lines, False)
			
 
				 
			
 
				             # 合并同一行textbox
			
--- a/format_convert/convert_pdf.py
+++ b/format_convert/convert_pdf.py
@@ -5,6 +5,9 @@ import logging
 
				 import os
			
 
				 import re
			
 
				 import sys
			
 
				+
			
 
				+from bs4 import BeautifulSoup
			
 
				+
			
 
				 sys.path.append(os.path.dirname(__file__) + "/../")
			
 
				 from pdfplumber import PDF
			
 
				 from pdfplumber.table import TableFinder
			
@@ -12,6 +15,10 @@ from pdfplumber.page import Page as pdfPage
 
				 from format_convert.convert_tree import _Document, _Page, _Image, _Sentence, _Table
			
 
				 import time
			
 
				 import pdfminer
			
 
				+import math
			
 
				+from scipy.stats import linregress
			
 
				+from matplotlib import pyplot as plt
			
 
				+from shapely.geometry import LineString, Point
			
 
				 from format_convert import timeout_decorator
			
 
				 from PIL import Image
			
 
				 from format_convert.convert_image import image_process
			
@@ -26,9 +33,9 @@ from pdfminer.pdfpage import PDFPage
 
				 from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
			
 
				 from pdfminer.converter import PDFPageAggregator
			
 
				 from pdfminer.layout import LTTextBoxHorizontal, LAParams, LTFigure, LTImage, LTCurve, LTText, LTChar, LTRect, \
			
 
				-    LTTextBoxVertical, LTLine
			
 
				+    LTTextBoxVertical, LTLine, LTTextContainer
			
 
				 from format_convert.utils import judge_error_code, add_div, get_platform, get_html_p, string_similarity, LineTable, \
			
 
				-    get_logger, log, memory_decorator,draw_lines_plt
			
 
				+    get_logger, log, memory_decorator, draw_lines_plt, get_garble_code, line_is_cross
			
 
				 import fitz
			
 
				 from format_convert.wrapt_timeout_decorator import timeout
			
 
				 
			
@@ -100,9 +107,9 @@ def pdf2Image(path, save_dir):
 
				 def pdf_analyze(interpreter, page, device, page_no):
			
 
				     log("into pdf_analyze")
			
 
				     pdf_time = time.time()
			
 
				-    print("pdf_analyze interpreter process...")
			
 
				+    # print("pdf_analyze interpreter process...")
			
 
				     interpreter.process_page(page)
			
 
				-    print("pdf_analyze device get_result...")
			
 
				+    # print("pdf_analyze device get_result...")
			
 
				     layout = device.get_result()
			
 
				     log("pdf2text page " + str(page_no) + " read time " + str(time.time() - pdf_time))
			
 
				     return layout
			
@@ -389,7 +396,7 @@ def pdf2text(path, unique_type_dir):
 
				                                 #         image_stream = ff.read()
			
 
				                                 except Exception:
			
 
				                                     log("pdf2text pdfminer read image in page " + str(page_no) +
			
 
				-                                                 "  fail! use pymupdf read image...")
			
 
				+                                        "  fail! use pymupdf read image...")
			
 
				                                     # print(traceback.print_exc())
			
 
				                                     image_text = page_info_dict.get(page_no)[0]
			
 
				                                     if image_text is None:
			
@@ -476,7 +483,7 @@ def pdf2text(path, unique_type_dir):
 
				         return [-3]
			
 
				     except Exception as e:
			
 
				         log("pdf2text error!")
			
 
				-        print("pdf2text", traceback.print_exc())
			
 
				+        traceback.print_exc()
			
 
				         return [-1]
			
 
				 
			
 
				 
			
@@ -497,7 +504,7 @@ def get_single_pdf(path, page_no):
 
				         raise e
			
 
				     except Exception as e:
			
 
				         log("get_single_pdf error! page " + str(page_no))
			
 
				-        print("get_single_pdf", traceback.print_exc())
			
 
				+        traceback.print_exc()
			
 
				         raise e
			
 
				 
			
 
				 
			
@@ -578,7 +585,7 @@ def page_table_connect(has_table_dict):
 
				     except Exception as e:
			
 
				         # print("page_table_connect", e)
			
 
				         log("page_table_connect error!")
			
 
				-        print("page_table_connect", traceback.print_exc())
			
 
				+        traceback.print_exc()
			
 
				         return [-1], [-1]
			
 
				 
			
 
				 
			
@@ -589,7 +596,7 @@ def read_pdf(path, package_name, packages):
 
				                         char_margin=0.3,
			
 
				                         line_margin=0.01,
			
 
				                         word_margin=0.01,
			
 
				-                        boxes_flow=0.1,)
			
 
				+                        boxes_flow=0.1, )
			
 
				 
			
 
				     if package_name == packages[0]:
			
 
				         fp = open(path, 'rb')
			
@@ -668,7 +675,7 @@ class PDFConvert:
 
				                                 char_margin=0.3,
			
 
				                                 line_margin=0.01,
			
 
				                                 word_margin=0.01,
			
 
				-                                boxes_flow=0.1,)
			
 
				+                                boxes_flow=0.1, )
			
 
				             if package_name == self.packages[0]:
			
 
				                 # fp = open(self.path, 'rb')
			
 
				                 # parser = PDFParser(fp)
			
@@ -702,7 +709,7 @@ class PDFConvert:
 
				                 self.lt, self.doc_top, self.doc_pdfplumber = read_pdfplumber(self.path, laparams)
			
 
				                 self.has_init_pdf[3] = 0
			
 
				             else:
			
 
				-                print("Only Support Packages", str(self.packages))
			
 
				+                log("Only Support Packages " + str(self.packages))
			
 
				                 raise Exception
			
 
				         except Exception as e:
			
 
				             log(package_name + " cannot open pdf!")
			
@@ -766,29 +773,401 @@ class PDFConvert:
 
				             self._doc.add_child(self._page)
			
 
				             page_no += 1
			
 
				 
			
 
				+    def clean_text(self, _text):
			
 
				+        return re.sub("\s", "", _text)
			
 
				 
			
 
				-    def clean_text(self,_text):
			
 
				-
			
 
				-        return re.sub("\s","",_text)
			
 
				-
			
 
				-
			
 
				-    def get_text_lines(self,page,page_no):
			
 
				+    def get_text_lines(self, page, page_no):
			
 
				         lt_line_list = []
			
 
				         page_plumber = pdfPage(self.doc_pdfplumber, page, page_number=page_no, initial_doctop=self.doc_top)
			
 
				         self.doc_top += page_plumber.height
			
 
				 
			
 
				         table_finder = TableFinder(page_plumber)
			
 
				+        all_width_zero = True
			
 
				+        for _edge in table_finder.get_edges():
			
 
				+            if _edge.get('linewidth') and _edge.get('linewidth') > 0:
			
 
				+                all_width_zero = False
			
 
				+                break
			
 
				         for _edge in table_finder.get_edges():
			
 
				-            lt_line_list.append(LTLine(1, (float(_edge["x0"]), float(_edge["y0"])),
			
 
				-                                       (float(_edge["x1"]), float(_edge["y1"]))))
			
 
				-        log("pdf page %s has %s lines"%(str(page_no),str(len(lt_line_list))))
			
 
				+            # print(_edge)
			
 
				+            if _edge.get('linewidth', 0.1) > 0 or all_width_zero:
			
 
				+                lt_line_list.append(LTLine(1, (float(_edge["x0"]), float(_edge["y0"])),
			
 
				+                                           (float(_edge["x1"]), float(_edge["y1"]))))
			
 
				+        log("pdf page %s has %s lines" % (str(page_no), str(len(lt_line_list))))
			
 
				+        return lt_line_list
			
 
				+
			
 
				+    def get_page_lines(self, layout, page_no):
			
 
				+        def _plot(_line_list, mode=1):
			
 
				+            for _line in _line_list:
			
 
				+                if mode == 1:
			
 
				+                    x0, y0, x1, y1 = _line.__dict__.get("bbox")
			
 
				+                elif mode == 2:
			
 
				+                    x0, y0, x1, y1 = _line
			
 
				+                plt.plot([x0, x1], [y0, y1])
			
 
				+            plt.show()
			
 
				+            return
			
 
				+
			
 
				+        def is_cross(A, B, C, D):
			
 
				+            if A[0] == B[0] == C[0] == D[0]:
			
 
				+                if A[1] <= C[1] <= B[1] or A[1] <= D[1] <= B[1] \
			
 
				+                        or C[1] <= A[1] <= D[1] or C[1] <= B[1] <= D[1]:
			
 
				+                    return True
			
 
				+            if A[1] == B[1] == C[1] == D[1]:
			
 
				+                if A[0] <= C[0] <= B[0] or A[0] <= D[0] <= B[0] \
			
 
				+                        or C[0] <= A[0] <= D[0] or C[0] <= B[0] <= D[0]:
			
 
				+                    return True
			
 
				+
			
 
				+            line1 = LineString([A, B])
			
 
				+            line2 = LineString([C, D])
			
 
				+
			
 
				+            int_pt = line1.intersection(line2)
			
 
				+            try:
			
 
				+                point_of_intersection = int_pt.x, int_pt.y
			
 
				+                return True
			
 
				+            except:
			
 
				+                return False
			
 
				+
			
 
				+        def calculate_k(bbox):
			
 
				+            x = [bbox[0], bbox[2]]
			
 
				+            y = [bbox[1], bbox[3]]
			
 
				+            slope, intercept, r_value, p_value, std_err = linregress(x, y)
			
 
				+            # print('k', slope)
			
 
				+            if math.isnan(slope):
			
 
				+                slope = 0
			
 
				+            return slope
			
 
				+
			
 
				+        def line_iou(line1, line2, axis=0):
			
 
				+            inter = min(line1[1][axis], line2[1][axis]) - max(line1[0][axis], line2[0][axis])
			
 
				+            # union = max(line1[1][axis], line2[1][axis]) - min(line1[0][axis], line2[0][axis])
			
 
				+            union = min(abs(line1[0][axis] - line1[1][axis]), abs(line2[0][axis] - line2[1][axis]))
			
 
				+            if union in [0, 0.]:
			
 
				+                iou = 0.
			
 
				+            else:
			
 
				+                iou = inter / union
			
 
				+            return iou
			
 
				+
			
 
				+        def get_cross_line(_line_list, threshold=1, cross_times=0):
			
 
				+            # 根据是否有交点判断表格线
			
 
				+            _cross_line_list = []
			
 
				+            for line1 in _line_list:
			
 
				+                if line1 in _cross_line_list:
			
 
				+                    continue
			
 
				+                if abs(line1[2] - line1[0]) > abs(line1[3] - line1[1]):
			
 
				+                    p1 = [max(0, line1[0] - threshold), line1[1]]
			
 
				+                    p2 = [min(line1[2] + threshold, page_w), line1[3]]
			
 
				+                else:
			
 
				+                    p1 = [line1[0], max(0, line1[1] - threshold)]
			
 
				+                    p2 = [line1[2], min(line1[3] + threshold, page_h)]
			
 
				+                line1 = [p1[0], p1[1], p2[0], p2[1]]
			
 
				+                _times = 0
			
 
				+                for line2 in _line_list:
			
 
				+                    if abs(line2[2] - line2[0]) > abs(line2[3] - line2[1]):
			
 
				+                        p3 = [max(0, line2[0] - threshold), line2[1]]
			
 
				+                        p4 = [min(line2[2] + threshold, page_w), line2[3]]
			
 
				+                    else:
			
 
				+                        p3 = [line2[0], max(0, line2[1] - threshold)]
			
 
				+                        p4 = [line2[2], min(line2[3] + threshold, page_h)]
			
 
				+                    line2 = [p3[0], p3[1], p4[0], p4[1]]
			
 
				+                    if line1 == line2:
			
 
				+                        continue
			
 
				+                    if is_cross(p1, p2, p3, p4):
			
 
				+                        _times += 1
			
 
				+                        if _times >= cross_times:
			
 
				+                            _cross_line_list += [line1]
			
 
				+                            break
			
 
				+            return _cross_line_list
			
 
				+
			
 
				+        def repair_bias_line(_line_list):
			
 
				+            temp_list = []
			
 
				+            for line in _line_list:
			
 
				+                x0, y0, x1, y1 = line
			
 
				+                _y = min(y0, y1)
			
 
				+                _x = min(x0, x1)
			
 
				+                if abs(x0 - x1) > abs(y0 - y1):
			
 
				+                    temp_list.append([x0, _y, x1, _y])
			
 
				+                else:
			
 
				+                    temp_list.append([_x, y0, _x, y1])
			
 
				+            _line_list = temp_list
			
 
				+            return _line_list
			
 
				+
			
 
				+        def repair_col_line(_straight_list, _bias_list, threshold=2, min_width=7):
			
 
				+            if not _straight_list or not _bias_list:
			
 
				+                print('add_col_bias_line empty', len(_straight_list), len(_bias_list))
			
 
				+                return []
			
 
				+
			
 
				+            # 分列
			
 
				+            _straight_list.sort(key=lambda x: (x[0], x[1]))
			
 
				+            cols = []
			
 
				+            col = []
			
 
				+            current_w = _straight_list[0][0]
			
 
				+            for line in _straight_list:
			
 
				+                if abs(line[0] - line[2]) > abs(line[1] - line[3]):
			
 
				+                    continue
			
 
				+                if min(line[0], line[2]) - threshold <= current_w <= max(line[0], line[2]) + threshold:
			
 
				+                    col.append(line)
			
 
				+                else:
			
 
				+                    if col:
			
 
				+                        cols.append(col)
			
 
				+                    col = [line]
			
 
				+                    current_w = line[0]
			
 
				+            if col:
			
 
				+                cols.append(col)
			
 
				+
			
 
				+            # 补充col
			
 
				+            new_list = []
			
 
				+            for line in bias_line_list:
			
 
				+                if abs(line[0] - line[2]) > abs(line[1] - line[3]):
			
 
				+                    continue
			
 
				+                for col in cols:
			
 
				+                    w = col[0][0]
			
 
				+                    if w - threshold <= line[0] <= w + threshold or w - threshold <= line[2] <= w + threshold:
			
 
				+                        new_list.append([w, line[1] - 3, w, line[3] + 3])
			
 
				+            new_list += _straight_list
			
 
				+
			
 
				+            # 去重
			
 
				+            new_list = [str(x) for x in new_list]
			
 
				+            new_list = list(set(new_list))
			
 
				+            new_list = [eval(x) for x in new_list]
			
 
				+
			
 
				+            # 分列
			
 
				+            new_list.sort(key=lambda x: (x[0], x[1]))
			
 
				+            cols = []
			
 
				+            col = []
			
 
				+            current_w = new_list[0][0]
			
 
				+            for line in new_list:
			
 
				+                if abs(line[0] - line[2]) > abs(line[1] - line[3]):
			
 
				+                    continue
			
 
				+                if min(line[0], line[2]) - threshold <= current_w <= max(line[0], line[2]) + threshold:
			
 
				+                    col.append(line)
			
 
				+                else:
			
 
				+                    if col:
			
 
				+                        cols.append(col)
			
 
				+                    col = [line]
			
 
				+                    current_w = line[0]
			
 
				+            if col:
			
 
				+                cols.append(col)
			
 
				+
			
 
				+            # 删除col
			
 
				+            for col1 in cols:
			
 
				+                for col2 in cols:
			
 
				+                    if col1 == col2 or abs(col1[0][0] - col2[0][0]) > min_width:
			
 
				+                        continue
			
 
				+
			
 
				+                    col1_len, col2_len = 0, 0
			
 
				+                    for c in col1:
			
 
				+                        col1_len += abs(c[1] - c[3])
			
 
				+                    for c in col2:
			
 
				+                        col2_len += abs(c[1] - c[3])
			
 
				+                    if col1_len > col2_len * 3:
			
 
				+                        for c in col2:
			
 
				+                            if c in new_list:
			
 
				+                                new_list.remove(c)
			
 
				+                    if col2_len > col1_len * 3:
			
 
				+                        for c in col1:
			
 
				+                            if c in new_list:
			
 
				+                                new_list.remove(c)
			
 
				+            return new_list
			
 
				+
			
 
				+        def merge_line(_line_list, threshold=2):
			
 
				+            new_line_list = []
			
 
				+            # 分列
			
 
				+            _line_list.sort(key=lambda x: (x[0], x[1]))
			
 
				+            cols = []
			
 
				+            col = [_line_list[0]]
			
 
				+            current_w = _line_list[0][0]
			
 
				+            for line in _line_list:
			
 
				+                if abs(line[0] - line[2]) > abs(line[1] - line[3]):
			
 
				+                    continue
			
 
				+                if min(line[0], line[2]) - threshold <= current_w <= max(line[0], line[2]) + threshold \
			
 
				+                        and is_cross(line[0:2], line[2:4], col[-1][0:2], col[-1][2:4]):
			
 
				+                    col.append(line)
			
 
				+                else:
			
 
				+                    if col:
			
 
				+                        cols.append(col)
			
 
				+                    col = [line]
			
 
				+                    current_w = line[0]
			
 
				+            if col:
			
 
				+                cols.append(col)
			
 
				+
			
 
				+            for col in cols:
			
 
				+                temp_c = col[0]
			
 
				+                col_w = col[0][0]
			
 
				+                for i in range(len(col) - 1):
			
 
				+                    c = col[i]
			
 
				+                    next_c = col[i + 1]
			
 
				+                    if is_cross(c[0:2], c[2:4], next_c[0:2], next_c[2:4]):
			
 
				+                        temp_c = [col_w, min(temp_c[1], c[1], c[3], next_c[1], next_c[3]), col_w,
			
 
				+                                  max(temp_c[3], c[1], c[3], next_c[1], next_c[3])]
			
 
				+                    else:
			
 
				+                        new_line_list.append(temp_c)
			
 
				+                        temp_c = next_c
			
 
				+                if not new_line_list or (new_line_list and new_line_list[-1] != temp_c):
			
 
				+                    new_line_list.append(temp_c)
			
 
				+
			
 
				+            # 分行
			
 
				+            _line_list.sort(key=lambda x: (x[1], x[0]))
			
 
				+            rows = []
			
 
				+            row = []
			
 
				+            current_h = _line_list[0][1]
			
 
				+            for line in _line_list:
			
 
				+                if abs(line[0] - line[2]) < abs(line[1] - line[3]):
			
 
				+                    continue
			
 
				+                if min(line[1], line[3]) - threshold <= current_h <= max(line[1], line[3]) + threshold:
			
 
				+                    row.append(line)
			
 
				+                else:
			
 
				+                    if row:
			
 
				+                        rows.append(row)
			
 
				+                    row = [line]
			
 
				+                    current_h = line[1]
			
 
				+            if row:
			
 
				+                rows.append(row)
			
 
				+
			
 
				+            for row in rows:
			
 
				+                temp_r = row[0]
			
 
				+                row_h = row[0][1]
			
 
				+                for i in range(len(row) - 1):
			
 
				+                    r = row[i]
			
 
				+                    next_r = row[i + 1]
			
 
				+                    # if is_cross(r[0:2], r[2:4], next_r[0:2], next_r[2:4]):
			
 
				+                    if line_iou([r[0:2], r[2:4]], [next_r[0:2], next_r[2:4]], axis=0):
			
 
				+                        temp_r = [min(temp_r[0], r[0], r[2], next_r[0], next_r[2]), row_h,
			
 
				+                                  max(temp_r[2], r[0], r[2], next_r[0], next_r[2]), row_h]
			
 
				+                    else:
			
 
				+                        new_line_list.append(temp_r)
			
 
				+                        temp_r = next_r
			
 
				+                if not new_line_list or (new_line_list and new_line_list[-1] != temp_r):
			
 
				+                    new_line_list.append(temp_r)
			
 
				+            return new_line_list
			
 
				+
			
 
				+        def remove_outline_no_cross(_line_list):
			
 
				+            row_list = []
			
 
				+            col_list = []
			
 
				+            for line in _line_list:
			
 
				+                # 存所有行
			
 
				+                if abs(line[0] - line[2]) > abs(line[1] - line[3]):
			
 
				+                    row_list.append(line)
			
 
				+                # 存所有列
			
 
				+                if abs(line[0] - line[2]) < abs(line[1] - line[3]):
			
 
				+                    col_list.append(line)
			
 
				+
			
 
				+            if not col_list:
			
 
				+                return _line_list
			
 
				+
			
 
				+            # 左右两条边框
			
 
				+            col_list.sort(key=lambda x: (x[0], x[1]))
			
 
				+            left_col = col_list[0]
			
 
				+            right_col = col_list[-1]
			
 
				+
			
 
				+            # 判断有交点但中间区域无交点
			
 
				+            compare_list = []
			
 
				+            for col in [left_col, right_col]:
			
 
				+                add_h = abs(col[1]-col[3]) / 8
			
 
				+                center_area = [col[1]+add_h, col[3]-add_h]
			
 
				+                cross_cnt = 0
			
 
				+                center_cross_cnt = 0
			
 
				+                center_row_cnt = 0
			
 
				+                for row in row_list:
			
 
				+                    if is_cross(row[0:2], row[2:4], col[0:2], col[2:4]):
			
 
				+                        if center_area[0] <= row[1] <= center_area[1]:
			
 
				+                            center_cross_cnt += 1
			
 
				+                        else:
			
 
				+                            cross_cnt += 1
			
 
				+                    else:
			
 
				+                        if center_area[0] <= row[1] <= center_area[1]:
			
 
				+                            center_row_cnt += 1
			
 
				+                compare_list.append([cross_cnt, center_cross_cnt, center_row_cnt])
			
 
				+
			
 
				+            _flag = True
			
 
				+            for c in compare_list:
			
 
				+                if c[0] >= 2 and c[1] == 0 and c[2] >= 2:
			
 
				+                    continue
			
 
				+                _flag = False
			
 
				+            print('compare_list', compare_list)
			
 
				+            if _flag and compare_list[0][1] == compare_list[1][1] \
			
 
				+                    and compare_list[0][2] == compare_list[1][2]:
			
 
				+                for col in [left_col, right_col]:
			
 
				+                    if col in _line_list:
			
 
				+                        _line_list.remove(col)
			
 
				+            return _line_list
			
 
				+
			
 
				+        log('into get_page_lines')
			
 
				+
			
 
				+        page_h = layout.height
			
 
				+        page_w = layout.width
			
 
				+
			
 
				+        element_list = []
			
 
				+        line_list = []
			
 
				+        bias_line_list = []
			
 
				+        text_bbox_list = []
			
 
				+        for element in layout:
			
 
				+            if isinstance(element, LTTextContainer):
			
 
				+                text_bbox_list.append(element.bbox)
			
 
				+
			
 
				+            # 只取这三种类型的bbox
			
 
				+            if isinstance(element, (LTRect, LTCurve, LTLine)):
			
 
				+                element_list.append(element)
			
 
				+                if element.height > 0.5 and element.width > 0.5:
			
 
				+                    # print('element.height, element.width', element.height, element.width)
			
 
				+                    k = calculate_k(element.bbox)
			
 
				+                    if 1.73 / 3 < abs(k) < 1.73:
			
 
				+                        continue
			
 
				+                    else:
			
 
				+                        bias_line_list.append(element.bbox)
			
 
				+                    continue
			
 
				+                line_list.append(element.bbox)
			
 
				+
			
 
				+        if not line_list and not bias_line_list:
			
 
				+            return []
			
 
				+
			
 
				+        # 是否使用斜线来生成表格
			
 
				+        if len(line_list) < 6 and len(bias_line_list) > len(line_list) * 2:
			
 
				+            # print('use bias line')
			
 
				+            # bias_line_list += add_col_bias_line(line_list, bias_line_list)
			
 
				+            line_list = bias_line_list
			
 
				+
			
 
				+        # 去重
			
 
				+        line_list = [str(x) for x in line_list]
			
 
				+        line_list = list(set(line_list))
			
 
				+        line_list = [eval(x) for x in line_list]
			
 
				+
			
 
				+        # 根据是否有交点判断表格线
			
 
				+        cross_line_list = get_cross_line(line_list, threshold=2, cross_times=1)
			
 
				+
			
 
				+        if not cross_line_list:
			
 
				+            return []
			
 
				+
			
 
				+        # 斜线校正
			
 
				+        if cross_line_list:
			
 
				+            cross_line_list = repair_bias_line(cross_line_list)
			
 
				+
			
 
				+        # 修复竖线
			
 
				+        if bias_line_list:
			
 
				+            cross_line_list = repair_col_line(cross_line_list, bias_line_list)
			
 
				+
			
 
				+        # 根据是否有交点判断表格线
			
 
				+        cross_line_list = get_cross_line(cross_line_list, threshold=1, cross_times=1)
			
 
				+
			
 
				+        # 合并线条
			
 
				+        cross_line_list = merge_line(cross_line_list)
			
 
				+
			
 
				+        # 删除最外层嵌套边框
			
 
				+        cross_line_list = remove_outline_no_cross(cross_line_list)
			
 
				+        # show
			
 
				+        # print('len(cross_line_list)', len(cross_line_list))
			
 
				+        # _plot(line_list, mode=2)
			
 
				+        # _plot(cross_line_list, mode=2)
			
 
				+
			
 
				+        lt_line_list = []
			
 
				+        for line in cross_line_list:
			
 
				+            lt_line_list.append(LTLine(1, (float(line[0]), float(line[1])),
			
 
				+                                       (float(line[2]), float(line[3]))))
			
 
				+        log("pdf page %s has %s lines" % (str(page_no), str(len(lt_line_list))))
			
 
				         return lt_line_list
			
 
				 
			
 
				-    def recognize_text(self,layout,page_no,lt_text_list,lt_line_list):
			
 
				+    def recognize_text(self, layout, page_no, lt_text_list, lt_line_list):
			
 
				         list_tables, filter_objs, _ = self.lt.recognize_table(lt_text_list, lt_line_list)
			
 
				         self._page.in_table_objs = filter_objs
			
 
				 
			
 
				-        print("=======text_len:%d:filter_len:%d"%(len(lt_text_list),len(filter_objs)))
			
 
				+        # print("=======text_len:%d:filter_len:%d"%(len(lt_text_list),len(filter_objs)))
			
 
				 
			
 
				         for table in list_tables:
			
 
				             _table = _Table(table["table"], table["bbox"])
			
@@ -804,7 +1183,7 @@ class PDFConvert:
 
				         # pdf对象需反向排序
			
 
				         self._page.is_reverse = True
			
 
				 
			
 
				-    def is_text_legal(self,lt_text_list,page_no):
			
 
				+    def is_text_legal(self, lt_text_list, page_no):
			
 
				         # 无法识别pdf字符编码，整页用ocr
			
 
				         text_temp = ""
			
 
				         for _t in lt_text_list:
			
@@ -819,6 +1198,19 @@ class PDFConvert:
 
				                 _image = _Image(page_image[1], page_image[0])
			
 
				                 self._page.add_child(_image)
			
 
				             return False
			
 
				+
			
 
				+        match1 = re.findall(get_garble_code(), text_temp)
			
 
				+        # match2 = re.search('[\u4e00-\u9fa5]', text_temp)
			
 
				+        if len(match1) > 3 and len(text_temp) > 10:
			
 
				+            log("pdf garbled code! try pymupdf... " + text_temp[:20])
			
 
				+            page_image = self.get_page_image(page_no)
			
 
				+            if judge_error_code(page_image):
			
 
				+                self._page.error_code = page_image
			
 
				+            else:
			
 
				+                _image = _Image(page_image[1], page_image[0])
			
 
				+                self._page.add_child(_image)
			
 
				+            return False
			
 
				+
			
 
				         return True
			
 
				 
			
 
				     def convert_page(self, page, page_no):
			
@@ -852,7 +1244,7 @@ class PDFConvert:
 
				                         lt_image_list.append(y)
			
 
				                         # image_count += 1
			
 
				         lt_text_list = self.delete_water_mark(lt_text_list, layout.bbox, 15)
			
 
				-        print("convert_pdf page", page_no)
			
 
				+        log("convert_pdf page " + str(page_no))
			
 
				         log("len(lt_image_list), len(lt_text_list) " + str(len(lt_image_list)) + " " + str(len(lt_text_list)))
			
 
				 
			
 
				         # 若只有文本且图片数为0，直接提取文字及表格
			
@@ -873,14 +1265,18 @@ class PDFConvert:
 
				                     self._page.add_child(_image)
			
 
				                 return
			
 
				 
			
 
				-
			
 
				-            if not self.is_text_legal(lt_text_list,page_no):
			
 
				+            if not self.is_text_legal(lt_text_list, page_no):
			
 
				                 return
			
 
				 
			
 
				             try:
			
 
				-                lt_line_list = self.get_text_lines(page,page_no)
			
 
				-                self.recognize_text(layout,page_no,lt_text_list,lt_line_list)
			
 
				-
			
 
				+                lt_line_list = self.get_page_lines(layout, page_no)
			
 
				+            except:
			
 
				+                traceback.print_exc()
			
 
				+                lt_line_list = []
			
 
				+                self._page.error_code = [-13]
			
 
				+            try:
			
 
				+                # lt_line_list = self.get_text_lines(page,page_no)
			
 
				+                self.recognize_text(layout, page_no, lt_text_list, lt_line_list)
			
 
				             except:
			
 
				                 traceback.print_exc()
			
 
				                 self._page.error_code = [-8]
			
@@ -902,7 +1298,7 @@ class PDFConvert:
 
				             # 图表对象
			
 
				             for image in lt_image_list:
			
 
				                 try:
			
 
				-                    print("pdf2text LTImage size", page_no, image.width, image.height)
			
 
				+                    # print("pdf2text LTImage size", page_no, image.width, image.height)
			
 
				                     image_stream = image.stream.get_data()
			
 
				                     # 小的图忽略
			
 
				                     if image.width <= 300 and image.height <= 300:
			
@@ -911,7 +1307,7 @@ class PDFConvert:
 
				                     img_test = Image.open(io.BytesIO(image_stream))
			
 
				                     # img_test.show()
			
 
				                     if image.height >= 1000 and image.width >= 1000:
			
 
				-                        print("pdf2text LTImage stream output size", img_test.size)
			
 
				+                        # print("pdf2text LTImage stream output size", img_test.size)
			
 
				                         page_image = self.get_page_image(page_no)
			
 
				                         if judge_error_code(page_image):
			
 
				                             self._page.error_code = page_image
			
@@ -932,19 +1328,25 @@ class PDFConvert:
 
				                 except Exception:
			
 
				                     log("pdf2text pdfminer read image in page " + str(page_no) +
			
 
				                         "  fail! use pymupdf read image...")
			
 
				-                    print(traceback.print_exc())
			
 
				+                    traceback.print_exc()
			
 
				             # pdf对象需反向排序
			
 
				             self._page.is_reverse = True
			
 
				             self.init_package("pdfplumber")
			
 
				 
			
 
				-            if not self.is_text_legal(lt_text_list,page_no):
			
 
				+            if not self.is_text_legal(lt_text_list, page_no):
			
 
				                 return
			
 
				 
			
 
				-            lt_line_list = self.get_text_lines(page,page_no)
			
 
				-            self.recognize_text(layout,page_no,lt_text_list,lt_line_list)
			
 
				+            # lt_line_list = self.get_text_lines(page, page_no)
			
 
				+            try:
			
 
				+                lt_line_list = self.get_page_lines(layout, page_no)
			
 
				+            except:
			
 
				+                traceback.print_exc()
			
 
				+                lt_line_list = []
			
 
				+                self._page.error_code = [-13]
			
 
				+            self.recognize_text(layout, page_no, lt_text_list, lt_line_list)
			
 
				 
			
 
				     def get_layout(self, page, page_no):
			
 
				-        log("")
			
 
				+        log("get_layout")
			
 
				         if self.has_init_pdf[0] == 0:
			
 
				             self.init_package("pdfminer")
			
 
				         if self._doc.error_code is not None:
			
@@ -1032,11 +1434,137 @@ class PDFConvert:
 
				                 continue
			
 
				             self._doc.add_child(self._page)
			
 
				 
			
 
				+    def connect_table(self, html_list):
			
 
				+        if not html_list:
			
 
				+            return html_list
			
 
				+
			
 
				+        # 判断条件1：最后一个表格后有无非页码文本/第一个表格前有无文本
			
 
				+        connect_flag_list = []
			
 
				+        soup_list = []
			
 
				+        for i, h in enumerate(html_list):
			
 
				+            soup_list.append(BeautifulSoup(h, 'lxml'))
			
 
				+            # 找最后一个表格
			
 
				+            table_start1, table_end1 = None, None
			
 
				+            # print('h', h)
			
 
				+            match = re.finditer('<table', h)
			
 
				+            for m in match:
			
 
				+                table_start1 = m.span()[0]
			
 
				+            if table_start1 is not None:
			
 
				+                match = re.finditer('</table>', h[table_start1:])
			
 
				+                for m in match:
			
 
				+                    table_end1 = m.span()[1] + table_start1
			
 
				+            # 最后一个表格后有无除了页码外的内容
			
 
				+            connect_flag1 = False
			
 
				+            if table_end1 is not None:
			
 
				+                match = re.search('[^-/第页0-9]*', re.sub('<div>|</div>', '', h[table_end1:]))
			
 
				+                # print('match1', match.group())
			
 
				+                if not match or match.group() == '':
			
 
				+                    connect_flag1 = True
			
 
				+
			
 
				+            # 找第一个表格
			
 
				+            table_start2, table_end2 = None, None
			
 
				+            match = re.finditer('<table', h)
			
 
				+            for m in match:
			
 
				+                table_start2 = m.span()[0]
			
 
				+                break
			
 
				+            # 第一个表格后有无内容
			
 
				+            connect_flag2 = False
			
 
				+            if table_start2 is not None and table_start2 == 0:
			
 
				+                connect_flag2 = True
			
 
				+            connect_flag_list.append([i, connect_flag2, connect_flag1])
			
 
				+
			
 
				+        # print('connect_flag_list', connect_flag_list)
			
 
				+
			
 
				+        # 根据条件1合并需连接页码，形成组
			
 
				+        connect_pages_list = []
			
 
				+        temp_list = []
			
 
				+        for i, c in enumerate(connect_flag_list):
			
 
				+            if temp_list and c[1]:
			
 
				+                temp_list.append(c)
			
 
				+            elif not temp_list and c[2]:
			
 
				+                temp_list.append(c)
			
 
				+            else:
			
 
				+                if temp_list:
			
 
				+                    connect_pages_list.append(temp_list)
			
 
				+                    temp_list = []
			
 
				+                connect_pages_list.append([c])
			
 
				+        if temp_list:
			
 
				+            connect_pages_list.append(temp_list)
			
 
				+
			
 
				+        # print('connect_pages_list', connect_pages_list)
			
 
				+
			
 
				+        # 判断条件2：判断组内列数是否相同
			
 
				+        connect_pages_list2 = []
			
 
				+        for c_list in connect_pages_list:
			
 
				+            if len(c_list) == 1:
			
 
				+                connect_pages_list2.append(c_list)
			
 
				+            else:
			
 
				+                col_cnt_list = []
			
 
				+                for c in c_list:
			
 
				+                    soup = soup_list[c[0]]
			
 
				+                    table1 = soup.findAll('table')[-1]
			
 
				+                    table2 = soup.findAll('table')[0]
			
 
				+                    tr1 = table1.findAll('tr')
			
 
				+                    tr2 = table2.findAll('tr')
			
 
				+                    td1 = tr1[-1].findAll('td')
			
 
				+                    td2 = tr2[0].findAll('td')
			
 
				+                    col_cnt_list.append([len(td2), len(td1)])
			
 
				+                new_c_list = [c_list[0]]
			
 
				+                # print('col_cnt_list', col_cnt_list)
			
 
				+                for i in range(len(col_cnt_list) - 1):
			
 
				+                    if col_cnt_list[i][1] != col_cnt_list[i + 1][0]:
			
 
				+                        connect_pages_list2.append(new_c_list)
			
 
				+                        new_c_list = [c_list[i + 1]]
			
 
				+                    else:
			
 
				+                        new_c_list.append(c_list[i + 1])
			
 
				+                if new_c_list:
			
 
				+                    connect_pages_list2.append(new_c_list)
			
 
				+
			
 
				+        # print('connect_pages_list2', connect_pages_list2)
			
 
				+
			
 
				+        # 符合连接条件的拼接表格
			
 
				+        new_html_list = []
			
 
				+        for c_list in connect_pages_list2:
			
 
				+            if len(c_list) == 1:
			
 
				+                new_html_list.append(html_list[c_list[0][0]])
			
 
				+                continue
			
 
				+            new_html = ''
			
 
				+            for c in c_list:
			
 
				+                new_html += html_list[c[0]]
			
 
				+            new_html = re.sub('</table>([-/第页0-9]|<div>|</div>)*<table border="1">', '<tr><td>#@#@#</td></tr>',
			
 
				+                              new_html)
			
 
				+
			
 
				+            soup = BeautifulSoup(new_html, 'lxml')
			
 
				+            trs = soup.findAll('tr')
			
 
				+            for i in range(len(trs)):
			
 
				+                if trs[i].get_text() == '#@#@#':
			
 
				+                    td1 = trs[i - 1].findAll('td')
			
 
				+                    td2 = trs[i + 1].findAll('td')
			
 
				+                    if td2[0].get_text() == '':
			
 
				+                        for j in range(len(td1)):
			
 
				+                            td1[j].string = td1[j].get_text() + td2[j].get_text()
			
 
				+                        trs[i + 1].decompose()
			
 
				+                    trs[i].decompose()
			
 
				+            new_html = str(soup)
			
 
				+            new_html_list.append(new_html)
			
 
				+
			
 
				+        html_str = ''
			
 
				+        for h in new_html_list:
			
 
				+            html_str += h
			
 
				+        return [html_str]
			
 
				+
			
 
				     def get_html(self):
			
 
				         self.convert()
			
 
				         if self._doc.error_code is not None:
			
 
				             return self._doc.error_code
			
 
				-        return self._doc.get_html()
			
 
				+        html = self._doc.get_html(return_list=True)
			
 
				+        # 表格连接
			
 
				+        try:
			
 
				+            html = self.connect_table(html)
			
 
				+        except:
			
 
				+            traceback.print_exc()
			
 
				+            return [-12]
			
 
				+        return html
			
 
				 
			
 
				     def delete_water_mark(self, lt_text_list, page_bbox, times=5):
			
 
				         # 删除过多重复字句，为水印
			
@@ -1075,7 +1603,7 @@ class PDFConvert:
 
				             ratio = max_size / _img.shape[resize_axis]
			
 
				             new_shape = [0, 0]
			
 
				             new_shape[resize_axis] = max_size
			
 
				-            new_shape[1-resize_axis] = int(_img.shape[1-resize_axis] * ratio)
			
 
				+            new_shape[1 - resize_axis] = int(_img.shape[1 - resize_axis] * ratio)
			
 
				             _img = cv2.resize(_img, (new_shape[1], new_shape[0]))
			
 
				             cv2.imwrite(img_path, _img)
			
 
				 
			
@@ -1097,11 +1625,116 @@ class PDFConvert:
 
				             return [-3]
			
 
				 
			
 
				 
			
 
				+def get_text_font():
			
 
				+    def flags_decomposer(flags):
			
 
				+        """Make font flags human readable."""
			
 
				+        l = []
			
 
				+        if flags & 2 ** 0:
			
 
				+            l.append("superscript")
			
 
				+        if flags & 2 ** 1:
			
 
				+            l.append("italic")
			
 
				+        if flags & 2 ** 2:
			
 
				+            l.append("serifed")
			
 
				+        else:
			
 
				+            l.append("sans")
			
 
				+        if flags & 2 ** 3:
			
 
				+            l.append("monospaced")
			
 
				+        else:
			
 
				+            l.append("proportional")
			
 
				+        if flags & 2 ** 4:
			
 
				+            l.append("bold")
			
 
				+        return ", ".join(l)
			
 
				+
			
 
				+    def get_underlined_textLines(page):
			
 
				+        """
			
 
				+        获取某页pdf上的所有下划线文本信息
			
 
				+        :param page: fitz中的一页
			
 
				+        :return: list of tuples,每个tuple都是一个完整的下划线覆盖的整体：[(下划线句, 所在blk_no, 所在line_no), ...]
			
 
				+        """
			
 
				+        paths = page.get_drawings()  # get drawings on the current page
			
 
				+
			
 
				+        # 获取该页内所有的height很小的bbox。因为下划线其实大多是这种矩形
			
 
				+        # subselect things we may regard as lines
			
 
				+        lines = []
			
 
				+        for p in paths:
			
 
				+            for item in p["items"]:
			
 
				+                if item[0] == "l":  # an actual line
			
 
				+                    p1, p2 = item[1:]
			
 
				+                    if p1.y == p2.y:
			
 
				+                        lines.append((p1, p2))
			
 
				+                elif item[0] == "re":  # a rectangle: check if height is small
			
 
				+                    r = item[1]
			
 
				+                    if r.width > r.height and r.height <= 2:
			
 
				+                        lines.append((r.tl, r.tr))  # take top left / right points
			
 
				+
			
 
				+        # 获取该页的`max_lineheight`，用于下面比较距离使用
			
 
				+        blocks = page.get_text("dict", flags=11)["blocks"]
			
 
				+        max_lineheight = 0
			
 
				+        for b in blocks:
			
 
				+            for l in b["lines"]:
			
 
				+                bbox = fitz.Rect(l["bbox"])
			
 
				+                if bbox.height > max_lineheight:
			
 
				+                    max_lineheight = bbox.height
			
 
				+
			
 
				+        underlined_res = []
			
 
				+        # 开始对下划线内容进行查询
			
 
				+        # make a list of words
			
 
				+        words = page.get_text("words")
			
 
				+        # if underlined, the bottom left / right of a word
			
 
				+        # should not be too far away from left / right end of some line:
			
 
				+        for wdx, w in enumerate(words):  # w[4] is the actual word string
			
 
				+            r = fitz.Rect(w[:4])  # first 4 items are the word bbox
			
 
				+            for p1, p2 in lines:  # check distances for start / end points
			
 
				+                if abs(r.bl - p1) <= max_lineheight:  # 当前word的左下满足下划线左下
			
 
				+                    if abs(r.br - p2) <= max_lineheight:  # 当前word的右下满足下划线右下（单个词，无空格）
			
 
				+                        print(f"Word '{w[4]}' is underlined! Its block-line number is {w[-3], w[-2]}")
			
 
				+                        underlined_res.append((w[4], w[-3], w[-2]))  # 分别是(下划线词，所在blk_no，所在line_no)
			
 
				+                        break  # don't check more lines
			
 
				+                    else:  # 继续寻找同line右侧的有缘人，因为有些下划线覆盖的词包含多个词，多个词之间有空格
			
 
				+                        curr_line_num = w[-2]  # line nunmber
			
 
				+                        for right_wdx in range(wdx + 1, len(words), 1):
			
 
				+                            _next_w = words[right_wdx]
			
 
				+                            if _next_w[-2] != curr_line_num:  # 当前遍历到的右侧word已经不是当前行的了（跨行是不行的）
			
 
				+                                break
			
 
				+                            _r_right = fitz.Rect(_next_w[:4])  # 获取当前同行右侧某word的方框4点
			
 
				+                            if abs(_r_right.br - p2) <= max_lineheight:  # 用此word右下点和p2(目标下划线右上点)算距离，距离要小于max_lineheight
			
 
				+                                print(
			
 
				+                                    f"Word '{' '.join([_one_word[4] for _one_word in words[wdx:right_wdx + 1]])}' is underlined! " +
			
 
				+                                    f"Its block-line number is {w[-3], w[-2]}")
			
 
				+                                underlined_res.append(
			
 
				+                                    (' '.join([_one_word[4] for _one_word in words[wdx:right_wdx + 1]]),
			
 
				+                                     w[-3], w[-2])
			
 
				+                                )  # 分别是(下划线词，所在blk_no，所在line_no)
			
 
				+                                break  # don't check more lines
			
 
				+        return underlined_res
			
 
				+
			
 
				+    _p = r'C:\Users\Administrator\Desktop\test_pdf\error2-2.pdf'
			
 
				+    doc_pymupdf = read_pymupdf(_p)
			
 
				+    page = doc_pymupdf[0]
			
 
				+    blocks = page.get_text("dict", flags=11)["blocks"]
			
 
				+    for b in blocks:  # iterate through the text blocks
			
 
				+        for l in b["lines"]:  # iterate through the text lines
			
 
				+            for s in l["spans"]:  # iterate through the text spans
			
 
				+                print("")
			
 
				+                font_properties = "Font: '%s' (%s), size %g, color #%06x" % (
			
 
				+                    s["font"],  # font name
			
 
				+                    flags_decomposer(s["flags"]),  # readable font flags
			
 
				+                    s["size"],  # font size
			
 
				+                    s["color"],  # font color
			
 
				+                )
			
 
				+                print(s)
			
 
				+                print("Text: '%s'" % s["text"])  # simple print of text
			
 
				+                print(font_properties)
			
 
				+
			
 
				+    get_underlined_textLines(page)
			
 
				+
			
 
				+
			
 
				 # 以下为现成pdf单页解析接口
			
 
				 class ParseSentence:
			
 
				 
			
 
				-    def __init__(self,bbox,fontname,fontsize,_text,_title,title_text,_pattern,title_degree,is_outline,outline_location,page_no):
			
 
				-        (x0,y0,x1,y1) = bbox
			
 
				+    def __init__(self, bbox, fontname, fontsize, _text, _title, title_text, _pattern, title_degree, is_outline,
			
 
				+                 outline_location, page_no):
			
 
				+        (x0, y0, x1, y1) = bbox
			
 
				         self.x0 = x0
			
 
				         self.y0 = y0
			
 
				         self.x1 = x1
			
@@ -1119,7 +1752,7 @@ class ParseSentence:
 
				         self.page_no = page_no
			
 
				 
			
 
				     def __repr__(self):
			
 
				-        return "%s,%s,%s,%d,%s"%(self.text,self.title,self.is_outline,self.outline_location,str(self.bbox))
			
 
				+        return "%s,%s,%s,%d,%s" % (self.text, self.title, self.is_outline, self.outline_location, str(self.bbox))
			
 
				 
			
 
				 
			
 
				 class ParseUtils:
			
@@ -1127,11 +1760,11 @@ class ParseUtils:
 
				     @staticmethod
			
 
				     def getFontinfo(_page):
			
 
				         for _obj in _page._objs:
			
 
				-            if isinstance(_obj,(LTTextBoxHorizontal,LTTextBoxVertical)):
			
 
				+            if isinstance(_obj, (LTTextBoxHorizontal, LTTextBoxVertical)):
			
 
				                 for textline in _obj._objs:
			
 
				                     done = False
			
 
				                     for lchar in textline._objs:
			
 
				-                        if isinstance(lchar,(LTChar)):
			
 
				+                        if isinstance(lchar, (LTChar)):
			
 
				                             _obj.fontname = lchar.fontname
			
 
				                             _obj.fontsize = lchar.size
			
 
				                         done = True
			
@@ -1153,7 +1786,7 @@ class ParseUtils:
 
				 
			
 
				             _find = False
			
 
				             for _ct in cluster_textbox:
			
 
				-                if abs(_ct["y"]-_textbox.bbox[1]) < 5:
			
 
				+                if abs(_ct["y"] - _textbox.bbox[1]) < 5:
			
 
				                     _find = True
			
 
				                     _ct["textbox"].append(_textbox)
			
 
				             if not _find:
			
@@ -1167,14 +1800,14 @@ class ParseUtils:
 
				 
			
 
				             _linetext = _textboxs[0].get_text()
			
 
				             for _i in range(1, len(_textboxs)):
			
 
				-                if abs(_textboxs[_i].bbox[0]-_textboxs[_i-1].bbox[2])>60:
			
 
				+                if abs(_textboxs[_i].bbox[0] - _textboxs[_i - 1].bbox[2]) > 60:
			
 
				                     if _linetext[-1] not in (",", "，", "。", ".", "、", "；"):
			
 
				                         _linetext += "=，="
			
 
				                 _linetext += _textboxs[_i].get_text()
			
 
				 
			
 
				             _linetext = re.sub("[\s\r\n]", "", _linetext)
			
 
				             _bbox = (_textboxs[0].bbox[0], _textboxs[0].bbox[1],
			
 
				-                     _textboxs[-1].bbox[2],_textboxs[-1].bbox[3])
			
 
				+                     _textboxs[-1].bbox[2], _textboxs[-1].bbox[3])
			
 
				 
			
 
				             _title = None
			
 
				             _pattern_groups = None
			
@@ -1192,7 +1825,7 @@ class ParseUtils:
 
				                     title_text = _groups[0][1]
			
 
				                     _pattern_groups = _groups
			
 
				             if not _title:
			
 
				-                _title = ParseUtils.rec_incenter(_bbox,page_bbox)
			
 
				+                _title = ParseUtils.rec_incenter(_bbox, page_bbox)
			
 
				 
			
 
				             title_degree = 2
			
 
				             if not _title:
			
@@ -1202,7 +1835,7 @@ class ParseUtils:
 
				                 title_degree = int(_title.split("_")[1])
			
 
				 
			
 
				             # 页码
			
 
				-            if ParseUtils.rec_incenter(_bbox,page_bbox) and re.search("^\d+$", _linetext) is not None:
			
 
				+            if ParseUtils.rec_incenter(_bbox, page_bbox) and re.search("^\d+$", _linetext) is not None:
			
 
				                 continue
			
 
				 
			
 
				             if _linetext == "" or re.search("^，+$", _linetext) is not None:
			
@@ -1216,7 +1849,10 @@ class ParseUtils:
 
				                 _linetext = _search.group("text")
			
 
				                 outline_location = int(_search.group("nums"))
			
 
				 
			
 
				-            list_sentences.append(ParseSentence(_bbox,_textboxs[-1].__dict__.get("fontname"),_textboxs[-1].__dict__.get("fontsize"),_linetext,_title,title_text,_pattern_groups,title_degree,is_outline,outline_location,page_no))
			
 
				+            list_sentences.append(
			
 
				+                ParseSentence(_bbox, _textboxs[-1].__dict__.get("fontname"), _textboxs[-1].__dict__.get("fontsize"),
			
 
				+                              _linetext, _title, title_text, _pattern_groups, title_degree, is_outline,
			
 
				+                              outline_location, page_no))
			
 
				 
			
 
				         # for _sen in list_sentences:
			
 
				         #     print(_sen.__dict__)
			
@@ -1224,133 +1860,136 @@ class ParseUtils:
 
				         return list_sentences
			
 
				 
			
 
				     @staticmethod
			
 
				-    def find_title_by_pattern(_text,_pattern="(?P<title_1>(?P<title_1_index_0_0>^第?)(?P<title_1_index_1_1>[一二三四五六七八九十ⅠⅡⅢⅣⅤⅥⅦⅧⅨⅩⅪⅫ]+)(?P<title_1_index_2_0>[、章]))|" \
			
 
				-                                             "(?P<title_3>^(?P<title_3_index_0_1>[ⅠⅡⅢⅣⅤⅥⅦⅧⅨⅩⅪⅫ]+))|" \
			
 
				-                                             "(?P<title_4>^(?P<title_4_index_0_0>第?)(?P<title_4_index_1_1>[一二三四五六七八九十]+)(?P<title_4_index_2_0>[节]))|" \
			
 
				-                                             "(?P<title_11>^(?P<title_11_index_0_0>\d{1,2}[\.．、\s\-]\d{1,2}[\.．、\s\-]\d{1,2}[\.．、\s\-])(?P<title_11_index_1_1>\d{1,2})(?P<title_11_index_2_0>[\.．、\s\-]))|" \
			
 
				-                                             "(?P<title_10>^(?P<title_10_index_0_0>\d{1,2}[\.．、\s\-]\d{1,2}[\.．、\s\-])(?P<title_10_index_1_1>\d{1,2})(?P<title_10_index_2_0>[\.．、\s\-]))|" \
			
 
				-                                             "(?P<title_7>^(?P<title_7_index_0_0>\d{1,2}[\.．、\s\-])(?P<title_7_index_1_1>\d{1,2})(?P<title_7_index_2_0>[\.．、\s\-]))|" \
			
 
				-                                             "(?P<title_6>^(?P<title_6_index_0_1>\d{1,2})(?P<title_6_index_1_0>[\.．、\s\-]))|" \
			
 
				-                                             "(?P<title_15>^(?P<title_15_index_0_0>（?)(?P<title_15_index_1_1>\d{1,2})(?P<title_15_index_2_0>）))|" \
			
 
				-                                             "(?P<title_17>^(?P<title_17_index_0_0>（?)(?P<title_17_index_1_1>[a-zA-Z]+)(?P<title_17_index_2_0>）))|"
			
 
				-                                             "(?P<title_19>^(?P<title_19_index_0_0>（?)(?P<title_19_index_1_1>[一二三四五六七八九十ⅠⅡⅢⅣⅤⅥⅦⅧⅨⅩⅪⅫ]+)(?P<title_19_index_2_0>）))|" \
			
 
				+    def find_title_by_pattern(_text,
			
 
				+                              _pattern="(?P<title_1>(?P<title_1_index_0_0>^第?)(?P<title_1_index_1_1>[一二三四五六七八九十ⅠⅡⅢⅣⅤⅥⅦⅧⅨⅩⅪⅫ]+)(?P<title_1_index_2_0>[、章]))|" \
			
 
				+                                       "(?P<title_3>^(?P<title_3_index_0_1>[ⅠⅡⅢⅣⅤⅥⅦⅧⅨⅩⅪⅫ]+))|" \
			
 
				+                                       "(?P<title_4>^(?P<title_4_index_0_0>第?)(?P<title_4_index_1_1>[一二三四五六七八九十]+)(?P<title_4_index_2_0>[节]))|" \
			
 
				+                                       "(?P<title_11>^(?P<title_11_index_0_0>\d{1,2}[\.．、\s\-]\d{1,2}[\.．、\s\-]\d{1,2}[\.．、\s\-])(?P<title_11_index_1_1>\d{1,2})(?P<title_11_index_2_0>[\.．、\s\-]))|" \
			
 
				+                                       "(?P<title_10>^(?P<title_10_index_0_0>\d{1,2}[\.．、\s\-]\d{1,2}[\.．、\s\-])(?P<title_10_index_1_1>\d{1,2})(?P<title_10_index_2_0>[\.．、\s\-]))|" \
			
 
				+                                       "(?P<title_7>^(?P<title_7_index_0_0>\d{1,2}[\.．、\s\-])(?P<title_7_index_1_1>\d{1,2})(?P<title_7_index_2_0>[\.．、\s\-]))|" \
			
 
				+                                       "(?P<title_6>^(?P<title_6_index_0_1>\d{1,2})(?P<title_6_index_1_0>[\.．、\s\-]))|" \
			
 
				+                                       "(?P<title_15>^(?P<title_15_index_0_0>（?)(?P<title_15_index_1_1>\d{1,2})(?P<title_15_index_2_0>）))|" \
			
 
				+                                       "(?P<title_17>^(?P<title_17_index_0_0>（?)(?P<title_17_index_1_1>[a-zA-Z]+)(?P<title_17_index_2_0>）))|"
			
 
				+                                       "(?P<title_19>^(?P<title_19_index_0_0>（?)(?P<title_19_index_1_1>[一二三四五六七八九十ⅠⅡⅢⅣⅤⅥⅦⅧⅨⅩⅪⅫ]+)(?P<title_19_index_2_0>）))|" \
			
 
				                               ):
			
 
				-        _se = re.search(_pattern,_text)
			
 
				+        _se = re.search(_pattern, _text)
			
 
				         groups = []
			
 
				         if _se is not None:
			
 
				             _gd = _se.groupdict()
			
 
				-            for k,v in _gd.items():
			
 
				+            for k, v in _gd.items():
			
 
				                 if v is not None:
			
 
				-                    groups.append((k,v))
			
 
				+                    groups.append((k, v))
			
 
				         if len(groups):
			
 
				-            groups.sort(key=lambda x:x[0])
			
 
				+            groups.sort(key=lambda x: x[0])
			
 
				             return groups
			
 
				         return None
			
 
				 
			
 
				     @staticmethod
			
 
				-    def rec_incenter(o_bbox,p_bbox):
			
 
				-        p_width = p_bbox[2]-p_bbox[0]
			
 
				-        l_space = (o_bbox[0]-p_bbox[0])/p_width
			
 
				-        r_space = (p_bbox[2]-o_bbox[2])/p_width
			
 
				+    def rec_incenter(o_bbox, p_bbox):
			
 
				+        p_width = p_bbox[2] - p_bbox[0]
			
 
				+        l_space = (o_bbox[0] - p_bbox[0]) / p_width
			
 
				+        r_space = (p_bbox[2] - o_bbox[2]) / p_width
			
 
				 
			
 
				-        if abs((l_space-r_space))<0.1 and l_space>0.2:
			
 
				+        if abs((l_space - r_space)) < 0.1 and l_space > 0.2:
			
 
				             return "title_2"
			
 
				 
			
 
				     @staticmethod
			
 
				     def is_first_title(_title):
			
 
				         if _title is None:
			
 
				             return False
			
 
				-        if re.search("^\d+$",_title) is not None:
			
 
				-            if int(_title)==1:
			
 
				+        if re.search("^\d+$", _title) is not None:
			
 
				+            if int(_title) == 1:
			
 
				                 return True
			
 
				             return False
			
 
				-        if re.search("^[一二三四五六七八九十百]+$",_title) is not None:
			
 
				-            if _title=="一":
			
 
				+        if re.search("^[一二三四五六七八九十百]+$", _title) is not None:
			
 
				+            if _title == "一":
			
 
				                 return True
			
 
				             return False
			
 
				-        if re.search("^[a-z]+$",_title) is not None:
			
 
				-            if _title=="a":
			
 
				+        if re.search("^[a-z]+$", _title) is not None:
			
 
				+            if _title == "a":
			
 
				                 return True
			
 
				             return False
			
 
				-        if re.search("^[A-Z]+$",_title) is not None:
			
 
				-            if _title=="A":
			
 
				+        if re.search("^[A-Z]+$", _title) is not None:
			
 
				+            if _title == "A":
			
 
				                 return True
			
 
				             return False
			
 
				-        if re.search("^[ⅠⅡⅢⅣⅤⅥⅦⅧⅨⅩⅪⅫ]$",_title) is not None:
			
 
				-            if _title=="Ⅰ":
			
 
				+        if re.search("^[ⅠⅡⅢⅣⅤⅥⅦⅧⅨⅩⅪⅫ]$", _title) is not None:
			
 
				+            if _title == "Ⅰ":
			
 
				                 return True
			
 
				             return False
			
 
				         return False
			
 
				 
			
 
				     @staticmethod
			
 
				     def get_next_title(_title):
			
 
				-        if re.search("^\d+$",_title) is not None:
			
 
				-            return str(int(_title)+1)
			
 
				-        if re.search("^[一二三四五六七八九十百]+$",_title) is not None:
			
 
				-            _next_title = ParseUtils.make_increase(['一','二','三','四','五','六','七','八','九','十'],re.sub("[十百]",'',_title))
			
 
				+        if re.search("^\d+$", _title) is not None:
			
 
				+            return str(int(_title) + 1)
			
 
				+        if re.search("^[一二三四五六七八九十百]+$", _title) is not None:
			
 
				+            _next_title = ParseUtils.make_increase(['一', '二', '三', '四', '五', '六', '七', '八', '九', '十'],
			
 
				+                                                   re.sub("[十百]", '', _title))
			
 
				             _next_title = list(_next_title)
			
 
				             _next_title.reverse()
			
 
				-            if _next_title[-1]!="十":
			
 
				-                if len(_next_title)>=2:
			
 
				-                    _next_title.insert(-1,'十')
			
 
				-            if len(_next_title)>=4:
			
 
				-                _next_title.insert(-3,'百')
			
 
				-            if _title[0]=="十":
			
 
				-                if _next_title=="十":
			
 
				-                    _next_title = ["二","十"]
			
 
				-                _next_title.insert(0,"十")
			
 
				+            if _next_title[-1] != "十":
			
 
				+                if len(_next_title) >= 2:
			
 
				+                    _next_title.insert(-1, '十')
			
 
				+            if len(_next_title) >= 4:
			
 
				+                _next_title.insert(-3, '百')
			
 
				+            if _title[0] == "十":
			
 
				+                if _next_title == "十":
			
 
				+                    _next_title = ["二", "十"]
			
 
				+                _next_title.insert(0, "十")
			
 
				             _next_title = "".join(_next_title)
			
 
				             return _next_title
			
 
				-        if re.search("^[a-z]+$",_title) is not None:
			
 
				-            _next_title = ParseUtils.make_increase([chr(i+ord('a')) for i in range(26)],_title)
			
 
				+        if re.search("^[a-z]+$", _title) is not None:
			
 
				+            _next_title = ParseUtils.make_increase([chr(i + ord('a')) for i in range(26)], _title)
			
 
				             _next_title = list(_next_title)
			
 
				             _next_title.reverse()
			
 
				             return "".join(_next_title)
			
 
				-        if re.search("^[A-Z]+$",_title) is not None:
			
 
				-            _next_title = ParseUtils.make_increase([chr(i+ord('A')) for i in range(26)],_title)
			
 
				+        if re.search("^[A-Z]+$", _title) is not None:
			
 
				+            _next_title = ParseUtils.make_increase([chr(i + ord('A')) for i in range(26)], _title)
			
 
				             _next_title = list(_next_title)
			
 
				             _next_title.reverse()
			
 
				             return "".join(_next_title)
			
 
				-        if re.search("^[ⅠⅡⅢⅣⅤⅥⅦⅧⅨⅩⅪⅫ]$",_title) is not None:
			
 
				-            _sort = ["Ⅰ","Ⅱ","Ⅲ","Ⅳ","Ⅴ","Ⅵ","Ⅶ","Ⅷ","Ⅸ","Ⅹ","Ⅺ","Ⅻ"]
			
 
				+        if re.search("^[ⅠⅡⅢⅣⅤⅥⅦⅧⅨⅩⅪⅫ]$", _title) is not None:
			
 
				+            _sort = ["Ⅰ", "Ⅱ", "Ⅲ", "Ⅳ", "Ⅴ", "Ⅵ", "Ⅶ", "Ⅷ", "Ⅸ", "Ⅹ", "Ⅺ", "Ⅻ"]
			
 
				             _index = _sort.index(_title)
			
 
				-            if _index<len(_sort)-1:
			
 
				-                return _sort[_index+1]
			
 
				+            if _index < len(_sort) - 1:
			
 
				+                return _sort[_index + 1]
			
 
				             return None
			
 
				 
			
 
				-
			
 
				     @staticmethod
			
 
				-    def make_increase(_sort,_title,_add=1):
			
 
				-        if len(_title)==0 and _add==0:
			
 
				+    def make_increase(_sort, _title, _add=1):
			
 
				+        if len(_title) == 0 and _add == 0:
			
 
				             return ""
			
 
				-        if len(_title)==0 and _add==1:
			
 
				+        if len(_title) == 0 and _add == 1:
			
 
				             return _sort[0]
			
 
				         _index = _sort.index(_title[-1])
			
 
				-        next_index = (_index+_add)%len(_sort)
			
 
				+        next_index = (_index + _add) % len(_sort)
			
 
				         next_chr = _sort[next_index]
			
 
				-        if _index==len(_sort)-1:
			
 
				+        if _index == len(_sort) - 1:
			
 
				             _add = 1
			
 
				         else:
			
 
				             _add = 0
			
 
				-        return next_chr+ParseUtils.make_increase(_sort,_title[:-1],_add)
			
 
				-
			
 
				-
			
 
				-
			
 
				+        return next_chr + ParseUtils.make_increase(_sort, _title[:-1], _add)
			
 
				 
			
 
				     @staticmethod
			
 
				-    def rec_serial(_text,o_bbox,p_bbox,fontname,_pattern="(?P<title_1>^[一二三四五六七八九十]+[、])|" \
			
 
				-                                                         "(?P<title_2>^\d+[\.、\s])|" \
			
 
				-                                                         "(?P<title_3>^\d+\.\d+[\.、\s])|" \
			
 
				-                                                         "(?P<title_4>^\d+\.\d+\.\d+[\.、\s])|" \
			
 
				-                                                         "(?P<title_5>^\d+\.\d+\.\d+\.\d+[\.、\s])"):
			
 
				-        #todo :recog the serial of the sentence
			
 
				-
			
 
				-
			
 
				-
			
 
				-        _se = re.search(_pattern,_text)
			
 
				+    def rec_serial(_text, o_bbox, p_bbox, fontname, _pattern="(?P<title_1>^[一二三四五六七八九十]+[、])|" \
			
 
				+                                                             "(?P<title_2>^\d+[\.、\s])|" \
			
 
				+                                                             "(?P<title_3>^\d+\.\d+[\.、\s])|" \
			
 
				+                                                             "(?P<title_4>^\d+\.\d+\.\d+[\.、\s])|" \
			
 
				+                                                             "(?P<title_5>^\d+\.\d+\.\d+\.\d+[\.、\s])"):
			
 
				+        # todo :recog the serial of the sentence
			
 
				+
			
 
				+        _se = re.search(_pattern, _text)
			
 
				         if _se is not None:
			
 
				             _gd = _se.groupdict()
			
 
				-            for k,v in _gd.items():
			
 
				+            for k, v in _gd.items():
			
 
				                 if v is not None:
			
 
				                     return k
			
 
				         return None
			
 
				+
			
 
				+
			
 
				+if __name__ == '__main__':
			
 
				+    # get_text_font()
			
 
				+    PDFConvert(r"C:/Users/Administrator/Downloads/1651896704621.pdf", "C:/Users/Administrator/Downloads/1").get_html()
			
 
				+
			
 
				+    # print(b'\x10')
			
--- a/format_convert/convert_test.py
+++ b/format_convert/convert_test.py
@@ -6,6 +6,9 @@ import sys
 
				 import time
			
 
				 from glob import glob
			
 
				 from multiprocessing import Process
			
 
				+
			
 
				+from bs4 import BeautifulSoup
			
 
				+
			
 
				 sys.path.append(os.path.dirname(os.path.abspath(__file__)) + "/../")
			
 
				 from format_convert.utils import get_platform, request_post, get_md5_from_bytes
			
 
				 from format_convert.convert import to_html
			
@@ -21,10 +24,10 @@ def test_one(p, from_remote=False):
 
				 
			
 
				     data = {"file": file_base64, "type": p.split(".")[-1], "filemd5": 100}
			
 
				     if from_remote:
			
 
				-        # _url = 'http://121.46.18.113:15010/convert'
			
 
				+        _url = 'http://121.46.18.113:15010/convert'
			
 
				         # _url = 'http://192.168.2.103:15010/convert'
			
 
				         # _url = 'http://172.16.160.65:15010/convert'
			
 
				-        _url = 'http://127.0.0.1:15010/convert'
			
 
				+        # _url = 'http://127.0.0.1:15010/convert'
			
 
				         result = json.loads(request_post(_url, data, time_out=10000))
			
 
				         text_str = ""
			
 
				         for t in result.get("result_html"):
			
@@ -58,9 +61,10 @@ if __name__ == '__main__':
 
				         # file_path = "C:/Users/Administrator/Desktop/test_xls/merge_cell.xlsx"
			
 
				         # file_path = "D:/BIDI_DOC/比地_文档/2022/Test_Interface/20210609202634853485.xlsx"
			
 
				         # file_path = "D:/BIDI_DOC/比地_文档/2022/Test_ODPS/1624325845476.pdf"
			
 
				-        # file_path = "C:/Users/Administrator/Downloads/神仙居旅游汽车租赁竞争性磋商文件(1).doc"
			
 
				+        # file_path = "C:/Users/Administrator/Downloads/QQ图片20230616105216.jpg"
			
 
				         # file_path = "C:/Users/Administrator/Desktop/test_xls/error2.xlsx"
			
 
				-        file_path = "C:/Users/Administrator/Desktop/test_doc/error5.docx"
			
 
				+        # file_path = "C:/Users/Administrator/Desktop/test_image/error9-2.png"
			
 
				+        file_path = "C:/Users/Administrator/Desktop/test_pdf/直接读表格线error/error51.pdf"
			
 
				     else:
			
 
				         file_path = "1660296734009.pdf"
			
 
				     test_one(file_path, from_remote=True)
			
@@ -87,4 +91,9 @@ if __name__ == '__main__':
 
				     #     p_list.append(p)
			
 
				     # for p in p_list:
			
 
				     #     p.join()
			
 
				-    # print("finish", time.time() - start_time)
			
 
				+    # print("finish", time.time() - start_time)
			
 
				+
			
 
				+    # with open(file_path, 'r') as f:
			
 
				+    #     t = f.read()
			
 
				+    # soup = BeautifulSoup(t, 'lxml')
			
 
				+    # print(soup.text)
			
--- a/format_convert/convert_tree.py
+++ b/format_convert/convert_tree.py
@@ -20,11 +20,15 @@ class _Document:
 
				         else:
			
 
				             self.error_code = child.error_code
			
 
				 
			
 
				-    def get_html(self):
			
 
				+    def get_html(self, return_list=False):
			
 
				         if self.error_code is not None:
			
 
				             return self.error_code
			
 
				 
			
 
				-        html_text = ""
			
 
				+        if return_list:
			
 
				+            html_text = []
			
 
				+        else:
			
 
				+            html_text = ""
			
 
				+
			
 
				         for child in self.children:
			
 
				             # 先调用get_html才能更新error_code
			
 
				             child_html_text = child.get_html()
			
@@ -32,8 +36,13 @@ class _Document:
 
				                 self.error_code = child.error_code
			
 
				                 return self.error_code
			
 
				             else:
			
 
				-                html_text += child_html_text
			
 
				-        return [html_text]
			
 
				+                if return_list:
			
 
				+                    html_text += [child_html_text]
			
 
				+                else:
			
 
				+                    html_text += child_html_text
			
 
				+        if not return_list:
			
 
				+            html_text = [html_text]
			
 
				+        return html_text
			
 
				 
			
 
				 
			
 
				 class _Page:
			
--- a/format_convert/convert_xls.py
+++ b/format_convert/convert_xls.py
@@ -1,8 +1,9 @@
 
				 import inspect
			
 
				 import os
			
 
				 import sys
			
 
				+from bs4 import BeautifulSoup
			
 
				 sys.path.append(os.path.dirname(os.path.abspath(__file__)) + "/../")
			
 
				-from format_convert.convert_tree import _Document
			
 
				+from format_convert.convert_tree import _Document, _Page, _Sentence
			
 
				 import logging
			
 
				 import traceback
			
 
				 from format_convert import get_memory_info
			
@@ -38,14 +39,31 @@ class XlsConvert:
 
				         self.unique_type_dir = unique_type_dir
			
 
				 
			
 
				     def convert(self):
			
 
				-        # 调用office格式转换
			
 
				-        file_path = from_office_interface(self.path, self.unique_type_dir, 'xlsx')
			
 
				-        if judge_error_code(file_path):
			
 
				-            self._doc.error_code = file_path
			
 
				-            return
			
 
				-        _xlsx = XlsxConvert(file_path, self.unique_type_dir)
			
 
				-        _xlsx.convert()
			
 
				-        self._doc = _xlsx._doc
			
 
				+        # 先判断特殊xls文件，可能是html文本
			
 
				+        is_html_xls = False
			
 
				+        try:
			
 
				+            with open(self.path, 'r') as f:
			
 
				+                html_str = f.read()
			
 
				+            soup = BeautifulSoup(html_str, 'lxml')
			
 
				+            text = soup.text
			
 
				+            is_html_xls = True
			
 
				+        except:
			
 
				+            pass
			
 
				+
			
 
				+        if is_html_xls:
			
 
				+            self._page = _Page(None, 0)
			
 
				+            _sen = _Sentence(text, (0, 0, 0, 0))
			
 
				+            self._page.add_child(_sen)
			
 
				+            self._doc.add_child(self._page)
			
 
				+        else:
			
 
				+            # 调用office格式转换
			
 
				+            file_path = from_office_interface(self.path, self.unique_type_dir, 'xlsx')
			
 
				+            if judge_error_code(file_path):
			
 
				+                self._doc.error_code = file_path
			
 
				+                return
			
 
				+            _xlsx = XlsxConvert(file_path, self.unique_type_dir)
			
 
				+            _xlsx.convert()
			
 
				+            self._doc = _xlsx._doc
			
 
				 
			
 
				     def get_html(self):
			
 
				         try:
			
@@ -53,8 +71,13 @@ class XlsConvert:
 
				         except:
			
 
				             traceback.print_exc()
			
 
				             self._doc.error_code = [-1]
			
 
				-        print("xls ", self._doc)
			
 
				+        # print("xls ", self._doc)
			
 
				         if self._doc.error_code is not None:
			
 
				             return self._doc.error_code
			
 
				-        print(self._doc.children)
			
 
				-        return self._doc.get_html()
			
 
				+        # print(self._doc.children)
			
 
				+        return self._doc.get_html()
			
 
				+
			
 
				+
			
 
				+if __name__ == '__main__':
			
 
				+    c = XlsConvert("C:/Users/Administrator/Downloads/1683641686556.xls", "C:/Users/Administrator/Downloads/1")
			
 
				+    print(c.get_html())
			
--- a/format_convert/convert_zip.py
+++ b/format_convert/convert_zip.py
@@ -1,11 +1,13 @@
 
				 import inspect
			
 
				 import os
			
 
				 import sys
			
 
				+import uuid
			
 
				+
			
 
				 sys.path.append(os.path.dirname(__file__) + "/../")
			
 
				 from format_convert.convert_tree import _Document, _Page, _Sentence
			
 
				 import logging
			
 
				 import traceback
			
 
				-import zipfile
			
 
				+import my_zipfile as zipfile
			
 
				 from format_convert import get_memory_info
			
 
				 from format_convert.utils import get_platform, rename_inner_files, judge_error_code, judge_format, get_logger, log, \
			
 
				     memory_decorator
			
@@ -126,14 +128,19 @@ class ZipConvert:
 
				                 # 中文乱码，会导致zip解压失败，直接修改对象
			
 
				                 try:
			
 
				                     new_f = f.encode('cp437').decode('gbk')
			
 
				+                    # print('1', new_f)
			
 
				                 except:
			
 
				                     new_f = f.encode('utf-8').decode('utf-8')
			
 
				+                    # print('2', new_f)
			
 
				                 if f != new_f:
			
 
				+                    new_f = str(uuid.uuid1().hex) + '.' + f.split('.')[-1]
			
 
				                     zip_file.NameToInfo[new_f] = zip_file.NameToInfo[f]
			
 
				                     zip_file.NameToInfo[new_f].filename = new_f
			
 
				                     zip_file.NameToInfo.pop(f)
			
 
				+                    zip_file.NameToInfo[new_f].orig_filename = new_f
			
 
				+                    # zip_file.NameToInfo[new_f].flag_bits = 2048
			
 
				+                    zip_file.NameToInfo[new_f].has_changed_name = True
			
 
				                 new_zip_list.append(new_f)
			
 
				-
			
 
				             new_zip_list.sort(key=lambda x: len(x))
			
 
				             for f in new_zip_list:
			
 
				                 file_list.append(zip_file.extract(f, path=self.zip_path))
			
@@ -198,4 +205,9 @@ class ZipConvert:
 
				             self._doc.error_code = [-1]
			
 
				         if self._doc.error_code is not None:
			
 
				             return self._doc.error_code
			
 
				-        return self._doc.get_html()
			
 
				+        return self._doc.get_html()
			
 
				+
			
 
				+
			
 
				+if __name__ == '__main__':
			
 
				+    c = ZipConvert("C:/Users/Administrator/Downloads/3775865878373065499.zip", "C:/Users/Administrator/Downloads/1")
			
 
				+    c.get_html()
			
--- a/format_convert/interface.yml
+++ b/format_convert/interface.yml
@@ -5,7 +5,7 @@ MASTER:
 
				 #  local-102: 'http://192.168.2.102'
			
 
				 #  local-103: 'http://192.168.2.103'
			
 
				 #  local 'http://127.0.0.1'
			
 
				-  ip: ['http://192.168.0.115']
			
 
				+  ip: ['http://127.0.0.1']
			
 
				 
			
 
				   PATH:
			
 
				     python: ['/data/anaconda3/envs/convert3/bin/python']
			
--- a/format_convert/kill_all.sh
+++ b/format_convert/kill_all.sh
@@ -0,0 +1,6 @@
 
				+kill -9 $(lsof -i:15010|sed -n '2,$p'|awk '{print $2}'|tr '\n' ' ')
			
 
				+kill -9 $(lsof -i:17000|sed -n '2,$p'|awk '{print $2}'|tr '\n' ' ')
			
 
				+kill -9 $(lsof -i:18000|sed -n '2,$p'|awk '{print $2}'|tr '\n' ' ')
			
 
				+kill -9 $(lsof -i:18020|sed -n '2,$p'|awk '{print $2}'|tr '\n' ' ')
			
 
				+kill -9 $(lsof -i:18040|sed -n '2,$p'|awk '{print $2}'|tr '\n' ' ')
			
 
				+kill -9 $(lsof -i:18060|sed -n '2,$p'|awk '{print $2}'|tr '\n' ' ')
			
--- a/format_convert/kill_main.sh
+++ b/format_convert/kill_main.sh
@@ -0,0 +1 @@
 
				+kill -9 $(lsof -i:15010|sed -n '2,$p'|awk '{print $2}'|tr '\n' ' ')
			
--- a/format_convert/test_walk.py
+++ b/format_convert/test_walk.py
@@ -1,17 +1,107 @@
 
				+import copy
			
 
				 import os
			
 
				-file_list = []
			
 
				-for root, dirs, files in os.walk("./", topdown=False):
			
 
				-    for name in dirs:
			
 
				-        file_list.append(os.path.join(root, name) + os.sep)
			
 
				-    for name in files:
			
 
				-        file_list.append(os.path.join(root, name))
			
 
				-print(file_list)
			
 
				+import random
			
 
				+import re
			
 
				+import sys
			
 
				+import time
			
 
				+from bs4 import BeautifulSoup
			
 
				+from datetime import datetime
			
 
				+from multiprocessing import Process
			
 
				+import datetime as dt
			
 
				+sys.path.append(os.path.dirname(os.path.abspath(__file__)) + "/../")
			
 
				+from format_convert.utils import file_lock
			
 
				 
			
 
				 
			
 
				-s = set()
			
 
				-s.update("1231asdb我深大")
			
 
				-s.update("g6712")
			
 
				+def run():
			
 
				+
			
 
				+    f = file_lock(os.path.abspath(os.path.dirname(__file__)) + '/19022.lock')
			
 
				+    print("acquire file_lock! process " + str(os.getpid()))
			
 
				+    for i in range(10):
			
 
				+        print("process " + str(os.getpid()) + " " + str(i))
			
 
				+        time.sleep(random.randint(0, 1))
			
 
				+    f.close()
			
 
				+
			
 
				+
			
 
				+def merge_table():
			
 
				+    with open(r'C:\Users\Administrator\Desktop\2.html', 'r') as f:
			
 
				+        html_str = f.read()
			
 
				+    html_str_origin = copy.deepcopy(html_str)
			
 
				+
			
 
				+    try:
			
 
				+        match1 = re.finditer('<table', html_str)
			
 
				+        match2 = re.finditer('</table>', html_str)
			
 
				+        table_index_list = []
			
 
				+        for m1, m2 in zip(match1, match2):
			
 
				+            table_index_list.append([m1.span()[0], m1.span()[1], m2.span()[0], m2.span()[1]])
			
 
				+        print(table_index_list)
			
 
				+
			
 
				+        soup = BeautifulSoup(html_str)
			
 
				+        tables = soup.find_all('table')
			
 
				+        table_td_cnt_list = []
			
 
				+        for table in tables:
			
 
				+            tds = table.tr.find_all('td')
			
 
				+            table_td_cnt_list.append(len(list(tds)))
			
 
				+        print(table_td_cnt_list)
			
 
				+
			
 
				+        if len(table_index_list) == len(table_td_cnt_list):
			
 
				+            merge_index_list = []
			
 
				+            temp_index = []
			
 
				+            for i in range(1, len(table_index_list)):
			
 
				+                last_index = table_index_list[i-1]
			
 
				+                index = table_index_list[i]
			
 
				+                last_tds = table_td_cnt_list[i-1]
			
 
				+                tds = table_td_cnt_list[i]
			
 
				+                if index[0] - last_index[-1] == 0 and last_tds == tds:
			
 
				+                    temp_index += [i-1, i]
			
 
				+                    temp_index = list(set(temp_index))
			
 
				+                else:
			
 
				+                    if temp_index:
			
 
				+                        merge_index_list.append(temp_index)
			
 
				+                    temp_index = []
			
 
				+            if temp_index:
			
 
				+                merge_index_list.append(temp_index)
			
 
				+            print(merge_index_list)
			
 
				+
			
 
				+            print('before len(html_str)', len(html_str))
			
 
				+            for merge in merge_index_list:
			
 
				+                start_index = table_index_list[merge[0]][0]
			
 
				+                end_index = table_index_list[merge[-1]][-1]
			
 
				+                table_replace = re.sub('<table[^>]*>|</table>', '', html_str[start_index:end_index])
			
 
				+                table_replace = '<table border="1">' + table_replace + '</table>'
			
 
				+                table_replace += ' '*(end_index-start_index-len(table_replace))
			
 
				+                html_str = html_str[:start_index] + table_replace + html_str[end_index:]
			
 
				+            print('after len(html_str)', len(html_str))
			
 
				+
			
 
				+            if len(html_str_origin) == len(html_str):
			
 
				+                with open(r'C:\Users\Administrator\Desktop\3.html', 'w') as f:
			
 
				+                    f.write(html_str)
			
 
				+                return html_str
			
 
				+            else:
			
 
				+                return html_str_origin
			
 
				+        else:
			
 
				+            return html_str_origin
			
 
				+    except:
			
 
				+        return html_str_origin
			
 
				+
			
 
				+
			
 
				+if __name__ == '__main__':
			
 
				+    # process_list = []
			
 
				+    # for j in range(10):
			
 
				+    #     p1 = Process(target=run,)
			
 
				+    #     p1.start()
			
 
				+    #     process_list.append(p1)
			
 
				+    #
			
 
				+    # for p in process_list:
			
 
				+    #     p.join()
			
 
				+
			
 
				+    print('|'.join(['a', 'n']))
			
 
				+    _t = datetime.strptime('2023-04-26', '%Y-%m-%d')
			
 
				+    _t2 = datetime.strptime('2023-04-02', '%Y-%m-%d')
			
 
				+    print(abs((_t2-_t).days))
			
 
				+    print(datetime.strftime(_t + dt.timedelta(days=10), '%Y-%m-%d'))
			
 
				+
			
 
				+    # merge_table()
			
 
				+
			
 
				+    print(datetime.now())
			
 
				 
			
 
				-print(len(s))
			
 
				 
			
 
				-print(len("".join(["sdas", "我是觉得", "111"])))
			
--- a/format_convert/utils.py
+++ b/format_convert/utils.py
--- a/isr/pre_process.py
+++ b/isr/pre_process.py
@@ -19,11 +19,11 @@ def count_red_pixel(image_np, cnt=1000):
 
				     labels = measure.label(red_mask, connectivity=2)  # 8连通区域标记
			
 
				     regions = measure.regionprops(labels)
			
 
				     red_cnt = np.sum(red_mask != 0)
			
 
				-    print("red_cnt regions", len(regions),red_cnt, time.time()-start_time)
			
 
				+    # print("red_cnt regions", len(regions),red_cnt, time.time()-start_time)
			
 
				     if regions and len(regions)>0:
			
 
				         _max_area = max([r.bbox_area for r in regions])
			
 
				         if _max_area>100:
			
 
				-            print("red_cnt max_area", _max_area, time.time()-start_time)
			
 
				+            # print("red_cnt max_area", _max_area, time.time()-start_time)
			
 
				             return True
			
 
				     return False
			
 
				 
			
--- a/my_zipfile.py
+++ b/my_zipfile.py
@@ -0,0 +1,2183 @@
 
				+"""
			
 
				+Read and write ZIP files.
			
 
				+
			
 
				+XXX references to utf-8 need further investigation.
			
 
				+"""
			
 
				+import io
			
 
				+import os
			
 
				+import importlib.util
			
 
				+import sys
			
 
				+import time
			
 
				+import stat
			
 
				+import shutil
			
 
				+import struct
			
 
				+import binascii
			
 
				+import threading
			
 
				+
			
 
				+try:
			
 
				+    import zlib # We may need its compression method
			
 
				+    crc32 = zlib.crc32
			
 
				+except ImportError:
			
 
				+    zlib = None
			
 
				+    crc32 = binascii.crc32
			
 
				+
			
 
				+try:
			
 
				+    import bz2 # We may need its compression method
			
 
				+except ImportError:
			
 
				+    bz2 = None
			
 
				+
			
 
				+try:
			
 
				+    import lzma # We may need its compression method
			
 
				+except ImportError:
			
 
				+    lzma = None
			
 
				+
			
 
				+__all__ = ["BadZipFile", "BadZipfile", "error",
			
 
				+           "ZIP_STORED", "ZIP_DEFLATED", "ZIP_BZIP2", "ZIP_LZMA",
			
 
				+           "is_zipfile", "ZipInfo", "ZipFile", "PyZipFile", "LargeZipFile"]
			
 
				+
			
 
				+class BadZipFile(Exception):
			
 
				+    pass
			
 
				+
			
 
				+
			
 
				+class LargeZipFile(Exception):
			
 
				+    """
			
 
				+    Raised when writing a zipfile, the zipfile requires ZIP64 extensions
			
 
				+    and those extensions are disabled.
			
 
				+    """
			
 
				+
			
 
				+error = BadZipfile = BadZipFile      # Pre-3.2 compatibility names
			
 
				+
			
 
				+
			
 
				+ZIP64_LIMIT = (1 << 31) - 1
			
 
				+ZIP_FILECOUNT_LIMIT = (1 << 16) - 1
			
 
				+ZIP_MAX_COMMENT = (1 << 16) - 1
			
 
				+
			
 
				+# constants for Zip file compression methods
			
 
				+ZIP_STORED = 0
			
 
				+ZIP_DEFLATED = 8
			
 
				+ZIP_BZIP2 = 12
			
 
				+ZIP_LZMA = 14
			
 
				+# Other ZIP compression methods not supported
			
 
				+
			
 
				+DEFAULT_VERSION = 20
			
 
				+ZIP64_VERSION = 45
			
 
				+BZIP2_VERSION = 46
			
 
				+LZMA_VERSION = 63
			
 
				+# we recognize (but not necessarily support) all features up to that version
			
 
				+MAX_EXTRACT_VERSION = 63
			
 
				+
			
 
				+# Below are some formats and associated data for reading/writing headers using
			
 
				+# the struct module.  The names and structures of headers/records are those used
			
 
				+# in the PKWARE description of the ZIP file format:
			
 
				+#     http://www.pkware.com/documents/casestudies/APPNOTE.TXT
			
 
				+# (URL valid as of January 2008)
			
 
				+
			
 
				+# The "end of central directory" structure, magic number, size, and indices
			
 
				+# (section V.I in the format document)
			
 
				+structEndArchive = b"<4s4H2LH"
			
 
				+stringEndArchive = b"PK\005\006"
			
 
				+sizeEndCentDir = struct.calcsize(structEndArchive)
			
 
				+
			
 
				+_ECD_SIGNATURE = 0
			
 
				+_ECD_DISK_NUMBER = 1
			
 
				+_ECD_DISK_START = 2
			
 
				+_ECD_ENTRIES_THIS_DISK = 3
			
 
				+_ECD_ENTRIES_TOTAL = 4
			
 
				+_ECD_SIZE = 5
			
 
				+_ECD_OFFSET = 6
			
 
				+_ECD_COMMENT_SIZE = 7
			
 
				+# These last two indices are not part of the structure as defined in the
			
 
				+# spec, but they are used internally by this module as a convenience
			
 
				+_ECD_COMMENT = 8
			
 
				+_ECD_LOCATION = 9
			
 
				+
			
 
				+# The "central directory" structure, magic number, size, and indices
			
 
				+# of entries in the structure (section V.F in the format document)
			
 
				+structCentralDir = "<4s4B4HL2L5H2L"
			
 
				+stringCentralDir = b"PK\001\002"
			
 
				+sizeCentralDir = struct.calcsize(structCentralDir)
			
 
				+
			
 
				+# indexes of entries in the central directory structure
			
 
				+_CD_SIGNATURE = 0
			
 
				+_CD_CREATE_VERSION = 1
			
 
				+_CD_CREATE_SYSTEM = 2
			
 
				+_CD_EXTRACT_VERSION = 3
			
 
				+_CD_EXTRACT_SYSTEM = 4
			
 
				+_CD_FLAG_BITS = 5
			
 
				+_CD_COMPRESS_TYPE = 6
			
 
				+_CD_TIME = 7
			
 
				+_CD_DATE = 8
			
 
				+_CD_CRC = 9
			
 
				+_CD_COMPRESSED_SIZE = 10
			
 
				+_CD_UNCOMPRESSED_SIZE = 11
			
 
				+_CD_FILENAME_LENGTH = 12
			
 
				+_CD_EXTRA_FIELD_LENGTH = 13
			
 
				+_CD_COMMENT_LENGTH = 14
			
 
				+_CD_DISK_NUMBER_START = 15
			
 
				+_CD_INTERNAL_FILE_ATTRIBUTES = 16
			
 
				+_CD_EXTERNAL_FILE_ATTRIBUTES = 17
			
 
				+_CD_LOCAL_HEADER_OFFSET = 18
			
 
				+
			
 
				+# The "local file header" structure, magic number, size, and indices
			
 
				+# (section V.A in the format document)
			
 
				+structFileHeader = "<4s2B4HL2L2H"
			
 
				+stringFileHeader = b"PK\003\004"
			
 
				+sizeFileHeader = struct.calcsize(structFileHeader)
			
 
				+
			
 
				+_FH_SIGNATURE = 0
			
 
				+_FH_EXTRACT_VERSION = 1
			
 
				+_FH_EXTRACT_SYSTEM = 2
			
 
				+_FH_GENERAL_PURPOSE_FLAG_BITS = 3
			
 
				+_FH_COMPRESSION_METHOD = 4
			
 
				+_FH_LAST_MOD_TIME = 5
			
 
				+_FH_LAST_MOD_DATE = 6
			
 
				+_FH_CRC = 7
			
 
				+_FH_COMPRESSED_SIZE = 8
			
 
				+_FH_UNCOMPRESSED_SIZE = 9
			
 
				+_FH_FILENAME_LENGTH = 10
			
 
				+_FH_EXTRA_FIELD_LENGTH = 11
			
 
				+
			
 
				+# The "Zip64 end of central directory locator" structure, magic number, and size
			
 
				+structEndArchive64Locator = "<4sLQL"
			
 
				+stringEndArchive64Locator = b"PK\x06\x07"
			
 
				+sizeEndCentDir64Locator = struct.calcsize(structEndArchive64Locator)
			
 
				+
			
 
				+# The "Zip64 end of central directory" record, magic number, size, and indices
			
 
				+# (section V.G in the format document)
			
 
				+structEndArchive64 = "<4sQ2H2L4Q"
			
 
				+stringEndArchive64 = b"PK\x06\x06"
			
 
				+sizeEndCentDir64 = struct.calcsize(structEndArchive64)
			
 
				+
			
 
				+_CD64_SIGNATURE = 0
			
 
				+_CD64_DIRECTORY_RECSIZE = 1
			
 
				+_CD64_CREATE_VERSION = 2
			
 
				+_CD64_EXTRACT_VERSION = 3
			
 
				+_CD64_DISK_NUMBER = 4
			
 
				+_CD64_DISK_NUMBER_START = 5
			
 
				+_CD64_NUMBER_ENTRIES_THIS_DISK = 6
			
 
				+_CD64_NUMBER_ENTRIES_TOTAL = 7
			
 
				+_CD64_DIRECTORY_SIZE = 8
			
 
				+_CD64_OFFSET_START_CENTDIR = 9
			
 
				+
			
 
				+_DD_SIGNATURE = 0x08074b50
			
 
				+
			
 
				+_EXTRA_FIELD_STRUCT = struct.Struct('<HH')
			
 
				+
			
 
				+def _strip_extra(extra, xids):
			
 
				+    # Remove Extra Fields with specified IDs.
			
 
				+    unpack = _EXTRA_FIELD_STRUCT.unpack
			
 
				+    modified = False
			
 
				+    buffer = []
			
 
				+    start = i = 0
			
 
				+    while i + 4 <= len(extra):
			
 
				+        xid, xlen = unpack(extra[i : i + 4])
			
 
				+        j = i + 4 + xlen
			
 
				+        if xid in xids:
			
 
				+            if i != start:
			
 
				+                buffer.append(extra[start : i])
			
 
				+            start = j
			
 
				+            modified = True
			
 
				+        i = j
			
 
				+    if not modified:
			
 
				+        return extra
			
 
				+    return b''.join(buffer)
			
 
				+
			
 
				+def _check_zipfile(fp):
			
 
				+    try:
			
 
				+        if _EndRecData(fp):
			
 
				+            return True         # file has correct magic number
			
 
				+    except OSError:
			
 
				+        pass
			
 
				+    return False
			
 
				+
			
 
				+def is_zipfile(filename):
			
 
				+    """Quickly see if a file is a ZIP file by checking the magic number.
			
 
				+
			
 
				+    The filename argument may be a file or file-like object too.
			
 
				+    """
			
 
				+    result = False
			
 
				+    try:
			
 
				+        if hasattr(filename, "read"):
			
 
				+            result = _check_zipfile(fp=filename)
			
 
				+        else:
			
 
				+            with open(filename, "rb") as fp:
			
 
				+                result = _check_zipfile(fp)
			
 
				+    except OSError:
			
 
				+        pass
			
 
				+    return result
			
 
				+
			
 
				+def _EndRecData64(fpin, offset, endrec):
			
 
				+    """
			
 
				+    Read the ZIP64 end-of-archive records and use that to update endrec
			
 
				+    """
			
 
				+    try:
			
 
				+        fpin.seek(offset - sizeEndCentDir64Locator, 2)
			
 
				+    except OSError:
			
 
				+        # If the seek fails, the file is not large enough to contain a ZIP64
			
 
				+        # end-of-archive record, so just return the end record we were given.
			
 
				+        return endrec
			
 
				+
			
 
				+    data = fpin.read(sizeEndCentDir64Locator)
			
 
				+    if len(data) != sizeEndCentDir64Locator:
			
 
				+        return endrec
			
 
				+    sig, diskno, reloff, disks = struct.unpack(structEndArchive64Locator, data)
			
 
				+    if sig != stringEndArchive64Locator:
			
 
				+        return endrec
			
 
				+
			
 
				+    if diskno != 0 or disks > 1:
			
 
				+        raise BadZipFile("zipfiles that span multiple disks are not supported")
			
 
				+
			
 
				+    # Assume no 'zip64 extensible data'
			
 
				+    fpin.seek(offset - sizeEndCentDir64Locator - sizeEndCentDir64, 2)
			
 
				+    data = fpin.read(sizeEndCentDir64)
			
 
				+    if len(data) != sizeEndCentDir64:
			
 
				+        return endrec
			
 
				+    sig, sz, create_version, read_version, disk_num, disk_dir, \
			
 
				+        dircount, dircount2, dirsize, diroffset = \
			
 
				+        struct.unpack(structEndArchive64, data)
			
 
				+    if sig != stringEndArchive64:
			
 
				+        return endrec
			
 
				+
			
 
				+    # Update the original endrec using data from the ZIP64 record
			
 
				+    endrec[_ECD_SIGNATURE] = sig
			
 
				+    endrec[_ECD_DISK_NUMBER] = disk_num
			
 
				+    endrec[_ECD_DISK_START] = disk_dir
			
 
				+    endrec[_ECD_ENTRIES_THIS_DISK] = dircount
			
 
				+    endrec[_ECD_ENTRIES_TOTAL] = dircount2
			
 
				+    endrec[_ECD_SIZE] = dirsize
			
 
				+    endrec[_ECD_OFFSET] = diroffset
			
 
				+    return endrec
			
 
				+
			
 
				+
			
 
				+def _EndRecData(fpin):
			
 
				+    """Return data from the "End of Central Directory" record, or None.
			
 
				+
			
 
				+    The data is a list of the nine items in the ZIP "End of central dir"
			
 
				+    record followed by a tenth item, the file seek offset of this record."""
			
 
				+
			
 
				+    # Determine file size
			
 
				+    fpin.seek(0, 2)
			
 
				+    filesize = fpin.tell()
			
 
				+
			
 
				+    # Check to see if this is ZIP file with no archive comment (the
			
 
				+    # "end of central directory" structure should be the last item in the
			
 
				+    # file if this is the case).
			
 
				+    try:
			
 
				+        fpin.seek(-sizeEndCentDir, 2)
			
 
				+    except OSError:
			
 
				+        return None
			
 
				+    data = fpin.read()
			
 
				+    if (len(data) == sizeEndCentDir and
			
 
				+        data[0:4] == stringEndArchive and
			
 
				+        data[-2:] == b"\000\000"):
			
 
				+        # the signature is correct and there's no comment, unpack structure
			
 
				+        endrec = struct.unpack(structEndArchive, data)
			
 
				+        endrec=list(endrec)
			
 
				+
			
 
				+        # Append a blank comment and record start offset
			
 
				+        endrec.append(b"")
			
 
				+        endrec.append(filesize - sizeEndCentDir)
			
 
				+
			
 
				+        # Try to read the "Zip64 end of central directory" structure
			
 
				+        return _EndRecData64(fpin, -sizeEndCentDir, endrec)
			
 
				+
			
 
				+    # Either this is not a ZIP file, or it is a ZIP file with an archive
			
 
				+    # comment.  Search the end of the file for the "end of central directory"
			
 
				+    # record signature. The comment is the last item in the ZIP file and may be
			
 
				+    # up to 64K long.  It is assumed that the "end of central directory" magic
			
 
				+    # number does not appear in the comment.
			
 
				+    maxCommentStart = max(filesize - (1 << 16) - sizeEndCentDir, 0)
			
 
				+    fpin.seek(maxCommentStart, 0)
			
 
				+    data = fpin.read()
			
 
				+    start = data.rfind(stringEndArchive)
			
 
				+    if start >= 0:
			
 
				+        # found the magic number; attempt to unpack and interpret
			
 
				+        recData = data[start:start+sizeEndCentDir]
			
 
				+        if len(recData) != sizeEndCentDir:
			
 
				+            # Zip file is corrupted.
			
 
				+            return None
			
 
				+        endrec = list(struct.unpack(structEndArchive, recData))
			
 
				+        commentSize = endrec[_ECD_COMMENT_SIZE] #as claimed by the zip file
			
 
				+        comment = data[start+sizeEndCentDir:start+sizeEndCentDir+commentSize]
			
 
				+        endrec.append(comment)
			
 
				+        endrec.append(maxCommentStart + start)
			
 
				+
			
 
				+        # Try to read the "Zip64 end of central directory" structure
			
 
				+        return _EndRecData64(fpin, maxCommentStart + start - filesize,
			
 
				+                             endrec)
			
 
				+
			
 
				+    # Unable to find a valid end of central directory structure
			
 
				+    return None
			
 
				+
			
 
				+
			
 
				+class ZipInfo (object):
			
 
				+    """Class with attributes describing each file in the ZIP archive."""
			
 
				+
			
 
				+    __slots__ = (
			
 
				+        'orig_filename',
			
 
				+        'filename',
			
 
				+        'date_time',
			
 
				+        'compress_type',
			
 
				+        '_compresslevel',
			
 
				+        'comment',
			
 
				+        'extra',
			
 
				+        'create_system',
			
 
				+        'create_version',
			
 
				+        'extract_version',
			
 
				+        'reserved',
			
 
				+        'flag_bits',
			
 
				+        'volume',
			
 
				+        'internal_attr',
			
 
				+        'external_attr',
			
 
				+        'header_offset',
			
 
				+        'CRC',
			
 
				+        'compress_size',
			
 
				+        'file_size',
			
 
				+        '_raw_time',
			
 
				+        'has_changed_name',
			
 
				+    )
			
 
				+
			
 
				+    def __init__(self, filename="NoName", date_time=(1980,1,1,0,0,0), has_changed_name=False):
			
 
				+        self.orig_filename = filename   # Original file name in archive
			
 
				+
			
 
				+        # Terminate the file name at the first null byte.  Null bytes in file
			
 
				+        # names are used as tricks by viruses in archives.
			
 
				+        null_byte = filename.find(chr(0))
			
 
				+        if null_byte >= 0:
			
 
				+            filename = filename[0:null_byte]
			
 
				+        # This is used to ensure paths in generated ZIP files always use
			
 
				+        # forward slashes as the directory separator, as required by the
			
 
				+        # ZIP format specification.
			
 
				+        if os.sep != "/" and os.sep in filename:
			
 
				+            filename = filename.replace(os.sep, "/")
			
 
				+
			
 
				+        self.filename = filename        # Normalized file name
			
 
				+        self.date_time = date_time      # year, month, day, hour, min, sec
			
 
				+
			
 
				+        if date_time[0] < 1980:
			
 
				+            raise ValueError('ZIP does not support timestamps before 1980')
			
 
				+
			
 
				+        # Standard values:
			
 
				+        self.compress_type = ZIP_STORED # Type of compression for the file
			
 
				+        self._compresslevel = None      # Level for the compressor
			
 
				+        self.comment = b""              # Comment for each file
			
 
				+        self.extra = b""                # ZIP extra data
			
 
				+        if sys.platform == 'win32':
			
 
				+            self.create_system = 0          # System which created ZIP archive
			
 
				+        else:
			
 
				+            # Assume everything else is unix-y
			
 
				+            self.create_system = 3          # System which created ZIP archive
			
 
				+        self.create_version = DEFAULT_VERSION  # Version which created ZIP archive
			
 
				+        self.extract_version = DEFAULT_VERSION # Version needed to extract archive
			
 
				+        self.reserved = 0               # Must be zero
			
 
				+        self.flag_bits = 0              # ZIP flag bits
			
 
				+        self.volume = 0                 # Volume number of file header
			
 
				+        self.internal_attr = 0          # Internal attributes
			
 
				+        self.external_attr = 0          # External file attributes
			
 
				+        # Other attributes are set by class ZipFile:
			
 
				+        # header_offset         Byte offset to the file header
			
 
				+        # CRC                   CRC-32 of the uncompressed file
			
 
				+        # compress_size         Size of the compressed file
			
 
				+        # file_size             Size of the uncompressed file
			
 
				+        self.has_changed_name = has_changed_name
			
 
				+
			
 
				+    def __repr__(self):
			
 
				+        result = ['<%s filename=%r' % (self.__class__.__name__, self.filename)]
			
 
				+        if self.compress_type != ZIP_STORED:
			
 
				+            result.append(' compress_type=%s' %
			
 
				+                          compressor_names.get(self.compress_type,
			
 
				+                                               self.compress_type))
			
 
				+        hi = self.external_attr >> 16
			
 
				+        lo = self.external_attr & 0xFFFF
			
 
				+        if hi:
			
 
				+            result.append(' filemode=%r' % stat.filemode(hi))
			
 
				+        if lo:
			
 
				+            result.append(' external_attr=%#x' % lo)
			
 
				+        isdir = self.is_dir()
			
 
				+        if not isdir or self.file_size:
			
 
				+            result.append(' file_size=%r' % self.file_size)
			
 
				+        if ((not isdir or self.compress_size) and
			
 
				+            (self.compress_type != ZIP_STORED or
			
 
				+             self.file_size != self.compress_size)):
			
 
				+            result.append(' compress_size=%r' % self.compress_size)
			
 
				+        result.append('>')
			
 
				+        return ''.join(result)
			
 
				+
			
 
				+    def FileHeader(self, zip64=None):
			
 
				+        """Return the per-file header as a bytes object."""
			
 
				+        dt = self.date_time
			
 
				+        dosdate = (dt[0] - 1980) << 9 | dt[1] << 5 | dt[2]
			
 
				+        dostime = dt[3] << 11 | dt[4] << 5 | (dt[5] // 2)
			
 
				+        if self.flag_bits & 0x08:
			
 
				+            # Set these to zero because we write them after the file data
			
 
				+            CRC = compress_size = file_size = 0
			
 
				+        else:
			
 
				+            CRC = self.CRC
			
 
				+            compress_size = self.compress_size
			
 
				+            file_size = self.file_size
			
 
				+
			
 
				+        extra = self.extra
			
 
				+
			
 
				+        min_version = 0
			
 
				+        if zip64 is None:
			
 
				+            zip64 = file_size > ZIP64_LIMIT or compress_size > ZIP64_LIMIT
			
 
				+        if zip64:
			
 
				+            fmt = '<HHQQ'
			
 
				+            extra = extra + struct.pack(fmt,
			
 
				+                                        1, struct.calcsize(fmt)-4, file_size, compress_size)
			
 
				+        if file_size > ZIP64_LIMIT or compress_size > ZIP64_LIMIT:
			
 
				+            if not zip64:
			
 
				+                raise LargeZipFile("Filesize would require ZIP64 extensions")
			
 
				+            # File is larger than what fits into a 4 byte integer,
			
 
				+            # fall back to the ZIP64 extension
			
 
				+            file_size = 0xffffffff
			
 
				+            compress_size = 0xffffffff
			
 
				+            min_version = ZIP64_VERSION
			
 
				+
			
 
				+        if self.compress_type == ZIP_BZIP2:
			
 
				+            min_version = max(BZIP2_VERSION, min_version)
			
 
				+        elif self.compress_type == ZIP_LZMA:
			
 
				+            min_version = max(LZMA_VERSION, min_version)
			
 
				+
			
 
				+        self.extract_version = max(min_version, self.extract_version)
			
 
				+        self.create_version = max(min_version, self.create_version)
			
 
				+        filename, flag_bits = self._encodeFilenameFlags()
			
 
				+        header = struct.pack(structFileHeader, stringFileHeader,
			
 
				+                             self.extract_version, self.reserved, flag_bits,
			
 
				+                             self.compress_type, dostime, dosdate, CRC,
			
 
				+                             compress_size, file_size,
			
 
				+                             len(filename), len(extra))
			
 
				+        return header + filename + extra
			
 
				+
			
 
				+    def _encodeFilenameFlags(self):
			
 
				+        try:
			
 
				+            return self.filename.encode('ascii'), self.flag_bits
			
 
				+        except UnicodeEncodeError:
			
 
				+            return self.filename.encode('utf-8'), self.flag_bits | 0x800
			
 
				+
			
 
				+    def _decodeExtra(self):
			
 
				+        # Try to decode the extra field.
			
 
				+        extra = self.extra
			
 
				+        unpack = struct.unpack
			
 
				+        while len(extra) >= 4:
			
 
				+            tp, ln = unpack('<HH', extra[:4])
			
 
				+            if ln+4 > len(extra):
			
 
				+                raise BadZipFile("Corrupt extra field %04x (size=%d)" % (tp, ln))
			
 
				+            if tp == 0x0001:
			
 
				+                if ln >= 24:
			
 
				+                    counts = unpack('<QQQ', extra[4:28])
			
 
				+                elif ln == 16:
			
 
				+                    counts = unpack('<QQ', extra[4:20])
			
 
				+                elif ln == 8:
			
 
				+                    counts = unpack('<Q', extra[4:12])
			
 
				+                elif ln == 0:
			
 
				+                    counts = ()
			
 
				+                else:
			
 
				+                    raise BadZipFile("Corrupt extra field %04x (size=%d)" % (tp, ln))
			
 
				+
			
 
				+                idx = 0
			
 
				+
			
 
				+                # ZIP64 extension (large files and/or large archives)
			
 
				+                if self.file_size in (0xffffffffffffffff, 0xffffffff):
			
 
				+                    if len(counts) <= idx:
			
 
				+                        raise BadZipFile(
			
 
				+                            "Corrupt zip64 extra field. File size not found."
			
 
				+                        )
			
 
				+                    self.file_size = counts[idx]
			
 
				+                    idx += 1
			
 
				+
			
 
				+                if self.compress_size == 0xFFFFFFFF:
			
 
				+                    if len(counts) <= idx:
			
 
				+                        raise BadZipFile(
			
 
				+                            "Corrupt zip64 extra field. Compress size not found."
			
 
				+                        )
			
 
				+                    self.compress_size = counts[idx]
			
 
				+                    idx += 1
			
 
				+
			
 
				+                if self.header_offset == 0xffffffff:
			
 
				+                    if len(counts) <= idx:
			
 
				+                        raise BadZipFile(
			
 
				+                            "Corrupt zip64 extra field. Header offset not found."
			
 
				+                        )
			
 
				+                    old = self.header_offset
			
 
				+                    self.header_offset = counts[idx]
			
 
				+                    idx+=1
			
 
				+
			
 
				+            extra = extra[ln+4:]
			
 
				+
			
 
				+    @classmethod
			
 
				+    def from_file(cls, filename, arcname=None):
			
 
				+        """Construct an appropriate ZipInfo for a file on the filesystem.
			
 
				+
			
 
				+        filename should be the path to a file or directory on the filesystem.
			
 
				+
			
 
				+        arcname is the name which it will have within the archive (by default,
			
 
				+        this will be the same as filename, but without a drive letter and with
			
 
				+        leading path separators removed).
			
 
				+        """
			
 
				+        if isinstance(filename, os.PathLike):
			
 
				+            filename = os.fspath(filename)
			
 
				+        st = os.stat(filename)
			
 
				+        isdir = stat.S_ISDIR(st.st_mode)
			
 
				+        mtime = time.localtime(st.st_mtime)
			
 
				+        date_time = mtime[0:6]
			
 
				+        # Create ZipInfo instance to store file information
			
 
				+        if arcname is None:
			
 
				+            arcname = filename
			
 
				+        arcname = os.path.normpath(os.path.splitdrive(arcname)[1])
			
 
				+        while arcname[0] in (os.sep, os.altsep):
			
 
				+            arcname = arcname[1:]
			
 
				+        if isdir:
			
 
				+            arcname += '/'
			
 
				+        zinfo = cls(arcname, date_time)
			
 
				+        zinfo.external_attr = (st.st_mode & 0xFFFF) << 16  # Unix attributes
			
 
				+        if isdir:
			
 
				+            zinfo.file_size = 0
			
 
				+            zinfo.external_attr |= 0x10  # MS-DOS directory flag
			
 
				+        else:
			
 
				+            zinfo.file_size = st.st_size
			
 
				+
			
 
				+        return zinfo
			
 
				+
			
 
				+    def is_dir(self):
			
 
				+        """Return True if this archive member is a directory."""
			
 
				+        return self.filename[-1] == '/'
			
 
				+
			
 
				+
			
 
				+# ZIP encryption uses the CRC32 one-byte primitive for scrambling some
			
 
				+# internal keys. We noticed that a direct implementation is faster than
			
 
				+# relying on binascii.crc32().
			
 
				+
			
 
				+_crctable = None
			
 
				+def _gen_crc(crc):
			
 
				+    for j in range(8):
			
 
				+        if crc & 1:
			
 
				+            crc = (crc >> 1) ^ 0xEDB88320
			
 
				+        else:
			
 
				+            crc >>= 1
			
 
				+    return crc
			
 
				+
			
 
				+# ZIP supports a password-based form of encryption. Even though known
			
 
				+# plaintext attacks have been found against it, it is still useful
			
 
				+# to be able to get data out of such a file.
			
 
				+#
			
 
				+# Usage:
			
 
				+#     zd = _ZipDecrypter(mypwd)
			
 
				+#     plain_bytes = zd(cypher_bytes)
			
 
				+
			
 
				+def _ZipDecrypter(pwd):
			
 
				+    key0 = 305419896
			
 
				+    key1 = 591751049
			
 
				+    key2 = 878082192
			
 
				+
			
 
				+    global _crctable
			
 
				+    if _crctable is None:
			
 
				+        _crctable = list(map(_gen_crc, range(256)))
			
 
				+    crctable = _crctable
			
 
				+
			
 
				+    def crc32(ch, crc):
			
 
				+        """Compute the CRC32 primitive on one byte."""
			
 
				+        return (crc >> 8) ^ crctable[(crc ^ ch) & 0xFF]
			
 
				+
			
 
				+    def update_keys(c):
			
 
				+        nonlocal key0, key1, key2
			
 
				+        key0 = crc32(c, key0)
			
 
				+        key1 = (key1 + (key0 & 0xFF)) & 0xFFFFFFFF
			
 
				+        key1 = (key1 * 134775813 + 1) & 0xFFFFFFFF
			
 
				+        key2 = crc32(key1 >> 24, key2)
			
 
				+
			
 
				+    for p in pwd:
			
 
				+        update_keys(p)
			
 
				+
			
 
				+    def decrypter(data):
			
 
				+        """Decrypt a bytes object."""
			
 
				+        result = bytearray()
			
 
				+        append = result.append
			
 
				+        for c in data:
			
 
				+            k = key2 | 2
			
 
				+            c ^= ((k * (k^1)) >> 8) & 0xFF
			
 
				+            update_keys(c)
			
 
				+            append(c)
			
 
				+        return bytes(result)
			
 
				+
			
 
				+    return decrypter
			
 
				+
			
 
				+
			
 
				+class LZMACompressor:
			
 
				+
			
 
				+    def __init__(self):
			
 
				+        self._comp = None
			
 
				+
			
 
				+    def _init(self):
			
 
				+        props = lzma._encode_filter_properties({'id': lzma.FILTER_LZMA1})
			
 
				+        self._comp = lzma.LZMACompressor(lzma.FORMAT_RAW, filters=[
			
 
				+            lzma._decode_filter_properties(lzma.FILTER_LZMA1, props)
			
 
				+        ])
			
 
				+        return struct.pack('<BBH', 9, 4, len(props)) + props
			
 
				+
			
 
				+    def compress(self, data):
			
 
				+        if self._comp is None:
			
 
				+            return self._init() + self._comp.compress(data)
			
 
				+        return self._comp.compress(data)
			
 
				+
			
 
				+    def flush(self):
			
 
				+        if self._comp is None:
			
 
				+            return self._init() + self._comp.flush()
			
 
				+        return self._comp.flush()
			
 
				+
			
 
				+
			
 
				+class LZMADecompressor:
			
 
				+
			
 
				+    def __init__(self):
			
 
				+        self._decomp = None
			
 
				+        self._unconsumed = b''
			
 
				+        self.eof = False
			
 
				+
			
 
				+    def decompress(self, data):
			
 
				+        if self._decomp is None:
			
 
				+            self._unconsumed += data
			
 
				+            if len(self._unconsumed) <= 4:
			
 
				+                return b''
			
 
				+            psize, = struct.unpack('<H', self._unconsumed[2:4])
			
 
				+            if len(self._unconsumed) <= 4 + psize:
			
 
				+                return b''
			
 
				+
			
 
				+            self._decomp = lzma.LZMADecompressor(lzma.FORMAT_RAW, filters=[
			
 
				+                lzma._decode_filter_properties(lzma.FILTER_LZMA1,
			
 
				+                                               self._unconsumed[4:4 + psize])
			
 
				+            ])
			
 
				+            data = self._unconsumed[4 + psize:]
			
 
				+            del self._unconsumed
			
 
				+
			
 
				+        result = self._decomp.decompress(data)
			
 
				+        self.eof = self._decomp.eof
			
 
				+        return result
			
 
				+
			
 
				+
			
 
				+compressor_names = {
			
 
				+    0: 'store',
			
 
				+    1: 'shrink',
			
 
				+    2: 'reduce',
			
 
				+    3: 'reduce',
			
 
				+    4: 'reduce',
			
 
				+    5: 'reduce',
			
 
				+    6: 'implode',
			
 
				+    7: 'tokenize',
			
 
				+    8: 'deflate',
			
 
				+    9: 'deflate64',
			
 
				+    10: 'implode',
			
 
				+    12: 'bzip2',
			
 
				+    14: 'lzma',
			
 
				+    18: 'terse',
			
 
				+    19: 'lz77',
			
 
				+    97: 'wavpack',
			
 
				+    98: 'ppmd',
			
 
				+}
			
 
				+
			
 
				+def _check_compression(compression):
			
 
				+    if compression == ZIP_STORED:
			
 
				+        pass
			
 
				+    elif compression == ZIP_DEFLATED:
			
 
				+        if not zlib:
			
 
				+            raise RuntimeError(
			
 
				+                "Compression requires the (missing) zlib module")
			
 
				+    elif compression == ZIP_BZIP2:
			
 
				+        if not bz2:
			
 
				+            raise RuntimeError(
			
 
				+                "Compression requires the (missing) bz2 module")
			
 
				+    elif compression == ZIP_LZMA:
			
 
				+        if not lzma:
			
 
				+            raise RuntimeError(
			
 
				+                "Compression requires the (missing) lzma module")
			
 
				+    else:
			
 
				+        raise NotImplementedError("That compression method is not supported")
			
 
				+
			
 
				+
			
 
				+def _get_compressor(compress_type, compresslevel=None):
			
 
				+    if compress_type == ZIP_DEFLATED:
			
 
				+        if compresslevel is not None:
			
 
				+            return zlib.compressobj(compresslevel, zlib.DEFLATED, -15)
			
 
				+        return zlib.compressobj(zlib.Z_DEFAULT_COMPRESSION, zlib.DEFLATED, -15)
			
 
				+    elif compress_type == ZIP_BZIP2:
			
 
				+        if compresslevel is not None:
			
 
				+            return bz2.BZ2Compressor(compresslevel)
			
 
				+        return bz2.BZ2Compressor()
			
 
				+    # compresslevel is ignored for ZIP_LZMA
			
 
				+    elif compress_type == ZIP_LZMA:
			
 
				+        return LZMACompressor()
			
 
				+    else:
			
 
				+        return None
			
 
				+
			
 
				+
			
 
				+def _get_decompressor(compress_type):
			
 
				+    if compress_type == ZIP_STORED:
			
 
				+        return None
			
 
				+    elif compress_type == ZIP_DEFLATED:
			
 
				+        return zlib.decompressobj(-15)
			
 
				+    elif compress_type == ZIP_BZIP2:
			
 
				+        return bz2.BZ2Decompressor()
			
 
				+    elif compress_type == ZIP_LZMA:
			
 
				+        return LZMADecompressor()
			
 
				+    else:
			
 
				+        descr = compressor_names.get(compress_type)
			
 
				+        if descr:
			
 
				+            raise NotImplementedError("compression type %d (%s)" % (compress_type, descr))
			
 
				+        else:
			
 
				+            raise NotImplementedError("compression type %d" % (compress_type,))
			
 
				+
			
 
				+
			
 
				+class _SharedFile:
			
 
				+    def __init__(self, file, pos, close, lock, writing):
			
 
				+        self._file = file
			
 
				+        self._pos = pos
			
 
				+        self._close = close
			
 
				+        self._lock = lock
			
 
				+        self._writing = writing
			
 
				+        self.seekable = file.seekable
			
 
				+        self.tell = file.tell
			
 
				+
			
 
				+    def seek(self, offset, whence=0):
			
 
				+        with self._lock:
			
 
				+            if self._writing():
			
 
				+                raise ValueError("Can't reposition in the ZIP file while "
			
 
				+                        "there is an open writing handle on it. "
			
 
				+                        "Close the writing handle before trying to read.")
			
 
				+            self._file.seek(offset, whence)
			
 
				+            self._pos = self._file.tell()
			
 
				+            return self._pos
			
 
				+
			
 
				+    def read(self, n=-1):
			
 
				+        with self._lock:
			
 
				+            if self._writing():
			
 
				+                raise ValueError("Can't read from the ZIP file while there "
			
 
				+                        "is an open writing handle on it. "
			
 
				+                        "Close the writing handle before trying to read.")
			
 
				+            self._file.seek(self._pos)
			
 
				+            data = self._file.read(n)
			
 
				+            self._pos = self._file.tell()
			
 
				+            return data
			
 
				+
			
 
				+    def close(self):
			
 
				+        if self._file is not None:
			
 
				+            fileobj = self._file
			
 
				+            self._file = None
			
 
				+            self._close(fileobj)
			
 
				+
			
 
				+# Provide the tell method for unseekable stream
			
 
				+class _Tellable:
			
 
				+    def __init__(self, fp):
			
 
				+        self.fp = fp
			
 
				+        self.offset = 0
			
 
				+
			
 
				+    def write(self, data):
			
 
				+        n = self.fp.write(data)
			
 
				+        self.offset += n
			
 
				+        return n
			
 
				+
			
 
				+    def tell(self):
			
 
				+        return self.offset
			
 
				+
			
 
				+    def flush(self):
			
 
				+        self.fp.flush()
			
 
				+
			
 
				+    def close(self):
			
 
				+        self.fp.close()
			
 
				+
			
 
				+
			
 
				+class ZipExtFile(io.BufferedIOBase):
			
 
				+    """File-like object for reading an archive member.
			
 
				+       Is returned by ZipFile.open().
			
 
				+    """
			
 
				+
			
 
				+    # Max size supported by decompressor.
			
 
				+    MAX_N = 1 << 31 - 1
			
 
				+
			
 
				+    # Read from compressed files in 4k blocks.
			
 
				+    MIN_READ_SIZE = 4096
			
 
				+
			
 
				+    # Chunk size to read during seek
			
 
				+    MAX_SEEK_READ = 1 << 24
			
 
				+
			
 
				+    def __init__(self, fileobj, mode, zipinfo, pwd=None,
			
 
				+                 close_fileobj=False):
			
 
				+        self._fileobj = fileobj
			
 
				+        self._pwd = pwd
			
 
				+        self._close_fileobj = close_fileobj
			
 
				+
			
 
				+        self._compress_type = zipinfo.compress_type
			
 
				+        self._compress_left = zipinfo.compress_size
			
 
				+        self._left = zipinfo.file_size
			
 
				+
			
 
				+        self._decompressor = _get_decompressor(self._compress_type)
			
 
				+
			
 
				+        self._eof = False
			
 
				+        self._readbuffer = b''
			
 
				+        self._offset = 0
			
 
				+
			
 
				+        self.newlines = None
			
 
				+
			
 
				+        self.mode = mode
			
 
				+        self.name = zipinfo.filename
			
 
				+
			
 
				+        if hasattr(zipinfo, 'CRC'):
			
 
				+            self._expected_crc = zipinfo.CRC
			
 
				+            self._running_crc = crc32(b'')
			
 
				+        else:
			
 
				+            self._expected_crc = None
			
 
				+
			
 
				+        self._seekable = False
			
 
				+        try:
			
 
				+            if fileobj.seekable():
			
 
				+                self._orig_compress_start = fileobj.tell()
			
 
				+                self._orig_compress_size = zipinfo.compress_size
			
 
				+                self._orig_file_size = zipinfo.file_size
			
 
				+                self._orig_start_crc = self._running_crc
			
 
				+                self._seekable = True
			
 
				+        except AttributeError:
			
 
				+            pass
			
 
				+
			
 
				+        self._decrypter = None
			
 
				+        if pwd:
			
 
				+            if zipinfo.flag_bits & 0x8:
			
 
				+                # compare against the file type from extended local headers
			
 
				+                check_byte = (zipinfo._raw_time >> 8) & 0xff
			
 
				+            else:
			
 
				+                # compare against the CRC otherwise
			
 
				+                check_byte = (zipinfo.CRC >> 24) & 0xff
			
 
				+            h = self._init_decrypter()
			
 
				+            if h != check_byte:
			
 
				+                raise RuntimeError("Bad password for file %r" % zipinfo.orig_filename)
			
 
				+
			
 
				+
			
 
				+    def _init_decrypter(self):
			
 
				+        self._decrypter = _ZipDecrypter(self._pwd)
			
 
				+        # The first 12 bytes in the cypher stream is an encryption header
			
 
				+        #  used to strengthen the algorithm. The first 11 bytes are
			
 
				+        #  completely random, while the 12th contains the MSB of the CRC,
			
 
				+        #  or the MSB of the file time depending on the header type
			
 
				+        #  and is used to check the correctness of the password.
			
 
				+        header = self._fileobj.read(12)
			
 
				+        self._compress_left -= 12
			
 
				+        return self._decrypter(header)[11]
			
 
				+
			
 
				+    def __repr__(self):
			
 
				+        result = ['<%s.%s' % (self.__class__.__module__,
			
 
				+                              self.__class__.__qualname__)]
			
 
				+        if not self.closed:
			
 
				+            result.append(' name=%r mode=%r' % (self.name, self.mode))
			
 
				+            if self._compress_type != ZIP_STORED:
			
 
				+                result.append(' compress_type=%s' %
			
 
				+                              compressor_names.get(self._compress_type,
			
 
				+                                                   self._compress_type))
			
 
				+        else:
			
 
				+            result.append(' [closed]')
			
 
				+        result.append('>')
			
 
				+        return ''.join(result)
			
 
				+
			
 
				+    def readline(self, limit=-1):
			
 
				+        """Read and return a line from the stream.
			
 
				+
			
 
				+        If limit is specified, at most limit bytes will be read.
			
 
				+        """
			
 
				+
			
 
				+        if limit < 0:
			
 
				+            # Shortcut common case - newline found in buffer.
			
 
				+            i = self._readbuffer.find(b'\n', self._offset) + 1
			
 
				+            if i > 0:
			
 
				+                line = self._readbuffer[self._offset: i]
			
 
				+                self._offset = i
			
 
				+                return line
			
 
				+
			
 
				+        return io.BufferedIOBase.readline(self, limit)
			
 
				+
			
 
				+    def peek(self, n=1):
			
 
				+        """Returns buffered bytes without advancing the position."""
			
 
				+        if n > len(self._readbuffer) - self._offset:
			
 
				+            chunk = self.read(n)
			
 
				+            if len(chunk) > self._offset:
			
 
				+                self._readbuffer = chunk + self._readbuffer[self._offset:]
			
 
				+                self._offset = 0
			
 
				+            else:
			
 
				+                self._offset -= len(chunk)
			
 
				+
			
 
				+        # Return up to 512 bytes to reduce allocation overhead for tight loops.
			
 
				+        return self._readbuffer[self._offset: self._offset + 512]
			
 
				+
			
 
				+    def readable(self):
			
 
				+        return True
			
 
				+
			
 
				+    def read(self, n=-1):
			
 
				+        """Read and return up to n bytes.
			
 
				+        If the argument is omitted, None, or negative, data is read and returned until EOF is reached.
			
 
				+        """
			
 
				+        if n is None or n < 0:
			
 
				+            buf = self._readbuffer[self._offset:]
			
 
				+            self._readbuffer = b''
			
 
				+            self._offset = 0
			
 
				+            while not self._eof:
			
 
				+                buf += self._read1(self.MAX_N)
			
 
				+            return buf
			
 
				+
			
 
				+        end = n + self._offset
			
 
				+        if end < len(self._readbuffer):
			
 
				+            buf = self._readbuffer[self._offset:end]
			
 
				+            self._offset = end
			
 
				+            return buf
			
 
				+
			
 
				+        n = end - len(self._readbuffer)
			
 
				+        buf = self._readbuffer[self._offset:]
			
 
				+        self._readbuffer = b''
			
 
				+        self._offset = 0
			
 
				+        while n > 0 and not self._eof:
			
 
				+            data = self._read1(n)
			
 
				+            if n < len(data):
			
 
				+                self._readbuffer = data
			
 
				+                self._offset = n
			
 
				+                buf += data[:n]
			
 
				+                break
			
 
				+            buf += data
			
 
				+            n -= len(data)
			
 
				+        return buf
			
 
				+
			
 
				+    def _update_crc(self, newdata):
			
 
				+        # Update the CRC using the given data.
			
 
				+        if self._expected_crc is None:
			
 
				+            # No need to compute the CRC if we don't have a reference value
			
 
				+            return
			
 
				+        self._running_crc = crc32(newdata, self._running_crc)
			
 
				+        # Check the CRC if we're at the end of the file
			
 
				+        if self._eof and self._running_crc != self._expected_crc:
			
 
				+            raise BadZipFile("Bad CRC-32 for file %r" % self.name)
			
 
				+
			
 
				+    def read1(self, n):
			
 
				+        """Read up to n bytes with at most one read() system call."""
			
 
				+
			
 
				+        if n is None or n < 0:
			
 
				+            buf = self._readbuffer[self._offset:]
			
 
				+            self._readbuffer = b''
			
 
				+            self._offset = 0
			
 
				+            while not self._eof:
			
 
				+                data = self._read1(self.MAX_N)
			
 
				+                if data:
			
 
				+                    buf += data
			
 
				+                    break
			
 
				+            return buf
			
 
				+
			
 
				+        end = n + self._offset
			
 
				+        if end < len(self._readbuffer):
			
 
				+            buf = self._readbuffer[self._offset:end]
			
 
				+            self._offset = end
			
 
				+            return buf
			
 
				+
			
 
				+        n = end - len(self._readbuffer)
			
 
				+        buf = self._readbuffer[self._offset:]
			
 
				+        self._readbuffer = b''
			
 
				+        self._offset = 0
			
 
				+        if n > 0:
			
 
				+            while not self._eof:
			
 
				+                data = self._read1(n)
			
 
				+                if n < len(data):
			
 
				+                    self._readbuffer = data
			
 
				+                    self._offset = n
			
 
				+                    buf += data[:n]
			
 
				+                    break
			
 
				+                if data:
			
 
				+                    buf += data
			
 
				+                    break
			
 
				+        return buf
			
 
				+
			
 
				+    def _read1(self, n):
			
 
				+        # Read up to n compressed bytes with at most one read() system call,
			
 
				+        # decrypt and decompress them.
			
 
				+        if self._eof or n <= 0:
			
 
				+            return b''
			
 
				+
			
 
				+        # Read from file.
			
 
				+        if self._compress_type == ZIP_DEFLATED:
			
 
				+            ## Handle unconsumed data.
			
 
				+            data = self._decompressor.unconsumed_tail
			
 
				+            if n > len(data):
			
 
				+                data += self._read2(n - len(data))
			
 
				+        else:
			
 
				+            data = self._read2(n)
			
 
				+
			
 
				+        if self._compress_type == ZIP_STORED:
			
 
				+            self._eof = self._compress_left <= 0
			
 
				+        elif self._compress_type == ZIP_DEFLATED:
			
 
				+            n = max(n, self.MIN_READ_SIZE)
			
 
				+            data = self._decompressor.decompress(data, n)
			
 
				+            self._eof = (self._decompressor.eof or
			
 
				+                         self._compress_left <= 0 and
			
 
				+                         not self._decompressor.unconsumed_tail)
			
 
				+            if self._eof:
			
 
				+                data += self._decompressor.flush()
			
 
				+        else:
			
 
				+            data = self._decompressor.decompress(data)
			
 
				+            self._eof = self._decompressor.eof or self._compress_left <= 0
			
 
				+
			
 
				+        data = data[:self._left]
			
 
				+        self._left -= len(data)
			
 
				+        if self._left <= 0:
			
 
				+            self._eof = True
			
 
				+        self._update_crc(data)
			
 
				+        return data
			
 
				+
			
 
				+    def _read2(self, n):
			
 
				+        if self._compress_left <= 0:
			
 
				+            return b''
			
 
				+
			
 
				+        n = max(n, self.MIN_READ_SIZE)
			
 
				+        n = min(n, self._compress_left)
			
 
				+
			
 
				+        data = self._fileobj.read(n)
			
 
				+        self._compress_left -= len(data)
			
 
				+        if not data:
			
 
				+            raise EOFError
			
 
				+
			
 
				+        if self._decrypter is not None:
			
 
				+            data = self._decrypter(data)
			
 
				+        return data
			
 
				+
			
 
				+    def close(self):
			
 
				+        try:
			
 
				+            if self._close_fileobj:
			
 
				+                self._fileobj.close()
			
 
				+        finally:
			
 
				+            super().close()
			
 
				+
			
 
				+    def seekable(self):
			
 
				+        return self._seekable
			
 
				+
			
 
				+    def seek(self, offset, whence=0):
			
 
				+        if not self._seekable:
			
 
				+            raise io.UnsupportedOperation("underlying stream is not seekable")
			
 
				+        curr_pos = self.tell()
			
 
				+        if whence == 0: # Seek from start of file
			
 
				+            new_pos = offset
			
 
				+        elif whence == 1: # Seek from current position
			
 
				+            new_pos = curr_pos + offset
			
 
				+        elif whence == 2: # Seek from EOF
			
 
				+            new_pos = self._orig_file_size + offset
			
 
				+        else:
			
 
				+            raise ValueError("whence must be os.SEEK_SET (0), "
			
 
				+                             "os.SEEK_CUR (1), or os.SEEK_END (2)")
			
 
				+
			
 
				+        if new_pos > self._orig_file_size:
			
 
				+            new_pos = self._orig_file_size
			
 
				+
			
 
				+        if new_pos < 0:
			
 
				+            new_pos = 0
			
 
				+
			
 
				+        read_offset = new_pos - curr_pos
			
 
				+        buff_offset = read_offset + self._offset
			
 
				+
			
 
				+        if buff_offset >= 0 and buff_offset < len(self._readbuffer):
			
 
				+            # Just move the _offset index if the new position is in the _readbuffer
			
 
				+            self._offset = buff_offset
			
 
				+            read_offset = 0
			
 
				+        elif read_offset < 0:
			
 
				+            # Position is before the current position. Reset the ZipExtFile
			
 
				+            self._fileobj.seek(self._orig_compress_start)
			
 
				+            self._running_crc = self._orig_start_crc
			
 
				+            self._compress_left = self._orig_compress_size
			
 
				+            self._left = self._orig_file_size
			
 
				+            self._readbuffer = b''
			
 
				+            self._offset = 0
			
 
				+            self._decompressor = _get_decompressor(self._compress_type)
			
 
				+            self._eof = False
			
 
				+            read_offset = new_pos
			
 
				+            if self._decrypter is not None:
			
 
				+                self._init_decrypter()
			
 
				+
			
 
				+        while read_offset > 0:
			
 
				+            read_len = min(self.MAX_SEEK_READ, read_offset)
			
 
				+            self.read(read_len)
			
 
				+            read_offset -= read_len
			
 
				+
			
 
				+        return self.tell()
			
 
				+
			
 
				+    def tell(self):
			
 
				+        if not self._seekable:
			
 
				+            raise io.UnsupportedOperation("underlying stream is not seekable")
			
 
				+        filepos = self._orig_file_size - self._left - len(self._readbuffer) + self._offset
			
 
				+        return filepos
			
 
				+
			
 
				+
			
 
				+class _ZipWriteFile(io.BufferedIOBase):
			
 
				+    def __init__(self, zf, zinfo, zip64):
			
 
				+        self._zinfo = zinfo
			
 
				+        self._zip64 = zip64
			
 
				+        self._zipfile = zf
			
 
				+        self._compressor = _get_compressor(zinfo.compress_type,
			
 
				+                                           zinfo._compresslevel)
			
 
				+        self._file_size = 0
			
 
				+        self._compress_size = 0
			
 
				+        self._crc = 0
			
 
				+
			
 
				+    @property
			
 
				+    def _fileobj(self):
			
 
				+        return self._zipfile.fp
			
 
				+
			
 
				+    def writable(self):
			
 
				+        return True
			
 
				+
			
 
				+    def write(self, data):
			
 
				+        if self.closed:
			
 
				+            raise ValueError('I/O operation on closed file.')
			
 
				+        nbytes = len(data)
			
 
				+        self._file_size += nbytes
			
 
				+        self._crc = crc32(data, self._crc)
			
 
				+        if self._compressor:
			
 
				+            data = self._compressor.compress(data)
			
 
				+            self._compress_size += len(data)
			
 
				+        self._fileobj.write(data)
			
 
				+        return nbytes
			
 
				+
			
 
				+    def close(self):
			
 
				+        if self.closed:
			
 
				+            return
			
 
				+        try:
			
 
				+            super().close()
			
 
				+            # Flush any data from the compressor, and update header info
			
 
				+            if self._compressor:
			
 
				+                buf = self._compressor.flush()
			
 
				+                self._compress_size += len(buf)
			
 
				+                self._fileobj.write(buf)
			
 
				+                self._zinfo.compress_size = self._compress_size
			
 
				+            else:
			
 
				+                self._zinfo.compress_size = self._file_size
			
 
				+            self._zinfo.CRC = self._crc
			
 
				+            self._zinfo.file_size = self._file_size
			
 
				+
			
 
				+            # Write updated header info
			
 
				+            if self._zinfo.flag_bits & 0x08:
			
 
				+                # Write CRC and file sizes after the file data
			
 
				+                fmt = '<LLQQ' if self._zip64 else '<LLLL'
			
 
				+                self._fileobj.write(struct.pack(fmt, _DD_SIGNATURE, self._zinfo.CRC,
			
 
				+                    self._zinfo.compress_size, self._zinfo.file_size))
			
 
				+                self._zipfile.start_dir = self._fileobj.tell()
			
 
				+            else:
			
 
				+                if not self._zip64:
			
 
				+                    if self._file_size > ZIP64_LIMIT:
			
 
				+                        raise RuntimeError(
			
 
				+                            'File size unexpectedly exceeded ZIP64 limit')
			
 
				+                    if self._compress_size > ZIP64_LIMIT:
			
 
				+                        raise RuntimeError(
			
 
				+                            'Compressed size unexpectedly exceeded ZIP64 limit')
			
 
				+                # Seek backwards and write file header (which will now include
			
 
				+                # correct CRC and file sizes)
			
 
				+
			
 
				+                # Preserve current position in file
			
 
				+                self._zipfile.start_dir = self._fileobj.tell()
			
 
				+                self._fileobj.seek(self._zinfo.header_offset)
			
 
				+                self._fileobj.write(self._zinfo.FileHeader(self._zip64))
			
 
				+                self._fileobj.seek(self._zipfile.start_dir)
			
 
				+
			
 
				+            # Successfully written: Add file to our caches
			
 
				+            self._zipfile.filelist.append(self._zinfo)
			
 
				+            self._zipfile.NameToInfo[self._zinfo.filename] = self._zinfo
			
 
				+        finally:
			
 
				+            self._zipfile._writing = False
			
 
				+
			
 
				+
			
 
				+
			
 
				+class ZipFile:
			
 
				+    """ Class with methods to open, read, write, close, list zip files.
			
 
				+
			
 
				+    z = ZipFile(file, mode="r", compression=ZIP_STORED, allowZip64=True,
			
 
				+                compresslevel=None)
			
 
				+
			
 
				+    file: Either the path to the file, or a file-like object.
			
 
				+          If it is a path, the file will be opened and closed by ZipFile.
			
 
				+    mode: The mode can be either read 'r', write 'w', exclusive create 'x',
			
 
				+          or append 'a'.
			
 
				+    compression: ZIP_STORED (no compression), ZIP_DEFLATED (requires zlib),
			
 
				+                 ZIP_BZIP2 (requires bz2) or ZIP_LZMA (requires lzma).
			
 
				+    allowZip64: if True ZipFile will create files with ZIP64 extensions when
			
 
				+                needed, otherwise it will raise an exception when this would
			
 
				+                be necessary.
			
 
				+    compresslevel: None (default for the given compression type) or an integer
			
 
				+                   specifying the level to pass to the compressor.
			
 
				+                   When using ZIP_STORED or ZIP_LZMA this keyword has no effect.
			
 
				+                   When using ZIP_DEFLATED integers 0 through 9 are accepted.
			
 
				+                   When using ZIP_BZIP2 integers 1 through 9 are accepted.
			
 
				+
			
 
				+    """
			
 
				+
			
 
				+    fp = None                   # Set here since __del__ checks it
			
 
				+    _windows_illegal_name_trans_table = None
			
 
				+
			
 
				+    def __init__(self, file, mode="r", compression=ZIP_STORED, allowZip64=True,
			
 
				+                 compresslevel=None, has_changed_name=False):
			
 
				+        """Open the ZIP file with mode read 'r', write 'w', exclusive create 'x',
			
 
				+        or append 'a'."""
			
 
				+        if mode not in ('r', 'w', 'x', 'a'):
			
 
				+            raise ValueError("ZipFile requires mode 'r', 'w', 'x', or 'a'")
			
 
				+
			
 
				+        _check_compression(compression)
			
 
				+
			
 
				+        self._allowZip64 = allowZip64
			
 
				+        self._didModify = False
			
 
				+        self.debug = 0  # Level of printing: 0 through 3
			
 
				+        self.NameToInfo = {}    # Find file info given name
			
 
				+        self.filelist = []      # List of ZipInfo instances for archive
			
 
				+        self.compression = compression  # Method of compression
			
 
				+        self.compresslevel = compresslevel
			
 
				+        self.mode = mode
			
 
				+        self.pwd = None
			
 
				+        self._comment = b''
			
 
				+        self.has_changed_name = has_changed_name
			
 
				+
			
 
				+        # Check if we were passed a file-like object
			
 
				+        if isinstance(file, os.PathLike):
			
 
				+            file = os.fspath(file)
			
 
				+        if isinstance(file, str):
			
 
				+            # No, it's a filename
			
 
				+            self._filePassed = 0
			
 
				+            self.filename = file
			
 
				+            modeDict = {'r' : 'rb', 'w': 'w+b', 'x': 'x+b', 'a' : 'r+b',
			
 
				+                        'r+b': 'w+b', 'w+b': 'wb', 'x+b': 'xb'}
			
 
				+            filemode = modeDict[mode]
			
 
				+            while True:
			
 
				+                try:
			
 
				+                    self.fp = io.open(file, filemode)
			
 
				+                except OSError:
			
 
				+                    if filemode in modeDict:
			
 
				+                        filemode = modeDict[filemode]
			
 
				+                        continue
			
 
				+                    raise
			
 
				+                break
			
 
				+        else:
			
 
				+            self._filePassed = 1
			
 
				+            self.fp = file
			
 
				+            self.filename = getattr(file, 'name', None)
			
 
				+        self._fileRefCnt = 1
			
 
				+        self._lock = threading.RLock()
			
 
				+        self._seekable = True
			
 
				+        self._writing = False
			
 
				+
			
 
				+        try:
			
 
				+            if mode == 'r':
			
 
				+                self._RealGetContents()
			
 
				+            elif mode in ('w', 'x'):
			
 
				+                # set the modified flag so central directory gets written
			
 
				+                # even if no files are added to the archive
			
 
				+                self._didModify = True
			
 
				+                try:
			
 
				+                    self.start_dir = self.fp.tell()
			
 
				+                except (AttributeError, OSError):
			
 
				+                    self.fp = _Tellable(self.fp)
			
 
				+                    self.start_dir = 0
			
 
				+                    self._seekable = False
			
 
				+                else:
			
 
				+                    # Some file-like objects can provide tell() but not seek()
			
 
				+                    try:
			
 
				+                        self.fp.seek(self.start_dir)
			
 
				+                    except (AttributeError, OSError):
			
 
				+                        self._seekable = False
			
 
				+            elif mode == 'a':
			
 
				+                try:
			
 
				+                    # See if file is a zip file
			
 
				+                    self._RealGetContents()
			
 
				+                    # seek to start of directory and overwrite
			
 
				+                    self.fp.seek(self.start_dir)
			
 
				+                except BadZipFile:
			
 
				+                    # file is not a zip file, just append
			
 
				+                    self.fp.seek(0, 2)
			
 
				+
			
 
				+                    # set the modified flag so central directory gets written
			
 
				+                    # even if no files are added to the archive
			
 
				+                    self._didModify = True
			
 
				+                    self.start_dir = self.fp.tell()
			
 
				+            else:
			
 
				+                raise ValueError("Mode must be 'r', 'w', 'x', or 'a'")
			
 
				+        except:
			
 
				+            fp = self.fp
			
 
				+            self.fp = None
			
 
				+            self._fpclose(fp)
			
 
				+            raise
			
 
				+
			
 
				+    def __enter__(self):
			
 
				+        return self
			
 
				+
			
 
				+    def __exit__(self, type, value, traceback):
			
 
				+        self.close()
			
 
				+
			
 
				+    def __repr__(self):
			
 
				+        result = ['<%s.%s' % (self.__class__.__module__,
			
 
				+                              self.__class__.__qualname__)]
			
 
				+        if self.fp is not None:
			
 
				+            if self._filePassed:
			
 
				+                result.append(' file=%r' % self.fp)
			
 
				+            elif self.filename is not None:
			
 
				+                result.append(' filename=%r' % self.filename)
			
 
				+            result.append(' mode=%r' % self.mode)
			
 
				+        else:
			
 
				+            result.append(' [closed]')
			
 
				+        result.append('>')
			
 
				+        return ''.join(result)
			
 
				+
			
 
				+    def _RealGetContents(self):
			
 
				+        """Read in the table of contents for the ZIP file."""
			
 
				+        fp = self.fp
			
 
				+        try:
			
 
				+            endrec = _EndRecData(fp)
			
 
				+        except OSError:
			
 
				+            raise BadZipFile("File is not a zip file")
			
 
				+        if not endrec:
			
 
				+            raise BadZipFile("File is not a zip file")
			
 
				+        if self.debug > 1:
			
 
				+            print(endrec)
			
 
				+        size_cd = endrec[_ECD_SIZE]             # bytes in central directory
			
 
				+        offset_cd = endrec[_ECD_OFFSET]         # offset of central directory
			
 
				+        self._comment = endrec[_ECD_COMMENT]    # archive comment
			
 
				+
			
 
				+        # "concat" is zero, unless zip was concatenated to another file
			
 
				+        concat = endrec[_ECD_LOCATION] - size_cd - offset_cd
			
 
				+        if endrec[_ECD_SIGNATURE] == stringEndArchive64:
			
 
				+            # If Zip64 extension structures are present, account for them
			
 
				+            concat -= (sizeEndCentDir64 + sizeEndCentDir64Locator)
			
 
				+
			
 
				+        if self.debug > 2:
			
 
				+            inferred = concat + offset_cd
			
 
				+            print("given, inferred, offset", offset_cd, inferred, concat)
			
 
				+        # self.start_dir:  Position of start of central directory
			
 
				+        self.start_dir = offset_cd + concat
			
 
				+        fp.seek(self.start_dir, 0)
			
 
				+        data = fp.read(size_cd)
			
 
				+        fp = io.BytesIO(data)
			
 
				+        total = 0
			
 
				+        while total < size_cd:
			
 
				+            centdir = fp.read(sizeCentralDir)
			
 
				+            if len(centdir) != sizeCentralDir:
			
 
				+                raise BadZipFile("Truncated central directory")
			
 
				+            centdir = struct.unpack(structCentralDir, centdir)
			
 
				+            if centdir[_CD_SIGNATURE] != stringCentralDir:
			
 
				+                raise BadZipFile("Bad magic number for central directory")
			
 
				+            if self.debug > 2:
			
 
				+                print(centdir)
			
 
				+            filename = fp.read(centdir[_CD_FILENAME_LENGTH])
			
 
				+            flags = centdir[5]
			
 
				+            if flags & 0x800:
			
 
				+                # UTF-8 file names extension
			
 
				+                filename = filename.decode('utf-8')
			
 
				+            else:
			
 
				+                # Historical ZIP filename encoding
			
 
				+                filename = filename.decode('cp437')
			
 
				+                # filename = filename.decode('utf-8')
			
 
				+            # Create ZipInfo instance to store file information
			
 
				+            x = ZipInfo(filename, has_changed_name=self.has_changed_name)
			
 
				+            x.extra = fp.read(centdir[_CD_EXTRA_FIELD_LENGTH])
			
 
				+            x.comment = fp.read(centdir[_CD_COMMENT_LENGTH])
			
 
				+            x.header_offset = centdir[_CD_LOCAL_HEADER_OFFSET]
			
 
				+            (x.create_version, x.create_system, x.extract_version, x.reserved,
			
 
				+             x.flag_bits, x.compress_type, t, d,
			
 
				+             x.CRC, x.compress_size, x.file_size) = centdir[1:12]
			
 
				+            if x.extract_version > MAX_EXTRACT_VERSION:
			
 
				+                raise NotImplementedError("zip file version %.1f" %
			
 
				+                                          (x.extract_version / 10))
			
 
				+            x.volume, x.internal_attr, x.external_attr = centdir[15:18]
			
 
				+            # Convert date/time code to (year, month, day, hour, min, sec)
			
 
				+            x._raw_time = t
			
 
				+            x.date_time = ( (d>>9)+1980, (d>>5)&0xF, d&0x1F,
			
 
				+                            t>>11, (t>>5)&0x3F, (t&0x1F) * 2 )
			
 
				+
			
 
				+            x._decodeExtra()
			
 
				+            x.header_offset = x.header_offset + concat
			
 
				+            self.filelist.append(x)
			
 
				+            self.NameToInfo[x.filename] = x
			
 
				+
			
 
				+            # update total bytes read from central directory
			
 
				+            total = (total + sizeCentralDir + centdir[_CD_FILENAME_LENGTH]
			
 
				+                     + centdir[_CD_EXTRA_FIELD_LENGTH]
			
 
				+                     + centdir[_CD_COMMENT_LENGTH])
			
 
				+
			
 
				+            if self.debug > 2:
			
 
				+                print("total", total)
			
 
				+
			
 
				+
			
 
				+    def namelist(self):
			
 
				+        """Return a list of file names in the archive."""
			
 
				+        return [data.filename for data in self.filelist]
			
 
				+
			
 
				+    def infolist(self):
			
 
				+        """Return a list of class ZipInfo instances for files in the
			
 
				+        archive."""
			
 
				+        return self.filelist
			
 
				+
			
 
				+    def printdir(self, file=None):
			
 
				+        """Print a table of contents for the zip file."""
			
 
				+        print("%-46s %19s %12s" % ("File Name", "Modified    ", "Size"),
			
 
				+              file=file)
			
 
				+        for zinfo in self.filelist:
			
 
				+            date = "%d-%02d-%02d %02d:%02d:%02d" % zinfo.date_time[:6]
			
 
				+            print("%-46s %s %12d" % (zinfo.filename, date, zinfo.file_size),
			
 
				+                  file=file)
			
 
				+
			
 
				+    def testzip(self):
			
 
				+        """Read all the files and check the CRC."""
			
 
				+        chunk_size = 2 ** 20
			
 
				+        for zinfo in self.filelist:
			
 
				+            try:
			
 
				+                # Read by chunks, to avoid an OverflowError or a
			
 
				+                # MemoryError with very large embedded files.
			
 
				+                with self.open(zinfo.filename, "r") as f:
			
 
				+                    while f.read(chunk_size):     # Check CRC-32
			
 
				+                        pass
			
 
				+            except BadZipFile:
			
 
				+                return zinfo.filename
			
 
				+
			
 
				+    def getinfo(self, name):
			
 
				+        """Return the instance of ZipInfo given 'name'."""
			
 
				+        info = self.NameToInfo.get(name)
			
 
				+        if info is None:
			
 
				+            raise KeyError(
			
 
				+                'There is no item named %r in the archive' % name)
			
 
				+
			
 
				+        return info
			
 
				+
			
 
				+    def setpassword(self, pwd):
			
 
				+        """Set default password for encrypted files."""
			
 
				+        if pwd and not isinstance(pwd, bytes):
			
 
				+            raise TypeError("pwd: expected bytes, got %s" % type(pwd).__name__)
			
 
				+        if pwd:
			
 
				+            self.pwd = pwd
			
 
				+        else:
			
 
				+            self.pwd = None
			
 
				+
			
 
				+    @property
			
 
				+    def comment(self):
			
 
				+        """The comment text associated with the ZIP file."""
			
 
				+        return self._comment
			
 
				+
			
 
				+    @comment.setter
			
 
				+    def comment(self, comment):
			
 
				+        if not isinstance(comment, bytes):
			
 
				+            raise TypeError("comment: expected bytes, got %s" % type(comment).__name__)
			
 
				+        # check for valid comment length
			
 
				+        if len(comment) > ZIP_MAX_COMMENT:
			
 
				+            import warnings
			
 
				+            warnings.warn('Archive comment is too long; truncating to %d bytes'
			
 
				+                          % ZIP_MAX_COMMENT, stacklevel=2)
			
 
				+            comment = comment[:ZIP_MAX_COMMENT]
			
 
				+        self._comment = comment
			
 
				+        self._didModify = True
			
 
				+
			
 
				+    def read(self, name, pwd=None):
			
 
				+        """Return file bytes for name."""
			
 
				+        with self.open(name, "r", pwd) as fp:
			
 
				+            return fp.read()
			
 
				+
			
 
				+    def open(self, name, mode="r", pwd=None, *, force_zip64=False):
			
 
				+        """Return file-like object for 'name'.
			
 
				+
			
 
				+        name is a string for the file name within the ZIP file, or a ZipInfo
			
 
				+        object.
			
 
				+
			
 
				+        mode should be 'r' to read a file already in the ZIP file, or 'w' to
			
 
				+        write to a file newly added to the archive.
			
 
				+
			
 
				+        pwd is the password to decrypt files (only used for reading).
			
 
				+
			
 
				+        When writing, if the file size is not known in advance but may exceed
			
 
				+        2 GiB, pass force_zip64 to use the ZIP64 format, which can handle large
			
 
				+        files.  If the size is known in advance, it is best to pass a ZipInfo
			
 
				+        instance for name, with zinfo.file_size set.
			
 
				+        """
			
 
				+        if mode not in {"r", "w"}:
			
 
				+            raise ValueError('open() requires mode "r" or "w"')
			
 
				+        if pwd and not isinstance(pwd, bytes):
			
 
				+            raise TypeError("pwd: expected bytes, got %s" % type(pwd).__name__)
			
 
				+        if pwd and (mode == "w"):
			
 
				+            raise ValueError("pwd is only supported for reading files")
			
 
				+        if not self.fp:
			
 
				+            raise ValueError(
			
 
				+                "Attempt to use ZIP archive that was already closed")
			
 
				+
			
 
				+        # Make sure we have an info object
			
 
				+        if isinstance(name, ZipInfo):
			
 
				+            # 'name' is already an info object
			
 
				+            zinfo = name
			
 
				+        elif mode == 'w':
			
 
				+            zinfo = ZipInfo(name)
			
 
				+            zinfo.compress_type = self.compression
			
 
				+            zinfo._compresslevel = self.compresslevel
			
 
				+        else:
			
 
				+            # Get info object for name
			
 
				+            zinfo = self.getinfo(name)
			
 
				+
			
 
				+        if mode == 'w':
			
 
				+            return self._open_to_write(zinfo, force_zip64=force_zip64)
			
 
				+
			
 
				+        if self._writing:
			
 
				+            raise ValueError("Can't read from the ZIP file while there "
			
 
				+                    "is an open writing handle on it. "
			
 
				+                    "Close the writing handle before trying to read.")
			
 
				+
			
 
				+        # Open for reading:
			
 
				+        self._fileRefCnt += 1
			
 
				+        zef_file = _SharedFile(self.fp, zinfo.header_offset,
			
 
				+                               self._fpclose, self._lock, lambda: self._writing)
			
 
				+        try:
			
 
				+            # Skip the file header:
			
 
				+            fheader = zef_file.read(sizeFileHeader)
			
 
				+            if len(fheader) != sizeFileHeader:
			
 
				+                raise BadZipFile("Truncated file header")
			
 
				+            fheader = struct.unpack(structFileHeader, fheader)
			
 
				+            if fheader[_FH_SIGNATURE] != stringFileHeader:
			
 
				+                raise BadZipFile("Bad magic number for file header")
			
 
				+
			
 
				+            fname = zef_file.read(fheader[_FH_FILENAME_LENGTH])
			
 
				+            if fheader[_FH_EXTRA_FIELD_LENGTH]:
			
 
				+                zef_file.read(fheader[_FH_EXTRA_FIELD_LENGTH])
			
 
				+
			
 
				+            if zinfo.flag_bits & 0x20:
			
 
				+                # Zip 2.7: compressed patched data
			
 
				+                raise NotImplementedError("compressed patched data (flag bit 5)")
			
 
				+
			
 
				+            if zinfo.flag_bits & 0x40:
			
 
				+                # strong encryption
			
 
				+                raise NotImplementedError("strong encryption (flag bit 6)")
			
 
				+
			
 
				+            if zinfo.flag_bits & 0x800:
			
 
				+                # UTF-8 filename
			
 
				+                fname_str = fname.decode("utf-8")
			
 
				+            else:
			
 
				+                fname_str = fname.decode("cp437")
			
 
				+
			
 
				+            print('zinfo.has_changed_name', zinfo.has_changed_name)
			
 
				+            if not zinfo.has_changed_name:
			
 
				+                if fname_str != zinfo.orig_filename:
			
 
				+                    raise BadZipFile(
			
 
				+                        'File name in directory %r and header %r differ.'
			
 
				+                        % (zinfo.orig_filename, fname))
			
 
				+
			
 
				+            # check for encrypted flag & handle password
			
 
				+            is_encrypted = zinfo.flag_bits & 0x1
			
 
				+            if is_encrypted:
			
 
				+                if not pwd:
			
 
				+                    pwd = self.pwd
			
 
				+                if not pwd:
			
 
				+                    raise RuntimeError("File %r is encrypted, password "
			
 
				+                                       "required for extraction" % name)
			
 
				+            else:
			
 
				+                pwd = None
			
 
				+
			
 
				+            return ZipExtFile(zef_file, mode, zinfo, pwd, True)
			
 
				+        except:
			
 
				+            zef_file.close()
			
 
				+            raise
			
 
				+
			
 
				+    def _open_to_write(self, zinfo, force_zip64=False):
			
 
				+        if force_zip64 and not self._allowZip64:
			
 
				+            raise ValueError(
			
 
				+                "force_zip64 is True, but allowZip64 was False when opening "
			
 
				+                "the ZIP file."
			
 
				+            )
			
 
				+        if self._writing:
			
 
				+            raise ValueError("Can't write to the ZIP file while there is "
			
 
				+                             "another write handle open on it. "
			
 
				+                             "Close the first handle before opening another.")
			
 
				+
			
 
				+        # Sizes and CRC are overwritten with correct data after processing the file
			
 
				+        if not hasattr(zinfo, 'file_size'):
			
 
				+            zinfo.file_size = 0
			
 
				+        zinfo.compress_size = 0
			
 
				+        zinfo.CRC = 0
			
 
				+
			
 
				+        zinfo.flag_bits = 0x00
			
 
				+        if zinfo.compress_type == ZIP_LZMA:
			
 
				+            # Compressed data includes an end-of-stream (EOS) marker
			
 
				+            zinfo.flag_bits |= 0x02
			
 
				+        if not self._seekable:
			
 
				+            zinfo.flag_bits |= 0x08
			
 
				+
			
 
				+        if not zinfo.external_attr:
			
 
				+            zinfo.external_attr = 0o600 << 16  # permissions: ?rw-------
			
 
				+
			
 
				+        # Compressed size can be larger than uncompressed size
			
 
				+        zip64 = self._allowZip64 and \
			
 
				+                (force_zip64 or zinfo.file_size * 1.05 > ZIP64_LIMIT)
			
 
				+
			
 
				+        if self._seekable:
			
 
				+            self.fp.seek(self.start_dir)
			
 
				+        zinfo.header_offset = self.fp.tell()
			
 
				+
			
 
				+        self._writecheck(zinfo)
			
 
				+        self._didModify = True
			
 
				+
			
 
				+        self.fp.write(zinfo.FileHeader(zip64))
			
 
				+
			
 
				+        self._writing = True
			
 
				+        return _ZipWriteFile(self, zinfo, zip64)
			
 
				+
			
 
				+    def extract(self, member, path=None, pwd=None):
			
 
				+        """Extract a member from the archive to the current working directory,
			
 
				+           using its full name. Its file information is extracted as accurately
			
 
				+           as possible. `member' may be a filename or a ZipInfo object. You can
			
 
				+           specify a different directory using `path'.
			
 
				+        """
			
 
				+        if path is None:
			
 
				+            path = os.getcwd()
			
 
				+        else:
			
 
				+            path = os.fspath(path)
			
 
				+
			
 
				+        return self._extract_member(member, path, pwd)
			
 
				+
			
 
				+    def extractall(self, path=None, members=None, pwd=None):
			
 
				+        """Extract all members from the archive to the current working
			
 
				+           directory. `path' specifies a different directory to extract to.
			
 
				+           `members' is optional and must be a subset of the list returned
			
 
				+           by namelist().
			
 
				+        """
			
 
				+        if members is None:
			
 
				+            members = self.namelist()
			
 
				+
			
 
				+        if path is None:
			
 
				+            path = os.getcwd()
			
 
				+        else:
			
 
				+            path = os.fspath(path)
			
 
				+
			
 
				+        for zipinfo in members:
			
 
				+            self._extract_member(zipinfo, path, pwd)
			
 
				+
			
 
				+    @classmethod
			
 
				+    def _sanitize_windows_name(cls, arcname, pathsep):
			
 
				+        """Replace bad characters and remove trailing dots from parts."""
			
 
				+        table = cls._windows_illegal_name_trans_table
			
 
				+        if not table:
			
 
				+            illegal = ':<>|"?*'
			
 
				+            table = str.maketrans(illegal, '_' * len(illegal))
			
 
				+            cls._windows_illegal_name_trans_table = table
			
 
				+        arcname = arcname.translate(table)
			
 
				+        # remove trailing dots
			
 
				+        arcname = (x.rstrip('.') for x in arcname.split(pathsep))
			
 
				+        # rejoin, removing empty parts.
			
 
				+        arcname = pathsep.join(x for x in arcname if x)
			
 
				+        return arcname
			
 
				+
			
 
				+    def _extract_member(self, member, targetpath, pwd):
			
 
				+        """Extract the ZipInfo object 'member' to a physical
			
 
				+           file on the path targetpath.
			
 
				+        """
			
 
				+        if not isinstance(member, ZipInfo):
			
 
				+            member = self.getinfo(member)
			
 
				+
			
 
				+        # build the destination pathname, replacing
			
 
				+        # forward slashes to platform specific separators.
			
 
				+        arcname = member.filename.replace('/', os.path.sep)
			
 
				+
			
 
				+        if os.path.altsep:
			
 
				+            arcname = arcname.replace(os.path.altsep, os.path.sep)
			
 
				+        # interpret absolute pathname as relative, remove drive letter or
			
 
				+        # UNC path, redundant separators, "." and ".." components.
			
 
				+        arcname = os.path.splitdrive(arcname)[1]
			
 
				+        invalid_path_parts = ('', os.path.curdir, os.path.pardir)
			
 
				+        arcname = os.path.sep.join(x for x in arcname.split(os.path.sep)
			
 
				+                                   if x not in invalid_path_parts)
			
 
				+        if os.path.sep == '\\':
			
 
				+            # filter illegal characters on Windows
			
 
				+            arcname = self._sanitize_windows_name(arcname, os.path.sep)
			
 
				+
			
 
				+        targetpath = os.path.join(targetpath, arcname)
			
 
				+        targetpath = os.path.normpath(targetpath)
			
 
				+
			
 
				+        # Create all upper directories if necessary.
			
 
				+        upperdirs = os.path.dirname(targetpath)
			
 
				+        if upperdirs and not os.path.exists(upperdirs):
			
 
				+            os.makedirs(upperdirs)
			
 
				+
			
 
				+        if member.is_dir():
			
 
				+            if not os.path.isdir(targetpath):
			
 
				+                os.mkdir(targetpath)
			
 
				+            return targetpath
			
 
				+
			
 
				+        with self.open(member, pwd=pwd) as source, \
			
 
				+             open(targetpath, "wb") as target:
			
 
				+            shutil.copyfileobj(source, target)
			
 
				+
			
 
				+        return targetpath
			
 
				+
			
 
				+    def _writecheck(self, zinfo):
			
 
				+        """Check for errors before writing a file to the archive."""
			
 
				+        if zinfo.filename in self.NameToInfo:
			
 
				+            import warnings
			
 
				+            warnings.warn('Duplicate name: %r' % zinfo.filename, stacklevel=3)
			
 
				+        if self.mode not in ('w', 'x', 'a'):
			
 
				+            raise ValueError("write() requires mode 'w', 'x', or 'a'")
			
 
				+        if not self.fp:
			
 
				+            raise ValueError(
			
 
				+                "Attempt to write ZIP archive that was already closed")
			
 
				+        _check_compression(zinfo.compress_type)
			
 
				+        if not self._allowZip64:
			
 
				+            requires_zip64 = None
			
 
				+            if len(self.filelist) >= ZIP_FILECOUNT_LIMIT:
			
 
				+                requires_zip64 = "Files count"
			
 
				+            elif zinfo.file_size > ZIP64_LIMIT:
			
 
				+                requires_zip64 = "Filesize"
			
 
				+            elif zinfo.header_offset > ZIP64_LIMIT:
			
 
				+                requires_zip64 = "Zipfile size"
			
 
				+            if requires_zip64:
			
 
				+                raise LargeZipFile(requires_zip64 +
			
 
				+                                   " would require ZIP64 extensions")
			
 
				+
			
 
				+    def write(self, filename, arcname=None,
			
 
				+              compress_type=None, compresslevel=None):
			
 
				+        """Put the bytes from filename into the archive under the name
			
 
				+        arcname."""
			
 
				+        if not self.fp:
			
 
				+            raise ValueError(
			
 
				+                "Attempt to write to ZIP archive that was already closed")
			
 
				+        if self._writing:
			
 
				+            raise ValueError(
			
 
				+                "Can't write to ZIP archive while an open writing handle exists"
			
 
				+            )
			
 
				+
			
 
				+        zinfo = ZipInfo.from_file(filename, arcname)
			
 
				+
			
 
				+        if zinfo.is_dir():
			
 
				+            zinfo.compress_size = 0
			
 
				+            zinfo.CRC = 0
			
 
				+        else:
			
 
				+            if compress_type is not None:
			
 
				+                zinfo.compress_type = compress_type
			
 
				+            else:
			
 
				+                zinfo.compress_type = self.compression
			
 
				+
			
 
				+            if compresslevel is not None:
			
 
				+                zinfo._compresslevel = compresslevel
			
 
				+            else:
			
 
				+                zinfo._compresslevel = self.compresslevel
			
 
				+
			
 
				+        if zinfo.is_dir():
			
 
				+            with self._lock:
			
 
				+                if self._seekable:
			
 
				+                    self.fp.seek(self.start_dir)
			
 
				+                zinfo.header_offset = self.fp.tell()  # Start of header bytes
			
 
				+                if zinfo.compress_type == ZIP_LZMA:
			
 
				+                # Compressed data includes an end-of-stream (EOS) marker
			
 
				+                    zinfo.flag_bits |= 0x02
			
 
				+
			
 
				+                self._writecheck(zinfo)
			
 
				+                self._didModify = True
			
 
				+
			
 
				+                self.filelist.append(zinfo)
			
 
				+                self.NameToInfo[zinfo.filename] = zinfo
			
 
				+                self.fp.write(zinfo.FileHeader(False))
			
 
				+                self.start_dir = self.fp.tell()
			
 
				+        else:
			
 
				+            with open(filename, "rb") as src, self.open(zinfo, 'w') as dest:
			
 
				+                shutil.copyfileobj(src, dest, 1024*8)
			
 
				+
			
 
				+    def writestr(self, zinfo_or_arcname, data,
			
 
				+                 compress_type=None, compresslevel=None):
			
 
				+        """Write a file into the archive.  The contents is 'data', which
			
 
				+        may be either a 'str' or a 'bytes' instance; if it is a 'str',
			
 
				+        it is encoded as UTF-8 first.
			
 
				+        'zinfo_or_arcname' is either a ZipInfo instance or
			
 
				+        the name of the file in the archive."""
			
 
				+        if isinstance(data, str):
			
 
				+            data = data.encode("utf-8")
			
 
				+        if not isinstance(zinfo_or_arcname, ZipInfo):
			
 
				+            zinfo = ZipInfo(filename=zinfo_or_arcname,
			
 
				+                            date_time=time.localtime(time.time())[:6])
			
 
				+            zinfo.compress_type = self.compression
			
 
				+            zinfo._compresslevel = self.compresslevel
			
 
				+            if zinfo.filename[-1] == '/':
			
 
				+                zinfo.external_attr = 0o40775 << 16   # drwxrwxr-x
			
 
				+                zinfo.external_attr |= 0x10           # MS-DOS directory flag
			
 
				+            else:
			
 
				+                zinfo.external_attr = 0o600 << 16     # ?rw-------
			
 
				+        else:
			
 
				+            zinfo = zinfo_or_arcname
			
 
				+
			
 
				+        if not self.fp:
			
 
				+            raise ValueError(
			
 
				+                "Attempt to write to ZIP archive that was already closed")
			
 
				+        if self._writing:
			
 
				+            raise ValueError(
			
 
				+                "Can't write to ZIP archive while an open writing handle exists."
			
 
				+            )
			
 
				+
			
 
				+        if compress_type is not None:
			
 
				+            zinfo.compress_type = compress_type
			
 
				+
			
 
				+        if compresslevel is not None:
			
 
				+            zinfo._compresslevel = compresslevel
			
 
				+
			
 
				+        zinfo.file_size = len(data)            # Uncompressed size
			
 
				+        with self._lock:
			
 
				+            with self.open(zinfo, mode='w') as dest:
			
 
				+                dest.write(data)
			
 
				+
			
 
				+    def __del__(self):
			
 
				+        """Call the "close()" method in case the user forgot."""
			
 
				+        self.close()
			
 
				+
			
 
				+    def close(self):
			
 
				+        """Close the file, and for mode 'w', 'x' and 'a' write the ending
			
 
				+        records."""
			
 
				+        if self.fp is None:
			
 
				+            return
			
 
				+
			
 
				+        if self._writing:
			
 
				+            raise ValueError("Can't close the ZIP file while there is "
			
 
				+                             "an open writing handle on it. "
			
 
				+                             "Close the writing handle before closing the zip.")
			
 
				+
			
 
				+        try:
			
 
				+            if self.mode in ('w', 'x', 'a') and self._didModify: # write ending records
			
 
				+                with self._lock:
			
 
				+                    if self._seekable:
			
 
				+                        self.fp.seek(self.start_dir)
			
 
				+                    self._write_end_record()
			
 
				+        finally:
			
 
				+            fp = self.fp
			
 
				+            self.fp = None
			
 
				+            self._fpclose(fp)
			
 
				+
			
 
				+    def _write_end_record(self):
			
 
				+        for zinfo in self.filelist:         # write central directory
			
 
				+            dt = zinfo.date_time
			
 
				+            dosdate = (dt[0] - 1980) << 9 | dt[1] << 5 | dt[2]
			
 
				+            dostime = dt[3] << 11 | dt[4] << 5 | (dt[5] // 2)
			
 
				+            extra = []
			
 
				+            if zinfo.file_size > ZIP64_LIMIT \
			
 
				+               or zinfo.compress_size > ZIP64_LIMIT:
			
 
				+                extra.append(zinfo.file_size)
			
 
				+                extra.append(zinfo.compress_size)
			
 
				+                file_size = 0xffffffff
			
 
				+                compress_size = 0xffffffff
			
 
				+            else:
			
 
				+                file_size = zinfo.file_size
			
 
				+                compress_size = zinfo.compress_size
			
 
				+
			
 
				+            if zinfo.header_offset > ZIP64_LIMIT:
			
 
				+                extra.append(zinfo.header_offset)
			
 
				+                header_offset = 0xffffffff
			
 
				+            else:
			
 
				+                header_offset = zinfo.header_offset
			
 
				+
			
 
				+            extra_data = zinfo.extra
			
 
				+            min_version = 0
			
 
				+            if extra:
			
 
				+                # Append a ZIP64 field to the extra's
			
 
				+                extra_data = _strip_extra(extra_data, (1,))
			
 
				+                extra_data = struct.pack(
			
 
				+                    '<HH' + 'Q'*len(extra),
			
 
				+                    1, 8*len(extra), *extra) + extra_data
			
 
				+
			
 
				+                min_version = ZIP64_VERSION
			
 
				+
			
 
				+            if zinfo.compress_type == ZIP_BZIP2:
			
 
				+                min_version = max(BZIP2_VERSION, min_version)
			
 
				+            elif zinfo.compress_type == ZIP_LZMA:
			
 
				+                min_version = max(LZMA_VERSION, min_version)
			
 
				+
			
 
				+            extract_version = max(min_version, zinfo.extract_version)
			
 
				+            create_version = max(min_version, zinfo.create_version)
			
 
				+            try:
			
 
				+                filename, flag_bits = zinfo._encodeFilenameFlags()
			
 
				+                centdir = struct.pack(structCentralDir,
			
 
				+                                      stringCentralDir, create_version,
			
 
				+                                      zinfo.create_system, extract_version, zinfo.reserved,
			
 
				+                                      flag_bits, zinfo.compress_type, dostime, dosdate,
			
 
				+                                      zinfo.CRC, compress_size, file_size,
			
 
				+                                      len(filename), len(extra_data), len(zinfo.comment),
			
 
				+                                      0, zinfo.internal_attr, zinfo.external_attr,
			
 
				+                                      header_offset)
			
 
				+            except DeprecationWarning:
			
 
				+                print((structCentralDir, stringCentralDir, create_version,
			
 
				+                       zinfo.create_system, extract_version, zinfo.reserved,
			
 
				+                       zinfo.flag_bits, zinfo.compress_type, dostime, dosdate,
			
 
				+                       zinfo.CRC, compress_size, file_size,
			
 
				+                       len(zinfo.filename), len(extra_data), len(zinfo.comment),
			
 
				+                       0, zinfo.internal_attr, zinfo.external_attr,
			
 
				+                       header_offset), file=sys.stderr)
			
 
				+                raise
			
 
				+            self.fp.write(centdir)
			
 
				+            self.fp.write(filename)
			
 
				+            self.fp.write(extra_data)
			
 
				+            self.fp.write(zinfo.comment)
			
 
				+
			
 
				+        pos2 = self.fp.tell()
			
 
				+        # Write end-of-zip-archive record
			
 
				+        centDirCount = len(self.filelist)
			
 
				+        centDirSize = pos2 - self.start_dir
			
 
				+        centDirOffset = self.start_dir
			
 
				+        requires_zip64 = None
			
 
				+        if centDirCount > ZIP_FILECOUNT_LIMIT:
			
 
				+            requires_zip64 = "Files count"
			
 
				+        elif centDirOffset > ZIP64_LIMIT:
			
 
				+            requires_zip64 = "Central directory offset"
			
 
				+        elif centDirSize > ZIP64_LIMIT:
			
 
				+            requires_zip64 = "Central directory size"
			
 
				+        if requires_zip64:
			
 
				+            # Need to write the ZIP64 end-of-archive records
			
 
				+            if not self._allowZip64:
			
 
				+                raise LargeZipFile(requires_zip64 +
			
 
				+                                   " would require ZIP64 extensions")
			
 
				+            zip64endrec = struct.pack(
			
 
				+                structEndArchive64, stringEndArchive64,
			
 
				+                44, 45, 45, 0, 0, centDirCount, centDirCount,
			
 
				+                centDirSize, centDirOffset)
			
 
				+            self.fp.write(zip64endrec)
			
 
				+
			
 
				+            zip64locrec = struct.pack(
			
 
				+                structEndArchive64Locator,
			
 
				+                stringEndArchive64Locator, 0, pos2, 1)
			
 
				+            self.fp.write(zip64locrec)
			
 
				+            centDirCount = min(centDirCount, 0xFFFF)
			
 
				+            centDirSize = min(centDirSize, 0xFFFFFFFF)
			
 
				+            centDirOffset = min(centDirOffset, 0xFFFFFFFF)
			
 
				+
			
 
				+        endrec = struct.pack(structEndArchive, stringEndArchive,
			
 
				+                             0, 0, centDirCount, centDirCount,
			
 
				+                             centDirSize, centDirOffset, len(self._comment))
			
 
				+        self.fp.write(endrec)
			
 
				+        self.fp.write(self._comment)
			
 
				+        self.fp.flush()
			
 
				+
			
 
				+    def _fpclose(self, fp):
			
 
				+        assert self._fileRefCnt > 0
			
 
				+        self._fileRefCnt -= 1
			
 
				+        if not self._fileRefCnt and not self._filePassed:
			
 
				+            fp.close()
			
 
				+
			
 
				+
			
 
				+class PyZipFile(ZipFile):
			
 
				+    """Class to create ZIP archives with Python library files and packages."""
			
 
				+
			
 
				+    def __init__(self, file, mode="r", compression=ZIP_STORED,
			
 
				+                 allowZip64=True, optimize=-1):
			
 
				+        ZipFile.__init__(self, file, mode=mode, compression=compression,
			
 
				+                         allowZip64=allowZip64)
			
 
				+        self._optimize = optimize
			
 
				+
			
 
				+    def writepy(self, pathname, basename="", filterfunc=None):
			
 
				+        """Add all files from "pathname" to the ZIP archive.
			
 
				+
			
 
				+        If pathname is a package directory, search the directory and
			
 
				+        all package subdirectories recursively for all *.py and enter
			
 
				+        the modules into the archive.  If pathname is a plain
			
 
				+        directory, listdir *.py and enter all modules.  Else, pathname
			
 
				+        must be a Python *.py file and the module will be put into the
			
 
				+        archive.  Added modules are always module.pyc.
			
 
				+        This method will compile the module.py into module.pyc if
			
 
				+        necessary.
			
 
				+        If filterfunc(pathname) is given, it is called with every argument.
			
 
				+        When it is False, the file or directory is skipped.
			
 
				+        """
			
 
				+        pathname = os.fspath(pathname)
			
 
				+        if filterfunc and not filterfunc(pathname):
			
 
				+            if self.debug:
			
 
				+                label = 'path' if os.path.isdir(pathname) else 'file'
			
 
				+                print('%s %r skipped by filterfunc' % (label, pathname))
			
 
				+            return
			
 
				+        dir, name = os.path.split(pathname)
			
 
				+        if os.path.isdir(pathname):
			
 
				+            initname = os.path.join(pathname, "__init__.py")
			
 
				+            if os.path.isfile(initname):
			
 
				+                # This is a package directory, add it
			
 
				+                if basename:
			
 
				+                    basename = "%s/%s" % (basename, name)
			
 
				+                else:
			
 
				+                    basename = name
			
 
				+                if self.debug:
			
 
				+                    print("Adding package in", pathname, "as", basename)
			
 
				+                fname, arcname = self._get_codename(initname[0:-3], basename)
			
 
				+                if self.debug:
			
 
				+                    print("Adding", arcname)
			
 
				+                self.write(fname, arcname)
			
 
				+                dirlist = sorted(os.listdir(pathname))
			
 
				+                dirlist.remove("__init__.py")
			
 
				+                # Add all *.py files and package subdirectories
			
 
				+                for filename in dirlist:
			
 
				+                    path = os.path.join(pathname, filename)
			
 
				+                    root, ext = os.path.splitext(filename)
			
 
				+                    if os.path.isdir(path):
			
 
				+                        if os.path.isfile(os.path.join(path, "__init__.py")):
			
 
				+                            # This is a package directory, add it
			
 
				+                            self.writepy(path, basename,
			
 
				+                                         filterfunc=filterfunc)  # Recursive call
			
 
				+                    elif ext == ".py":
			
 
				+                        if filterfunc and not filterfunc(path):
			
 
				+                            if self.debug:
			
 
				+                                print('file %r skipped by filterfunc' % path)
			
 
				+                            continue
			
 
				+                        fname, arcname = self._get_codename(path[0:-3],
			
 
				+                                                            basename)
			
 
				+                        if self.debug:
			
 
				+                            print("Adding", arcname)
			
 
				+                        self.write(fname, arcname)
			
 
				+            else:
			
 
				+                # This is NOT a package directory, add its files at top level
			
 
				+                if self.debug:
			
 
				+                    print("Adding files from directory", pathname)
			
 
				+                for filename in sorted(os.listdir(pathname)):
			
 
				+                    path = os.path.join(pathname, filename)
			
 
				+                    root, ext = os.path.splitext(filename)
			
 
				+                    if ext == ".py":
			
 
				+                        if filterfunc and not filterfunc(path):
			
 
				+                            if self.debug:
			
 
				+                                print('file %r skipped by filterfunc' % path)
			
 
				+                            continue
			
 
				+                        fname, arcname = self._get_codename(path[0:-3],
			
 
				+                                                            basename)
			
 
				+                        if self.debug:
			
 
				+                            print("Adding", arcname)
			
 
				+                        self.write(fname, arcname)
			
 
				+        else:
			
 
				+            if pathname[-3:] != ".py":
			
 
				+                raise RuntimeError(
			
 
				+                    'Files added with writepy() must end with ".py"')
			
 
				+            fname, arcname = self._get_codename(pathname[0:-3], basename)
			
 
				+            if self.debug:
			
 
				+                print("Adding file", arcname)
			
 
				+            self.write(fname, arcname)
			
 
				+
			
 
				+    def _get_codename(self, pathname, basename):
			
 
				+        """Return (filename, archivename) for the path.
			
 
				+
			
 
				+        Given a module name path, return the correct file path and
			
 
				+        archive name, compiling if necessary.  For example, given
			
 
				+        /python/lib/string, return (/python/lib/string.pyc, string).
			
 
				+        """
			
 
				+        def _compile(file, optimize=-1):
			
 
				+            import py_compile
			
 
				+            if self.debug:
			
 
				+                print("Compiling", file)
			
 
				+            try:
			
 
				+                py_compile.compile(file, doraise=True, optimize=optimize)
			
 
				+            except py_compile.PyCompileError as err:
			
 
				+                print(err.msg)
			
 
				+                return False
			
 
				+            return True
			
 
				+
			
 
				+        file_py  = pathname + ".py"
			
 
				+        file_pyc = pathname + ".pyc"
			
 
				+        pycache_opt0 = importlib.util.cache_from_source(file_py, optimization='')
			
 
				+        pycache_opt1 = importlib.util.cache_from_source(file_py, optimization=1)
			
 
				+        pycache_opt2 = importlib.util.cache_from_source(file_py, optimization=2)
			
 
				+        if self._optimize == -1:
			
 
				+            # legacy mode: use whatever file is present
			
 
				+            if (os.path.isfile(file_pyc) and
			
 
				+                  os.stat(file_pyc).st_mtime >= os.stat(file_py).st_mtime):
			
 
				+                # Use .pyc file.
			
 
				+                arcname = fname = file_pyc
			
 
				+            elif (os.path.isfile(pycache_opt0) and
			
 
				+                  os.stat(pycache_opt0).st_mtime >= os.stat(file_py).st_mtime):
			
 
				+                # Use the __pycache__/*.pyc file, but write it to the legacy pyc
			
 
				+                # file name in the archive.
			
 
				+                fname = pycache_opt0
			
 
				+                arcname = file_pyc
			
 
				+            elif (os.path.isfile(pycache_opt1) and
			
 
				+                  os.stat(pycache_opt1).st_mtime >= os.stat(file_py).st_mtime):
			
 
				+                # Use the __pycache__/*.pyc file, but write it to the legacy pyc
			
 
				+                # file name in the archive.
			
 
				+                fname = pycache_opt1
			
 
				+                arcname = file_pyc
			
 
				+            elif (os.path.isfile(pycache_opt2) and
			
 
				+                  os.stat(pycache_opt2).st_mtime >= os.stat(file_py).st_mtime):
			
 
				+                # Use the __pycache__/*.pyc file, but write it to the legacy pyc
			
 
				+                # file name in the archive.
			
 
				+                fname = pycache_opt2
			
 
				+                arcname = file_pyc
			
 
				+            else:
			
 
				+                # Compile py into PEP 3147 pyc file.
			
 
				+                if _compile(file_py):
			
 
				+                    if sys.flags.optimize == 0:
			
 
				+                        fname = pycache_opt0
			
 
				+                    elif sys.flags.optimize == 1:
			
 
				+                        fname = pycache_opt1
			
 
				+                    else:
			
 
				+                        fname = pycache_opt2
			
 
				+                    arcname = file_pyc
			
 
				+                else:
			
 
				+                    fname = arcname = file_py
			
 
				+        else:
			
 
				+            # new mode: use given optimization level
			
 
				+            if self._optimize == 0:
			
 
				+                fname = pycache_opt0
			
 
				+                arcname = file_pyc
			
 
				+            else:
			
 
				+                arcname = file_pyc
			
 
				+                if self._optimize == 1:
			
 
				+                    fname = pycache_opt1
			
 
				+                elif self._optimize == 2:
			
 
				+                    fname = pycache_opt2
			
 
				+                else:
			
 
				+                    msg = "invalid value for 'optimize': {!r}".format(self._optimize)
			
 
				+                    raise ValueError(msg)
			
 
				+            if not (os.path.isfile(fname) and
			
 
				+                    os.stat(fname).st_mtime >= os.stat(file_py).st_mtime):
			
 
				+                if not _compile(file_py, optimize=self._optimize):
			
 
				+                    fname = arcname = file_py
			
 
				+        archivename = os.path.split(arcname)[1]
			
 
				+        if basename:
			
 
				+            archivename = "%s/%s" % (basename, archivename)
			
 
				+        return (fname, archivename)
			
 
				+
			
 
				+
			
 
				+def main(args=None):
			
 
				+    import argparse
			
 
				+
			
 
				+    description = 'A simple command-line interface for zipfile module.'
			
 
				+    parser = argparse.ArgumentParser(description=description)
			
 
				+    group = parser.add_mutually_exclusive_group(required=True)
			
 
				+    group.add_argument('-l', '--list', metavar='<zipfile>',
			
 
				+                       help='Show listing of a zipfile')
			
 
				+    group.add_argument('-e', '--extract', nargs=2,
			
 
				+                       metavar=('<zipfile>', '<output_dir>'),
			
 
				+                       help='Extract zipfile into target dir')
			
 
				+    group.add_argument('-c', '--create', nargs='+',
			
 
				+                       metavar=('<name>', '<file>'),
			
 
				+                       help='Create zipfile from sources')
			
 
				+    group.add_argument('-t', '--test', metavar='<zipfile>',
			
 
				+                       help='Test if a zipfile is valid')
			
 
				+    args = parser.parse_args(args)
			
 
				+
			
 
				+    if args.test is not None:
			
 
				+        src = args.test
			
 
				+        with ZipFile(src, 'r') as zf:
			
 
				+            badfile = zf.testzip()
			
 
				+        if badfile:
			
 
				+            print("The following enclosed file is corrupted: {!r}".format(badfile))
			
 
				+        print("Done testing")
			
 
				+
			
 
				+    elif args.list is not None:
			
 
				+        src = args.list
			
 
				+        with ZipFile(src, 'r') as zf:
			
 
				+            zf.printdir()
			
 
				+
			
 
				+    elif args.extract is not None:
			
 
				+        src, curdir = args.extract
			
 
				+        with ZipFile(src, 'r') as zf:
			
 
				+            zf.extractall(curdir)
			
 
				+
			
 
				+    elif args.create is not None:
			
 
				+        zip_name = args.create.pop(0)
			
 
				+        files = args.create
			
 
				+
			
 
				+        def addToZip(zf, path, zippath):
			
 
				+            if os.path.isfile(path):
			
 
				+                zf.write(path, zippath, ZIP_DEFLATED)
			
 
				+            elif os.path.isdir(path):
			
 
				+                if zippath:
			
 
				+                    zf.write(path, zippath)
			
 
				+                for nm in sorted(os.listdir(path)):
			
 
				+                    addToZip(zf,
			
 
				+                             os.path.join(path, nm), os.path.join(zippath, nm))
			
 
				+            # else: ignore
			
 
				+
			
 
				+        with ZipFile(zip_name, 'w') as zf:
			
 
				+            for path in files:
			
 
				+                zippath = os.path.basename(path)
			
 
				+                if not zippath:
			
 
				+                    zippath = os.path.basename(os.path.dirname(path))
			
 
				+                if zippath in ('', os.curdir, os.pardir):
			
 
				+                    zippath = ''
			
 
				+                addToZip(zf, path, zippath)
			
 
				+
			
 
				+if __name__ == "__main__":
			
 
				+    main()
			
--- a/otr/otr_interface.py
+++ b/otr/otr_interface.py
@@ -411,40 +411,6 @@ if __name__ == '__main__':
 
				     else:
			
 
				         port = 18000
			
 
				         using_gpu_index = 0
			
 
				-    _global._init()
			
 
				-    _global.update({"port": str(port)})
			
 
				-    globals().update({"port": str(port)})
			
 
				-
			
 
				-    # 日志格式设置
			
 
				-    # ip = get_intranet_ip()
			
 
				-    # logging.basicConfig(level=logging.INFO,
			
 
				-    #                     format='%(asctime)s - %(name)s - %(levelname)s - '
			
 
				-    #                            + ip + ' - ' + str(port) + ' - %(message)s')
			
 
				-    logging.info(get_platform())
			
 
				-    # 限制tensorflow显存
			
 
				-    # os.environ['CUDA_VISIBLE_DEVICES'] = str(using_gpu_index)
			
 
				-    # import tensorflow as tf
			
 
				-    # if get_platform() != "Windows":
			
 
				-    #     _version = tf.__version__
			
 
				-    #     logging.info(str(_version))
			
 
				-    #     memory_limit_scale = 0.3
			
 
				-    #     # tensorflow 1.x
			
 
				-    #     if str(_version)[0] == "1":
			
 
				-    #         logging.info("1.x " + str(_version))
			
 
				-    #         os.environ['CUDA_CACHE_MAXSIZE'] = str(2147483648)
			
 
				-    #         os.environ['CUDA_CACHE_DISABLE'] = str(0)
			
 
				-    #         gpu_options = tf.GPUOptions(per_process_gpu_memory_fraction=memory_limit_scale)
			
 
				-    #         sess = tf.Session(config=tf.ConfigProto(gpu_options=gpu_options))
			
 
				-    #
			
 
				-    #     # tensorflow 2.x
			
 
				-    #     elif str(_version)[0] == "2":
			
 
				-    #         logging.info("2.x " + str(_version))
			
 
				-            # config = tf.compat.v1.ConfigProto()
			
 
				-            # config.gpu_options.per_process_gpu_memory_fraction = memory_limit_scale
			
 
				-            # config.gpu_options.allow_growth = True
			
 
				-            # sess = tf.compat.v1.Session(config=config)
			
 
				-
			
 
				-
			
 
				     # app.run(host='0.0.0.0', port=port, processes=1, threaded=False, debug=False)
			
 
				     app.run()
			
 
				     log("OTR running "+str(port))
			
--- a/otr/table_line.py
+++ b/otr/table_line.py
--- a/result.html
+++ b/result.html
		`@@ -0,0 +1 @@`
		`+kill -9 $(lsof -i:15010\|sed -n '2,$p'\|awk '{print $2}'\|tr '\n' ' ')`