fangjiasheng 2 жил өмнө
parent
commit
7741734a8c

+ 6 - 0
format_convert/convert.py

@@ -364,6 +364,12 @@ def _convert():
     {[-5], 0}: 整个转换过程超时
     {[-6], 0}: 阿里云UDF队列超时
     {[-7], 1}: 文件需密码,无法打开
+    {[-8], 0}: 调用现成接口报错
+    {[-9], 0}: 接口接收数据为空
+    {[-10], 0}: 长图分割报错
+    {[-11], 0}: 新接口idc、isr、atc报错
+    {[-12], 0}: 表格跨页连接报错
+    {[-13], 0}: pdf表格线处理报错
     :return: {"result_html": str([]), "result_text":str([]) "is_success": int}
     """
 

+ 33 - 10
format_convert/convert_doc.py

@@ -1,8 +1,9 @@
 import inspect
 import os
 import sys
+from bs4 import BeautifulSoup
 sys.path.append(os.path.dirname(__file__) + "/../")
-from format_convert.convert_tree import _Document
+from format_convert.convert_tree import _Document, _Sentence, _Page
 import logging
 import traceback
 from format_convert import get_memory_info
@@ -35,14 +36,31 @@ class DocConvert:
         self.unique_type_dir = unique_type_dir
 
     def convert(self):
-        # 调用office格式转换
-        file_path = from_office_interface(self.path, self.unique_type_dir, 'docx')
-        if judge_error_code(file_path):
-            self._doc.error_code = file_path
-            return
-        _docx = DocxConvert(file_path, self.unique_type_dir)
-        _docx.convert()
-        self._doc = _docx._doc
+        # 先判断特殊doc文件,可能是html文本
+        is_html_doc = False
+        try:
+            with open(self.path, 'r') as f:
+                html_str = f.read()
+            soup = BeautifulSoup(html_str, 'lxml')
+            text = soup.text
+            is_html_doc = True
+        except:
+            pass
+
+        if is_html_doc:
+            self._page = _Page(None, 0)
+            _sen = _Sentence(text, (0, 0, 0, 0))
+            self._page.add_child(_sen)
+            self._doc.add_child(self._page)
+        else:
+            # 调用office格式转换
+            file_path = from_office_interface(self.path, self.unique_type_dir, 'docx')
+            if judge_error_code(file_path):
+                self._doc.error_code = file_path
+                return
+            _docx = DocxConvert(file_path, self.unique_type_dir)
+            _docx.convert()
+            self._doc = _docx._doc
 
     def get_html(self):
         try:
@@ -52,5 +70,10 @@ class DocConvert:
             self._doc.error_code = [-1]
         if self._doc.error_code is not None:
             return self._doc.error_code
-        print(self._doc.children)
+        # print(self._doc.children)
         return self._doc.get_html()
+
+
+if __name__ == '__main__':
+    c = DocConvert("C:/Users/Administrator/Downloads/-4274446916340743056.doc", "C:/Users/Administrator/Downloads/1")
+    print(c.get_html())

+ 18 - 2
format_convert/convert_docx.py

@@ -10,7 +10,7 @@ import xml
 import zipfile
 import docx
 from format_convert.convert_image import picture2text
-from format_convert.utils import judge_error_code, add_div, get_logger, log, memory_decorator
+from format_convert.utils import judge_error_code, add_div, get_logger, log, memory_decorator, get_garble_code
 from format_convert.wrapt_timeout_decorator import timeout
 
 
@@ -325,6 +325,18 @@ class DocxConvert:
             return
         order_list, text_list = order_and_text_list
 
+        self._page = _Page(None, 0)
+
+        # 乱码返回文件格式错误
+        match1 = re.findall(get_garble_code(), ''.join(text_list))
+        if len(match1) > 10:
+            log("doc/docx garbled code!")
+            # self._doc.error_code = [-3]
+            _sen = _Sentence('文件乱码!', (0, 0, 0, 0))
+            self._page.add_child(_sen)
+            self._doc.add_child(self._page)
+            return
+
         # test
         # for i in range(len(text_list)):
         #     print(order_list[i], text_list[i])
@@ -338,7 +350,6 @@ class DocxConvert:
 
         image_list = self.get_images()
 
-        self._page = _Page(None, 0)
         order_y = 0
         doc_pr_cnt = 0
         for tag in order_list:
@@ -427,3 +438,8 @@ class DocxConvert:
         if self._doc.error_code is not None:
             return self._doc.error_code
         return self._doc.get_html()
+
+
+if __name__ == '__main__':
+    c = DocxConvert("C:/Users/Administrator/Downloads/1631944542835.docx", "C:/Users/Administrator/Downloads/1/")
+    print(c.get_html())

+ 3 - 0
format_convert/convert_image.py

@@ -224,6 +224,9 @@ def image_process(image_np, image_path, is_from_pdf=False, is_from_docx=False, u
             # for _textbox in list_text_boxes:
             #     print("==",_textbox.get_text())
             lt = LineTable()
+            # print('text_list', text_list)
+            # print('bbox_list', bbox_list)
+            # print('list_line', list_line)
             tables, obj_in_table, _ = lt.recognize_table(list_text_boxes, list_lines, False)
 
             # 合并同一行textbox

+ 759 - 120
format_convert/convert_pdf.py

@@ -5,6 +5,9 @@ import logging
 import os
 import re
 import sys
+
+from bs4 import BeautifulSoup
+
 sys.path.append(os.path.dirname(__file__) + "/../")
 from pdfplumber import PDF
 from pdfplumber.table import TableFinder
@@ -12,6 +15,10 @@ from pdfplumber.page import Page as pdfPage
 from format_convert.convert_tree import _Document, _Page, _Image, _Sentence, _Table
 import time
 import pdfminer
+import math
+from scipy.stats import linregress
+from matplotlib import pyplot as plt
+from shapely.geometry import LineString, Point
 from format_convert import timeout_decorator
 from PIL import Image
 from format_convert.convert_image import image_process
@@ -26,9 +33,9 @@ from pdfminer.pdfpage import PDFPage
 from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
 from pdfminer.converter import PDFPageAggregator
 from pdfminer.layout import LTTextBoxHorizontal, LAParams, LTFigure, LTImage, LTCurve, LTText, LTChar, LTRect, \
-    LTTextBoxVertical, LTLine
+    LTTextBoxVertical, LTLine, LTTextContainer
 from format_convert.utils import judge_error_code, add_div, get_platform, get_html_p, string_similarity, LineTable, \
-    get_logger, log, memory_decorator,draw_lines_plt
+    get_logger, log, memory_decorator, draw_lines_plt, get_garble_code, line_is_cross
 import fitz
 from format_convert.wrapt_timeout_decorator import timeout
 
@@ -100,9 +107,9 @@ def pdf2Image(path, save_dir):
 def pdf_analyze(interpreter, page, device, page_no):
     log("into pdf_analyze")
     pdf_time = time.time()
-    print("pdf_analyze interpreter process...")
+    # print("pdf_analyze interpreter process...")
     interpreter.process_page(page)
-    print("pdf_analyze device get_result...")
+    # print("pdf_analyze device get_result...")
     layout = device.get_result()
     log("pdf2text page " + str(page_no) + " read time " + str(time.time() - pdf_time))
     return layout
@@ -389,7 +396,7 @@ def pdf2text(path, unique_type_dir):
                                 #         image_stream = ff.read()
                                 except Exception:
                                     log("pdf2text pdfminer read image in page " + str(page_no) +
-                                                 "  fail! use pymupdf read image...")
+                                        "  fail! use pymupdf read image...")
                                     # print(traceback.print_exc())
                                     image_text = page_info_dict.get(page_no)[0]
                                     if image_text is None:
@@ -476,7 +483,7 @@ def pdf2text(path, unique_type_dir):
         return [-3]
     except Exception as e:
         log("pdf2text error!")
-        print("pdf2text", traceback.print_exc())
+        traceback.print_exc()
         return [-1]
 
 
@@ -497,7 +504,7 @@ def get_single_pdf(path, page_no):
         raise e
     except Exception as e:
         log("get_single_pdf error! page " + str(page_no))
-        print("get_single_pdf", traceback.print_exc())
+        traceback.print_exc()
         raise e
 
 
@@ -578,7 +585,7 @@ def page_table_connect(has_table_dict):
     except Exception as e:
         # print("page_table_connect", e)
         log("page_table_connect error!")
-        print("page_table_connect", traceback.print_exc())
+        traceback.print_exc()
         return [-1], [-1]
 
 
@@ -589,7 +596,7 @@ def read_pdf(path, package_name, packages):
                         char_margin=0.3,
                         line_margin=0.01,
                         word_margin=0.01,
-                        boxes_flow=0.1,)
+                        boxes_flow=0.1, )
 
     if package_name == packages[0]:
         fp = open(path, 'rb')
@@ -668,7 +675,7 @@ class PDFConvert:
                                 char_margin=0.3,
                                 line_margin=0.01,
                                 word_margin=0.01,
-                                boxes_flow=0.1,)
+                                boxes_flow=0.1, )
             if package_name == self.packages[0]:
                 # fp = open(self.path, 'rb')
                 # parser = PDFParser(fp)
@@ -702,7 +709,7 @@ class PDFConvert:
                 self.lt, self.doc_top, self.doc_pdfplumber = read_pdfplumber(self.path, laparams)
                 self.has_init_pdf[3] = 0
             else:
-                print("Only Support Packages", str(self.packages))
+                log("Only Support Packages " + str(self.packages))
                 raise Exception
         except Exception as e:
             log(package_name + " cannot open pdf!")
@@ -766,29 +773,401 @@ class PDFConvert:
             self._doc.add_child(self._page)
             page_no += 1
 
+    def clean_text(self, _text):
+        return re.sub("\s", "", _text)
 
-    def clean_text(self,_text):
-
-        return re.sub("\s","",_text)
-
-
-    def get_text_lines(self,page,page_no):
+    def get_text_lines(self, page, page_no):
         lt_line_list = []
         page_plumber = pdfPage(self.doc_pdfplumber, page, page_number=page_no, initial_doctop=self.doc_top)
         self.doc_top += page_plumber.height
 
         table_finder = TableFinder(page_plumber)
+        all_width_zero = True
+        for _edge in table_finder.get_edges():
+            if _edge.get('linewidth') and _edge.get('linewidth') > 0:
+                all_width_zero = False
+                break
         for _edge in table_finder.get_edges():
-            lt_line_list.append(LTLine(1, (float(_edge["x0"]), float(_edge["y0"])),
-                                       (float(_edge["x1"]), float(_edge["y1"]))))
-        log("pdf page %s has %s lines"%(str(page_no),str(len(lt_line_list))))
+            # print(_edge)
+            if _edge.get('linewidth', 0.1) > 0 or all_width_zero:
+                lt_line_list.append(LTLine(1, (float(_edge["x0"]), float(_edge["y0"])),
+                                           (float(_edge["x1"]), float(_edge["y1"]))))
+        log("pdf page %s has %s lines" % (str(page_no), str(len(lt_line_list))))
+        return lt_line_list
+
+    def get_page_lines(self, layout, page_no):
+        def _plot(_line_list, mode=1):
+            for _line in _line_list:
+                if mode == 1:
+                    x0, y0, x1, y1 = _line.__dict__.get("bbox")
+                elif mode == 2:
+                    x0, y0, x1, y1 = _line
+                plt.plot([x0, x1], [y0, y1])
+            plt.show()
+            return
+
+        def is_cross(A, B, C, D):
+            if A[0] == B[0] == C[0] == D[0]:
+                if A[1] <= C[1] <= B[1] or A[1] <= D[1] <= B[1] \
+                        or C[1] <= A[1] <= D[1] or C[1] <= B[1] <= D[1]:
+                    return True
+            if A[1] == B[1] == C[1] == D[1]:
+                if A[0] <= C[0] <= B[0] or A[0] <= D[0] <= B[0] \
+                        or C[0] <= A[0] <= D[0] or C[0] <= B[0] <= D[0]:
+                    return True
+
+            line1 = LineString([A, B])
+            line2 = LineString([C, D])
+
+            int_pt = line1.intersection(line2)
+            try:
+                point_of_intersection = int_pt.x, int_pt.y
+                return True
+            except:
+                return False
+
+        def calculate_k(bbox):
+            x = [bbox[0], bbox[2]]
+            y = [bbox[1], bbox[3]]
+            slope, intercept, r_value, p_value, std_err = linregress(x, y)
+            # print('k', slope)
+            if math.isnan(slope):
+                slope = 0
+            return slope
+
+        def line_iou(line1, line2, axis=0):
+            inter = min(line1[1][axis], line2[1][axis]) - max(line1[0][axis], line2[0][axis])
+            # union = max(line1[1][axis], line2[1][axis]) - min(line1[0][axis], line2[0][axis])
+            union = min(abs(line1[0][axis] - line1[1][axis]), abs(line2[0][axis] - line2[1][axis]))
+            if union in [0, 0.]:
+                iou = 0.
+            else:
+                iou = inter / union
+            return iou
+
+        def get_cross_line(_line_list, threshold=1, cross_times=0):
+            # 根据是否有交点判断表格线
+            _cross_line_list = []
+            for line1 in _line_list:
+                if line1 in _cross_line_list:
+                    continue
+                if abs(line1[2] - line1[0]) > abs(line1[3] - line1[1]):
+                    p1 = [max(0, line1[0] - threshold), line1[1]]
+                    p2 = [min(line1[2] + threshold, page_w), line1[3]]
+                else:
+                    p1 = [line1[0], max(0, line1[1] - threshold)]
+                    p2 = [line1[2], min(line1[3] + threshold, page_h)]
+                line1 = [p1[0], p1[1], p2[0], p2[1]]
+                _times = 0
+                for line2 in _line_list:
+                    if abs(line2[2] - line2[0]) > abs(line2[3] - line2[1]):
+                        p3 = [max(0, line2[0] - threshold), line2[1]]
+                        p4 = [min(line2[2] + threshold, page_w), line2[3]]
+                    else:
+                        p3 = [line2[0], max(0, line2[1] - threshold)]
+                        p4 = [line2[2], min(line2[3] + threshold, page_h)]
+                    line2 = [p3[0], p3[1], p4[0], p4[1]]
+                    if line1 == line2:
+                        continue
+                    if is_cross(p1, p2, p3, p4):
+                        _times += 1
+                        if _times >= cross_times:
+                            _cross_line_list += [line1]
+                            break
+            return _cross_line_list
+
+        def repair_bias_line(_line_list):
+            temp_list = []
+            for line in _line_list:
+                x0, y0, x1, y1 = line
+                _y = min(y0, y1)
+                _x = min(x0, x1)
+                if abs(x0 - x1) > abs(y0 - y1):
+                    temp_list.append([x0, _y, x1, _y])
+                else:
+                    temp_list.append([_x, y0, _x, y1])
+            _line_list = temp_list
+            return _line_list
+
+        def repair_col_line(_straight_list, _bias_list, threshold=2, min_width=7):
+            if not _straight_list or not _bias_list:
+                print('add_col_bias_line empty', len(_straight_list), len(_bias_list))
+                return []
+
+            # 分列
+            _straight_list.sort(key=lambda x: (x[0], x[1]))
+            cols = []
+            col = []
+            current_w = _straight_list[0][0]
+            for line in _straight_list:
+                if abs(line[0] - line[2]) > abs(line[1] - line[3]):
+                    continue
+                if min(line[0], line[2]) - threshold <= current_w <= max(line[0], line[2]) + threshold:
+                    col.append(line)
+                else:
+                    if col:
+                        cols.append(col)
+                    col = [line]
+                    current_w = line[0]
+            if col:
+                cols.append(col)
+
+            # 补充col
+            new_list = []
+            for line in bias_line_list:
+                if abs(line[0] - line[2]) > abs(line[1] - line[3]):
+                    continue
+                for col in cols:
+                    w = col[0][0]
+                    if w - threshold <= line[0] <= w + threshold or w - threshold <= line[2] <= w + threshold:
+                        new_list.append([w, line[1] - 3, w, line[3] + 3])
+            new_list += _straight_list
+
+            # 去重
+            new_list = [str(x) for x in new_list]
+            new_list = list(set(new_list))
+            new_list = [eval(x) for x in new_list]
+
+            # 分列
+            new_list.sort(key=lambda x: (x[0], x[1]))
+            cols = []
+            col = []
+            current_w = new_list[0][0]
+            for line in new_list:
+                if abs(line[0] - line[2]) > abs(line[1] - line[3]):
+                    continue
+                if min(line[0], line[2]) - threshold <= current_w <= max(line[0], line[2]) + threshold:
+                    col.append(line)
+                else:
+                    if col:
+                        cols.append(col)
+                    col = [line]
+                    current_w = line[0]
+            if col:
+                cols.append(col)
+
+            # 删除col
+            for col1 in cols:
+                for col2 in cols:
+                    if col1 == col2 or abs(col1[0][0] - col2[0][0]) > min_width:
+                        continue
+
+                    col1_len, col2_len = 0, 0
+                    for c in col1:
+                        col1_len += abs(c[1] - c[3])
+                    for c in col2:
+                        col2_len += abs(c[1] - c[3])
+                    if col1_len > col2_len * 3:
+                        for c in col2:
+                            if c in new_list:
+                                new_list.remove(c)
+                    if col2_len > col1_len * 3:
+                        for c in col1:
+                            if c in new_list:
+                                new_list.remove(c)
+            return new_list
+
+        def merge_line(_line_list, threshold=2):
+            new_line_list = []
+            # 分列
+            _line_list.sort(key=lambda x: (x[0], x[1]))
+            cols = []
+            col = [_line_list[0]]
+            current_w = _line_list[0][0]
+            for line in _line_list:
+                if abs(line[0] - line[2]) > abs(line[1] - line[3]):
+                    continue
+                if min(line[0], line[2]) - threshold <= current_w <= max(line[0], line[2]) + threshold \
+                        and is_cross(line[0:2], line[2:4], col[-1][0:2], col[-1][2:4]):
+                    col.append(line)
+                else:
+                    if col:
+                        cols.append(col)
+                    col = [line]
+                    current_w = line[0]
+            if col:
+                cols.append(col)
+
+            for col in cols:
+                temp_c = col[0]
+                col_w = col[0][0]
+                for i in range(len(col) - 1):
+                    c = col[i]
+                    next_c = col[i + 1]
+                    if is_cross(c[0:2], c[2:4], next_c[0:2], next_c[2:4]):
+                        temp_c = [col_w, min(temp_c[1], c[1], c[3], next_c[1], next_c[3]), col_w,
+                                  max(temp_c[3], c[1], c[3], next_c[1], next_c[3])]
+                    else:
+                        new_line_list.append(temp_c)
+                        temp_c = next_c
+                if not new_line_list or (new_line_list and new_line_list[-1] != temp_c):
+                    new_line_list.append(temp_c)
+
+            # 分行
+            _line_list.sort(key=lambda x: (x[1], x[0]))
+            rows = []
+            row = []
+            current_h = _line_list[0][1]
+            for line in _line_list:
+                if abs(line[0] - line[2]) < abs(line[1] - line[3]):
+                    continue
+                if min(line[1], line[3]) - threshold <= current_h <= max(line[1], line[3]) + threshold:
+                    row.append(line)
+                else:
+                    if row:
+                        rows.append(row)
+                    row = [line]
+                    current_h = line[1]
+            if row:
+                rows.append(row)
+
+            for row in rows:
+                temp_r = row[0]
+                row_h = row[0][1]
+                for i in range(len(row) - 1):
+                    r = row[i]
+                    next_r = row[i + 1]
+                    # if is_cross(r[0:2], r[2:4], next_r[0:2], next_r[2:4]):
+                    if line_iou([r[0:2], r[2:4]], [next_r[0:2], next_r[2:4]], axis=0):
+                        temp_r = [min(temp_r[0], r[0], r[2], next_r[0], next_r[2]), row_h,
+                                  max(temp_r[2], r[0], r[2], next_r[0], next_r[2]), row_h]
+                    else:
+                        new_line_list.append(temp_r)
+                        temp_r = next_r
+                if not new_line_list or (new_line_list and new_line_list[-1] != temp_r):
+                    new_line_list.append(temp_r)
+            return new_line_list
+
+        def remove_outline_no_cross(_line_list):
+            row_list = []
+            col_list = []
+            for line in _line_list:
+                # 存所有行
+                if abs(line[0] - line[2]) > abs(line[1] - line[3]):
+                    row_list.append(line)
+                # 存所有列
+                if abs(line[0] - line[2]) < abs(line[1] - line[3]):
+                    col_list.append(line)
+
+            if not col_list:
+                return _line_list
+
+            # 左右两条边框
+            col_list.sort(key=lambda x: (x[0], x[1]))
+            left_col = col_list[0]
+            right_col = col_list[-1]
+
+            # 判断有交点但中间区域无交点
+            compare_list = []
+            for col in [left_col, right_col]:
+                add_h = abs(col[1]-col[3]) / 8
+                center_area = [col[1]+add_h, col[3]-add_h]
+                cross_cnt = 0
+                center_cross_cnt = 0
+                center_row_cnt = 0
+                for row in row_list:
+                    if is_cross(row[0:2], row[2:4], col[0:2], col[2:4]):
+                        if center_area[0] <= row[1] <= center_area[1]:
+                            center_cross_cnt += 1
+                        else:
+                            cross_cnt += 1
+                    else:
+                        if center_area[0] <= row[1] <= center_area[1]:
+                            center_row_cnt += 1
+                compare_list.append([cross_cnt, center_cross_cnt, center_row_cnt])
+
+            _flag = True
+            for c in compare_list:
+                if c[0] >= 2 and c[1] == 0 and c[2] >= 2:
+                    continue
+                _flag = False
+            print('compare_list', compare_list)
+            if _flag and compare_list[0][1] == compare_list[1][1] \
+                    and compare_list[0][2] == compare_list[1][2]:
+                for col in [left_col, right_col]:
+                    if col in _line_list:
+                        _line_list.remove(col)
+            return _line_list
+
+        log('into get_page_lines')
+
+        page_h = layout.height
+        page_w = layout.width
+
+        element_list = []
+        line_list = []
+        bias_line_list = []
+        text_bbox_list = []
+        for element in layout:
+            if isinstance(element, LTTextContainer):
+                text_bbox_list.append(element.bbox)
+
+            # 只取这三种类型的bbox
+            if isinstance(element, (LTRect, LTCurve, LTLine)):
+                element_list.append(element)
+                if element.height > 0.5 and element.width > 0.5:
+                    # print('element.height, element.width', element.height, element.width)
+                    k = calculate_k(element.bbox)
+                    if 1.73 / 3 < abs(k) < 1.73:
+                        continue
+                    else:
+                        bias_line_list.append(element.bbox)
+                    continue
+                line_list.append(element.bbox)
+
+        if not line_list and not bias_line_list:
+            return []
+
+        # 是否使用斜线来生成表格
+        if len(line_list) < 6 and len(bias_line_list) > len(line_list) * 2:
+            # print('use bias line')
+            # bias_line_list += add_col_bias_line(line_list, bias_line_list)
+            line_list = bias_line_list
+
+        # 去重
+        line_list = [str(x) for x in line_list]
+        line_list = list(set(line_list))
+        line_list = [eval(x) for x in line_list]
+
+        # 根据是否有交点判断表格线
+        cross_line_list = get_cross_line(line_list, threshold=2, cross_times=1)
+
+        if not cross_line_list:
+            return []
+
+        # 斜线校正
+        if cross_line_list:
+            cross_line_list = repair_bias_line(cross_line_list)
+
+        # 修复竖线
+        if bias_line_list:
+            cross_line_list = repair_col_line(cross_line_list, bias_line_list)
+
+        # 根据是否有交点判断表格线
+        cross_line_list = get_cross_line(cross_line_list, threshold=1, cross_times=1)
+
+        # 合并线条
+        cross_line_list = merge_line(cross_line_list)
+
+        # 删除最外层嵌套边框
+        cross_line_list = remove_outline_no_cross(cross_line_list)
+        # show
+        # print('len(cross_line_list)', len(cross_line_list))
+        # _plot(line_list, mode=2)
+        # _plot(cross_line_list, mode=2)
+
+        lt_line_list = []
+        for line in cross_line_list:
+            lt_line_list.append(LTLine(1, (float(line[0]), float(line[1])),
+                                       (float(line[2]), float(line[3]))))
+        log("pdf page %s has %s lines" % (str(page_no), str(len(lt_line_list))))
         return lt_line_list
 
-    def recognize_text(self,layout,page_no,lt_text_list,lt_line_list):
+    def recognize_text(self, layout, page_no, lt_text_list, lt_line_list):
         list_tables, filter_objs, _ = self.lt.recognize_table(lt_text_list, lt_line_list)
         self._page.in_table_objs = filter_objs
 
-        print("=======text_len:%d:filter_len:%d"%(len(lt_text_list),len(filter_objs)))
+        # print("=======text_len:%d:filter_len:%d"%(len(lt_text_list),len(filter_objs)))
 
         for table in list_tables:
             _table = _Table(table["table"], table["bbox"])
@@ -804,7 +1183,7 @@ class PDFConvert:
         # pdf对象需反向排序
         self._page.is_reverse = True
 
-    def is_text_legal(self,lt_text_list,page_no):
+    def is_text_legal(self, lt_text_list, page_no):
         # 无法识别pdf字符编码,整页用ocr
         text_temp = ""
         for _t in lt_text_list:
@@ -819,6 +1198,19 @@ class PDFConvert:
                 _image = _Image(page_image[1], page_image[0])
                 self._page.add_child(_image)
             return False
+
+        match1 = re.findall(get_garble_code(), text_temp)
+        # match2 = re.search('[\u4e00-\u9fa5]', text_temp)
+        if len(match1) > 3 and len(text_temp) > 10:
+            log("pdf garbled code! try pymupdf... " + text_temp[:20])
+            page_image = self.get_page_image(page_no)
+            if judge_error_code(page_image):
+                self._page.error_code = page_image
+            else:
+                _image = _Image(page_image[1], page_image[0])
+                self._page.add_child(_image)
+            return False
+
         return True
 
     def convert_page(self, page, page_no):
@@ -852,7 +1244,7 @@ class PDFConvert:
                         lt_image_list.append(y)
                         # image_count += 1
         lt_text_list = self.delete_water_mark(lt_text_list, layout.bbox, 15)
-        print("convert_pdf page", page_no)
+        log("convert_pdf page " + str(page_no))
         log("len(lt_image_list), len(lt_text_list) " + str(len(lt_image_list)) + " " + str(len(lt_text_list)))
 
         # 若只有文本且图片数为0,直接提取文字及表格
@@ -873,14 +1265,18 @@ class PDFConvert:
                     self._page.add_child(_image)
                 return
 
-
-            if not self.is_text_legal(lt_text_list,page_no):
+            if not self.is_text_legal(lt_text_list, page_no):
                 return
 
             try:
-                lt_line_list = self.get_text_lines(page,page_no)
-                self.recognize_text(layout,page_no,lt_text_list,lt_line_list)
-
+                lt_line_list = self.get_page_lines(layout, page_no)
+            except:
+                traceback.print_exc()
+                lt_line_list = []
+                self._page.error_code = [-13]
+            try:
+                # lt_line_list = self.get_text_lines(page,page_no)
+                self.recognize_text(layout, page_no, lt_text_list, lt_line_list)
             except:
                 traceback.print_exc()
                 self._page.error_code = [-8]
@@ -902,7 +1298,7 @@ class PDFConvert:
             # 图表对象
             for image in lt_image_list:
                 try:
-                    print("pdf2text LTImage size", page_no, image.width, image.height)
+                    # print("pdf2text LTImage size", page_no, image.width, image.height)
                     image_stream = image.stream.get_data()
                     # 小的图忽略
                     if image.width <= 300 and image.height <= 300:
@@ -911,7 +1307,7 @@ class PDFConvert:
                     img_test = Image.open(io.BytesIO(image_stream))
                     # img_test.show()
                     if image.height >= 1000 and image.width >= 1000:
-                        print("pdf2text LTImage stream output size", img_test.size)
+                        # print("pdf2text LTImage stream output size", img_test.size)
                         page_image = self.get_page_image(page_no)
                         if judge_error_code(page_image):
                             self._page.error_code = page_image
@@ -932,19 +1328,25 @@ class PDFConvert:
                 except Exception:
                     log("pdf2text pdfminer read image in page " + str(page_no) +
                         "  fail! use pymupdf read image...")
-                    print(traceback.print_exc())
+                    traceback.print_exc()
             # pdf对象需反向排序
             self._page.is_reverse = True
             self.init_package("pdfplumber")
 
-            if not self.is_text_legal(lt_text_list,page_no):
+            if not self.is_text_legal(lt_text_list, page_no):
                 return
 
-            lt_line_list = self.get_text_lines(page,page_no)
-            self.recognize_text(layout,page_no,lt_text_list,lt_line_list)
+            # lt_line_list = self.get_text_lines(page, page_no)
+            try:
+                lt_line_list = self.get_page_lines(layout, page_no)
+            except:
+                traceback.print_exc()
+                lt_line_list = []
+                self._page.error_code = [-13]
+            self.recognize_text(layout, page_no, lt_text_list, lt_line_list)
 
     def get_layout(self, page, page_no):
-        log("")
+        log("get_layout")
         if self.has_init_pdf[0] == 0:
             self.init_package("pdfminer")
         if self._doc.error_code is not None:
@@ -1032,11 +1434,137 @@ class PDFConvert:
                 continue
             self._doc.add_child(self._page)
 
+    def connect_table(self, html_list):
+        if not html_list:
+            return html_list
+
+        # 判断条件1:最后一个表格后有无非页码文本/第一个表格前有无文本
+        connect_flag_list = []
+        soup_list = []
+        for i, h in enumerate(html_list):
+            soup_list.append(BeautifulSoup(h, 'lxml'))
+            # 找最后一个表格
+            table_start1, table_end1 = None, None
+            # print('h', h)
+            match = re.finditer('<table', h)
+            for m in match:
+                table_start1 = m.span()[0]
+            if table_start1 is not None:
+                match = re.finditer('</table>', h[table_start1:])
+                for m in match:
+                    table_end1 = m.span()[1] + table_start1
+            # 最后一个表格后有无除了页码外的内容
+            connect_flag1 = False
+            if table_end1 is not None:
+                match = re.search('[^-/第页0-9]*', re.sub('<div>|</div>', '', h[table_end1:]))
+                # print('match1', match.group())
+                if not match or match.group() == '':
+                    connect_flag1 = True
+
+            # 找第一个表格
+            table_start2, table_end2 = None, None
+            match = re.finditer('<table', h)
+            for m in match:
+                table_start2 = m.span()[0]
+                break
+            # 第一个表格后有无内容
+            connect_flag2 = False
+            if table_start2 is not None and table_start2 == 0:
+                connect_flag2 = True
+            connect_flag_list.append([i, connect_flag2, connect_flag1])
+
+        # print('connect_flag_list', connect_flag_list)
+
+        # 根据条件1合并需连接页码,形成组
+        connect_pages_list = []
+        temp_list = []
+        for i, c in enumerate(connect_flag_list):
+            if temp_list and c[1]:
+                temp_list.append(c)
+            elif not temp_list and c[2]:
+                temp_list.append(c)
+            else:
+                if temp_list:
+                    connect_pages_list.append(temp_list)
+                    temp_list = []
+                connect_pages_list.append([c])
+        if temp_list:
+            connect_pages_list.append(temp_list)
+
+        # print('connect_pages_list', connect_pages_list)
+
+        # 判断条件2:判断组内列数是否相同
+        connect_pages_list2 = []
+        for c_list in connect_pages_list:
+            if len(c_list) == 1:
+                connect_pages_list2.append(c_list)
+            else:
+                col_cnt_list = []
+                for c in c_list:
+                    soup = soup_list[c[0]]
+                    table1 = soup.findAll('table')[-1]
+                    table2 = soup.findAll('table')[0]
+                    tr1 = table1.findAll('tr')
+                    tr2 = table2.findAll('tr')
+                    td1 = tr1[-1].findAll('td')
+                    td2 = tr2[0].findAll('td')
+                    col_cnt_list.append([len(td2), len(td1)])
+                new_c_list = [c_list[0]]
+                # print('col_cnt_list', col_cnt_list)
+                for i in range(len(col_cnt_list) - 1):
+                    if col_cnt_list[i][1] != col_cnt_list[i + 1][0]:
+                        connect_pages_list2.append(new_c_list)
+                        new_c_list = [c_list[i + 1]]
+                    else:
+                        new_c_list.append(c_list[i + 1])
+                if new_c_list:
+                    connect_pages_list2.append(new_c_list)
+
+        # print('connect_pages_list2', connect_pages_list2)
+
+        # 符合连接条件的拼接表格
+        new_html_list = []
+        for c_list in connect_pages_list2:
+            if len(c_list) == 1:
+                new_html_list.append(html_list[c_list[0][0]])
+                continue
+            new_html = ''
+            for c in c_list:
+                new_html += html_list[c[0]]
+            new_html = re.sub('</table>([-/第页0-9]|<div>|</div>)*<table border="1">', '<tr><td>#@#@#</td></tr>',
+                              new_html)
+
+            soup = BeautifulSoup(new_html, 'lxml')
+            trs = soup.findAll('tr')
+            for i in range(len(trs)):
+                if trs[i].get_text() == '#@#@#':
+                    td1 = trs[i - 1].findAll('td')
+                    td2 = trs[i + 1].findAll('td')
+                    if td2[0].get_text() == '':
+                        for j in range(len(td1)):
+                            td1[j].string = td1[j].get_text() + td2[j].get_text()
+                        trs[i + 1].decompose()
+                    trs[i].decompose()
+            new_html = str(soup)
+            new_html_list.append(new_html)
+
+        html_str = ''
+        for h in new_html_list:
+            html_str += h
+        return [html_str]
+
     def get_html(self):
         self.convert()
         if self._doc.error_code is not None:
             return self._doc.error_code
-        return self._doc.get_html()
+        html = self._doc.get_html(return_list=True)
+        # 表格连接
+        try:
+            html = self.connect_table(html)
+        except:
+            traceback.print_exc()
+            return [-12]
+        return html
 
     def delete_water_mark(self, lt_text_list, page_bbox, times=5):
         # 删除过多重复字句,为水印
@@ -1075,7 +1603,7 @@ class PDFConvert:
             ratio = max_size / _img.shape[resize_axis]
             new_shape = [0, 0]
             new_shape[resize_axis] = max_size
-            new_shape[1-resize_axis] = int(_img.shape[1-resize_axis] * ratio)
+            new_shape[1 - resize_axis] = int(_img.shape[1 - resize_axis] * ratio)
             _img = cv2.resize(_img, (new_shape[1], new_shape[0]))
             cv2.imwrite(img_path, _img)
 
@@ -1097,11 +1625,116 @@ class PDFConvert:
             return [-3]
 
 
+def get_text_font():
+    def flags_decomposer(flags):
+        """Make font flags human readable."""
+        l = []
+        if flags & 2 ** 0:
+            l.append("superscript")
+        if flags & 2 ** 1:
+            l.append("italic")
+        if flags & 2 ** 2:
+            l.append("serifed")
+        else:
+            l.append("sans")
+        if flags & 2 ** 3:
+            l.append("monospaced")
+        else:
+            l.append("proportional")
+        if flags & 2 ** 4:
+            l.append("bold")
+        return ", ".join(l)
+
+    def get_underlined_textLines(page):
+        """
+        获取某页pdf上的所有下划线文本信息
+        :param page: fitz中的一页
+        :return: list of tuples,每个tuple都是一个完整的下划线覆盖的整体:[(下划线句, 所在blk_no, 所在line_no), ...]
+        """
+        paths = page.get_drawings()  # get drawings on the current page
+
+        # 获取该页内所有的height很小的bbox。因为下划线其实大多是这种矩形
+        # subselect things we may regard as lines
+        lines = []
+        for p in paths:
+            for item in p["items"]:
+                if item[0] == "l":  # an actual line
+                    p1, p2 = item[1:]
+                    if p1.y == p2.y:
+                        lines.append((p1, p2))
+                elif item[0] == "re":  # a rectangle: check if height is small
+                    r = item[1]
+                    if r.width > r.height and r.height <= 2:
+                        lines.append((r.tl, r.tr))  # take top left / right points
+
+        # 获取该页的`max_lineheight`,用于下面比较距离使用
+        blocks = page.get_text("dict", flags=11)["blocks"]
+        max_lineheight = 0
+        for b in blocks:
+            for l in b["lines"]:
+                bbox = fitz.Rect(l["bbox"])
+                if bbox.height > max_lineheight:
+                    max_lineheight = bbox.height
+
+        underlined_res = []
+        # 开始对下划线内容进行查询
+        # make a list of words
+        words = page.get_text("words")
+        # if underlined, the bottom left / right of a word
+        # should not be too far away from left / right end of some line:
+        for wdx, w in enumerate(words):  # w[4] is the actual word string
+            r = fitz.Rect(w[:4])  # first 4 items are the word bbox
+            for p1, p2 in lines:  # check distances for start / end points
+                if abs(r.bl - p1) <= max_lineheight:  # 当前word的左下满足下划线左下
+                    if abs(r.br - p2) <= max_lineheight:  # 当前word的右下满足下划线右下(单个词,无空格)
+                        print(f"Word '{w[4]}' is underlined! Its block-line number is {w[-3], w[-2]}")
+                        underlined_res.append((w[4], w[-3], w[-2]))  # 分别是(下划线词,所在blk_no,所在line_no)
+                        break  # don't check more lines
+                    else:  # 继续寻找同line右侧的有缘人,因为有些下划线覆盖的词包含多个词,多个词之间有空格
+                        curr_line_num = w[-2]  # line nunmber
+                        for right_wdx in range(wdx + 1, len(words), 1):
+                            _next_w = words[right_wdx]
+                            if _next_w[-2] != curr_line_num:  # 当前遍历到的右侧word已经不是当前行的了(跨行是不行的)
+                                break
+                            _r_right = fitz.Rect(_next_w[:4])  # 获取当前同行右侧某word的方框4点
+                            if abs(_r_right.br - p2) <= max_lineheight:  # 用此word右下点和p2(目标下划线右上点)算距离,距离要小于max_lineheight
+                                print(
+                                    f"Word '{' '.join([_one_word[4] for _one_word in words[wdx:right_wdx + 1]])}' is underlined! " +
+                                    f"Its block-line number is {w[-3], w[-2]}")
+                                underlined_res.append(
+                                    (' '.join([_one_word[4] for _one_word in words[wdx:right_wdx + 1]]),
+                                     w[-3], w[-2])
+                                )  # 分别是(下划线词,所在blk_no,所在line_no)
+                                break  # don't check more lines
+        return underlined_res
+
+    _p = r'C:\Users\Administrator\Desktop\test_pdf\error2-2.pdf'
+    doc_pymupdf = read_pymupdf(_p)
+    page = doc_pymupdf[0]
+    blocks = page.get_text("dict", flags=11)["blocks"]
+    for b in blocks:  # iterate through the text blocks
+        for l in b["lines"]:  # iterate through the text lines
+            for s in l["spans"]:  # iterate through the text spans
+                print("")
+                font_properties = "Font: '%s' (%s), size %g, color #%06x" % (
+                    s["font"],  # font name
+                    flags_decomposer(s["flags"]),  # readable font flags
+                    s["size"],  # font size
+                    s["color"],  # font color
+                )
+                print(s)
+                print("Text: '%s'" % s["text"])  # simple print of text
+                print(font_properties)
+
+    get_underlined_textLines(page)
+
+
 # 以下为现成pdf单页解析接口
 class ParseSentence:
 
-    def __init__(self,bbox,fontname,fontsize,_text,_title,title_text,_pattern,title_degree,is_outline,outline_location,page_no):
-        (x0,y0,x1,y1) = bbox
+    def __init__(self, bbox, fontname, fontsize, _text, _title, title_text, _pattern, title_degree, is_outline,
+                 outline_location, page_no):
+        (x0, y0, x1, y1) = bbox
         self.x0 = x0
         self.y0 = y0
         self.x1 = x1
@@ -1119,7 +1752,7 @@ class ParseSentence:
         self.page_no = page_no
 
     def __repr__(self):
-        return "%s,%s,%s,%d,%s"%(self.text,self.title,self.is_outline,self.outline_location,str(self.bbox))
+        return "%s,%s,%s,%d,%s" % (self.text, self.title, self.is_outline, self.outline_location, str(self.bbox))
 
 
 class ParseUtils:
@@ -1127,11 +1760,11 @@ class ParseUtils:
     @staticmethod
     def getFontinfo(_page):
         for _obj in _page._objs:
-            if isinstance(_obj,(LTTextBoxHorizontal,LTTextBoxVertical)):
+            if isinstance(_obj, (LTTextBoxHorizontal, LTTextBoxVertical)):
                 for textline in _obj._objs:
                     done = False
                     for lchar in textline._objs:
-                        if isinstance(lchar,(LTChar)):
+                        if isinstance(lchar, (LTChar)):
                             _obj.fontname = lchar.fontname
                             _obj.fontsize = lchar.size
                         done = True
@@ -1153,7 +1786,7 @@ class ParseUtils:
 
             _find = False
             for _ct in cluster_textbox:
-                if abs(_ct["y"]-_textbox.bbox[1]) < 5:
+                if abs(_ct["y"] - _textbox.bbox[1]) < 5:
                     _find = True
                     _ct["textbox"].append(_textbox)
             if not _find:
@@ -1167,14 +1800,14 @@ class ParseUtils:
 
             _linetext = _textboxs[0].get_text()
             for _i in range(1, len(_textboxs)):
-                if abs(_textboxs[_i].bbox[0]-_textboxs[_i-1].bbox[2])>60:
+                if abs(_textboxs[_i].bbox[0] - _textboxs[_i - 1].bbox[2]) > 60:
                     if _linetext[-1] not in (",", ",", "。", ".", "、", ";"):
                         _linetext += "=,="
                 _linetext += _textboxs[_i].get_text()
 
             _linetext = re.sub("[\s\r\n]", "", _linetext)
             _bbox = (_textboxs[0].bbox[0], _textboxs[0].bbox[1],
-                     _textboxs[-1].bbox[2],_textboxs[-1].bbox[3])
+                     _textboxs[-1].bbox[2], _textboxs[-1].bbox[3])
 
             _title = None
             _pattern_groups = None
@@ -1192,7 +1825,7 @@ class ParseUtils:
                     title_text = _groups[0][1]
                     _pattern_groups = _groups
             if not _title:
-                _title = ParseUtils.rec_incenter(_bbox,page_bbox)
+                _title = ParseUtils.rec_incenter(_bbox, page_bbox)
 
             title_degree = 2
             if not _title:
@@ -1202,7 +1835,7 @@ class ParseUtils:
                 title_degree = int(_title.split("_")[1])
 
             # 页码
-            if ParseUtils.rec_incenter(_bbox,page_bbox) and re.search("^\d+$", _linetext) is not None:
+            if ParseUtils.rec_incenter(_bbox, page_bbox) and re.search("^\d+$", _linetext) is not None:
                 continue
 
             if _linetext == "" or re.search("^,+$", _linetext) is not None:
@@ -1216,7 +1849,10 @@ class ParseUtils:
                 _linetext = _search.group("text")
                 outline_location = int(_search.group("nums"))
 
-            list_sentences.append(ParseSentence(_bbox,_textboxs[-1].__dict__.get("fontname"),_textboxs[-1].__dict__.get("fontsize"),_linetext,_title,title_text,_pattern_groups,title_degree,is_outline,outline_location,page_no))
+            list_sentences.append(
+                ParseSentence(_bbox, _textboxs[-1].__dict__.get("fontname"), _textboxs[-1].__dict__.get("fontsize"),
+                              _linetext, _title, title_text, _pattern_groups, title_degree, is_outline,
+                              outline_location, page_no))
 
         # for _sen in list_sentences:
         #     print(_sen.__dict__)
@@ -1224,133 +1860,136 @@ class ParseUtils:
         return list_sentences
 
     @staticmethod
-    def find_title_by_pattern(_text,_pattern="(?P<title_1>(?P<title_1_index_0_0>^第?)(?P<title_1_index_1_1>[一二三四五六七八九十ⅠⅡⅢⅣⅤⅥⅦⅧⅨⅩⅪⅫ]+)(?P<title_1_index_2_0>[、章]))|" \
-                                             "(?P<title_3>^(?P<title_3_index_0_1>[ⅠⅡⅢⅣⅤⅥⅦⅧⅨⅩⅪⅫ]+))|" \
-                                             "(?P<title_4>^(?P<title_4_index_0_0>第?)(?P<title_4_index_1_1>[一二三四五六七八九十]+)(?P<title_4_index_2_0>[节]))|" \
-                                             "(?P<title_11>^(?P<title_11_index_0_0>\d{1,2}[\..、\s\-]\d{1,2}[\..、\s\-]\d{1,2}[\..、\s\-])(?P<title_11_index_1_1>\d{1,2})(?P<title_11_index_2_0>[\..、\s\-]))|" \
-                                             "(?P<title_10>^(?P<title_10_index_0_0>\d{1,2}[\..、\s\-]\d{1,2}[\..、\s\-])(?P<title_10_index_1_1>\d{1,2})(?P<title_10_index_2_0>[\..、\s\-]))|" \
-                                             "(?P<title_7>^(?P<title_7_index_0_0>\d{1,2}[\..、\s\-])(?P<title_7_index_1_1>\d{1,2})(?P<title_7_index_2_0>[\..、\s\-]))|" \
-                                             "(?P<title_6>^(?P<title_6_index_0_1>\d{1,2})(?P<title_6_index_1_0>[\..、\s\-]))|" \
-                                             "(?P<title_15>^(?P<title_15_index_0_0>(?)(?P<title_15_index_1_1>\d{1,2})(?P<title_15_index_2_0>)))|" \
-                                             "(?P<title_17>^(?P<title_17_index_0_0>(?)(?P<title_17_index_1_1>[a-zA-Z]+)(?P<title_17_index_2_0>)))|"
-                                             "(?P<title_19>^(?P<title_19_index_0_0>(?)(?P<title_19_index_1_1>[一二三四五六七八九十ⅠⅡⅢⅣⅤⅥⅦⅧⅨⅩⅪⅫ]+)(?P<title_19_index_2_0>)))|" \
+    def find_title_by_pattern(_text,
+                              _pattern="(?P<title_1>(?P<title_1_index_0_0>^第?)(?P<title_1_index_1_1>[一二三四五六七八九十ⅠⅡⅢⅣⅤⅥⅦⅧⅨⅩⅪⅫ]+)(?P<title_1_index_2_0>[、章]))|" \
+                                       "(?P<title_3>^(?P<title_3_index_0_1>[ⅠⅡⅢⅣⅤⅥⅦⅧⅨⅩⅪⅫ]+))|" \
+                                       "(?P<title_4>^(?P<title_4_index_0_0>第?)(?P<title_4_index_1_1>[一二三四五六七八九十]+)(?P<title_4_index_2_0>[节]))|" \
+                                       "(?P<title_11>^(?P<title_11_index_0_0>\d{1,2}[\..、\s\-]\d{1,2}[\..、\s\-]\d{1,2}[\..、\s\-])(?P<title_11_index_1_1>\d{1,2})(?P<title_11_index_2_0>[\..、\s\-]))|" \
+                                       "(?P<title_10>^(?P<title_10_index_0_0>\d{1,2}[\..、\s\-]\d{1,2}[\..、\s\-])(?P<title_10_index_1_1>\d{1,2})(?P<title_10_index_2_0>[\..、\s\-]))|" \
+                                       "(?P<title_7>^(?P<title_7_index_0_0>\d{1,2}[\..、\s\-])(?P<title_7_index_1_1>\d{1,2})(?P<title_7_index_2_0>[\..、\s\-]))|" \
+                                       "(?P<title_6>^(?P<title_6_index_0_1>\d{1,2})(?P<title_6_index_1_0>[\..、\s\-]))|" \
+                                       "(?P<title_15>^(?P<title_15_index_0_0>(?)(?P<title_15_index_1_1>\d{1,2})(?P<title_15_index_2_0>)))|" \
+                                       "(?P<title_17>^(?P<title_17_index_0_0>(?)(?P<title_17_index_1_1>[a-zA-Z]+)(?P<title_17_index_2_0>)))|"
+                                       "(?P<title_19>^(?P<title_19_index_0_0>(?)(?P<title_19_index_1_1>[一二三四五六七八九十ⅠⅡⅢⅣⅤⅥⅦⅧⅨⅩⅪⅫ]+)(?P<title_19_index_2_0>)))|" \
                               ):
-        _se = re.search(_pattern,_text)
+        _se = re.search(_pattern, _text)
         groups = []
         if _se is not None:
             _gd = _se.groupdict()
-            for k,v in _gd.items():
+            for k, v in _gd.items():
                 if v is not None:
-                    groups.append((k,v))
+                    groups.append((k, v))
         if len(groups):
-            groups.sort(key=lambda x:x[0])
+            groups.sort(key=lambda x: x[0])
             return groups
         return None
 
     @staticmethod
-    def rec_incenter(o_bbox,p_bbox):
-        p_width = p_bbox[2]-p_bbox[0]
-        l_space = (o_bbox[0]-p_bbox[0])/p_width
-        r_space = (p_bbox[2]-o_bbox[2])/p_width
+    def rec_incenter(o_bbox, p_bbox):
+        p_width = p_bbox[2] - p_bbox[0]
+        l_space = (o_bbox[0] - p_bbox[0]) / p_width
+        r_space = (p_bbox[2] - o_bbox[2]) / p_width
 
-        if abs((l_space-r_space))<0.1 and l_space>0.2:
+        if abs((l_space - r_space)) < 0.1 and l_space > 0.2:
             return "title_2"
 
     @staticmethod
     def is_first_title(_title):
         if _title is None:
             return False
-        if re.search("^\d+$",_title) is not None:
-            if int(_title)==1:
+        if re.search("^\d+$", _title) is not None:
+            if int(_title) == 1:
                 return True
             return False
-        if re.search("^[一二三四五六七八九十百]+$",_title) is not None:
-            if _title=="一":
+        if re.search("^[一二三四五六七八九十百]+$", _title) is not None:
+            if _title == "一":
                 return True
             return False
-        if re.search("^[a-z]+$",_title) is not None:
-            if _title=="a":
+        if re.search("^[a-z]+$", _title) is not None:
+            if _title == "a":
                 return True
             return False
-        if re.search("^[A-Z]+$",_title) is not None:
-            if _title=="A":
+        if re.search("^[A-Z]+$", _title) is not None:
+            if _title == "A":
                 return True
             return False
-        if re.search("^[ⅠⅡⅢⅣⅤⅥⅦⅧⅨⅩⅪⅫ]$",_title) is not None:
-            if _title=="Ⅰ":
+        if re.search("^[ⅠⅡⅢⅣⅤⅥⅦⅧⅨⅩⅪⅫ]$", _title) is not None:
+            if _title == "Ⅰ":
                 return True
             return False
         return False
 
     @staticmethod
     def get_next_title(_title):
-        if re.search("^\d+$",_title) is not None:
-            return str(int(_title)+1)
-        if re.search("^[一二三四五六七八九十百]+$",_title) is not None:
-            _next_title = ParseUtils.make_increase(['一','二','三','四','五','六','七','八','九','十'],re.sub("[十百]",'',_title))
+        if re.search("^\d+$", _title) is not None:
+            return str(int(_title) + 1)
+        if re.search("^[一二三四五六七八九十百]+$", _title) is not None:
+            _next_title = ParseUtils.make_increase(['一', '二', '三', '四', '五', '六', '七', '八', '九', '十'],
+                                                   re.sub("[十百]", '', _title))
             _next_title = list(_next_title)
             _next_title.reverse()
-            if _next_title[-1]!="十":
-                if len(_next_title)>=2:
-                    _next_title.insert(-1,'十')
-            if len(_next_title)>=4:
-                _next_title.insert(-3,'百')
-            if _title[0]=="十":
-                if _next_title=="十":
-                    _next_title = ["二","十"]
-                _next_title.insert(0,"十")
+            if _next_title[-1] != "十":
+                if len(_next_title) >= 2:
+                    _next_title.insert(-1, '十')
+            if len(_next_title) >= 4:
+                _next_title.insert(-3, '百')
+            if _title[0] == "十":
+                if _next_title == "十":
+                    _next_title = ["二", "十"]
+                _next_title.insert(0, "十")
             _next_title = "".join(_next_title)
             return _next_title
-        if re.search("^[a-z]+$",_title) is not None:
-            _next_title = ParseUtils.make_increase([chr(i+ord('a')) for i in range(26)],_title)
+        if re.search("^[a-z]+$", _title) is not None:
+            _next_title = ParseUtils.make_increase([chr(i + ord('a')) for i in range(26)], _title)
             _next_title = list(_next_title)
             _next_title.reverse()
             return "".join(_next_title)
-        if re.search("^[A-Z]+$",_title) is not None:
-            _next_title = ParseUtils.make_increase([chr(i+ord('A')) for i in range(26)],_title)
+        if re.search("^[A-Z]+$", _title) is not None:
+            _next_title = ParseUtils.make_increase([chr(i + ord('A')) for i in range(26)], _title)
             _next_title = list(_next_title)
             _next_title.reverse()
             return "".join(_next_title)
-        if re.search("^[ⅠⅡⅢⅣⅤⅥⅦⅧⅨⅩⅪⅫ]$",_title) is not None:
-            _sort = ["Ⅰ","Ⅱ","Ⅲ","Ⅳ","Ⅴ","Ⅵ","Ⅶ","Ⅷ","Ⅸ","Ⅹ","Ⅺ","Ⅻ"]
+        if re.search("^[ⅠⅡⅢⅣⅤⅥⅦⅧⅨⅩⅪⅫ]$", _title) is not None:
+            _sort = ["Ⅰ", "Ⅱ", "Ⅲ", "Ⅳ", "Ⅴ", "Ⅵ", "Ⅶ", "Ⅷ", "Ⅸ", "Ⅹ", "Ⅺ", "Ⅻ"]
             _index = _sort.index(_title)
-            if _index<len(_sort)-1:
-                return _sort[_index+1]
+            if _index < len(_sort) - 1:
+                return _sort[_index + 1]
             return None
 
-
     @staticmethod
-    def make_increase(_sort,_title,_add=1):
-        if len(_title)==0 and _add==0:
+    def make_increase(_sort, _title, _add=1):
+        if len(_title) == 0 and _add == 0:
             return ""
-        if len(_title)==0 and _add==1:
+        if len(_title) == 0 and _add == 1:
             return _sort[0]
         _index = _sort.index(_title[-1])
-        next_index = (_index+_add)%len(_sort)
+        next_index = (_index + _add) % len(_sort)
         next_chr = _sort[next_index]
-        if _index==len(_sort)-1:
+        if _index == len(_sort) - 1:
             _add = 1
         else:
             _add = 0
-        return next_chr+ParseUtils.make_increase(_sort,_title[:-1],_add)
-
-
-
+        return next_chr + ParseUtils.make_increase(_sort, _title[:-1], _add)
 
     @staticmethod
-    def rec_serial(_text,o_bbox,p_bbox,fontname,_pattern="(?P<title_1>^[一二三四五六七八九十]+[、])|" \
-                                                         "(?P<title_2>^\d+[\.、\s])|" \
-                                                         "(?P<title_3>^\d+\.\d+[\.、\s])|" \
-                                                         "(?P<title_4>^\d+\.\d+\.\d+[\.、\s])|" \
-                                                         "(?P<title_5>^\d+\.\d+\.\d+\.\d+[\.、\s])"):
-        #todo :recog the serial of the sentence
-
-
-
-        _se = re.search(_pattern,_text)
+    def rec_serial(_text, o_bbox, p_bbox, fontname, _pattern="(?P<title_1>^[一二三四五六七八九十]+[、])|" \
+                                                             "(?P<title_2>^\d+[\.、\s])|" \
+                                                             "(?P<title_3>^\d+\.\d+[\.、\s])|" \
+                                                             "(?P<title_4>^\d+\.\d+\.\d+[\.、\s])|" \
+                                                             "(?P<title_5>^\d+\.\d+\.\d+\.\d+[\.、\s])"):
+        # todo :recog the serial of the sentence
+
+        _se = re.search(_pattern, _text)
         if _se is not None:
             _gd = _se.groupdict()
-            for k,v in _gd.items():
+            for k, v in _gd.items():
                 if v is not None:
                     return k
         return None
+
+
+if __name__ == '__main__':
+    # get_text_font()
+    PDFConvert(r"C:/Users/Administrator/Downloads/1651896704621.pdf", "C:/Users/Administrator/Downloads/1").get_html()
+
+    # print(b'\x10')

+ 14 - 5
format_convert/convert_test.py

@@ -6,6 +6,9 @@ import sys
 import time
 from glob import glob
 from multiprocessing import Process
+
+from bs4 import BeautifulSoup
+
 sys.path.append(os.path.dirname(os.path.abspath(__file__)) + "/../")
 from format_convert.utils import get_platform, request_post, get_md5_from_bytes
 from format_convert.convert import to_html
@@ -21,10 +24,10 @@ def test_one(p, from_remote=False):
 
     data = {"file": file_base64, "type": p.split(".")[-1], "filemd5": 100}
     if from_remote:
-        # _url = 'http://121.46.18.113:15010/convert'
+        _url = 'http://121.46.18.113:15010/convert'
         # _url = 'http://192.168.2.103:15010/convert'
         # _url = 'http://172.16.160.65:15010/convert'
-        _url = 'http://127.0.0.1:15010/convert'
+        # _url = 'http://127.0.0.1:15010/convert'
         result = json.loads(request_post(_url, data, time_out=10000))
         text_str = ""
         for t in result.get("result_html"):
@@ -58,9 +61,10 @@ if __name__ == '__main__':
         # file_path = "C:/Users/Administrator/Desktop/test_xls/merge_cell.xlsx"
         # file_path = "D:/BIDI_DOC/比地_文档/2022/Test_Interface/20210609202634853485.xlsx"
         # file_path = "D:/BIDI_DOC/比地_文档/2022/Test_ODPS/1624325845476.pdf"
-        # file_path = "C:/Users/Administrator/Downloads/神仙居旅游汽车租赁竞争性磋商文件(1).doc"
+        # file_path = "C:/Users/Administrator/Downloads/QQ图片20230616105216.jpg"
         # file_path = "C:/Users/Administrator/Desktop/test_xls/error2.xlsx"
-        file_path = "C:/Users/Administrator/Desktop/test_doc/error5.docx"
+        # file_path = "C:/Users/Administrator/Desktop/test_image/error9-2.png"
+        file_path = "C:/Users/Administrator/Desktop/test_pdf/直接读表格线error/error51.pdf"
     else:
         file_path = "1660296734009.pdf"
     test_one(file_path, from_remote=True)
@@ -87,4 +91,9 @@ if __name__ == '__main__':
     #     p_list.append(p)
     # for p in p_list:
     #     p.join()
-    # print("finish", time.time() - start_time)
+    # print("finish", time.time() - start_time)
+
+    # with open(file_path, 'r') as f:
+    #     t = f.read()
+    # soup = BeautifulSoup(t, 'lxml')
+    # print(soup.text)

+ 13 - 4
format_convert/convert_tree.py

@@ -20,11 +20,15 @@ class _Document:
         else:
             self.error_code = child.error_code
 
-    def get_html(self):
+    def get_html(self, return_list=False):
         if self.error_code is not None:
             return self.error_code
 
-        html_text = ""
+        if return_list:
+            html_text = []
+        else:
+            html_text = ""
+
         for child in self.children:
             # 先调用get_html才能更新error_code
             child_html_text = child.get_html()
@@ -32,8 +36,13 @@ class _Document:
                 self.error_code = child.error_code
                 return self.error_code
             else:
-                html_text += child_html_text
-        return [html_text]
+                if return_list:
+                    html_text += [child_html_text]
+                else:
+                    html_text += child_html_text
+        if not return_list:
+            html_text = [html_text]
+        return html_text
 
 
 class _Page:

+ 35 - 12
format_convert/convert_xls.py

@@ -1,8 +1,9 @@
 import inspect
 import os
 import sys
+from bs4 import BeautifulSoup
 sys.path.append(os.path.dirname(os.path.abspath(__file__)) + "/../")
-from format_convert.convert_tree import _Document
+from format_convert.convert_tree import _Document, _Page, _Sentence
 import logging
 import traceback
 from format_convert import get_memory_info
@@ -38,14 +39,31 @@ class XlsConvert:
         self.unique_type_dir = unique_type_dir
 
     def convert(self):
-        # 调用office格式转换
-        file_path = from_office_interface(self.path, self.unique_type_dir, 'xlsx')
-        if judge_error_code(file_path):
-            self._doc.error_code = file_path
-            return
-        _xlsx = XlsxConvert(file_path, self.unique_type_dir)
-        _xlsx.convert()
-        self._doc = _xlsx._doc
+        # 先判断特殊xls文件,可能是html文本
+        is_html_xls = False
+        try:
+            with open(self.path, 'r') as f:
+                html_str = f.read()
+            soup = BeautifulSoup(html_str, 'lxml')
+            text = soup.text
+            is_html_xls = True
+        except:
+            pass
+
+        if is_html_xls:
+            self._page = _Page(None, 0)
+            _sen = _Sentence(text, (0, 0, 0, 0))
+            self._page.add_child(_sen)
+            self._doc.add_child(self._page)
+        else:
+            # 调用office格式转换
+            file_path = from_office_interface(self.path, self.unique_type_dir, 'xlsx')
+            if judge_error_code(file_path):
+                self._doc.error_code = file_path
+                return
+            _xlsx = XlsxConvert(file_path, self.unique_type_dir)
+            _xlsx.convert()
+            self._doc = _xlsx._doc
 
     def get_html(self):
         try:
@@ -53,8 +71,13 @@ class XlsConvert:
         except:
             traceback.print_exc()
             self._doc.error_code = [-1]
-        print("xls ", self._doc)
+        # print("xls ", self._doc)
         if self._doc.error_code is not None:
             return self._doc.error_code
-        print(self._doc.children)
-        return self._doc.get_html()
+        # print(self._doc.children)
+        return self._doc.get_html()
+
+
+if __name__ == '__main__':
+    c = XlsConvert("C:/Users/Administrator/Downloads/1683641686556.xls", "C:/Users/Administrator/Downloads/1")
+    print(c.get_html())

+ 15 - 3
format_convert/convert_zip.py

@@ -1,11 +1,13 @@
 import inspect
 import os
 import sys
+import uuid
+
 sys.path.append(os.path.dirname(__file__) + "/../")
 from format_convert.convert_tree import _Document, _Page, _Sentence
 import logging
 import traceback
-import zipfile
+import my_zipfile as zipfile
 from format_convert import get_memory_info
 from format_convert.utils import get_platform, rename_inner_files, judge_error_code, judge_format, get_logger, log, \
     memory_decorator
@@ -126,14 +128,19 @@ class ZipConvert:
                 # 中文乱码,会导致zip解压失败,直接修改对象
                 try:
                     new_f = f.encode('cp437').decode('gbk')
+                    # print('1', new_f)
                 except:
                     new_f = f.encode('utf-8').decode('utf-8')
+                    # print('2', new_f)
                 if f != new_f:
+                    new_f = str(uuid.uuid1().hex) + '.' + f.split('.')[-1]
                     zip_file.NameToInfo[new_f] = zip_file.NameToInfo[f]
                     zip_file.NameToInfo[new_f].filename = new_f
                     zip_file.NameToInfo.pop(f)
+                    zip_file.NameToInfo[new_f].orig_filename = new_f
+                    # zip_file.NameToInfo[new_f].flag_bits = 2048
+                    zip_file.NameToInfo[new_f].has_changed_name = True
                 new_zip_list.append(new_f)
-
             new_zip_list.sort(key=lambda x: len(x))
             for f in new_zip_list:
                 file_list.append(zip_file.extract(f, path=self.zip_path))
@@ -198,4 +205,9 @@ class ZipConvert:
             self._doc.error_code = [-1]
         if self._doc.error_code is not None:
             return self._doc.error_code
-        return self._doc.get_html()
+        return self._doc.get_html()
+
+
+if __name__ == '__main__':
+    c = ZipConvert("C:/Users/Administrator/Downloads/3775865878373065499.zip", "C:/Users/Administrator/Downloads/1")
+    c.get_html()

+ 1 - 1
format_convert/interface.yml

@@ -5,7 +5,7 @@ MASTER:
 #  local-102: 'http://192.168.2.102'
 #  local-103: 'http://192.168.2.103'
 #  local 'http://127.0.0.1'
-  ip: ['http://192.168.0.115']
+  ip: ['http://127.0.0.1']
 
   PATH:
     python: ['/data/anaconda3/envs/convert3/bin/python']

+ 6 - 0
format_convert/kill_all.sh

@@ -0,0 +1,6 @@
+kill -9 $(lsof -i:15010|sed -n '2,$p'|awk '{print $2}'|tr '\n' ' ')
+kill -9 $(lsof -i:17000|sed -n '2,$p'|awk '{print $2}'|tr '\n' ' ')
+kill -9 $(lsof -i:18000|sed -n '2,$p'|awk '{print $2}'|tr '\n' ' ')
+kill -9 $(lsof -i:18020|sed -n '2,$p'|awk '{print $2}'|tr '\n' ' ')
+kill -9 $(lsof -i:18040|sed -n '2,$p'|awk '{print $2}'|tr '\n' ' ')
+kill -9 $(lsof -i:18060|sed -n '2,$p'|awk '{print $2}'|tr '\n' ' ')

+ 1 - 0
format_convert/kill_main.sh

@@ -0,0 +1 @@
+kill -9 $(lsof -i:15010|sed -n '2,$p'|awk '{print $2}'|tr '\n' ' ')

+ 102 - 12
format_convert/test_walk.py

@@ -1,17 +1,107 @@
+import copy
 import os
-file_list = []
-for root, dirs, files in os.walk("./", topdown=False):
-    for name in dirs:
-        file_list.append(os.path.join(root, name) + os.sep)
-    for name in files:
-        file_list.append(os.path.join(root, name))
-print(file_list)
+import random
+import re
+import sys
+import time
+from bs4 import BeautifulSoup
+from datetime import datetime
+from multiprocessing import Process
+import datetime as dt
+sys.path.append(os.path.dirname(os.path.abspath(__file__)) + "/../")
+from format_convert.utils import file_lock
 
 
-s = set()
-s.update("1231asdb我深大")
-s.update("g6712")
+def run():
+
+    f = file_lock(os.path.abspath(os.path.dirname(__file__)) + '/19022.lock')
+    print("acquire file_lock! process " + str(os.getpid()))
+    for i in range(10):
+        print("process " + str(os.getpid()) + " " + str(i))
+        time.sleep(random.randint(0, 1))
+    f.close()
+
+
+def merge_table():
+    with open(r'C:\Users\Administrator\Desktop\2.html', 'r') as f:
+        html_str = f.read()
+    html_str_origin = copy.deepcopy(html_str)
+
+    try:
+        match1 = re.finditer('<table', html_str)
+        match2 = re.finditer('</table>', html_str)
+        table_index_list = []
+        for m1, m2 in zip(match1, match2):
+            table_index_list.append([m1.span()[0], m1.span()[1], m2.span()[0], m2.span()[1]])
+        print(table_index_list)
+
+        soup = BeautifulSoup(html_str)
+        tables = soup.find_all('table')
+        table_td_cnt_list = []
+        for table in tables:
+            tds = table.tr.find_all('td')
+            table_td_cnt_list.append(len(list(tds)))
+        print(table_td_cnt_list)
+
+        if len(table_index_list) == len(table_td_cnt_list):
+            merge_index_list = []
+            temp_index = []
+            for i in range(1, len(table_index_list)):
+                last_index = table_index_list[i-1]
+                index = table_index_list[i]
+                last_tds = table_td_cnt_list[i-1]
+                tds = table_td_cnt_list[i]
+                if index[0] - last_index[-1] == 0 and last_tds == tds:
+                    temp_index += [i-1, i]
+                    temp_index = list(set(temp_index))
+                else:
+                    if temp_index:
+                        merge_index_list.append(temp_index)
+                    temp_index = []
+            if temp_index:
+                merge_index_list.append(temp_index)
+            print(merge_index_list)
+
+            print('before len(html_str)', len(html_str))
+            for merge in merge_index_list:
+                start_index = table_index_list[merge[0]][0]
+                end_index = table_index_list[merge[-1]][-1]
+                table_replace = re.sub('<table[^>]*>|</table>', '', html_str[start_index:end_index])
+                table_replace = '<table border="1">' + table_replace + '</table>'
+                table_replace += ' '*(end_index-start_index-len(table_replace))
+                html_str = html_str[:start_index] + table_replace + html_str[end_index:]
+            print('after len(html_str)', len(html_str))
+
+            if len(html_str_origin) == len(html_str):
+                with open(r'C:\Users\Administrator\Desktop\3.html', 'w') as f:
+                    f.write(html_str)
+                return html_str
+            else:
+                return html_str_origin
+        else:
+            return html_str_origin
+    except:
+        return html_str_origin
+
+
+if __name__ == '__main__':
+    # process_list = []
+    # for j in range(10):
+    #     p1 = Process(target=run,)
+    #     p1.start()
+    #     process_list.append(p1)
+    #
+    # for p in process_list:
+    #     p.join()
+
+    print('|'.join(['a', 'n']))
+    _t = datetime.strptime('2023-04-26', '%Y-%m-%d')
+    _t2 = datetime.strptime('2023-04-02', '%Y-%m-%d')
+    print(abs((_t2-_t).days))
+    print(datetime.strftime(_t + dt.timedelta(days=10), '%Y-%m-%d'))
+
+    # merge_table()
+
+    print(datetime.now())
 
-print(len(s))
 
-print(len("".join(["sdas", "我是觉得", "111"])))

Файлын зөрүү хэтэрхий том тул дарагдсан байна
+ 191 - 399
format_convert/utils.py


+ 2 - 2
isr/pre_process.py

@@ -19,11 +19,11 @@ def count_red_pixel(image_np, cnt=1000):
     labels = measure.label(red_mask, connectivity=2)  # 8连通区域标记
     regions = measure.regionprops(labels)
     red_cnt = np.sum(red_mask != 0)
-    print("red_cnt regions", len(regions),red_cnt, time.time()-start_time)
+    # print("red_cnt regions", len(regions),red_cnt, time.time()-start_time)
     if regions and len(regions)>0:
         _max_area = max([r.bbox_area for r in regions])
         if _max_area>100:
-            print("red_cnt max_area", _max_area, time.time()-start_time)
+            # print("red_cnt max_area", _max_area, time.time()-start_time)
             return True
     return False
 

+ 2183 - 0
my_zipfile.py

@@ -0,0 +1,2183 @@
+"""
+Read and write ZIP files.
+
+XXX references to utf-8 need further investigation.
+"""
+import io
+import os
+import importlib.util
+import sys
+import time
+import stat
+import shutil
+import struct
+import binascii
+import threading
+
+try:
+    import zlib # We may need its compression method
+    crc32 = zlib.crc32
+except ImportError:
+    zlib = None
+    crc32 = binascii.crc32
+
+try:
+    import bz2 # We may need its compression method
+except ImportError:
+    bz2 = None
+
+try:
+    import lzma # We may need its compression method
+except ImportError:
+    lzma = None
+
+__all__ = ["BadZipFile", "BadZipfile", "error",
+           "ZIP_STORED", "ZIP_DEFLATED", "ZIP_BZIP2", "ZIP_LZMA",
+           "is_zipfile", "ZipInfo", "ZipFile", "PyZipFile", "LargeZipFile"]
+
+class BadZipFile(Exception):
+    pass
+
+
+class LargeZipFile(Exception):
+    """
+    Raised when writing a zipfile, the zipfile requires ZIP64 extensions
+    and those extensions are disabled.
+    """
+
+error = BadZipfile = BadZipFile      # Pre-3.2 compatibility names
+
+
+ZIP64_LIMIT = (1 << 31) - 1
+ZIP_FILECOUNT_LIMIT = (1 << 16) - 1
+ZIP_MAX_COMMENT = (1 << 16) - 1
+
+# constants for Zip file compression methods
+ZIP_STORED = 0
+ZIP_DEFLATED = 8
+ZIP_BZIP2 = 12
+ZIP_LZMA = 14
+# Other ZIP compression methods not supported
+
+DEFAULT_VERSION = 20
+ZIP64_VERSION = 45
+BZIP2_VERSION = 46
+LZMA_VERSION = 63
+# we recognize (but not necessarily support) all features up to that version
+MAX_EXTRACT_VERSION = 63
+
+# Below are some formats and associated data for reading/writing headers using
+# the struct module.  The names and structures of headers/records are those used
+# in the PKWARE description of the ZIP file format:
+#     http://www.pkware.com/documents/casestudies/APPNOTE.TXT
+# (URL valid as of January 2008)
+
+# The "end of central directory" structure, magic number, size, and indices
+# (section V.I in the format document)
+structEndArchive = b"<4s4H2LH"
+stringEndArchive = b"PK\005\006"
+sizeEndCentDir = struct.calcsize(structEndArchive)
+
+_ECD_SIGNATURE = 0
+_ECD_DISK_NUMBER = 1
+_ECD_DISK_START = 2
+_ECD_ENTRIES_THIS_DISK = 3
+_ECD_ENTRIES_TOTAL = 4
+_ECD_SIZE = 5
+_ECD_OFFSET = 6
+_ECD_COMMENT_SIZE = 7
+# These last two indices are not part of the structure as defined in the
+# spec, but they are used internally by this module as a convenience
+_ECD_COMMENT = 8
+_ECD_LOCATION = 9
+
+# The "central directory" structure, magic number, size, and indices
+# of entries in the structure (section V.F in the format document)
+structCentralDir = "<4s4B4HL2L5H2L"
+stringCentralDir = b"PK\001\002"
+sizeCentralDir = struct.calcsize(structCentralDir)
+
+# indexes of entries in the central directory structure
+_CD_SIGNATURE = 0
+_CD_CREATE_VERSION = 1
+_CD_CREATE_SYSTEM = 2
+_CD_EXTRACT_VERSION = 3
+_CD_EXTRACT_SYSTEM = 4
+_CD_FLAG_BITS = 5
+_CD_COMPRESS_TYPE = 6
+_CD_TIME = 7
+_CD_DATE = 8
+_CD_CRC = 9
+_CD_COMPRESSED_SIZE = 10
+_CD_UNCOMPRESSED_SIZE = 11
+_CD_FILENAME_LENGTH = 12
+_CD_EXTRA_FIELD_LENGTH = 13
+_CD_COMMENT_LENGTH = 14
+_CD_DISK_NUMBER_START = 15
+_CD_INTERNAL_FILE_ATTRIBUTES = 16
+_CD_EXTERNAL_FILE_ATTRIBUTES = 17
+_CD_LOCAL_HEADER_OFFSET = 18
+
+# The "local file header" structure, magic number, size, and indices
+# (section V.A in the format document)
+structFileHeader = "<4s2B4HL2L2H"
+stringFileHeader = b"PK\003\004"
+sizeFileHeader = struct.calcsize(structFileHeader)
+
+_FH_SIGNATURE = 0
+_FH_EXTRACT_VERSION = 1
+_FH_EXTRACT_SYSTEM = 2
+_FH_GENERAL_PURPOSE_FLAG_BITS = 3
+_FH_COMPRESSION_METHOD = 4
+_FH_LAST_MOD_TIME = 5
+_FH_LAST_MOD_DATE = 6
+_FH_CRC = 7
+_FH_COMPRESSED_SIZE = 8
+_FH_UNCOMPRESSED_SIZE = 9
+_FH_FILENAME_LENGTH = 10
+_FH_EXTRA_FIELD_LENGTH = 11
+
+# The "Zip64 end of central directory locator" structure, magic number, and size
+structEndArchive64Locator = "<4sLQL"
+stringEndArchive64Locator = b"PK\x06\x07"
+sizeEndCentDir64Locator = struct.calcsize(structEndArchive64Locator)
+
+# The "Zip64 end of central directory" record, magic number, size, and indices
+# (section V.G in the format document)
+structEndArchive64 = "<4sQ2H2L4Q"
+stringEndArchive64 = b"PK\x06\x06"
+sizeEndCentDir64 = struct.calcsize(structEndArchive64)
+
+_CD64_SIGNATURE = 0
+_CD64_DIRECTORY_RECSIZE = 1
+_CD64_CREATE_VERSION = 2
+_CD64_EXTRACT_VERSION = 3
+_CD64_DISK_NUMBER = 4
+_CD64_DISK_NUMBER_START = 5
+_CD64_NUMBER_ENTRIES_THIS_DISK = 6
+_CD64_NUMBER_ENTRIES_TOTAL = 7
+_CD64_DIRECTORY_SIZE = 8
+_CD64_OFFSET_START_CENTDIR = 9
+
+_DD_SIGNATURE = 0x08074b50
+
+_EXTRA_FIELD_STRUCT = struct.Struct('<HH')
+
+def _strip_extra(extra, xids):
+    # Remove Extra Fields with specified IDs.
+    unpack = _EXTRA_FIELD_STRUCT.unpack
+    modified = False
+    buffer = []
+    start = i = 0
+    while i + 4 <= len(extra):
+        xid, xlen = unpack(extra[i : i + 4])
+        j = i + 4 + xlen
+        if xid in xids:
+            if i != start:
+                buffer.append(extra[start : i])
+            start = j
+            modified = True
+        i = j
+    if not modified:
+        return extra
+    return b''.join(buffer)
+
+def _check_zipfile(fp):
+    try:
+        if _EndRecData(fp):
+            return True         # file has correct magic number
+    except OSError:
+        pass
+    return False
+
+def is_zipfile(filename):
+    """Quickly see if a file is a ZIP file by checking the magic number.
+
+    The filename argument may be a file or file-like object too.
+    """
+    result = False
+    try:
+        if hasattr(filename, "read"):
+            result = _check_zipfile(fp=filename)
+        else:
+            with open(filename, "rb") as fp:
+                result = _check_zipfile(fp)
+    except OSError:
+        pass
+    return result
+
+def _EndRecData64(fpin, offset, endrec):
+    """
+    Read the ZIP64 end-of-archive records and use that to update endrec
+    """
+    try:
+        fpin.seek(offset - sizeEndCentDir64Locator, 2)
+    except OSError:
+        # If the seek fails, the file is not large enough to contain a ZIP64
+        # end-of-archive record, so just return the end record we were given.
+        return endrec
+
+    data = fpin.read(sizeEndCentDir64Locator)
+    if len(data) != sizeEndCentDir64Locator:
+        return endrec
+    sig, diskno, reloff, disks = struct.unpack(structEndArchive64Locator, data)
+    if sig != stringEndArchive64Locator:
+        return endrec
+
+    if diskno != 0 or disks > 1:
+        raise BadZipFile("zipfiles that span multiple disks are not supported")
+
+    # Assume no 'zip64 extensible data'
+    fpin.seek(offset - sizeEndCentDir64Locator - sizeEndCentDir64, 2)
+    data = fpin.read(sizeEndCentDir64)
+    if len(data) != sizeEndCentDir64:
+        return endrec
+    sig, sz, create_version, read_version, disk_num, disk_dir, \
+        dircount, dircount2, dirsize, diroffset = \
+        struct.unpack(structEndArchive64, data)
+    if sig != stringEndArchive64:
+        return endrec
+
+    # Update the original endrec using data from the ZIP64 record
+    endrec[_ECD_SIGNATURE] = sig
+    endrec[_ECD_DISK_NUMBER] = disk_num
+    endrec[_ECD_DISK_START] = disk_dir
+    endrec[_ECD_ENTRIES_THIS_DISK] = dircount
+    endrec[_ECD_ENTRIES_TOTAL] = dircount2
+    endrec[_ECD_SIZE] = dirsize
+    endrec[_ECD_OFFSET] = diroffset
+    return endrec
+
+
+def _EndRecData(fpin):
+    """Return data from the "End of Central Directory" record, or None.
+
+    The data is a list of the nine items in the ZIP "End of central dir"
+    record followed by a tenth item, the file seek offset of this record."""
+
+    # Determine file size
+    fpin.seek(0, 2)
+    filesize = fpin.tell()
+
+    # Check to see if this is ZIP file with no archive comment (the
+    # "end of central directory" structure should be the last item in the
+    # file if this is the case).
+    try:
+        fpin.seek(-sizeEndCentDir, 2)
+    except OSError:
+        return None
+    data = fpin.read()
+    if (len(data) == sizeEndCentDir and
+        data[0:4] == stringEndArchive and
+        data[-2:] == b"\000\000"):
+        # the signature is correct and there's no comment, unpack structure
+        endrec = struct.unpack(structEndArchive, data)
+        endrec=list(endrec)
+
+        # Append a blank comment and record start offset
+        endrec.append(b"")
+        endrec.append(filesize - sizeEndCentDir)
+
+        # Try to read the "Zip64 end of central directory" structure
+        return _EndRecData64(fpin, -sizeEndCentDir, endrec)
+
+    # Either this is not a ZIP file, or it is a ZIP file with an archive
+    # comment.  Search the end of the file for the "end of central directory"
+    # record signature. The comment is the last item in the ZIP file and may be
+    # up to 64K long.  It is assumed that the "end of central directory" magic
+    # number does not appear in the comment.
+    maxCommentStart = max(filesize - (1 << 16) - sizeEndCentDir, 0)
+    fpin.seek(maxCommentStart, 0)
+    data = fpin.read()
+    start = data.rfind(stringEndArchive)
+    if start >= 0:
+        # found the magic number; attempt to unpack and interpret
+        recData = data[start:start+sizeEndCentDir]
+        if len(recData) != sizeEndCentDir:
+            # Zip file is corrupted.
+            return None
+        endrec = list(struct.unpack(structEndArchive, recData))
+        commentSize = endrec[_ECD_COMMENT_SIZE] #as claimed by the zip file
+        comment = data[start+sizeEndCentDir:start+sizeEndCentDir+commentSize]
+        endrec.append(comment)
+        endrec.append(maxCommentStart + start)
+
+        # Try to read the "Zip64 end of central directory" structure
+        return _EndRecData64(fpin, maxCommentStart + start - filesize,
+                             endrec)
+
+    # Unable to find a valid end of central directory structure
+    return None
+
+
+class ZipInfo (object):
+    """Class with attributes describing each file in the ZIP archive."""
+
+    __slots__ = (
+        'orig_filename',
+        'filename',
+        'date_time',
+        'compress_type',
+        '_compresslevel',
+        'comment',
+        'extra',
+        'create_system',
+        'create_version',
+        'extract_version',
+        'reserved',
+        'flag_bits',
+        'volume',
+        'internal_attr',
+        'external_attr',
+        'header_offset',
+        'CRC',
+        'compress_size',
+        'file_size',
+        '_raw_time',
+        'has_changed_name',
+    )
+
+    def __init__(self, filename="NoName", date_time=(1980,1,1,0,0,0), has_changed_name=False):
+        self.orig_filename = filename   # Original file name in archive
+
+        # Terminate the file name at the first null byte.  Null bytes in file
+        # names are used as tricks by viruses in archives.
+        null_byte = filename.find(chr(0))
+        if null_byte >= 0:
+            filename = filename[0:null_byte]
+        # This is used to ensure paths in generated ZIP files always use
+        # forward slashes as the directory separator, as required by the
+        # ZIP format specification.
+        if os.sep != "/" and os.sep in filename:
+            filename = filename.replace(os.sep, "/")
+
+        self.filename = filename        # Normalized file name
+        self.date_time = date_time      # year, month, day, hour, min, sec
+
+        if date_time[0] < 1980:
+            raise ValueError('ZIP does not support timestamps before 1980')
+
+        # Standard values:
+        self.compress_type = ZIP_STORED # Type of compression for the file
+        self._compresslevel = None      # Level for the compressor
+        self.comment = b""              # Comment for each file
+        self.extra = b""                # ZIP extra data
+        if sys.platform == 'win32':
+            self.create_system = 0          # System which created ZIP archive
+        else:
+            # Assume everything else is unix-y
+            self.create_system = 3          # System which created ZIP archive
+        self.create_version = DEFAULT_VERSION  # Version which created ZIP archive
+        self.extract_version = DEFAULT_VERSION # Version needed to extract archive
+        self.reserved = 0               # Must be zero
+        self.flag_bits = 0              # ZIP flag bits
+        self.volume = 0                 # Volume number of file header
+        self.internal_attr = 0          # Internal attributes
+        self.external_attr = 0          # External file attributes
+        # Other attributes are set by class ZipFile:
+        # header_offset         Byte offset to the file header
+        # CRC                   CRC-32 of the uncompressed file
+        # compress_size         Size of the compressed file
+        # file_size             Size of the uncompressed file
+        self.has_changed_name = has_changed_name
+
+    def __repr__(self):
+        result = ['<%s filename=%r' % (self.__class__.__name__, self.filename)]
+        if self.compress_type != ZIP_STORED:
+            result.append(' compress_type=%s' %
+                          compressor_names.get(self.compress_type,
+                                               self.compress_type))
+        hi = self.external_attr >> 16
+        lo = self.external_attr & 0xFFFF
+        if hi:
+            result.append(' filemode=%r' % stat.filemode(hi))
+        if lo:
+            result.append(' external_attr=%#x' % lo)
+        isdir = self.is_dir()
+        if not isdir or self.file_size:
+            result.append(' file_size=%r' % self.file_size)
+        if ((not isdir or self.compress_size) and
+            (self.compress_type != ZIP_STORED or
+             self.file_size != self.compress_size)):
+            result.append(' compress_size=%r' % self.compress_size)
+        result.append('>')
+        return ''.join(result)
+
+    def FileHeader(self, zip64=None):
+        """Return the per-file header as a bytes object."""
+        dt = self.date_time
+        dosdate = (dt[0] - 1980) << 9 | dt[1] << 5 | dt[2]
+        dostime = dt[3] << 11 | dt[4] << 5 | (dt[5] // 2)
+        if self.flag_bits & 0x08:
+            # Set these to zero because we write them after the file data
+            CRC = compress_size = file_size = 0
+        else:
+            CRC = self.CRC
+            compress_size = self.compress_size
+            file_size = self.file_size
+
+        extra = self.extra
+
+        min_version = 0
+        if zip64 is None:
+            zip64 = file_size > ZIP64_LIMIT or compress_size > ZIP64_LIMIT
+        if zip64:
+            fmt = '<HHQQ'
+            extra = extra + struct.pack(fmt,
+                                        1, struct.calcsize(fmt)-4, file_size, compress_size)
+        if file_size > ZIP64_LIMIT or compress_size > ZIP64_LIMIT:
+            if not zip64:
+                raise LargeZipFile("Filesize would require ZIP64 extensions")
+            # File is larger than what fits into a 4 byte integer,
+            # fall back to the ZIP64 extension
+            file_size = 0xffffffff
+            compress_size = 0xffffffff
+            min_version = ZIP64_VERSION
+
+        if self.compress_type == ZIP_BZIP2:
+            min_version = max(BZIP2_VERSION, min_version)
+        elif self.compress_type == ZIP_LZMA:
+            min_version = max(LZMA_VERSION, min_version)
+
+        self.extract_version = max(min_version, self.extract_version)
+        self.create_version = max(min_version, self.create_version)
+        filename, flag_bits = self._encodeFilenameFlags()
+        header = struct.pack(structFileHeader, stringFileHeader,
+                             self.extract_version, self.reserved, flag_bits,
+                             self.compress_type, dostime, dosdate, CRC,
+                             compress_size, file_size,
+                             len(filename), len(extra))
+        return header + filename + extra
+
+    def _encodeFilenameFlags(self):
+        try:
+            return self.filename.encode('ascii'), self.flag_bits
+        except UnicodeEncodeError:
+            return self.filename.encode('utf-8'), self.flag_bits | 0x800
+
+    def _decodeExtra(self):
+        # Try to decode the extra field.
+        extra = self.extra
+        unpack = struct.unpack
+        while len(extra) >= 4:
+            tp, ln = unpack('<HH', extra[:4])
+            if ln+4 > len(extra):
+                raise BadZipFile("Corrupt extra field %04x (size=%d)" % (tp, ln))
+            if tp == 0x0001:
+                if ln >= 24:
+                    counts = unpack('<QQQ', extra[4:28])
+                elif ln == 16:
+                    counts = unpack('<QQ', extra[4:20])
+                elif ln == 8:
+                    counts = unpack('<Q', extra[4:12])
+                elif ln == 0:
+                    counts = ()
+                else:
+                    raise BadZipFile("Corrupt extra field %04x (size=%d)" % (tp, ln))
+
+                idx = 0
+
+                # ZIP64 extension (large files and/or large archives)
+                if self.file_size in (0xffffffffffffffff, 0xffffffff):
+                    if len(counts) <= idx:
+                        raise BadZipFile(
+                            "Corrupt zip64 extra field. File size not found."
+                        )
+                    self.file_size = counts[idx]
+                    idx += 1
+
+                if self.compress_size == 0xFFFFFFFF:
+                    if len(counts) <= idx:
+                        raise BadZipFile(
+                            "Corrupt zip64 extra field. Compress size not found."
+                        )
+                    self.compress_size = counts[idx]
+                    idx += 1
+
+                if self.header_offset == 0xffffffff:
+                    if len(counts) <= idx:
+                        raise BadZipFile(
+                            "Corrupt zip64 extra field. Header offset not found."
+                        )
+                    old = self.header_offset
+                    self.header_offset = counts[idx]
+                    idx+=1
+
+            extra = extra[ln+4:]
+
+    @classmethod
+    def from_file(cls, filename, arcname=None):
+        """Construct an appropriate ZipInfo for a file on the filesystem.
+
+        filename should be the path to a file or directory on the filesystem.
+
+        arcname is the name which it will have within the archive (by default,
+        this will be the same as filename, but without a drive letter and with
+        leading path separators removed).
+        """
+        if isinstance(filename, os.PathLike):
+            filename = os.fspath(filename)
+        st = os.stat(filename)
+        isdir = stat.S_ISDIR(st.st_mode)
+        mtime = time.localtime(st.st_mtime)
+        date_time = mtime[0:6]
+        # Create ZipInfo instance to store file information
+        if arcname is None:
+            arcname = filename
+        arcname = os.path.normpath(os.path.splitdrive(arcname)[1])
+        while arcname[0] in (os.sep, os.altsep):
+            arcname = arcname[1:]
+        if isdir:
+            arcname += '/'
+        zinfo = cls(arcname, date_time)
+        zinfo.external_attr = (st.st_mode & 0xFFFF) << 16  # Unix attributes
+        if isdir:
+            zinfo.file_size = 0
+            zinfo.external_attr |= 0x10  # MS-DOS directory flag
+        else:
+            zinfo.file_size = st.st_size
+
+        return zinfo
+
+    def is_dir(self):
+        """Return True if this archive member is a directory."""
+        return self.filename[-1] == '/'
+
+
+# ZIP encryption uses the CRC32 one-byte primitive for scrambling some
+# internal keys. We noticed that a direct implementation is faster than
+# relying on binascii.crc32().
+
+_crctable = None
+def _gen_crc(crc):
+    for j in range(8):
+        if crc & 1:
+            crc = (crc >> 1) ^ 0xEDB88320
+        else:
+            crc >>= 1
+    return crc
+
+# ZIP supports a password-based form of encryption. Even though known
+# plaintext attacks have been found against it, it is still useful
+# to be able to get data out of such a file.
+#
+# Usage:
+#     zd = _ZipDecrypter(mypwd)
+#     plain_bytes = zd(cypher_bytes)
+
+def _ZipDecrypter(pwd):
+    key0 = 305419896
+    key1 = 591751049
+    key2 = 878082192
+
+    global _crctable
+    if _crctable is None:
+        _crctable = list(map(_gen_crc, range(256)))
+    crctable = _crctable
+
+    def crc32(ch, crc):
+        """Compute the CRC32 primitive on one byte."""
+        return (crc >> 8) ^ crctable[(crc ^ ch) & 0xFF]
+
+    def update_keys(c):
+        nonlocal key0, key1, key2
+        key0 = crc32(c, key0)
+        key1 = (key1 + (key0 & 0xFF)) & 0xFFFFFFFF
+        key1 = (key1 * 134775813 + 1) & 0xFFFFFFFF
+        key2 = crc32(key1 >> 24, key2)
+
+    for p in pwd:
+        update_keys(p)
+
+    def decrypter(data):
+        """Decrypt a bytes object."""
+        result = bytearray()
+        append = result.append
+        for c in data:
+            k = key2 | 2
+            c ^= ((k * (k^1)) >> 8) & 0xFF
+            update_keys(c)
+            append(c)
+        return bytes(result)
+
+    return decrypter
+
+
+class LZMACompressor:
+
+    def __init__(self):
+        self._comp = None
+
+    def _init(self):
+        props = lzma._encode_filter_properties({'id': lzma.FILTER_LZMA1})
+        self._comp = lzma.LZMACompressor(lzma.FORMAT_RAW, filters=[
+            lzma._decode_filter_properties(lzma.FILTER_LZMA1, props)
+        ])
+        return struct.pack('<BBH', 9, 4, len(props)) + props
+
+    def compress(self, data):
+        if self._comp is None:
+            return self._init() + self._comp.compress(data)
+        return self._comp.compress(data)
+
+    def flush(self):
+        if self._comp is None:
+            return self._init() + self._comp.flush()
+        return self._comp.flush()
+
+
+class LZMADecompressor:
+
+    def __init__(self):
+        self._decomp = None
+        self._unconsumed = b''
+        self.eof = False
+
+    def decompress(self, data):
+        if self._decomp is None:
+            self._unconsumed += data
+            if len(self._unconsumed) <= 4:
+                return b''
+            psize, = struct.unpack('<H', self._unconsumed[2:4])
+            if len(self._unconsumed) <= 4 + psize:
+                return b''
+
+            self._decomp = lzma.LZMADecompressor(lzma.FORMAT_RAW, filters=[
+                lzma._decode_filter_properties(lzma.FILTER_LZMA1,
+                                               self._unconsumed[4:4 + psize])
+            ])
+            data = self._unconsumed[4 + psize:]
+            del self._unconsumed
+
+        result = self._decomp.decompress(data)
+        self.eof = self._decomp.eof
+        return result
+
+
+compressor_names = {
+    0: 'store',
+    1: 'shrink',
+    2: 'reduce',
+    3: 'reduce',
+    4: 'reduce',
+    5: 'reduce',
+    6: 'implode',
+    7: 'tokenize',
+    8: 'deflate',
+    9: 'deflate64',
+    10: 'implode',
+    12: 'bzip2',
+    14: 'lzma',
+    18: 'terse',
+    19: 'lz77',
+    97: 'wavpack',
+    98: 'ppmd',
+}
+
+def _check_compression(compression):
+    if compression == ZIP_STORED:
+        pass
+    elif compression == ZIP_DEFLATED:
+        if not zlib:
+            raise RuntimeError(
+                "Compression requires the (missing) zlib module")
+    elif compression == ZIP_BZIP2:
+        if not bz2:
+            raise RuntimeError(
+                "Compression requires the (missing) bz2 module")
+    elif compression == ZIP_LZMA:
+        if not lzma:
+            raise RuntimeError(
+                "Compression requires the (missing) lzma module")
+    else:
+        raise NotImplementedError("That compression method is not supported")
+
+
+def _get_compressor(compress_type, compresslevel=None):
+    if compress_type == ZIP_DEFLATED:
+        if compresslevel is not None:
+            return zlib.compressobj(compresslevel, zlib.DEFLATED, -15)
+        return zlib.compressobj(zlib.Z_DEFAULT_COMPRESSION, zlib.DEFLATED, -15)
+    elif compress_type == ZIP_BZIP2:
+        if compresslevel is not None:
+            return bz2.BZ2Compressor(compresslevel)
+        return bz2.BZ2Compressor()
+    # compresslevel is ignored for ZIP_LZMA
+    elif compress_type == ZIP_LZMA:
+        return LZMACompressor()
+    else:
+        return None
+
+
+def _get_decompressor(compress_type):
+    if compress_type == ZIP_STORED:
+        return None
+    elif compress_type == ZIP_DEFLATED:
+        return zlib.decompressobj(-15)
+    elif compress_type == ZIP_BZIP2:
+        return bz2.BZ2Decompressor()
+    elif compress_type == ZIP_LZMA:
+        return LZMADecompressor()
+    else:
+        descr = compressor_names.get(compress_type)
+        if descr:
+            raise NotImplementedError("compression type %d (%s)" % (compress_type, descr))
+        else:
+            raise NotImplementedError("compression type %d" % (compress_type,))
+
+
+class _SharedFile:
+    def __init__(self, file, pos, close, lock, writing):
+        self._file = file
+        self._pos = pos
+        self._close = close
+        self._lock = lock
+        self._writing = writing
+        self.seekable = file.seekable
+        self.tell = file.tell
+
+    def seek(self, offset, whence=0):
+        with self._lock:
+            if self._writing():
+                raise ValueError("Can't reposition in the ZIP file while "
+                        "there is an open writing handle on it. "
+                        "Close the writing handle before trying to read.")
+            self._file.seek(offset, whence)
+            self._pos = self._file.tell()
+            return self._pos
+
+    def read(self, n=-1):
+        with self._lock:
+            if self._writing():
+                raise ValueError("Can't read from the ZIP file while there "
+                        "is an open writing handle on it. "
+                        "Close the writing handle before trying to read.")
+            self._file.seek(self._pos)
+            data = self._file.read(n)
+            self._pos = self._file.tell()
+            return data
+
+    def close(self):
+        if self._file is not None:
+            fileobj = self._file
+            self._file = None
+            self._close(fileobj)
+
+# Provide the tell method for unseekable stream
+class _Tellable:
+    def __init__(self, fp):
+        self.fp = fp
+        self.offset = 0
+
+    def write(self, data):
+        n = self.fp.write(data)
+        self.offset += n
+        return n
+
+    def tell(self):
+        return self.offset
+
+    def flush(self):
+        self.fp.flush()
+
+    def close(self):
+        self.fp.close()
+
+
+class ZipExtFile(io.BufferedIOBase):
+    """File-like object for reading an archive member.
+       Is returned by ZipFile.open().
+    """
+
+    # Max size supported by decompressor.
+    MAX_N = 1 << 31 - 1
+
+    # Read from compressed files in 4k blocks.
+    MIN_READ_SIZE = 4096
+
+    # Chunk size to read during seek
+    MAX_SEEK_READ = 1 << 24
+
+    def __init__(self, fileobj, mode, zipinfo, pwd=None,
+                 close_fileobj=False):
+        self._fileobj = fileobj
+        self._pwd = pwd
+        self._close_fileobj = close_fileobj
+
+        self._compress_type = zipinfo.compress_type
+        self._compress_left = zipinfo.compress_size
+        self._left = zipinfo.file_size
+
+        self._decompressor = _get_decompressor(self._compress_type)
+
+        self._eof = False
+        self._readbuffer = b''
+        self._offset = 0
+
+        self.newlines = None
+
+        self.mode = mode
+        self.name = zipinfo.filename
+
+        if hasattr(zipinfo, 'CRC'):
+            self._expected_crc = zipinfo.CRC
+            self._running_crc = crc32(b'')
+        else:
+            self._expected_crc = None
+
+        self._seekable = False
+        try:
+            if fileobj.seekable():
+                self._orig_compress_start = fileobj.tell()
+                self._orig_compress_size = zipinfo.compress_size
+                self._orig_file_size = zipinfo.file_size
+                self._orig_start_crc = self._running_crc
+                self._seekable = True
+        except AttributeError:
+            pass
+
+        self._decrypter = None
+        if pwd:
+            if zipinfo.flag_bits & 0x8:
+                # compare against the file type from extended local headers
+                check_byte = (zipinfo._raw_time >> 8) & 0xff
+            else:
+                # compare against the CRC otherwise
+                check_byte = (zipinfo.CRC >> 24) & 0xff
+            h = self._init_decrypter()
+            if h != check_byte:
+                raise RuntimeError("Bad password for file %r" % zipinfo.orig_filename)
+
+
+    def _init_decrypter(self):
+        self._decrypter = _ZipDecrypter(self._pwd)
+        # The first 12 bytes in the cypher stream is an encryption header
+        #  used to strengthen the algorithm. The first 11 bytes are
+        #  completely random, while the 12th contains the MSB of the CRC,
+        #  or the MSB of the file time depending on the header type
+        #  and is used to check the correctness of the password.
+        header = self._fileobj.read(12)
+        self._compress_left -= 12
+        return self._decrypter(header)[11]
+
+    def __repr__(self):
+        result = ['<%s.%s' % (self.__class__.__module__,
+                              self.__class__.__qualname__)]
+        if not self.closed:
+            result.append(' name=%r mode=%r' % (self.name, self.mode))
+            if self._compress_type != ZIP_STORED:
+                result.append(' compress_type=%s' %
+                              compressor_names.get(self._compress_type,
+                                                   self._compress_type))
+        else:
+            result.append(' [closed]')
+        result.append('>')
+        return ''.join(result)
+
+    def readline(self, limit=-1):
+        """Read and return a line from the stream.
+
+        If limit is specified, at most limit bytes will be read.
+        """
+
+        if limit < 0:
+            # Shortcut common case - newline found in buffer.
+            i = self._readbuffer.find(b'\n', self._offset) + 1
+            if i > 0:
+                line = self._readbuffer[self._offset: i]
+                self._offset = i
+                return line
+
+        return io.BufferedIOBase.readline(self, limit)
+
+    def peek(self, n=1):
+        """Returns buffered bytes without advancing the position."""
+        if n > len(self._readbuffer) - self._offset:
+            chunk = self.read(n)
+            if len(chunk) > self._offset:
+                self._readbuffer = chunk + self._readbuffer[self._offset:]
+                self._offset = 0
+            else:
+                self._offset -= len(chunk)
+
+        # Return up to 512 bytes to reduce allocation overhead for tight loops.
+        return self._readbuffer[self._offset: self._offset + 512]
+
+    def readable(self):
+        return True
+
+    def read(self, n=-1):
+        """Read and return up to n bytes.
+        If the argument is omitted, None, or negative, data is read and returned until EOF is reached.
+        """
+        if n is None or n < 0:
+            buf = self._readbuffer[self._offset:]
+            self._readbuffer = b''
+            self._offset = 0
+            while not self._eof:
+                buf += self._read1(self.MAX_N)
+            return buf
+
+        end = n + self._offset
+        if end < len(self._readbuffer):
+            buf = self._readbuffer[self._offset:end]
+            self._offset = end
+            return buf
+
+        n = end - len(self._readbuffer)
+        buf = self._readbuffer[self._offset:]
+        self._readbuffer = b''
+        self._offset = 0
+        while n > 0 and not self._eof:
+            data = self._read1(n)
+            if n < len(data):
+                self._readbuffer = data
+                self._offset = n
+                buf += data[:n]
+                break
+            buf += data
+            n -= len(data)
+        return buf
+
+    def _update_crc(self, newdata):
+        # Update the CRC using the given data.
+        if self._expected_crc is None:
+            # No need to compute the CRC if we don't have a reference value
+            return
+        self._running_crc = crc32(newdata, self._running_crc)
+        # Check the CRC if we're at the end of the file
+        if self._eof and self._running_crc != self._expected_crc:
+            raise BadZipFile("Bad CRC-32 for file %r" % self.name)
+
+    def read1(self, n):
+        """Read up to n bytes with at most one read() system call."""
+
+        if n is None or n < 0:
+            buf = self._readbuffer[self._offset:]
+            self._readbuffer = b''
+            self._offset = 0
+            while not self._eof:
+                data = self._read1(self.MAX_N)
+                if data:
+                    buf += data
+                    break
+            return buf
+
+        end = n + self._offset
+        if end < len(self._readbuffer):
+            buf = self._readbuffer[self._offset:end]
+            self._offset = end
+            return buf
+
+        n = end - len(self._readbuffer)
+        buf = self._readbuffer[self._offset:]
+        self._readbuffer = b''
+        self._offset = 0
+        if n > 0:
+            while not self._eof:
+                data = self._read1(n)
+                if n < len(data):
+                    self._readbuffer = data
+                    self._offset = n
+                    buf += data[:n]
+                    break
+                if data:
+                    buf += data
+                    break
+        return buf
+
+    def _read1(self, n):
+        # Read up to n compressed bytes with at most one read() system call,
+        # decrypt and decompress them.
+        if self._eof or n <= 0:
+            return b''
+
+        # Read from file.
+        if self._compress_type == ZIP_DEFLATED:
+            ## Handle unconsumed data.
+            data = self._decompressor.unconsumed_tail
+            if n > len(data):
+                data += self._read2(n - len(data))
+        else:
+            data = self._read2(n)
+
+        if self._compress_type == ZIP_STORED:
+            self._eof = self._compress_left <= 0
+        elif self._compress_type == ZIP_DEFLATED:
+            n = max(n, self.MIN_READ_SIZE)
+            data = self._decompressor.decompress(data, n)
+            self._eof = (self._decompressor.eof or
+                         self._compress_left <= 0 and
+                         not self._decompressor.unconsumed_tail)
+            if self._eof:
+                data += self._decompressor.flush()
+        else:
+            data = self._decompressor.decompress(data)
+            self._eof = self._decompressor.eof or self._compress_left <= 0
+
+        data = data[:self._left]
+        self._left -= len(data)
+        if self._left <= 0:
+            self._eof = True
+        self._update_crc(data)
+        return data
+
+    def _read2(self, n):
+        if self._compress_left <= 0:
+            return b''
+
+        n = max(n, self.MIN_READ_SIZE)
+        n = min(n, self._compress_left)
+
+        data = self._fileobj.read(n)
+        self._compress_left -= len(data)
+        if not data:
+            raise EOFError
+
+        if self._decrypter is not None:
+            data = self._decrypter(data)
+        return data
+
+    def close(self):
+        try:
+            if self._close_fileobj:
+                self._fileobj.close()
+        finally:
+            super().close()
+
+    def seekable(self):
+        return self._seekable
+
+    def seek(self, offset, whence=0):
+        if not self._seekable:
+            raise io.UnsupportedOperation("underlying stream is not seekable")
+        curr_pos = self.tell()
+        if whence == 0: # Seek from start of file
+            new_pos = offset
+        elif whence == 1: # Seek from current position
+            new_pos = curr_pos + offset
+        elif whence == 2: # Seek from EOF
+            new_pos = self._orig_file_size + offset
+        else:
+            raise ValueError("whence must be os.SEEK_SET (0), "
+                             "os.SEEK_CUR (1), or os.SEEK_END (2)")
+
+        if new_pos > self._orig_file_size:
+            new_pos = self._orig_file_size
+
+        if new_pos < 0:
+            new_pos = 0
+
+        read_offset = new_pos - curr_pos
+        buff_offset = read_offset + self._offset
+
+        if buff_offset >= 0 and buff_offset < len(self._readbuffer):
+            # Just move the _offset index if the new position is in the _readbuffer
+            self._offset = buff_offset
+            read_offset = 0
+        elif read_offset < 0:
+            # Position is before the current position. Reset the ZipExtFile
+            self._fileobj.seek(self._orig_compress_start)
+            self._running_crc = self._orig_start_crc
+            self._compress_left = self._orig_compress_size
+            self._left = self._orig_file_size
+            self._readbuffer = b''
+            self._offset = 0
+            self._decompressor = _get_decompressor(self._compress_type)
+            self._eof = False
+            read_offset = new_pos
+            if self._decrypter is not None:
+                self._init_decrypter()
+
+        while read_offset > 0:
+            read_len = min(self.MAX_SEEK_READ, read_offset)
+            self.read(read_len)
+            read_offset -= read_len
+
+        return self.tell()
+
+    def tell(self):
+        if not self._seekable:
+            raise io.UnsupportedOperation("underlying stream is not seekable")
+        filepos = self._orig_file_size - self._left - len(self._readbuffer) + self._offset
+        return filepos
+
+
+class _ZipWriteFile(io.BufferedIOBase):
+    def __init__(self, zf, zinfo, zip64):
+        self._zinfo = zinfo
+        self._zip64 = zip64
+        self._zipfile = zf
+        self._compressor = _get_compressor(zinfo.compress_type,
+                                           zinfo._compresslevel)
+        self._file_size = 0
+        self._compress_size = 0
+        self._crc = 0
+
+    @property
+    def _fileobj(self):
+        return self._zipfile.fp
+
+    def writable(self):
+        return True
+
+    def write(self, data):
+        if self.closed:
+            raise ValueError('I/O operation on closed file.')
+        nbytes = len(data)
+        self._file_size += nbytes
+        self._crc = crc32(data, self._crc)
+        if self._compressor:
+            data = self._compressor.compress(data)
+            self._compress_size += len(data)
+        self._fileobj.write(data)
+        return nbytes
+
+    def close(self):
+        if self.closed:
+            return
+        try:
+            super().close()
+            # Flush any data from the compressor, and update header info
+            if self._compressor:
+                buf = self._compressor.flush()
+                self._compress_size += len(buf)
+                self._fileobj.write(buf)
+                self._zinfo.compress_size = self._compress_size
+            else:
+                self._zinfo.compress_size = self._file_size
+            self._zinfo.CRC = self._crc
+            self._zinfo.file_size = self._file_size
+
+            # Write updated header info
+            if self._zinfo.flag_bits & 0x08:
+                # Write CRC and file sizes after the file data
+                fmt = '<LLQQ' if self._zip64 else '<LLLL'
+                self._fileobj.write(struct.pack(fmt, _DD_SIGNATURE, self._zinfo.CRC,
+                    self._zinfo.compress_size, self._zinfo.file_size))
+                self._zipfile.start_dir = self._fileobj.tell()
+            else:
+                if not self._zip64:
+                    if self._file_size > ZIP64_LIMIT:
+                        raise RuntimeError(
+                            'File size unexpectedly exceeded ZIP64 limit')
+                    if self._compress_size > ZIP64_LIMIT:
+                        raise RuntimeError(
+                            'Compressed size unexpectedly exceeded ZIP64 limit')
+                # Seek backwards and write file header (which will now include
+                # correct CRC and file sizes)
+
+                # Preserve current position in file
+                self._zipfile.start_dir = self._fileobj.tell()
+                self._fileobj.seek(self._zinfo.header_offset)
+                self._fileobj.write(self._zinfo.FileHeader(self._zip64))
+                self._fileobj.seek(self._zipfile.start_dir)
+
+            # Successfully written: Add file to our caches
+            self._zipfile.filelist.append(self._zinfo)
+            self._zipfile.NameToInfo[self._zinfo.filename] = self._zinfo
+        finally:
+            self._zipfile._writing = False
+
+
+
+class ZipFile:
+    """ Class with methods to open, read, write, close, list zip files.
+
+    z = ZipFile(file, mode="r", compression=ZIP_STORED, allowZip64=True,
+                compresslevel=None)
+
+    file: Either the path to the file, or a file-like object.
+          If it is a path, the file will be opened and closed by ZipFile.
+    mode: The mode can be either read 'r', write 'w', exclusive create 'x',
+          or append 'a'.
+    compression: ZIP_STORED (no compression), ZIP_DEFLATED (requires zlib),
+                 ZIP_BZIP2 (requires bz2) or ZIP_LZMA (requires lzma).
+    allowZip64: if True ZipFile will create files with ZIP64 extensions when
+                needed, otherwise it will raise an exception when this would
+                be necessary.
+    compresslevel: None (default for the given compression type) or an integer
+                   specifying the level to pass to the compressor.
+                   When using ZIP_STORED or ZIP_LZMA this keyword has no effect.
+                   When using ZIP_DEFLATED integers 0 through 9 are accepted.
+                   When using ZIP_BZIP2 integers 1 through 9 are accepted.
+
+    """
+
+    fp = None                   # Set here since __del__ checks it
+    _windows_illegal_name_trans_table = None
+
+    def __init__(self, file, mode="r", compression=ZIP_STORED, allowZip64=True,
+                 compresslevel=None, has_changed_name=False):
+        """Open the ZIP file with mode read 'r', write 'w', exclusive create 'x',
+        or append 'a'."""
+        if mode not in ('r', 'w', 'x', 'a'):
+            raise ValueError("ZipFile requires mode 'r', 'w', 'x', or 'a'")
+
+        _check_compression(compression)
+
+        self._allowZip64 = allowZip64
+        self._didModify = False
+        self.debug = 0  # Level of printing: 0 through 3
+        self.NameToInfo = {}    # Find file info given name
+        self.filelist = []      # List of ZipInfo instances for archive
+        self.compression = compression  # Method of compression
+        self.compresslevel = compresslevel
+        self.mode = mode
+        self.pwd = None
+        self._comment = b''
+        self.has_changed_name = has_changed_name
+
+        # Check if we were passed a file-like object
+        if isinstance(file, os.PathLike):
+            file = os.fspath(file)
+        if isinstance(file, str):
+            # No, it's a filename
+            self._filePassed = 0
+            self.filename = file
+            modeDict = {'r' : 'rb', 'w': 'w+b', 'x': 'x+b', 'a' : 'r+b',
+                        'r+b': 'w+b', 'w+b': 'wb', 'x+b': 'xb'}
+            filemode = modeDict[mode]
+            while True:
+                try:
+                    self.fp = io.open(file, filemode)
+                except OSError:
+                    if filemode in modeDict:
+                        filemode = modeDict[filemode]
+                        continue
+                    raise
+                break
+        else:
+            self._filePassed = 1
+            self.fp = file
+            self.filename = getattr(file, 'name', None)
+        self._fileRefCnt = 1
+        self._lock = threading.RLock()
+        self._seekable = True
+        self._writing = False
+
+        try:
+            if mode == 'r':
+                self._RealGetContents()
+            elif mode in ('w', 'x'):
+                # set the modified flag so central directory gets written
+                # even if no files are added to the archive
+                self._didModify = True
+                try:
+                    self.start_dir = self.fp.tell()
+                except (AttributeError, OSError):
+                    self.fp = _Tellable(self.fp)
+                    self.start_dir = 0
+                    self._seekable = False
+                else:
+                    # Some file-like objects can provide tell() but not seek()
+                    try:
+                        self.fp.seek(self.start_dir)
+                    except (AttributeError, OSError):
+                        self._seekable = False
+            elif mode == 'a':
+                try:
+                    # See if file is a zip file
+                    self._RealGetContents()
+                    # seek to start of directory and overwrite
+                    self.fp.seek(self.start_dir)
+                except BadZipFile:
+                    # file is not a zip file, just append
+                    self.fp.seek(0, 2)
+
+                    # set the modified flag so central directory gets written
+                    # even if no files are added to the archive
+                    self._didModify = True
+                    self.start_dir = self.fp.tell()
+            else:
+                raise ValueError("Mode must be 'r', 'w', 'x', or 'a'")
+        except:
+            fp = self.fp
+            self.fp = None
+            self._fpclose(fp)
+            raise
+
+    def __enter__(self):
+        return self
+
+    def __exit__(self, type, value, traceback):
+        self.close()
+
+    def __repr__(self):
+        result = ['<%s.%s' % (self.__class__.__module__,
+                              self.__class__.__qualname__)]
+        if self.fp is not None:
+            if self._filePassed:
+                result.append(' file=%r' % self.fp)
+            elif self.filename is not None:
+                result.append(' filename=%r' % self.filename)
+            result.append(' mode=%r' % self.mode)
+        else:
+            result.append(' [closed]')
+        result.append('>')
+        return ''.join(result)
+
+    def _RealGetContents(self):
+        """Read in the table of contents for the ZIP file."""
+        fp = self.fp
+        try:
+            endrec = _EndRecData(fp)
+        except OSError:
+            raise BadZipFile("File is not a zip file")
+        if not endrec:
+            raise BadZipFile("File is not a zip file")
+        if self.debug > 1:
+            print(endrec)
+        size_cd = endrec[_ECD_SIZE]             # bytes in central directory
+        offset_cd = endrec[_ECD_OFFSET]         # offset of central directory
+        self._comment = endrec[_ECD_COMMENT]    # archive comment
+
+        # "concat" is zero, unless zip was concatenated to another file
+        concat = endrec[_ECD_LOCATION] - size_cd - offset_cd
+        if endrec[_ECD_SIGNATURE] == stringEndArchive64:
+            # If Zip64 extension structures are present, account for them
+            concat -= (sizeEndCentDir64 + sizeEndCentDir64Locator)
+
+        if self.debug > 2:
+            inferred = concat + offset_cd
+            print("given, inferred, offset", offset_cd, inferred, concat)
+        # self.start_dir:  Position of start of central directory
+        self.start_dir = offset_cd + concat
+        fp.seek(self.start_dir, 0)
+        data = fp.read(size_cd)
+        fp = io.BytesIO(data)
+        total = 0
+        while total < size_cd:
+            centdir = fp.read(sizeCentralDir)
+            if len(centdir) != sizeCentralDir:
+                raise BadZipFile("Truncated central directory")
+            centdir = struct.unpack(structCentralDir, centdir)
+            if centdir[_CD_SIGNATURE] != stringCentralDir:
+                raise BadZipFile("Bad magic number for central directory")
+            if self.debug > 2:
+                print(centdir)
+            filename = fp.read(centdir[_CD_FILENAME_LENGTH])
+            flags = centdir[5]
+            if flags & 0x800:
+                # UTF-8 file names extension
+                filename = filename.decode('utf-8')
+            else:
+                # Historical ZIP filename encoding
+                filename = filename.decode('cp437')
+                # filename = filename.decode('utf-8')
+            # Create ZipInfo instance to store file information
+            x = ZipInfo(filename, has_changed_name=self.has_changed_name)
+            x.extra = fp.read(centdir[_CD_EXTRA_FIELD_LENGTH])
+            x.comment = fp.read(centdir[_CD_COMMENT_LENGTH])
+            x.header_offset = centdir[_CD_LOCAL_HEADER_OFFSET]
+            (x.create_version, x.create_system, x.extract_version, x.reserved,
+             x.flag_bits, x.compress_type, t, d,
+             x.CRC, x.compress_size, x.file_size) = centdir[1:12]
+            if x.extract_version > MAX_EXTRACT_VERSION:
+                raise NotImplementedError("zip file version %.1f" %
+                                          (x.extract_version / 10))
+            x.volume, x.internal_attr, x.external_attr = centdir[15:18]
+            # Convert date/time code to (year, month, day, hour, min, sec)
+            x._raw_time = t
+            x.date_time = ( (d>>9)+1980, (d>>5)&0xF, d&0x1F,
+                            t>>11, (t>>5)&0x3F, (t&0x1F) * 2 )
+
+            x._decodeExtra()
+            x.header_offset = x.header_offset + concat
+            self.filelist.append(x)
+            self.NameToInfo[x.filename] = x
+
+            # update total bytes read from central directory
+            total = (total + sizeCentralDir + centdir[_CD_FILENAME_LENGTH]
+                     + centdir[_CD_EXTRA_FIELD_LENGTH]
+                     + centdir[_CD_COMMENT_LENGTH])
+
+            if self.debug > 2:
+                print("total", total)
+
+
+    def namelist(self):
+        """Return a list of file names in the archive."""
+        return [data.filename for data in self.filelist]
+
+    def infolist(self):
+        """Return a list of class ZipInfo instances for files in the
+        archive."""
+        return self.filelist
+
+    def printdir(self, file=None):
+        """Print a table of contents for the zip file."""
+        print("%-46s %19s %12s" % ("File Name", "Modified    ", "Size"),
+              file=file)
+        for zinfo in self.filelist:
+            date = "%d-%02d-%02d %02d:%02d:%02d" % zinfo.date_time[:6]
+            print("%-46s %s %12d" % (zinfo.filename, date, zinfo.file_size),
+                  file=file)
+
+    def testzip(self):
+        """Read all the files and check the CRC."""
+        chunk_size = 2 ** 20
+        for zinfo in self.filelist:
+            try:
+                # Read by chunks, to avoid an OverflowError or a
+                # MemoryError with very large embedded files.
+                with self.open(zinfo.filename, "r") as f:
+                    while f.read(chunk_size):     # Check CRC-32
+                        pass
+            except BadZipFile:
+                return zinfo.filename
+
+    def getinfo(self, name):
+        """Return the instance of ZipInfo given 'name'."""
+        info = self.NameToInfo.get(name)
+        if info is None:
+            raise KeyError(
+                'There is no item named %r in the archive' % name)
+
+        return info
+
+    def setpassword(self, pwd):
+        """Set default password for encrypted files."""
+        if pwd and not isinstance(pwd, bytes):
+            raise TypeError("pwd: expected bytes, got %s" % type(pwd).__name__)
+        if pwd:
+            self.pwd = pwd
+        else:
+            self.pwd = None
+
+    @property
+    def comment(self):
+        """The comment text associated with the ZIP file."""
+        return self._comment
+
+    @comment.setter
+    def comment(self, comment):
+        if not isinstance(comment, bytes):
+            raise TypeError("comment: expected bytes, got %s" % type(comment).__name__)
+        # check for valid comment length
+        if len(comment) > ZIP_MAX_COMMENT:
+            import warnings
+            warnings.warn('Archive comment is too long; truncating to %d bytes'
+                          % ZIP_MAX_COMMENT, stacklevel=2)
+            comment = comment[:ZIP_MAX_COMMENT]
+        self._comment = comment
+        self._didModify = True
+
+    def read(self, name, pwd=None):
+        """Return file bytes for name."""
+        with self.open(name, "r", pwd) as fp:
+            return fp.read()
+
+    def open(self, name, mode="r", pwd=None, *, force_zip64=False):
+        """Return file-like object for 'name'.
+
+        name is a string for the file name within the ZIP file, or a ZipInfo
+        object.
+
+        mode should be 'r' to read a file already in the ZIP file, or 'w' to
+        write to a file newly added to the archive.
+
+        pwd is the password to decrypt files (only used for reading).
+
+        When writing, if the file size is not known in advance but may exceed
+        2 GiB, pass force_zip64 to use the ZIP64 format, which can handle large
+        files.  If the size is known in advance, it is best to pass a ZipInfo
+        instance for name, with zinfo.file_size set.
+        """
+        if mode not in {"r", "w"}:
+            raise ValueError('open() requires mode "r" or "w"')
+        if pwd and not isinstance(pwd, bytes):
+            raise TypeError("pwd: expected bytes, got %s" % type(pwd).__name__)
+        if pwd and (mode == "w"):
+            raise ValueError("pwd is only supported for reading files")
+        if not self.fp:
+            raise ValueError(
+                "Attempt to use ZIP archive that was already closed")
+
+        # Make sure we have an info object
+        if isinstance(name, ZipInfo):
+            # 'name' is already an info object
+            zinfo = name
+        elif mode == 'w':
+            zinfo = ZipInfo(name)
+            zinfo.compress_type = self.compression
+            zinfo._compresslevel = self.compresslevel
+        else:
+            # Get info object for name
+            zinfo = self.getinfo(name)
+
+        if mode == 'w':
+            return self._open_to_write(zinfo, force_zip64=force_zip64)
+
+        if self._writing:
+            raise ValueError("Can't read from the ZIP file while there "
+                    "is an open writing handle on it. "
+                    "Close the writing handle before trying to read.")
+
+        # Open for reading:
+        self._fileRefCnt += 1
+        zef_file = _SharedFile(self.fp, zinfo.header_offset,
+                               self._fpclose, self._lock, lambda: self._writing)
+        try:
+            # Skip the file header:
+            fheader = zef_file.read(sizeFileHeader)
+            if len(fheader) != sizeFileHeader:
+                raise BadZipFile("Truncated file header")
+            fheader = struct.unpack(structFileHeader, fheader)
+            if fheader[_FH_SIGNATURE] != stringFileHeader:
+                raise BadZipFile("Bad magic number for file header")
+
+            fname = zef_file.read(fheader[_FH_FILENAME_LENGTH])
+            if fheader[_FH_EXTRA_FIELD_LENGTH]:
+                zef_file.read(fheader[_FH_EXTRA_FIELD_LENGTH])
+
+            if zinfo.flag_bits & 0x20:
+                # Zip 2.7: compressed patched data
+                raise NotImplementedError("compressed patched data (flag bit 5)")
+
+            if zinfo.flag_bits & 0x40:
+                # strong encryption
+                raise NotImplementedError("strong encryption (flag bit 6)")
+
+            if zinfo.flag_bits & 0x800:
+                # UTF-8 filename
+                fname_str = fname.decode("utf-8")
+            else:
+                fname_str = fname.decode("cp437")
+
+            print('zinfo.has_changed_name', zinfo.has_changed_name)
+            if not zinfo.has_changed_name:
+                if fname_str != zinfo.orig_filename:
+                    raise BadZipFile(
+                        'File name in directory %r and header %r differ.'
+                        % (zinfo.orig_filename, fname))
+
+            # check for encrypted flag & handle password
+            is_encrypted = zinfo.flag_bits & 0x1
+            if is_encrypted:
+                if not pwd:
+                    pwd = self.pwd
+                if not pwd:
+                    raise RuntimeError("File %r is encrypted, password "
+                                       "required for extraction" % name)
+            else:
+                pwd = None
+
+            return ZipExtFile(zef_file, mode, zinfo, pwd, True)
+        except:
+            zef_file.close()
+            raise
+
+    def _open_to_write(self, zinfo, force_zip64=False):
+        if force_zip64 and not self._allowZip64:
+            raise ValueError(
+                "force_zip64 is True, but allowZip64 was False when opening "
+                "the ZIP file."
+            )
+        if self._writing:
+            raise ValueError("Can't write to the ZIP file while there is "
+                             "another write handle open on it. "
+                             "Close the first handle before opening another.")
+
+        # Sizes and CRC are overwritten with correct data after processing the file
+        if not hasattr(zinfo, 'file_size'):
+            zinfo.file_size = 0
+        zinfo.compress_size = 0
+        zinfo.CRC = 0
+
+        zinfo.flag_bits = 0x00
+        if zinfo.compress_type == ZIP_LZMA:
+            # Compressed data includes an end-of-stream (EOS) marker
+            zinfo.flag_bits |= 0x02
+        if not self._seekable:
+            zinfo.flag_bits |= 0x08
+
+        if not zinfo.external_attr:
+            zinfo.external_attr = 0o600 << 16  # permissions: ?rw-------
+
+        # Compressed size can be larger than uncompressed size
+        zip64 = self._allowZip64 and \
+                (force_zip64 or zinfo.file_size * 1.05 > ZIP64_LIMIT)
+
+        if self._seekable:
+            self.fp.seek(self.start_dir)
+        zinfo.header_offset = self.fp.tell()
+
+        self._writecheck(zinfo)
+        self._didModify = True
+
+        self.fp.write(zinfo.FileHeader(zip64))
+
+        self._writing = True
+        return _ZipWriteFile(self, zinfo, zip64)
+
+    def extract(self, member, path=None, pwd=None):
+        """Extract a member from the archive to the current working directory,
+           using its full name. Its file information is extracted as accurately
+           as possible. `member' may be a filename or a ZipInfo object. You can
+           specify a different directory using `path'.
+        """
+        if path is None:
+            path = os.getcwd()
+        else:
+            path = os.fspath(path)
+
+        return self._extract_member(member, path, pwd)
+
+    def extractall(self, path=None, members=None, pwd=None):
+        """Extract all members from the archive to the current working
+           directory. `path' specifies a different directory to extract to.
+           `members' is optional and must be a subset of the list returned
+           by namelist().
+        """
+        if members is None:
+            members = self.namelist()
+
+        if path is None:
+            path = os.getcwd()
+        else:
+            path = os.fspath(path)
+
+        for zipinfo in members:
+            self._extract_member(zipinfo, path, pwd)
+
+    @classmethod
+    def _sanitize_windows_name(cls, arcname, pathsep):
+        """Replace bad characters and remove trailing dots from parts."""
+        table = cls._windows_illegal_name_trans_table
+        if not table:
+            illegal = ':<>|"?*'
+            table = str.maketrans(illegal, '_' * len(illegal))
+            cls._windows_illegal_name_trans_table = table
+        arcname = arcname.translate(table)
+        # remove trailing dots
+        arcname = (x.rstrip('.') for x in arcname.split(pathsep))
+        # rejoin, removing empty parts.
+        arcname = pathsep.join(x for x in arcname if x)
+        return arcname
+
+    def _extract_member(self, member, targetpath, pwd):
+        """Extract the ZipInfo object 'member' to a physical
+           file on the path targetpath.
+        """
+        if not isinstance(member, ZipInfo):
+            member = self.getinfo(member)
+
+        # build the destination pathname, replacing
+        # forward slashes to platform specific separators.
+        arcname = member.filename.replace('/', os.path.sep)
+
+        if os.path.altsep:
+            arcname = arcname.replace(os.path.altsep, os.path.sep)
+        # interpret absolute pathname as relative, remove drive letter or
+        # UNC path, redundant separators, "." and ".." components.
+        arcname = os.path.splitdrive(arcname)[1]
+        invalid_path_parts = ('', os.path.curdir, os.path.pardir)
+        arcname = os.path.sep.join(x for x in arcname.split(os.path.sep)
+                                   if x not in invalid_path_parts)
+        if os.path.sep == '\\':
+            # filter illegal characters on Windows
+            arcname = self._sanitize_windows_name(arcname, os.path.sep)
+
+        targetpath = os.path.join(targetpath, arcname)
+        targetpath = os.path.normpath(targetpath)
+
+        # Create all upper directories if necessary.
+        upperdirs = os.path.dirname(targetpath)
+        if upperdirs and not os.path.exists(upperdirs):
+            os.makedirs(upperdirs)
+
+        if member.is_dir():
+            if not os.path.isdir(targetpath):
+                os.mkdir(targetpath)
+            return targetpath
+
+        with self.open(member, pwd=pwd) as source, \
+             open(targetpath, "wb") as target:
+            shutil.copyfileobj(source, target)
+
+        return targetpath
+
+    def _writecheck(self, zinfo):
+        """Check for errors before writing a file to the archive."""
+        if zinfo.filename in self.NameToInfo:
+            import warnings
+            warnings.warn('Duplicate name: %r' % zinfo.filename, stacklevel=3)
+        if self.mode not in ('w', 'x', 'a'):
+            raise ValueError("write() requires mode 'w', 'x', or 'a'")
+        if not self.fp:
+            raise ValueError(
+                "Attempt to write ZIP archive that was already closed")
+        _check_compression(zinfo.compress_type)
+        if not self._allowZip64:
+            requires_zip64 = None
+            if len(self.filelist) >= ZIP_FILECOUNT_LIMIT:
+                requires_zip64 = "Files count"
+            elif zinfo.file_size > ZIP64_LIMIT:
+                requires_zip64 = "Filesize"
+            elif zinfo.header_offset > ZIP64_LIMIT:
+                requires_zip64 = "Zipfile size"
+            if requires_zip64:
+                raise LargeZipFile(requires_zip64 +
+                                   " would require ZIP64 extensions")
+
+    def write(self, filename, arcname=None,
+              compress_type=None, compresslevel=None):
+        """Put the bytes from filename into the archive under the name
+        arcname."""
+        if not self.fp:
+            raise ValueError(
+                "Attempt to write to ZIP archive that was already closed")
+        if self._writing:
+            raise ValueError(
+                "Can't write to ZIP archive while an open writing handle exists"
+            )
+
+        zinfo = ZipInfo.from_file(filename, arcname)
+
+        if zinfo.is_dir():
+            zinfo.compress_size = 0
+            zinfo.CRC = 0
+        else:
+            if compress_type is not None:
+                zinfo.compress_type = compress_type
+            else:
+                zinfo.compress_type = self.compression
+
+            if compresslevel is not None:
+                zinfo._compresslevel = compresslevel
+            else:
+                zinfo._compresslevel = self.compresslevel
+
+        if zinfo.is_dir():
+            with self._lock:
+                if self._seekable:
+                    self.fp.seek(self.start_dir)
+                zinfo.header_offset = self.fp.tell()  # Start of header bytes
+                if zinfo.compress_type == ZIP_LZMA:
+                # Compressed data includes an end-of-stream (EOS) marker
+                    zinfo.flag_bits |= 0x02
+
+                self._writecheck(zinfo)
+                self._didModify = True
+
+                self.filelist.append(zinfo)
+                self.NameToInfo[zinfo.filename] = zinfo
+                self.fp.write(zinfo.FileHeader(False))
+                self.start_dir = self.fp.tell()
+        else:
+            with open(filename, "rb") as src, self.open(zinfo, 'w') as dest:
+                shutil.copyfileobj(src, dest, 1024*8)
+
+    def writestr(self, zinfo_or_arcname, data,
+                 compress_type=None, compresslevel=None):
+        """Write a file into the archive.  The contents is 'data', which
+        may be either a 'str' or a 'bytes' instance; if it is a 'str',
+        it is encoded as UTF-8 first.
+        'zinfo_or_arcname' is either a ZipInfo instance or
+        the name of the file in the archive."""
+        if isinstance(data, str):
+            data = data.encode("utf-8")
+        if not isinstance(zinfo_or_arcname, ZipInfo):
+            zinfo = ZipInfo(filename=zinfo_or_arcname,
+                            date_time=time.localtime(time.time())[:6])
+            zinfo.compress_type = self.compression
+            zinfo._compresslevel = self.compresslevel
+            if zinfo.filename[-1] == '/':
+                zinfo.external_attr = 0o40775 << 16   # drwxrwxr-x
+                zinfo.external_attr |= 0x10           # MS-DOS directory flag
+            else:
+                zinfo.external_attr = 0o600 << 16     # ?rw-------
+        else:
+            zinfo = zinfo_or_arcname
+
+        if not self.fp:
+            raise ValueError(
+                "Attempt to write to ZIP archive that was already closed")
+        if self._writing:
+            raise ValueError(
+                "Can't write to ZIP archive while an open writing handle exists."
+            )
+
+        if compress_type is not None:
+            zinfo.compress_type = compress_type
+
+        if compresslevel is not None:
+            zinfo._compresslevel = compresslevel
+
+        zinfo.file_size = len(data)            # Uncompressed size
+        with self._lock:
+            with self.open(zinfo, mode='w') as dest:
+                dest.write(data)
+
+    def __del__(self):
+        """Call the "close()" method in case the user forgot."""
+        self.close()
+
+    def close(self):
+        """Close the file, and for mode 'w', 'x' and 'a' write the ending
+        records."""
+        if self.fp is None:
+            return
+
+        if self._writing:
+            raise ValueError("Can't close the ZIP file while there is "
+                             "an open writing handle on it. "
+                             "Close the writing handle before closing the zip.")
+
+        try:
+            if self.mode in ('w', 'x', 'a') and self._didModify: # write ending records
+                with self._lock:
+                    if self._seekable:
+                        self.fp.seek(self.start_dir)
+                    self._write_end_record()
+        finally:
+            fp = self.fp
+            self.fp = None
+            self._fpclose(fp)
+
+    def _write_end_record(self):
+        for zinfo in self.filelist:         # write central directory
+            dt = zinfo.date_time
+            dosdate = (dt[0] - 1980) << 9 | dt[1] << 5 | dt[2]
+            dostime = dt[3] << 11 | dt[4] << 5 | (dt[5] // 2)
+            extra = []
+            if zinfo.file_size > ZIP64_LIMIT \
+               or zinfo.compress_size > ZIP64_LIMIT:
+                extra.append(zinfo.file_size)
+                extra.append(zinfo.compress_size)
+                file_size = 0xffffffff
+                compress_size = 0xffffffff
+            else:
+                file_size = zinfo.file_size
+                compress_size = zinfo.compress_size
+
+            if zinfo.header_offset > ZIP64_LIMIT:
+                extra.append(zinfo.header_offset)
+                header_offset = 0xffffffff
+            else:
+                header_offset = zinfo.header_offset
+
+            extra_data = zinfo.extra
+            min_version = 0
+            if extra:
+                # Append a ZIP64 field to the extra's
+                extra_data = _strip_extra(extra_data, (1,))
+                extra_data = struct.pack(
+                    '<HH' + 'Q'*len(extra),
+                    1, 8*len(extra), *extra) + extra_data
+
+                min_version = ZIP64_VERSION
+
+            if zinfo.compress_type == ZIP_BZIP2:
+                min_version = max(BZIP2_VERSION, min_version)
+            elif zinfo.compress_type == ZIP_LZMA:
+                min_version = max(LZMA_VERSION, min_version)
+
+            extract_version = max(min_version, zinfo.extract_version)
+            create_version = max(min_version, zinfo.create_version)
+            try:
+                filename, flag_bits = zinfo._encodeFilenameFlags()
+                centdir = struct.pack(structCentralDir,
+                                      stringCentralDir, create_version,
+                                      zinfo.create_system, extract_version, zinfo.reserved,
+                                      flag_bits, zinfo.compress_type, dostime, dosdate,
+                                      zinfo.CRC, compress_size, file_size,
+                                      len(filename), len(extra_data), len(zinfo.comment),
+                                      0, zinfo.internal_attr, zinfo.external_attr,
+                                      header_offset)
+            except DeprecationWarning:
+                print((structCentralDir, stringCentralDir, create_version,
+                       zinfo.create_system, extract_version, zinfo.reserved,
+                       zinfo.flag_bits, zinfo.compress_type, dostime, dosdate,
+                       zinfo.CRC, compress_size, file_size,
+                       len(zinfo.filename), len(extra_data), len(zinfo.comment),
+                       0, zinfo.internal_attr, zinfo.external_attr,
+                       header_offset), file=sys.stderr)
+                raise
+            self.fp.write(centdir)
+            self.fp.write(filename)
+            self.fp.write(extra_data)
+            self.fp.write(zinfo.comment)
+
+        pos2 = self.fp.tell()
+        # Write end-of-zip-archive record
+        centDirCount = len(self.filelist)
+        centDirSize = pos2 - self.start_dir
+        centDirOffset = self.start_dir
+        requires_zip64 = None
+        if centDirCount > ZIP_FILECOUNT_LIMIT:
+            requires_zip64 = "Files count"
+        elif centDirOffset > ZIP64_LIMIT:
+            requires_zip64 = "Central directory offset"
+        elif centDirSize > ZIP64_LIMIT:
+            requires_zip64 = "Central directory size"
+        if requires_zip64:
+            # Need to write the ZIP64 end-of-archive records
+            if not self._allowZip64:
+                raise LargeZipFile(requires_zip64 +
+                                   " would require ZIP64 extensions")
+            zip64endrec = struct.pack(
+                structEndArchive64, stringEndArchive64,
+                44, 45, 45, 0, 0, centDirCount, centDirCount,
+                centDirSize, centDirOffset)
+            self.fp.write(zip64endrec)
+
+            zip64locrec = struct.pack(
+                structEndArchive64Locator,
+                stringEndArchive64Locator, 0, pos2, 1)
+            self.fp.write(zip64locrec)
+            centDirCount = min(centDirCount, 0xFFFF)
+            centDirSize = min(centDirSize, 0xFFFFFFFF)
+            centDirOffset = min(centDirOffset, 0xFFFFFFFF)
+
+        endrec = struct.pack(structEndArchive, stringEndArchive,
+                             0, 0, centDirCount, centDirCount,
+                             centDirSize, centDirOffset, len(self._comment))
+        self.fp.write(endrec)
+        self.fp.write(self._comment)
+        self.fp.flush()
+
+    def _fpclose(self, fp):
+        assert self._fileRefCnt > 0
+        self._fileRefCnt -= 1
+        if not self._fileRefCnt and not self._filePassed:
+            fp.close()
+
+
+class PyZipFile(ZipFile):
+    """Class to create ZIP archives with Python library files and packages."""
+
+    def __init__(self, file, mode="r", compression=ZIP_STORED,
+                 allowZip64=True, optimize=-1):
+        ZipFile.__init__(self, file, mode=mode, compression=compression,
+                         allowZip64=allowZip64)
+        self._optimize = optimize
+
+    def writepy(self, pathname, basename="", filterfunc=None):
+        """Add all files from "pathname" to the ZIP archive.
+
+        If pathname is a package directory, search the directory and
+        all package subdirectories recursively for all *.py and enter
+        the modules into the archive.  If pathname is a plain
+        directory, listdir *.py and enter all modules.  Else, pathname
+        must be a Python *.py file and the module will be put into the
+        archive.  Added modules are always module.pyc.
+        This method will compile the module.py into module.pyc if
+        necessary.
+        If filterfunc(pathname) is given, it is called with every argument.
+        When it is False, the file or directory is skipped.
+        """
+        pathname = os.fspath(pathname)
+        if filterfunc and not filterfunc(pathname):
+            if self.debug:
+                label = 'path' if os.path.isdir(pathname) else 'file'
+                print('%s %r skipped by filterfunc' % (label, pathname))
+            return
+        dir, name = os.path.split(pathname)
+        if os.path.isdir(pathname):
+            initname = os.path.join(pathname, "__init__.py")
+            if os.path.isfile(initname):
+                # This is a package directory, add it
+                if basename:
+                    basename = "%s/%s" % (basename, name)
+                else:
+                    basename = name
+                if self.debug:
+                    print("Adding package in", pathname, "as", basename)
+                fname, arcname = self._get_codename(initname[0:-3], basename)
+                if self.debug:
+                    print("Adding", arcname)
+                self.write(fname, arcname)
+                dirlist = sorted(os.listdir(pathname))
+                dirlist.remove("__init__.py")
+                # Add all *.py files and package subdirectories
+                for filename in dirlist:
+                    path = os.path.join(pathname, filename)
+                    root, ext = os.path.splitext(filename)
+                    if os.path.isdir(path):
+                        if os.path.isfile(os.path.join(path, "__init__.py")):
+                            # This is a package directory, add it
+                            self.writepy(path, basename,
+                                         filterfunc=filterfunc)  # Recursive call
+                    elif ext == ".py":
+                        if filterfunc and not filterfunc(path):
+                            if self.debug:
+                                print('file %r skipped by filterfunc' % path)
+                            continue
+                        fname, arcname = self._get_codename(path[0:-3],
+                                                            basename)
+                        if self.debug:
+                            print("Adding", arcname)
+                        self.write(fname, arcname)
+            else:
+                # This is NOT a package directory, add its files at top level
+                if self.debug:
+                    print("Adding files from directory", pathname)
+                for filename in sorted(os.listdir(pathname)):
+                    path = os.path.join(pathname, filename)
+                    root, ext = os.path.splitext(filename)
+                    if ext == ".py":
+                        if filterfunc and not filterfunc(path):
+                            if self.debug:
+                                print('file %r skipped by filterfunc' % path)
+                            continue
+                        fname, arcname = self._get_codename(path[0:-3],
+                                                            basename)
+                        if self.debug:
+                            print("Adding", arcname)
+                        self.write(fname, arcname)
+        else:
+            if pathname[-3:] != ".py":
+                raise RuntimeError(
+                    'Files added with writepy() must end with ".py"')
+            fname, arcname = self._get_codename(pathname[0:-3], basename)
+            if self.debug:
+                print("Adding file", arcname)
+            self.write(fname, arcname)
+
+    def _get_codename(self, pathname, basename):
+        """Return (filename, archivename) for the path.
+
+        Given a module name path, return the correct file path and
+        archive name, compiling if necessary.  For example, given
+        /python/lib/string, return (/python/lib/string.pyc, string).
+        """
+        def _compile(file, optimize=-1):
+            import py_compile
+            if self.debug:
+                print("Compiling", file)
+            try:
+                py_compile.compile(file, doraise=True, optimize=optimize)
+            except py_compile.PyCompileError as err:
+                print(err.msg)
+                return False
+            return True
+
+        file_py  = pathname + ".py"
+        file_pyc = pathname + ".pyc"
+        pycache_opt0 = importlib.util.cache_from_source(file_py, optimization='')
+        pycache_opt1 = importlib.util.cache_from_source(file_py, optimization=1)
+        pycache_opt2 = importlib.util.cache_from_source(file_py, optimization=2)
+        if self._optimize == -1:
+            # legacy mode: use whatever file is present
+            if (os.path.isfile(file_pyc) and
+                  os.stat(file_pyc).st_mtime >= os.stat(file_py).st_mtime):
+                # Use .pyc file.
+                arcname = fname = file_pyc
+            elif (os.path.isfile(pycache_opt0) and
+                  os.stat(pycache_opt0).st_mtime >= os.stat(file_py).st_mtime):
+                # Use the __pycache__/*.pyc file, but write it to the legacy pyc
+                # file name in the archive.
+                fname = pycache_opt0
+                arcname = file_pyc
+            elif (os.path.isfile(pycache_opt1) and
+                  os.stat(pycache_opt1).st_mtime >= os.stat(file_py).st_mtime):
+                # Use the __pycache__/*.pyc file, but write it to the legacy pyc
+                # file name in the archive.
+                fname = pycache_opt1
+                arcname = file_pyc
+            elif (os.path.isfile(pycache_opt2) and
+                  os.stat(pycache_opt2).st_mtime >= os.stat(file_py).st_mtime):
+                # Use the __pycache__/*.pyc file, but write it to the legacy pyc
+                # file name in the archive.
+                fname = pycache_opt2
+                arcname = file_pyc
+            else:
+                # Compile py into PEP 3147 pyc file.
+                if _compile(file_py):
+                    if sys.flags.optimize == 0:
+                        fname = pycache_opt0
+                    elif sys.flags.optimize == 1:
+                        fname = pycache_opt1
+                    else:
+                        fname = pycache_opt2
+                    arcname = file_pyc
+                else:
+                    fname = arcname = file_py
+        else:
+            # new mode: use given optimization level
+            if self._optimize == 0:
+                fname = pycache_opt0
+                arcname = file_pyc
+            else:
+                arcname = file_pyc
+                if self._optimize == 1:
+                    fname = pycache_opt1
+                elif self._optimize == 2:
+                    fname = pycache_opt2
+                else:
+                    msg = "invalid value for 'optimize': {!r}".format(self._optimize)
+                    raise ValueError(msg)
+            if not (os.path.isfile(fname) and
+                    os.stat(fname).st_mtime >= os.stat(file_py).st_mtime):
+                if not _compile(file_py, optimize=self._optimize):
+                    fname = arcname = file_py
+        archivename = os.path.split(arcname)[1]
+        if basename:
+            archivename = "%s/%s" % (basename, archivename)
+        return (fname, archivename)
+
+
+def main(args=None):
+    import argparse
+
+    description = 'A simple command-line interface for zipfile module.'
+    parser = argparse.ArgumentParser(description=description)
+    group = parser.add_mutually_exclusive_group(required=True)
+    group.add_argument('-l', '--list', metavar='<zipfile>',
+                       help='Show listing of a zipfile')
+    group.add_argument('-e', '--extract', nargs=2,
+                       metavar=('<zipfile>', '<output_dir>'),
+                       help='Extract zipfile into target dir')
+    group.add_argument('-c', '--create', nargs='+',
+                       metavar=('<name>', '<file>'),
+                       help='Create zipfile from sources')
+    group.add_argument('-t', '--test', metavar='<zipfile>',
+                       help='Test if a zipfile is valid')
+    args = parser.parse_args(args)
+
+    if args.test is not None:
+        src = args.test
+        with ZipFile(src, 'r') as zf:
+            badfile = zf.testzip()
+        if badfile:
+            print("The following enclosed file is corrupted: {!r}".format(badfile))
+        print("Done testing")
+
+    elif args.list is not None:
+        src = args.list
+        with ZipFile(src, 'r') as zf:
+            zf.printdir()
+
+    elif args.extract is not None:
+        src, curdir = args.extract
+        with ZipFile(src, 'r') as zf:
+            zf.extractall(curdir)
+
+    elif args.create is not None:
+        zip_name = args.create.pop(0)
+        files = args.create
+
+        def addToZip(zf, path, zippath):
+            if os.path.isfile(path):
+                zf.write(path, zippath, ZIP_DEFLATED)
+            elif os.path.isdir(path):
+                if zippath:
+                    zf.write(path, zippath)
+                for nm in sorted(os.listdir(path)):
+                    addToZip(zf,
+                             os.path.join(path, nm), os.path.join(zippath, nm))
+            # else: ignore
+
+        with ZipFile(zip_name, 'w') as zf:
+            for path in files:
+                zippath = os.path.basename(path)
+                if not zippath:
+                    zippath = os.path.basename(os.path.dirname(path))
+                if zippath in ('', os.curdir, os.pardir):
+                    zippath = ''
+                addToZip(zf, path, zippath)
+
+if __name__ == "__main__":
+    main()

+ 0 - 34
otr/otr_interface.py

@@ -411,40 +411,6 @@ if __name__ == '__main__':
     else:
         port = 18000
         using_gpu_index = 0
-    _global._init()
-    _global.update({"port": str(port)})
-    globals().update({"port": str(port)})
-
-    # 日志格式设置
-    # ip = get_intranet_ip()
-    # logging.basicConfig(level=logging.INFO,
-    #                     format='%(asctime)s - %(name)s - %(levelname)s - '
-    #                            + ip + ' - ' + str(port) + ' - %(message)s')
-    logging.info(get_platform())
-    # 限制tensorflow显存
-    # os.environ['CUDA_VISIBLE_DEVICES'] = str(using_gpu_index)
-    # import tensorflow as tf
-    # if get_platform() != "Windows":
-    #     _version = tf.__version__
-    #     logging.info(str(_version))
-    #     memory_limit_scale = 0.3
-    #     # tensorflow 1.x
-    #     if str(_version)[0] == "1":
-    #         logging.info("1.x " + str(_version))
-    #         os.environ['CUDA_CACHE_MAXSIZE'] = str(2147483648)
-    #         os.environ['CUDA_CACHE_DISABLE'] = str(0)
-    #         gpu_options = tf.GPUOptions(per_process_gpu_memory_fraction=memory_limit_scale)
-    #         sess = tf.Session(config=tf.ConfigProto(gpu_options=gpu_options))
-    #
-    #     # tensorflow 2.x
-    #     elif str(_version)[0] == "2":
-    #         logging.info("2.x " + str(_version))
-            # config = tf.compat.v1.ConfigProto()
-            # config.gpu_options.per_process_gpu_memory_fraction = memory_limit_scale
-            # config.gpu_options.allow_growth = True
-            # sess = tf.compat.v1.Session(config=config)
-
-
     # app.run(host='0.0.0.0', port=port, processes=1, threaded=False, debug=False)
     app.run()
     log("OTR running "+str(port))

Файлын зөрүү хэтэрхий том тул дарагдсан байна
+ 223 - 208
otr/table_line.py


Файлын зөрүү хэтэрхий том тул дарагдсан байна
+ 0 - 0
result.html


Энэ ялгаанд хэт олон файл өөрчлөгдсөн тул зарим файлыг харуулаагүй болно