4 лет назад · c0f71aa05b
--- a/.gitignore
+++ b/.gitignore
@@ -21,3 +21,4 @@
 
															 /wiki_128_word_embedding_new.env
														
 
															 /yep_homework.py
														
 
															 /format_convert/temp/
														
 
															+/format_convert/files/
														
--- a/format_convert/convert.py
+++ b/format_convert/convert.py
@@ -2673,7 +2673,7 @@ if __name__ == '__main__':
 
															         # file_path = "C:/Users/Administrator/Desktop/Test_ODPS/1624875783055.pdf"
														
 
															     else:
														
 
															         file_path = "1.doc"
														
 
															-    file_path = "files/1629873875150.png"
														
 
															+    file_path = "files/error3.pdf"
														
 
															     with open(file_path, "rb") as f:
														
 
															         file_bytes = f.read()
														
--- a/format_convert/convert_image.py
+++ b/format_convert/convert_image.py
@@ -101,14 +101,16 @@ def image_preprocess(image_np, image_path, use_ocr=True):
 
															                     list_lines.append(LTLine(1, (line[0], line[1]), (line[2], line[3])))
														
 
															                 from format_convert.convert_tree import TextBox
														
 
															                 list_text_boxes = []
														
 
															+                print("=============1")
														
 
															                 for i in range(len(bbox_list)):
														
 
															                     bbox = bbox_list[i]
														
 
															                     b_text = text_list[i]
														
 
															-                    list_text_boxes.append(TextBox([bbox[3][0], bbox[3][1],
														
 
															-                                                    bbox[1][0], bbox[1][1]], b_text))
														
 
															+                    print("text:",b_text,"bbox:",bbox)
														
 
															+                    list_text_boxes.append(TextBox([bbox[0][0], bbox[0][1],
														
 
															+                                                    bbox[2][0], bbox[2][1]], b_text))
														
 
															                 lt = LineTable()
														
 
															-                tables, obj_in_table, _ = lt.recognize_table(list_text_boxes, list_lines)
														
 
															+                tables, obj_in_table, _ = lt.recognize_table(list_text_boxes, list_lines,False)
														
 
															                 text = [tables, obj_in_table]
														
 
															                 column_list = []
														
 
															             except:
														
--- a/format_convert/pdfparser.py
+++ b/format_convert/pdfparser.py
@@ -0,0 +1,485 @@
 
															+#coding:utf8
														
 
															+
														
 
															+from pdfminer.pdfparser import PDFParser
														
 
															+from pdfminer.pdfdocument import PDFDocument
														
 
															+from pdfminer.pdfpage import PDFPage
														
 
															+from pdfminer.pdfpage import PDFTextExtractionNotAllowed
														
 
															+from pdfminer.pdfinterp import PDFResourceManager
														
 
															+from pdfminer.pdfinterp import PDFPageInterpreter
														
 
															+from pdfminer.pdfdevice import PDFDevice
														
 
															+from pdfminer.layout import *
														
 
															+from pdfminer.converter import PDFPageAggregator
														
 
															+
														
 
															+import logging
														
 
															+logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(name)s - %(levelname)s - %(message)s')
														
 
															+
														
 
															+class ParseDocument():
														
 
															+
														
 
															+    def __init__(self,filepath):
														
 
															+        self.filename = filepath
														
 
															+        self.childs = []
														
 
															+
														
 
															+        # Open a PDF file.
														
 
															+        fp = open(filepath, 'rb')
														
 
															+        # Create a PDF parser object associated with the file object.
														
 
															+        parser = PDFParser(fp)
														
 
															+        # Create a PDF document object that stores the document structure.
														
 
															+        # Supply the password for initialization.
														
 
															+        document = PDFDocument(parser)
														
 
															+        # Check if the document allows text extraction. If not, abort.
														
 
															+        if not document.is_extractable:
														
 
															+            raise PDFTextExtractionNotAllowed
														
 
															+        # Create a PDF resource manager object that stores shared resources.
														
 
															+        rsrcmgr = PDFResourceManager()
														
 
															+        # Create a PDF device object.
														
 
															+        laparams = LAParams(line_overlap=0.1,
														
 
															+                            char_margin=0.1,
														
 
															+                            line_margin=0.1,
														
 
															+                            word_margin=0.1,
														
 
															+                            boxes_flow=0.5,)
														
 
															+        device = PDFPageAggregator(rsrcmgr, laparams=laparams)
														
 
															+        # Create a PDF interpreter object.
														
 
															+        interpreter = PDFPageInterpreter(rsrcmgr, device)
														
 
															+        # Process each page contained in the document.
														
 
															+        page_no = 0
														
 
															+        for page in PDFPage.create_pages(document):
														
 
															+            interpreter.process_page(page)
														
 
															+            ltpage = device.get_result()
														
 
															+
														
 
															+            page_no += 1
														
 
															+            logging.info("recognize page:%d"%page_no)
														
 
															+            self.childs.append(self.recognize(ltpage))
														
 
															+            # print(ltpage.__dict__)
														
 
															+            # ParsePage(ltpage).recognize_rect(ltpage)
														
 
															+            return
														
 
															+
														
 
															+
														
 
															+    def recognize(self,_page):
														
 
															+        _page = ParsePage(_page)
														
 
															+        return _page
														
 
															+
														
 
															+
														
 
															+
														
 
															+class ParsePage():
														
 
															+
														
 
															+    def __init__(self,_page):
														
 
															+
														
 
															+        self.childs = []
														
 
															+        self.list_tables = []
														
 
															+        self.list_sentences = []
														
 
															+
														
 
															+        self.getFontinfo(_page)
														
 
															+        filter_objs = self.recognize_table(_page)
														
 
															+        self.recognize_sentences(_page,filter_objs)
														
 
															+
														
 
															+    def recognize_table(self,_page,line_margin=0.2):
														
 
															+
														
 
															+        list_rects = []
														
 
															+
														
 
															+        list_textbox = []
														
 
															+        for _obj in _page._objs:
														
 
															+            if isinstance(_obj,(LTRect)):
														
 
															+                list_rects.append(_obj)
														
 
															+            elif isinstance(_obj,(LTTextBoxHorizontal,LTTextBoxVertical)):
														
 
															+                list_textbox.append(_obj)
														
 
															+        #
														
 
															+        #clusters_rects = []
														
 
															+        # #根据y0聚类
														
 
															+        # list_rects.sort(key=lambda x:x.bbox[1])
														
 
															+        # for _rect in list_rects:
														
 
															+        #     _y0 = _rect.bbox[1]
														
 
															+        #     _find = False
														
 
															+        #     for l_cr in clusters_rects:
														
 
															+        #         if abs(l_cr[0].bbox[1]-_y0)<2:
														
 
															+        #             _find = True
														
 
															+        #             l_cr.append(_rect)
														
 
															+        #             break
														
 
															+        #     if not _find:
														
 
															+        #         clusters_rects.append([_rect])
														
 
															+        #
														
 
															+        # clusters_rects.sort(key=lambda x:x[0].bbox[1])
														
 
															+        # for l_cr in clusters_rects:
														
 
															+        #     l_cr.sort(key=lambda x:x.bbox[0])
														
 
															+        #
														
 
															+        # table_index = [0]
														
 
															+        # for i in range(1,len(clusters_rects)):
														
 
															+        #     if abs(clusters_rects[i][0].bbox[1]-clusters_rects[i-1][0].bbox[3])>line_margin:
														
 
															+        #         table_index.append(i)
														
 
															+        # table_index.append(len(clusters_rects))
														
 
															+        #
														
 
															+        # print("11111111111111111111111")
														
 
															+        # print(clusters_rects)
														
 
															+        # print("22222222222222222222222")
														
 
															+        #
														
 
															+        # in_objs = set()
														
 
															+        # for i in range(1,len(table_index)):
														
 
															+        #     _begin = table_index[i-1]
														
 
															+        #     _end = table_index[i]
														
 
															+        #     _ta = self.rect2table(list_textbox,clusters_rects[_begin:_end],in_objs)
														
 
															+        #     if _ta:
														
 
															+        #         self.list_tables.append(_ta)
														
 
															+
														
 
															+        in_objs = set()
														
 
															+        list_l_rect = self.recognize_rect(_page)
														
 
															+        for l_rect in list_l_rect:
														
 
															+            _ta = self.rect2table(list_textbox,l_rect,in_objs)
														
 
															+            if _ta:
														
 
															+                self.list_tables.append(_ta)
														
 
															+        return in_objs
														
 
															+
														
 
															+
														
 
															+
														
 
															+    def recognize_crosspoints(self,list_line):
														
 
															+        from matplotlib import pyplot as plt
														
 
															+        list_crosspoints = []
														
 
															+        print("lines num",len(list_line))
														
 
															+
														
 
															+        plt.figure()
														
 
															+        for _line in list_line:
														
 
															+            x0,y0,x1,y1 = _line.bbox
														
 
															+            plt.plot([x0,x1],[y0,y1])
														
 
															+
														
 
															+
														
 
															+        for _i in range(len(list_line)):
														
 
															+            for _j in range(len(list_line)):
														
 
															+                line1 = list_line[_i].bbox
														
 
															+                line2 = list_line[_j].bbox
														
 
															+                exists,point = self.cross_point(line1,line2)
														
 
															+                if exists:
														
 
															+                    list_crosspoints.append(point)
														
 
															+
														
 
															+        # plt.figure()
														
 
															+        # for _line in list_line:
														
 
															+        #     x0,y0,x1,y1 = _line.bbox
														
 
															+        #     plt.plot([x0,x1],[y0,y1])
														
 
															+        # for point in list_crosspoints:
														
 
															+        #     plt.scatter(point.get("point")[0],point.get("point")[1])
														
 
															+        # plt.show()
														
 
															+
														
 
															+        # print(list_crosspoints)
														
 
															+        # print("points num",len(list_crosspoints))
														
 
															+        return list_crosspoints
														
 
															+
														
 
															+
														
 
															+    def recognize_rect(self,_page):
														
 
															+
														
 
															+        list_line = []
														
 
															+        for _obj in _page._objs:
														
 
															+            if isinstance(_obj,(LTLine)):
														
 
															+                list_line.append(_obj)
														
 
															+        list_crosspoints = self.recognize_crosspoints(list_line)
														
 
															+
														
 
															+        #聚类
														
 
															+        cluster_crosspoints = []
														
 
															+        for _point in list_crosspoints:
														
 
															+            cluster_crosspoints.append({"lines":_point.get("lines"),"points":[_point]})
														
 
															+        while 1:
														
 
															+            _find = False
														
 
															+            new_cluster_crosspoints = []
														
 
															+            for l_point in cluster_crosspoints:
														
 
															+                _flag = False
														
 
															+                for l_n_point in new_cluster_crosspoints:
														
 
															+                    line1 = l_point.get("lines")
														
 
															+                    line2 = l_n_point.get("lines")
														
 
															+                    if len(line1&line2)>0:
														
 
															+                        _find = True
														
 
															+                        _flag = True
														
 
															+                        l_n_point["lines"] = line1.union(line2)
														
 
															+                        l_n_point["points"].extend(l_point["points"])
														
 
															+                if not _flag:
														
 
															+                    new_cluster_crosspoints.append({"lines":l_point.get("lines"),"points":l_point.get("points")})
														
 
															+            cluster_crosspoints = new_cluster_crosspoints
														
 
															+            if not _find:
														
 
															+                break
														
 
															+        # print(len(cluster_crosspoints))
														
 
															+
														
 
															+        list_l_rect = []
														
 
															+        for table_crosspoint in cluster_crosspoints:
														
 
															+            list_rect = self.crosspoint2rect(table_crosspoint.get("points"))
														
 
															+            list_l_rect.append(list_rect)
														
 
															+
														
 
															+        return list_l_rect
														
 
															+
														
 
															+
														
 
															+
														
 
															+    def crosspoint2rect(self,list_crosspoint,margin=4):
														
 
															+
														
 
															+        dict_line_points = {}
														
 
															+        for _point in list_crosspoint:
														
 
															+            lines = list(_point.get("lines"))
														
 
															+            for _line in lines:
														
 
															+                if _line not in dict_line_points:
														
 
															+                    dict_line_points[_line] = {"direct":None,"points":[]}
														
 
															+                dict_line_points[_line]["points"].append(_point)
														
 
															+
														
 
															+        #排序
														
 
															+        for k,v in dict_line_points.items():
														
 
															+
														
 
															+            list_x = []
														
 
															+            list_y = []
														
 
															+            for _p in v["points"]:
														
 
															+                list_x.append(_p.get("point")[0])
														
 
															+                list_y.append(_p.get("point")[1])
														
 
															+            if max(list_x)-min(list_x)>max(list_y)-min(list_y):
														
 
															+                v.get("points").sort(key=lambda x:x.get("point")[0])
														
 
															+                v["direct"] = "row"
														
 
															+            else:
														
 
															+                v.get("points").sort(key=lambda x:x.get("point")[1])
														
 
															+                v["direct"] = "column"
														
 
															+
														
 
															+
														
 
															+        list_rect = []
														
 
															+        for _point in list_crosspoint:
														
 
															+            if _point["buttom"]>=margin and _point["right"]>=margin:
														
 
															+                lines = list(_point.get("lines"))
														
 
															+                _line = lines[0]
														
 
															+                if dict_line_points[_line]["direct"]=="column":
														
 
															+                    _line = lines[1]
														
 
															+                next_point = None
														
 
															+                for p1 in  dict_line_points[_line]["points"]:
														
 
															+                    if p1["buttom"]>=margin and p1["point"][0]>_point["point"][0]:
														
 
															+                        next_point = p1
														
 
															+                        break
														
 
															+                if not next_point:
														
 
															+                    continue
														
 
															+                lines = list(next_point.get("lines"))
														
 
															+                _line = lines[0]
														
 
															+                if dict_line_points[_line]["direct"]=="row":
														
 
															+                    _line = lines[1]
														
 
															+                final_point = None
														
 
															+                for p1 in dict_line_points[_line]["points"]:
														
 
															+                    if p1["left"]>=margin and p1["point"][1]>next_point["point"][1]:
														
 
															+                        final_point = p1
														
 
															+                        break
														
 
															+                if not final_point:
														
 
															+                    continue
														
 
															+                _r = LTRect(1,(_point["point"][0],_point["point"][1],final_point["point"][0],final_point["point"][1]))
														
 
															+                list_rect.append(_r)
														
 
															+
														
 
															+        return list_rect
														
 
															+
														
 
															+
														
 
															+
														
 
															+    def cross_point(self,line1, line2,segment=True,margin=2):
														
 
															+        point_is_exist = False
														
 
															+        x = y = 0
														
 
															+        x1,y1,x2,y2 = line1
														
 
															+        x3,y3,x4,y4 = line2
														
 
															+
														
 
															+        if (x2 - x1) == 0:
														
 
															+            k1 = None
														
 
															+            b1 = 0
														
 
															+        else:
														
 
															+            k1 = (y2 - y1) * 1.0 / (x2 - x1)  # 计算k1,由于点均为整数，需要进行浮点数转化
														
 
															+            b1 = y1 * 1.0 - x1 * k1 * 1.0  # 整型转浮点型是关键
														
 
															+
														
 
															+        if (x4 - x3) == 0:  # L2直线斜率不存在
														
 
															+            k2 = None
														
 
															+            b2 = 0
														
 
															+        else:
														
 
															+            k2 = (y4 - y3) * 1.0 / (x4 - x3)  # 斜率存在
														
 
															+            b2 = y3 * 1.0 - x3 * k2 * 1.0
														
 
															+
														
 
															+        if k1 is None:
														
 
															+            if not k2 is None:
														
 
															+                x = x1
														
 
															+                y = k2 * x1 + b2
														
 
															+                point_is_exist = True
														
 
															+        elif k2 is None:
														
 
															+            x = x3
														
 
															+            y = k1 * x3 + b1
														
 
															+        elif not k2 == k1:
														
 
															+            x = (b2 - b1) * 1.0 / (k1 - k2)
														
 
															+            y = k1 * x * 1.0 + b1 * 1.0
														
 
															+            point_is_exist = True
														
 
															+
														
 
															+        left = 0
														
 
															+        right = 0
														
 
															+        top = 0
														
 
															+        buttom = 0
														
 
															+        if point_is_exist:
														
 
															+            if segment:
														
 
															+                if x>=(min(x1,x2)-margin) and x<=(max(x1,x2)+margin) and y>=(min(y1,y2)-margin) and y<=(max(y1,y2)+margin):
														
 
															+                    if x>=(min(x3,x4)-margin) and x<=(max(x3,x4)+margin) and y>=(min(y3,y4)-margin) and y<=(max(y3,y4)+margin):
														
 
															+                        point_is_exist = True
														
 
															+                        left = abs(min(x1,x3)-x)
														
 
															+                        right = abs(max(x2,x4)-x)
														
 
															+                        top = abs(min(y1,y3)-y)
														
 
															+                        buttom = abs(max(y2,y4)-y)
														
 
															+                    else:
														
 
															+                        point_is_exist = False
														
 
															+                else:
														
 
															+                    point_is_exist = False
														
 
															+        line1_key = "%.2f-%.2f-%.2f-%.2f"%(x1,y1,x2,y2)
														
 
															+        line2_key = "%.2f-%.2f-%.2f-%.2f"%(x3,y3,x4,y4)
														
 
															+        return point_is_exist, {"point":[x, y],"left":left,"right":right,"top":top,"buttom":buttom,"lines":set([line1_key,line2_key])}
														
 
															+
														
 
															+    def rect2table(self,list_textbox,list_rect,in_objs,margin=0.2,fixspan=True):
														
 
															+        _table = []
														
 
															+        set_x = set()
														
 
															+        set_y = set()
														
 
															+
														
 
															+        clusters_rects = []
														
 
															+        #根据y1聚类
														
 
															+        list_rect.sort(key=lambda x:x.bbox[3])
														
 
															+        for _rect in list_rect:
														
 
															+            _y0 = _rect.bbox[3]
														
 
															+            _find = False
														
 
															+            for l_cr in clusters_rects:
														
 
															+                if abs(l_cr[0].bbox[3]-_y0)<2:
														
 
															+                    _find = True
														
 
															+                    l_cr.append(_rect)
														
 
															+                    break
														
 
															+            if not _find:
														
 
															+                clusters_rects.append([_rect])
														
 
															+
														
 
															+        clusters_rects.sort(key=lambda x:x[0].bbox[3],reverse=True)
														
 
															+        for l_cr in clusters_rects:
														
 
															+            l_cr.sort(key=lambda x:x.bbox[0])
														
 
															+
														
 
															+        for _line in clusters_rects:
														
 
															+            for _rect in _line:
														
 
															+                (x0,y0,x1,y1) = _rect.bbox
														
 
															+                set_x.add(x0)
														
 
															+                set_x.add(x1)
														
 
															+                set_y.add(y0)
														
 
															+                set_y.add(y1)
														
 
															+        if len(set_x)==0 or len(set_y)==0:
														
 
															+            return
														
 
															+        list_x = list(set_x)
														
 
															+        list_y = list(set_y)
														
 
															+
														
 
															+        list_x.sort(key=lambda x:x)
														
 
															+        list_y.sort(key=lambda x:x,reverse=True)
														
 
															+        for _line in clusters_rects:
														
 
															+            table_line = []
														
 
															+            for _rect in _line:
														
 
															+                (x0,y0,x1,y1) = _rect.bbox
														
 
															+                _cell = {"bbox":(x0,y0,x1,y1),"rect":_rect,"rowspan":self.getspan(list_y,y0,y1,margin),"columnspan":self.getspan(list_x,x0,x1,margin),"text":""}
														
 
															+                table_line.append(_cell)
														
 
															+            _table.append(table_line)
														
 
															+
														
 
															+        for textbox in list_textbox:
														
 
															+            (x0,y0,x1,y1) = textbox.bbox
														
 
															+            _text = textbox.get_text()
														
 
															+            _find = False
														
 
															+            for table_line in _table:
														
 
															+                for _cell in table_line:
														
 
															+                    if self.inbox(textbox.bbox,_cell["bbox"]):
														
 
															+                        _cell["text"]+= _text
														
 
															+                        in_objs.add(textbox)
														
 
															+                        _find = True
														
 
															+                        break
														
 
															+                if _find:
														
 
															+                    break
														
 
															+
														
 
															+
														
 
															+        if fixspan:
														
 
															+            for _line in _table:
														
 
															+                for c_i in range(len(_line)):
														
 
															+                    _cell = _line[c_i]
														
 
															+                    if _cell.get("columnspan")>1:
														
 
															+                        _cospan = _cell.get("columnspan")
														
 
															+                        _cell["columnspan"] = 1
														
 
															+                        for i in range(1,_cospan):
														
 
															+                            _line.insert(c_i)
														
 
															+            for l_i in range(len(_table)):
														
 
															+                _line = _table[l_i]
														
 
															+                for c_i in range(len(_line)):
														
 
															+                    _cell = _line[c_i]
														
 
															+                    if _cell.get("rowspan")>1:
														
 
															+                        _rospan = _cell.get("rowspan")
														
 
															+                        _cell["rowspan"] = 1
														
 
															+                        for i in range(1,_rospan):
														
 
															+                            _table[l_i+i].insert(c_i,_cell)
														
 
															+
														
 
															+        # print("=======")
														
 
															+        # for _line in _table:
														
 
															+        #     for _cell in _line:
														
 
															+        #         print("[%s]"%_cell.get("text")[:10].replace("\n",''),end="\t\t")
														
 
															+        #     print("\n")
														
 
															+        # print("===========")
														
 
															+
														
 
															+        table_bbox = (_table[0][0].get("bbox")[0],_table[0][0].get("bbox")[1],_table[-1][-1].get("bbox")[2],_table[-1][-1].get("bbox")[3])
														
 
															+
														
 
															+        ta = ParseTable(table_bbox,_table)
														
 
															+        return ta
														
 
															+
														
 
															+    def inbox(self,bbox0,bbox_g):
														
 
															+        # if bbox_g[0]<=bbox0[0] and bbox_g[1]<=bbox0[1] and bbox_g[2]>=bbox0[2] and bbox_g[3]>=bbox0[3]:
														
 
															+        #     return 1
														
 
															+        if self.getIOU(bbox0,bbox_g)>0.5:
														
 
															+            return 1
														
 
															+        return 0
														
 
															+
														
 
															+    def getIOU(self,bbox0,bbox1):
														
 
															+        width = max(bbox0[2],bbox1[2])-min(bbox0[0],bbox1[0])-(bbox0[2]-bbox0[0]+bbox1[2]-bbox1[0])
														
 
															+        height = max(bbox0[3],bbox1[3])-min(bbox0[1],bbox1[1])-(bbox0[3]-bbox0[1]+bbox1[3]-bbox1[1])
														
 
															+        if width<0 and height<0:
														
 
															+            return abs(width*height/min(abs((bbox0[2]-bbox0[0])*(bbox0[3]-bbox0[1])),abs((bbox1[2]-bbox1[0])*(bbox1[3]-bbox1[1]))))
														
 
															+        return 0
														
 
															+
														
 
															+    def getspan(self,_list,x0,x1,margin):
														
 
															+        _count = 0
														
 
															+        (x0,x1) = (min(x0,x1),max(x0,x1))
														
 
															+        for _x in _list:
														
 
															+            if _x>=(x0-margin) and _x<=(x1+margin):
														
 
															+                _count += 1
														
 
															+        return _count-1
														
 
															+
														
 
															+    def getFontinfo(self,_page):
														
 
															+        for _obj in _page._objs:
														
 
															+            if isinstance(_obj,(LTTextBoxHorizontal,LTTextBoxVertical)):
														
 
															+                for textline in _obj._objs:
														
 
															+                    done = False
														
 
															+                    for lchar in textline._objs:
														
 
															+                        if isinstance(lchar,(LTChar)):
														
 
															+                            _obj.fontname = lchar.fontname
														
 
															+                            _obj.fontsize = lchar.size
														
 
															+                        done = True
														
 
															+                        break
														
 
															+                    if done:
														
 
															+                        break
														
 
															+
														
 
															+
														
 
															+    def recognize_sentences(self,_page,filter_objs):
														
 
															+        for _obj in _page._objs:
														
 
															+            if isinstance(_obj,(LTTextBoxHorizontal,LTTextBoxVertical)):
														
 
															+                if _obj in filter_objs:
														
 
															+                    continue
														
 
															+                self.list_sentences.append(ParseSentence(_obj.bbox,_obj.__dict__.get("fontname"),_obj.__dict__.get("fontsize"),_obj.get_text()))
														
 
															+
														
 
															+class ParseRect():
														
 
															+
														
 
															+    def __init__(self,bbox):
														
 
															+        self.bbox = bbox
														
 
															+
														
 
															+class ParseTable():
														
 
															+
														
 
															+    def __init__(self,bbox,list_table):
														
 
															+        self.table = list_table
														
 
															+        self.bbox = bbox
														
 
															+
														
 
															+
														
 
															+
														
 
															+class ParseSentence():
														
 
															+
														
 
															+    def __init__(self,bbox,fontname,fontsize,_text):
														
 
															+        (x0,y0,x1,y1) = bbox
														
 
															+        self.x0 = x0
														
 
															+        self.y0 = y0
														
 
															+        self.x1 = x1
														
 
															+        self.y1 = y1
														
 
															+        self.box = bbox
														
 
															+        self.fontname = fontname
														
 
															+        self.fontsize = fontsize
														
 
															+        self.text = _text
														
 
															+
														
 
															+
														
 
															+    def rec_serial(self):
														
 
															+        #todo :recog the serial of the sentence
														
 
															+        pass
														
 
															+
														
 
															+if __name__ == '__main__':
														
 
															+    document = ParseDocument('8a9494757a859f17017e8aa443360235.pdf')
														
 
															+
														
--- a/format_convert/utils.py
+++ b/format_convert/utils.py
@@ -504,7 +504,7 @@ def slash_replace(_str, reverse=False):
 
															 class LineTable():
														
 
															-    def recognize_table(self, list_textbox, list_line):
														
 
															+    def recognize_table(self,list_textbox, list_line,sourceP_LB=True):
														
 
															         self.list_line = list_line
														
 
															         self.list_crosspoints = self.recognize_crosspoints(list_line)
														
@@ -539,7 +539,7 @@ class LineTable():
 
															         in_objs = set()
														
 
															         list_tables = []
														
 
															         for l_rect in list_l_rect:
														
 
															-            _ta = self.rect2table(list_textbox,l_rect,in_objs)
														
 
															+            _ta = self.rect2table(list_textbox,l_rect,in_objs,sourceP_LB=sourceP_LB)
														
 
															             if _ta:
														
 
															                 list_tables.append(_ta)
														
 
															         self._plot(list_line, list_textbox)
														
@@ -876,7 +876,7 @@ class LineTable():
 
															         ta = {"bbox":table_bbox,"table":_table}
														
 
															         return ta
														
 
															-    def rect2table(self, list_textbox, list_rect, in_objs, margin=0.2, fixspan=True):
														
 
															+    def rect2table(self, list_textbox, list_rect, in_objs, margin=0.2, fixspan=True,sourceP_LB=True):
														
 
															         _table = []
														
 
															         set_x = set()
														
 
															         set_y = set()
														
@@ -896,7 +896,7 @@ class LineTable():
 
															                 clusters_rects.append([_rect])
														
 
															         print("clusters_rects", len(clusters_rects))
														
 
															-        clusters_rects.sort(key=lambda x:x[0].bbox[3],reverse=True)
														
 
															+        clusters_rects.sort(key=lambda x:x[0].bbox[3],reverse=sourceP_LB)
														
 
															         for l_cr in clusters_rects:
														
 
															             l_cr.sort(key=lambda x:x.bbox[0])
														
@@ -914,7 +914,7 @@ class LineTable():
 
															         list_y = list(set_y)
														
 
															         list_x.sort(key=lambda x:x)
														
 
															-        list_y.sort(key=lambda x:x,reverse=True)
														
 
															+        list_y.sort(key=lambda x:x,reverse=sourceP_LB)
														
 
															         pop_x = []
														
 
															         for i in range(len(list_x)-1):
														
@@ -951,7 +951,7 @@ class LineTable():
 
															             _table.append(table_line)
														
 
															         list_textbox.sort(key=lambda x:x.bbox[0])
														
 
															-        list_textbox.sort(key=lambda x:x.bbox[3],reverse=True)
														
 
															+        list_textbox.sort(key=lambda x:x.bbox[3],reverse=sourceP_LB)
														
 
															         for textbox in list_textbox:
														
 
															             (x0,y0,x1,y1) = textbox.bbox
														
 
															             _text = textbox.get_text()
														
--- a/result.html
+++ b/result.html
@@ -1,38 +1,38 @@
 
															 <!DOCTYPE HTML><head><meta charset="UTF-8"></head><body><table border="1">
														
 
															 <tr>
														
 
															-<td colspan=1 rowspan=1>件件详见附件详见附详见附品牌单价规格型号数量司72.3500中标金额（万元）</td>
														
 
															-<td colspan=1 rowspan=1>购置赁有限公司机械设备租华池县卓泰供应商名称名称四、主要标的信息设备租赁有限公华池县卓泰机械供应商名称三、中标（成交）信息二、项目名称HCZC2021-0001、项目编号</td>
														
 
															-<td colspan=1 rowspan=1>六、代理服务收费标准及金额：设项目配套设备庄肉牛养殖场建货物类远镇张川村供应商联系地址</td>
														
 
															-<td colspan=1 rowspan=1>王正刚、段海龙、李鑫、刘翠平、张武峰详见附件华池县柔远镇李甘肃省庆阳市华池县柔</td>
														
 
															-<td colspan=1 rowspan=1></td>
														
 
															+<td colspan=1 rowspan=1>货物类</td>
														
 
															+<td colspan=1 rowspan=1>货物类</td>
														
 
															+<td colspan=1 rowspan=1>货物类</td>
														
 
															+<td colspan=1 rowspan=1>货物类</td>
														
 
															+<td colspan=1 rowspan=1>货物类</td>
														
 
															+<td colspan=1 rowspan=1>货物类</td>
														
 
															 </tr>
														
 
															 <tr>
														
 
															-<td colspan=1 rowspan=1></td>
														
 
															-<td colspan=1 rowspan=1></td>
														
 
															-<td colspan=1 rowspan=1></td>
														
 
															-<td colspan=1 rowspan=1></td>
														
 
															-<td colspan=1 rowspan=1></td>
														
 
															-<td colspan=1 rowspan=1>五、评审专家（单一来源采购人员）名单：</td>
														
 
															+<td colspan=1 rowspan=1>供应商名称</td>
														
 
															+<td colspan=1 rowspan=1>名称</td>
														
 
															+<td colspan=1 rowspan=1>品牌</td>
														
 
															+<td colspan=1 rowspan=1>单价数量</td>
														
 
															+<td colspan=1 rowspan=1>单价数量</td>
														
 
															+<td colspan=1 rowspan=1>规格型号</td>
														
 
															 </tr>
														
 
															 <tr>
														
 
															-<td colspan=1 rowspan=1></td>
														
 
															-<td colspan=1 rowspan=1></td>
														
 
															-<td colspan=1 rowspan=1></td>
														
 
															-<td colspan=1 rowspan=1></td>
														
 
															-<td colspan=1 rowspan=1></td>
														
 
															-<td colspan=1 rowspan=1></td>
														
 
															+<td colspan=1 rowspan=1>华池县柔远镇李庄肉牛养殖场建设项目配套设备购置</td>
														
 
															+<td colspan=1 rowspan=1>详见附件</td>
														
 
															+<td colspan=1 rowspan=1>详见附件</td>
														
 
															+<td colspan=1 rowspan=1>详见附件</td>
														
 
															+<td colspan=1 rowspan=1>详见附件</td>
														
 
															 </tr>
														
 
															 </table>
														
 
															 <table border="1">
														
 
															 <tr>
														
 
															-<td colspan=1 rowspan=1>件件详见附件详见附件详见附详见附品牌单价规格型号名称数量货物类远镇张川村72.3500供应商联系地址中标金额（万元）</td>
														
 
															-<td colspan=1 rowspan=1>赁有限公司机械设备租华池县卓泰供应商名称司供应商名称二、项目名称、项目编号</td>
														
 
															-<td colspan=1 rowspan=1>六、代理服务收费标准及金额：王正刚、段海龙、李鑫、刘翠平、张武峰购置设项目配套设备庄肉牛养殖场建华池县柔远镇李四、主要标的信息设备租赁有限公华池县卓泰机械三、中标（成交）信息HCZC2021-0001</td>
														
 
															+<td colspan=1 rowspan=1>供应商名称</td>
														
 
															+<td colspan=1 rowspan=1>供应商联系地址</td>
														
 
															+<td colspan=1 rowspan=1>中标金额（万元）</td>
														
 
															 </tr>
														
 
															 <tr>
														
 
															-<td colspan=1 rowspan=1></td>
														
 
															-<td colspan=1 rowspan=1></td>
														
 
															-<td colspan=1 rowspan=1></td>
														
 
															+<td colspan=1 rowspan=1>华池县卓泰机械设备租赁有限公司</td>
														
 
															+<td colspan=1 rowspan=1>甘肃省庆阳市华池县柔远镇张川村</td>
														
 
															+<td colspan=1 rowspan=1>72.3500</td>
														
 
															 </tr>
														
 
															 </table>
														
 
															 <div>收费标准：无</div>