Эх сурвалжийг харах

Merge remote-tracking branch 'origin/master'

# Conflicts:
#	format_convert/convert.py
#	format_convert/convert_pdf.py
#	format_convert/utils.py
#	result.html
luojiehua 3 жил өмнө
parent
commit
a344f956ed

+ 0 - 2
format_convert/convert_image.py

@@ -66,8 +66,6 @@ def image_process(image_np, image_path, use_ocr=True):
                                                 bbox[2][0], bbox[2][1]], b_text))
             lt = LineTable()
             tables, obj_in_table, _ = lt.recognize_table(list_text_boxes, list_lines, False)
-            text = [tables, obj_in_table]
-            column_list = []
 
             obj_list = []
             for table in tables:

+ 49 - 28
format_convert/convert_pdf.py

@@ -633,6 +633,9 @@ class PDFConvert:
         if self.has_init_pdf[0] == 0:
             self.init_package("pdfminer")
         if self._doc.error_code is not None:
+            self._doc.error_code = None
+            # pdfminer读不了直接转成图片识别
+            self.get_all_page_image()
             return
 
         # 判断是否能读pdf
@@ -711,7 +714,8 @@ class PDFConvert:
                     self._page.add_child(_table)
 
                 list_sentences = ParseUtils.recognize_sentences(lt_text_list, filter_objs,
-                                                                layout.bbox, page_no)
+                                                                layout.bbox, page_no,
+                                                                sourceP_LB=False)
                 for sentence in list_sentences:
                     _sen = _Sentence(sentence.text, sentence.bbox)
                     # _sen.x = sentence.x0
@@ -851,6 +855,31 @@ class PDFConvert:
                 traceback.print_exc()
                 return [-3]
 
+    def get_all_page_image(self):
+        if self.has_init_pdf[1] == 0:
+            self.init_package("PyMuPDF")
+        if self._doc.error_code is not None:
+            return
+
+        page_count = self.doc_pymupdf.page_count
+        for page_no in range(page_count):
+            # 限制pdf页数,只取前10页后10页
+            if page_count > 20:
+                if 10 <= page_no < page_count - 10:
+                    continue
+
+            self._page = _Page(None, page_no)
+            page_image = self.get_page_image(page_no)
+            if judge_error_code(page_image):
+                self._page.error_code = page_image
+            else:
+                _image = _Image(page_image[1], page_image[0])
+                self._page.add_child(_image)
+            # 报错继续读后面页面
+            if self._doc.error_code is None and self._page.error_code is not None:
+                continue
+            self._doc.add_child(self._page)
+
     def get_html(self):
         self.convert()
         if self._doc.error_code is not None:
@@ -997,10 +1026,11 @@ class ParseUtils:
                         break
 
     @staticmethod
-    def recognize_sentences(list_textbox,filter_objs,page_bbox,page_no,remove_space=True,source_LB=True):
+    def recognize_sentences(list_textbox, filter_objs, page_bbox, page_no,
+                            remove_space=True, sourceP_LB=True):
 
-        list_textbox.sort(key=lambda x:x.bbox[0])
-        list_textbox.sort(key=lambda x:x.bbox[3],reverse=source_LB)
+        list_textbox.sort(key=lambda x: x.bbox[0])
+        list_textbox.sort(key=lambda x: x.bbox[3], reverse=sourceP_LB)
 
         cluster_textbox = []
         for _textbox in list_textbox:
@@ -1009,32 +1039,28 @@ class ParseUtils:
 
             _find = False
             for _ct in cluster_textbox:
-                if abs(_ct["y"]-_textbox.bbox[1])<5:
+                if abs(_ct["y"]-_textbox.bbox[1]) < 5:
                     _find = True
                     _ct["textbox"].append(_textbox)
             if not _find:
-                cluster_textbox.append({"y":_textbox.bbox[1],"textbox":[_textbox]})
+                cluster_textbox.append({"y": _textbox.bbox[1], "textbox": [_textbox]})
 
-        cluster_textbox.sort(key=lambda x:x["y"],reverse=source_LB)
+        cluster_textbox.sort(key=lambda x: x["y"], reverse=sourceP_LB)
         list_sentences = []
         for _line in cluster_textbox:
             _textboxs = _line["textbox"]
-            _textboxs.sort(key=lambda x:x.bbox[0])
-
-
+            _textboxs.sort(key=lambda x: x.bbox[0])
 
             _linetext = _textboxs[0].get_text()
-            for _i in range(1,len(_textboxs)):
+            for _i in range(1, len(_textboxs)):
                 if abs(_textboxs[_i].bbox[0]-_textboxs[_i-1].bbox[0])>30:
-                    if _linetext[-1] not in (",",",","。",".","、",";"):
+                    if _linetext[-1] not in (",", ",", "。", ".", "、", ";"):
                         _linetext += "=,="
                 _linetext += _textboxs[_i].get_text()
 
-
-
-
-            _linetext = re.sub("[\s\r\n]","",_linetext)
-            _bbox = (_textboxs[0].bbox[0],_textboxs[0].bbox[1],_textboxs[-1].bbox[2],_textboxs[-1].bbox[3])
+            _linetext = re.sub("[\s\r\n]", "", _linetext)
+            _bbox = (_textboxs[0].bbox[0], _textboxs[0].bbox[1],
+                     _textboxs[-1].bbox[2],_textboxs[-1].bbox[3])
 
             _title = None
             _pattern_groups = None
@@ -1054,33 +1080,28 @@ class ParseUtils:
             if not _title:
                 _title = ParseUtils.rec_incenter(_bbox,page_bbox)
 
-
             title_degree = 2
             if not _title:
-                _linetext = _linetext.replace("=,=",",")
+                _linetext = _linetext.replace("=,=", ",")
             else:
-                _linetext = _linetext.replace("=,=","")
+                _linetext = _linetext.replace("=,=", "")
                 title_degree = int(_title.split("_")[1])
 
-
-            #页码
-            if ParseUtils.rec_incenter(_bbox,page_bbox) and re.search("^\d+$",_linetext) is not None:
+            # 页码
+            if ParseUtils.rec_incenter(_bbox,page_bbox) and re.search("^\d+$", _linetext) is not None:
                 continue
 
-            if _linetext=="" or re.search("^,+$",_linetext) is not None:
+            if _linetext == "" or re.search("^,+$", _linetext) is not None:
                 continue
 
-
             is_outline = False
             outline_location = -1
-            _search = re.search("(?P<text>.+?)\.{5,}(?P<nums>\d+)$",_linetext)
+            _search = re.search("(?P<text>.+?)\.{5,}(?P<nums>\d+)$", _linetext)
             if _search is not None:
                 is_outline = True
                 _linetext = _search.group("text")
                 outline_location = int(_search.group("nums"))
 
-
-
             list_sentences.append(ParseSentence(_bbox,_textboxs[-1].__dict__.get("fontname"),_textboxs[-1].__dict__.get("fontsize"),_linetext,_title,title_text,_pattern_groups,title_degree,is_outline,outline_location,page_no))
 
         # for _sen in list_sentences:

+ 1 - 3
format_convert/convert_tree.py

@@ -27,7 +27,6 @@ class _Document:
         for child in self.children:
             # 先调用get_html才能更新error_code
             child_html_text = child.get_html()
-            print("Document", self.error_code, child.error_code, type(child), child.page_no)
             if child.error_code is not None:
                 self.error_code = child.error_code
                 return self.error_code
@@ -61,7 +60,6 @@ class _Page:
         for child in self.children:
             # 先调用get_html才能更新error_code
             child_html_text = child.get_html()
-            print("Page", self.error_code, child.error_code, type(child))
             if child.error_code is not None:
                 self.error_code = child.error_code
                 return ""
@@ -105,7 +103,6 @@ class _Image:
         for child in self.children:
             # 先调用get_html才能更新error_code
             child_html_text = child.get_html()
-            print("Image", self.error_code, child.error_code, type(child))
             if child.error_code is not None:
                 self.error_code = child.error_code
                 return ""
@@ -163,6 +160,7 @@ class _Sentence:
     def get_html(self):
         if self.error_code is not None:
             return ""
+        print("_Sentence", self.content, self.bbox)
         if self.is_html:
             return self.content
         else:

+ 18 - 150
format_convert/utils.py

@@ -43,7 +43,7 @@ def add_div(text):
     text = re.sub("\n", "</div>\n<div>", text)
     # text += "</div>"
     if text[-5:] == "<div>":
-        print("add_div has cut", text[-30:])
+        # print("add_div has cut", text[-30:])
         text = text[:-5]
     return text
 
@@ -603,25 +603,11 @@ class LineTable:
                 list_tables.append(_ta)
         return list_tables,in_objs,list_l_rect
 
-    def recognize_crosspoints(self, list_line,fixLine=True):
-
+    def recognize_crosspoints(self, list_line):
+        from matplotlib import pyplot as plt
         list_crosspoints = []
         # print("lines num",len(list_line))
 
-        def getMaxPoints(list_x,margin=5,reverse=False):
-            clust_x = []
-            for _x in list_x:
-                _find = False
-                for cx in clust_x:
-                    if abs(cx[0]-_x)<margin:
-                        _find = True
-                        cx.append(_x)
-                        break
-                if not _find:
-                    clust_x.append([_x])
-            clust_x.sort(key=lambda x:x,reverse=reverse)
-            return clust_x[0][0],len(clust_x[0])
-
         for _i in range(len(list_line)):
             for _j in range(len(list_line)):
                 line1 = list_line[_i].__dict__.get("bbox")
@@ -630,109 +616,6 @@ class LineTable:
                 if exists:
                     list_crosspoints.append(point)
 
-
-        if fixLine:
-            #聚类
-            cluster_crosspoints = []
-            for _point in list_crosspoints:
-                cluster_crosspoints.append({"lines":_point.get("lines"),"points":[_point]})
-            while 1:
-                _find = False
-                new_cluster_crosspoints = []
-                for l_point in cluster_crosspoints:
-                    _flag = False
-                    for l_n_point in new_cluster_crosspoints:
-                        line1 = l_point.get("lines")
-                        line2 = l_n_point.get("lines")
-                        if len(line1&line2)>0:
-                            _find = True
-                            _flag = True
-                            l_n_point["lines"] = line1.union(line2)
-                            l_n_point["points"].extend(l_point["points"])
-
-                    if not _flag:
-                        new_cluster_crosspoints.append({"lines":l_point.get("lines"),"points":l_point.get("points")})
-                cluster_crosspoints = new_cluster_crosspoints
-                if not _find:
-                    break
-
-            list_crosspoints = []
-
-            for list_cp in cluster_crosspoints:
-                points = list_cp.get("points")
-
-                l_lines = []
-                for p in points:
-                    l_lines.extend(p.get("p_lines"))
-                l_lines = list(set(l_lines))
-                l_lines.sort(key=lambda x:x[0])
-
-                min_x,_count = getMaxPoints([l[0] for l in l_lines],reverse=False)
-                if _count<=2:
-                    min_x = None
-
-
-                min_y,_count = getMaxPoints([l[1] for l in l_lines],reverse=False)
-                if _count<2:
-                    min_y = None
-
-
-                max_x,_count = getMaxPoints([l[2] for l in l_lines],reverse=True)
-                if _count<=2:
-                    max_x = None
-
-
-                max_y,_count = getMaxPoints([l[3] for l in l_lines],reverse=True)
-                if _count<=2:
-                    max_y = None
-                if min_x and min_y and max_x and max_y:
-
-                    points.sort(key=lambda x:x["point"][0])
-                    if abs(min_x-points[0]["point"][0])>30:
-                        _line = LTLine(1,(min_x,min_y),(min_x,max_y))
-                        list_line.append(_line)
-                        l_lines.append(_line.bbox)
-                        print("add=====",_line.bbox)
-
-
-                    if abs(max_x-points[-1]["point"][0])>30:
-                        _line = LTLine(1,(max_x,min_y),(max_x,max_y))
-                        list_line.append()
-                        l_lines.append(_line.bbox)
-                        print("add=====1",_line.bbox)
-
-                    points.sort(key=lambda x:x["point"][1])
-                    if abs(min_y-points[0]["point"][1])>30:
-                        _line = LTLine(1,(min_x,min_y),(max_x,min_y))
-                        list_line.append(_line)
-                        l_lines.append(_line.bbox)
-                        print("add=====2",_line.bbox)
-
-                    if abs(max_y-points[-1]["point"][1])>30:
-                        _line = LTLine(1,(min_x,max_y),(max_x,max_y))
-                        list_line.append(_line)
-                        l_lines.append(_line.bbox)
-                        print("add=====2",_line.bbox)
-
-
-
-                for _i in range(len(l_lines)):
-                    for _j in range(len(l_lines)):
-                        line1 = l_lines[_i]
-                        line2 = l_lines[_j]
-                        exists,point = self.cross_point(line1,line2)
-                        if exists:
-                            list_crosspoints.append(point)
-                from matplotlib import pyplot as plt
-                plt.figure()
-                for _line in l_lines:
-                    x0,y0,x1,y1 = _line
-                    plt.plot([x0,x1],[y0,y1])
-                for point in list_crosspoints:
-                    plt.scatter(point.get("point")[0],point.get("point")[1])
-                plt.show()
-
-        # from matplotlib import pyplot as plt
         # plt.figure()
         # for _line in list_line:
         #     x0,y0,x1,y1 = _line.__dict__.get("bbox")
@@ -820,10 +703,9 @@ class LineTable:
                     _line = lines[1]
                 next_point = None
                 for p1 in  dict_line_points[_line]["points"]:
-                    if p1["point"][0]>_point["point"][0]:
-                        if p1["buttom"]>=margin:
-                            next_point = p1
-                            break
+                    if p1["buttom"]>=margin and p1["point"][0]>_point["point"][0]:
+                        next_point = p1
+                        break
                 if not next_point:
                     continue
                 lines = list(next_point.get("lines"))
@@ -832,26 +714,14 @@ class LineTable:
                     _line = lines[1]
                 final_point = None
                 for p1 in dict_line_points[_line]["points"]:
-                    if p1["point"][1]>next_point["point"][1]:
-                        if p1["left"]>=margin:
-                            final_point = p1
-                            break
+                    if p1["left"]>=margin and p1["point"][1]>next_point["point"][1]:
+                        final_point = p1
+                        break
                 if not final_point:
-                    next_point["buttom"] = 0
                     continue
                 _r = LTRect(1,(_point["point"][0],_point["point"][1],final_point["point"][0],final_point["point"][1]))
                 list_rect.append(_r)
 
-        # dump
-        tmp_rect = []
-        set_bbox = set()
-        for _r in list_rect:
-            _bbox = "%.2f-%.2f-%.2f-%.2f"%_r.bbox
-            if not _bbox in set_bbox:
-                tmp_rect.append(_r)
-                set_bbox.add(_bbox)
-        list_rect = tmp_rect
-
         return list_rect
 
     def cross_point(self, line1, line2, segment=True, margin=2):
@@ -907,7 +777,7 @@ class LineTable:
         line1_key = "%.2f-%.2f-%.2f-%.2f"%(x1, y1, x2, y2)
         line2_key = "%.2f-%.2f-%.2f-%.2f"%(x3, y3, x4, y4)
         return point_is_exist, {"point": [x, y], "left": left, "right": right,
-                                "top": top, "buttom": buttom, "lines": set([line1_key,line2_key]),"p_lines":[line1,line2]}
+                                "top": top, "buttom": buttom, "lines": set([line1_key,line2_key])}
 
     def unionTable(self, list_table, fixspan=True, margin=2):
         set_x = set()
@@ -1142,14 +1012,12 @@ class LineTable:
                                 print(len(_table),l_i+i)
                                 _table[l_i+i].insert(c_i,_cell)
 
-        print("table>=======>")
-        print(list_x)
-        print(list_y)
-        for _line in _table:
-            for _cell in _line:
-                print("[%s]"%_cell.get("text")[:10].replace("\n",''),end="\t\t")
-            print("\n")
-        print("===========")
+        # print("=======")
+        # for _line in _table:
+        #     for _cell in _line:
+        #         print("[%s]"%_cell.get("text")[:10].replace("\n",''),end="\t\t")
+        #     print("\n")
+        # print("===========")
 
         table_bbox = (_table[0][0].get("bbox")[0],
                       _table[0][0].get("bbox")[1],
@@ -1180,9 +1048,9 @@ class LineTable:
         _count = 0
         (x0,x1) = (min(x0,x1),max(x0,x1))
         for _x in _list:
-            if _x >=(x0 - margin) and _x<=(x1 + margin):
+            if _x>=(x0-margin) and _x<=(x1+margin):
                 _count += 1
-            return _count-1
+        return _count-1
 
     def _plot(self, list_line, list_textbox):
         from matplotlib import pyplot as plt