Przeglądaj źródła

新增docx提取编号规则

fangjiasheng 1 rok temu
rodzic
commit
9b8377d055
3 zmienionych plików z 192 dodań i 18 usunięć
  1. 39 0
      format_convert/convert_docx.py
  2. 50 18
      format_convert/convert_pdf.py
  3. 103 0
      otr/table_line_new.py

+ 39 - 0
format_convert/convert_docx.py

@@ -138,9 +138,46 @@ def read_xml_order(path, save_path):
         body = collection.getElementsByTagName("w:body")[0]
         order_list = []
         text_list = []
+        # 编号组记录
+        num_pr_dict = {}
+        last_node_level = 0
         for line in body.childNodes:
             # print(str(line))
             if "w:p" in str(line):
+                # 文本的编号(如果有编号的话)
+                text_no = ''
+                # 提取编号 组-层级-序号
+                num_pr = line.getElementsByTagName("w:numPr")
+                if num_pr:
+                    num_pr = num_pr[0]
+                    group_id = int(num_pr.getElementsByTagName("w:numId")[0].getAttribute("w:val"))
+                    if group_id >= 1:
+                        node_level = num_pr.getElementsByTagName("w:ilvl")
+                        if node_level:
+                            node_level = int(node_level[0].getAttribute("w:val"))
+                            # print('node_level', node_level, 'last_node_level', last_node_level)
+                            if group_id in num_pr_dict.keys():
+                                if last_node_level != 0 and node_level < last_node_level:
+                                    # print('重置', 'group_id', group_id, 'last_node_level', last_node_level)
+                                    # 需循环重置node_level到last_node_level之间的level
+                                    for l in range(node_level+1, last_node_level+1):
+                                        num_pr_dict[group_id][l] = 0
+                                    num_pr_dict[group_id][node_level] += 1
+                                elif node_level in num_pr_dict[group_id].keys():
+                                    num_pr_dict[group_id][node_level] += 1
+                                else:
+                                    num_pr_dict[group_id][node_level] = 1
+                            else:
+                                num_pr_dict[group_id] = {node_level: 1}
+                            # print(num_pr_dict[group_id])
+                            for level in range(node_level+1):
+                                # 当前level下有多少个node
+                                level_node_cnt = num_pr_dict[group_id][level]
+                                # print('level_node_cnt', level_node_cnt)
+                                text_no += str(level_node_cnt) + '.'
+                            last_node_level = node_level
+                            # print('read_xml_order text_no', text_no)
+
                 text = line.getElementsByTagName("w:t")
                 picture = line.getElementsByTagName("wp:docPr")
                 if text:
@@ -151,6 +188,8 @@ def read_xml_order(path, save_path):
                             temp_text += t.childNodes[0].nodeValue
                         else:
                             continue
+                    if text_no:
+                        temp_text = text_no + ' ' + temp_text
                     text_list.append(temp_text)
                 if picture:
                     order_list.append("wp:docPr")

+ 50 - 18
format_convert/convert_pdf.py

@@ -1181,6 +1181,10 @@ class PDFConvert:
         # 删除最外层嵌套边框
         cross_line_list = remove_outline_no_cross(cross_line_list)
 
+        # 复用otr的部分后处理,补线
+        from otr.table_line_new import table_line_pdf
+        cross_line_list = table_line_pdf(cross_line_list, page_w, page_h)
+
         # show
         if show:
             print('len(cross_line_list)', len(cross_line_list))
@@ -1214,6 +1218,8 @@ class PDFConvert:
         # pdf对象需反向排序
         self._page.is_reverse = True
 
+        return list_tables
+
     def is_text_legal(self, lt_text_list, page_no):
         # 无法识别pdf字符编码,整页用ocr
         text_temp = ""
@@ -1244,7 +1250,11 @@ class PDFConvert:
 
         return True
 
-    def judge_b_table(self, lt_text_list):
+    def judge_b_table(self, lt_text_list, table_list):
+        table_h_list = []
+        for table in table_list:
+            table_h_list.append([table.get('bbox')[1], table.get('bbox')[3]])
+
         # 先分行
         lt_text_list.sort(key=lambda x: (x.bbox[1], x.bbox[0]))
         lt_text_row_list = []
@@ -1272,6 +1282,7 @@ class PDFConvert:
         tolerate_cnt = 2
         t_cnt = 0
         row_cnt = 0
+        b_table_row_list = []
         for row in lt_text_row_list:
             # 水印行跳过
             if len(row) == 1 and len(row[0].get_text()[:-1]) == 1:
@@ -1284,19 +1295,40 @@ class PDFConvert:
                         and re.search('[\u4e00-\u9fff]{2,}', text[match.span()[1]:]):
                     row_cnt += 1
                     t_cnt = 0
+                    b_table_row_list += row
                 else:
                     # 容忍
                     if t_cnt < tolerate_cnt:
                         t_cnt += 1
                         continue
                     row_cnt = 0
+                    b_table_row_list = []
             else:
                 row_cnt += 1
                 t_cnt = 0
+                b_table_row_list += row
 
             if row_cnt >= is_b_table_cnt:
-                is_b_table_flag = True
-                break
+                # 判断在不在有边框表格的范围
+                in_flag = False
+                for table_h in table_h_list:
+                    for b in b_table_row_list:
+                        # print('b.bbox', b.bbox)
+                        # print(table_h)
+                        if table_h[1] <= b.bbox[1] <= table_h[0] or table_h[1] <= b.bbox[3] <= table_h[0]:
+                            in_flag = True
+                            break
+                    if in_flag:
+                        break
+                if in_flag:
+                    is_b_table_flag = False
+                    t_cnt = 0
+                    row_cnt = 0
+                else:
+                    print('True b_table_row_list', b_table_row_list)
+                    print('table_h_list', table_h_list)
+                    is_b_table_flag = True
+                    break
         log('pdf is_b_table_flag ' + str(is_b_table_flag))
         return is_b_table_flag
 
@@ -1430,8 +1462,16 @@ class PDFConvert:
             if not self.is_text_legal(lt_text_list, page_no):
                 return
 
+            try:
+                lt_line_list = self.get_page_lines(layout, page_no)
+            except:
+                traceback.print_exc()
+                lt_line_list = []
+                self._page.error_code = [-13]
+            table_list = self.recognize_text(layout, page_no, lt_text_list, lt_line_list)
+
             # 根据text规律,判断该页是否可能有无边框表格
-            if self.judge_b_table(lt_text_list):
+            if self.judge_b_table(lt_text_list, table_list):
                 page_image = self.get_page_image(page_no)
                 if judge_error_code(page_image):
                     self._page.error_code = page_image
@@ -1443,14 +1483,6 @@ class PDFConvert:
                     _image.b_table_layout_size = (layout.width, layout.height)
                     self._page.add_child(_image)
 
-            try:
-                lt_line_list = self.get_page_lines(layout, page_no)
-            except:
-                traceback.print_exc()
-                lt_line_list = []
-                self._page.error_code = [-13]
-            self.recognize_text(layout, page_no, lt_text_list, lt_line_list)
-
     def get_layout(self, page, page_no):
         log("get_layout")
         if self.has_init_pdf[0] == 0:
@@ -1566,7 +1598,7 @@ class PDFConvert:
             # 最后一个表格后有无除了页码外的内容
             connect_flag1 = False
             if last_table_end is not None:
-                match = re.search('[^-/第页0-9]*', re.sub('<div>|</div>', '', h[last_table_end:]))
+                match = re.search('[^-/第页0-9,,]*', re.sub('<div>|</div>', '', h[last_table_end:]))
                 # print('match1', match.group())
                 if not match or match.group() == '':
                     connect_flag1 = True
@@ -1595,7 +1627,7 @@ class PDFConvert:
 
             connect_flag_list.append([i, connect_flag2, connect_flag1])
 
-        print('connect_flag_list', connect_flag_list)
+        # print('connect_flag_list', connect_flag_list)
 
         # 根据条件1合并需连接页码,形成组
         connect_pages_list = []
@@ -1613,7 +1645,7 @@ class PDFConvert:
         if temp_list:
             connect_pages_list.append(temp_list)
 
-        print('connect_pages_list', connect_pages_list)
+        # print('connect_pages_list', connect_pages_list)
 
         # 判断后续条件:判断组内列数是否相同
         connect_pages_list2 = []
@@ -1642,7 +1674,7 @@ class PDFConvert:
                 if new_c_list:
                     connect_pages_list2.append(new_c_list)
 
-        print('connect_pages_list2', connect_pages_list2)
+        # print('connect_pages_list2', connect_pages_list2)
 
         # 符合连接条件的拼接表格
         new_html_list = []
@@ -1656,10 +1688,10 @@ class PDFConvert:
                 new_html += html_list[c[0]] + '#@#@#'
             new_html = new_html[:-5]
             # ([-/第页0-9]|<div>|</div>)*
-            new_html = re.sub('</table>((<div>[-/第页0-9]*</div>#@#@#)|(#@#@#<div>[^<]*</div>)|#@#@#)<table border="1">',
+            new_html = re.sub('</table>((<div>[-/第页0-9,,]*</div>#@#@#)|(#@#@#<div>[^<]*</div>)|#@#@#)<table border="1">',
                               '<tr><td>#@#@#</td></tr>',
                               new_html)
-            print('new_html', new_html)
+            # print('new_html', new_html)
 
             soup = BeautifulSoup(new_html, 'lxml')
             trs = soup.findAll('tr')

+ 103 - 0
otr/table_line_new.py

@@ -166,6 +166,109 @@ def table_line(img, model, size=(512, 1024), prob=0.2, is_test=0):
     return line_list
 
 
+def table_line_pdf(line_list, page_w, page_h, is_test=0):
+    for i, line in enumerate(line_list):
+        line_list[i] = [int(x) for x in line]
+
+    img_new = np.full([int(page_h+1), int(page_w+1), 3], 255, dtype=np.uint8)
+    img_show = copy.deepcopy(img_new)
+
+    # 分成横竖线
+    start_time = time.time()
+    row_line_list = []
+    col_line_list = []
+    for line in line_list:
+        if line[0] == line[2]:
+            col_line_list.append(line)
+        elif line[1] == line[3]:
+            row_line_list.append(line)
+    log("pdf divide rows and cols " + str(time.time() - start_time))
+    show(row_line_list + col_line_list, title="divide", mode=2, is_test=is_test)
+
+    # 两种线都需要存在,否则跳过
+    if not row_line_list or not col_line_list:
+        return []
+
+    # 计算交点
+    cross_points = get_points(row_line_list, col_line_list, (img_new.shape[0], img_new.shape[1]))
+    if not cross_points:
+        return []
+    show(cross_points, title="get_points", img=img_show, mode=4, is_test=is_test)
+
+    # 多个表格分割线,获取多个表格区域
+    start_time = time.time()
+    split_lines, split_y = get_split_line(cross_points, col_line_list, img_new)
+    area_row_line_list, area_col_line_list, area_point_list = get_split_area(split_y, row_line_list, col_line_list, cross_points)
+    log("pdf get_split_area " + str(time.time() - start_time))
+    show(split_lines, title="split_lines", img=img_show, mode=3, is_test=is_test)
+
+    # 根据区域循环
+    need_split_flag = False
+    for i in range(len(area_point_list)):
+        sub_row_line_list = area_row_line_list[i]
+        sub_col_line_list = area_col_line_list[i]
+        sub_point_list = area_point_list[i]
+
+        # 修复边框
+        start_time = time.time()
+        new_rows, new_cols, long_rows, long_cols = fix_outline(img_new,
+                                                               sub_row_line_list,
+                                                               sub_col_line_list,
+                                                               sub_point_list)
+
+        # 如有补线
+        if new_rows or new_cols:
+            # 连接至补线的延长线
+            if long_rows:
+                sub_row_line_list = long_rows
+            if long_cols:
+                sub_col_line_list = long_cols
+            # 新的补线
+            if new_rows:
+                sub_row_line_list += new_rows
+            if new_cols:
+                sub_col_line_list += new_cols
+            need_split_flag = True
+            area_row_line_list[i] = sub_row_line_list
+            area_col_line_list[i] = sub_col_line_list
+
+    row_line_list = [y for x in area_row_line_list for y in x]
+    col_line_list = [y for x in area_col_line_list for y in x]
+
+    if need_split_flag:
+        # 修复边框后重新计算交点
+        cross_points = get_points(row_line_list, col_line_list, (img_new.shape[0], img_new.shape[1]))
+        split_lines, split_y = get_split_line(cross_points, col_line_list, img_new)
+        area_row_line_list, area_col_line_list, area_point_list = get_split_area(split_y, row_line_list, col_line_list, cross_points)
+    log("pdf fix_outline " + str(time.time() - start_time))
+
+    # 根据区域循环
+    for i in range(len(area_point_list)):
+        sub_row_line_list = area_row_line_list[i]
+        sub_col_line_list = area_col_line_list[i]
+        sub_point_list = area_point_list[i]
+
+        # 修复内部缺线
+        start_time = time.time()
+        sub_row_line_list, sub_col_line_list = fix_inner(sub_row_line_list, sub_col_line_list, sub_point_list)
+        log("pdf fix_inner " + str(time.time() - start_time))
+        show(sub_row_line_list + sub_col_line_list, title="fix_inner1", mode=2, is_test=is_test)
+
+        # 修复内部线后重新计算交点
+        start_time = time.time()
+        cross_points = get_points(sub_row_line_list, sub_col_line_list, (img_new.shape[0], img_new.shape[1]))
+        show(cross_points, title="get_points3", img=img_show, mode=4, is_test=is_test)
+
+    row_line_list = [y for x in area_row_line_list for y in x]
+    col_line_list = [y for x in area_col_line_list for y in x]
+
+    line_list = row_line_list + col_line_list
+    # 打印处理后线
+    show(line_list, title="all", img=img_show, mode=5, is_test=is_test)
+    log("pdf otr postprocess table_line " + str(time.time() - start_time))
+    return line_list
+
+
 def show(pred_or_lines, title='', prob=0.2, img=None, mode=1, is_test=0):
     if not is_test:
         return