소스 검색

pdf表格加入处理步骤

fangjiasheng 3 년 전
부모
커밋
5caaa75d49
3개의 변경된 파일307개의 추가작업 그리고 34개의 파일을 삭제
  1. 1 1
      format_convert/convert.py
  2. 161 3
      otr/table_line.py
  3. 145 30
      result.html

+ 1 - 1
format_convert/convert.py

@@ -2605,7 +2605,7 @@ else:
         _path = os.path.dirname(os.path.abspath(__file__))
 if __name__ == '__main__':
     if get_platform() == "Windows":
-        file_path = "C:/Users/Administrator/Desktop/error14-1.pdf"
+        file_path = "C:/Users/Administrator/Desktop/error7.jpg"
         # file_path = "D:/BIDI_DOC/比地_文档/2022/Test_Interface/1622529434414.rar"
         # file_path = "D:/BIDI_DOC/比地_文档/2022/Test_ODPS/1624934320560.pdf"
     else:

+ 161 - 3
otr/table_line.py

@@ -197,16 +197,16 @@ def table_net(input_shape=(1152, 896, 3), num_classes=1):
 model = table_net((None, None, 3), 2)
 
 
-def draw_pixel(pred):
+def draw_pixel(pred, prob=0.2):
     import matplotlib.pyplot as plt
     _array = []
     for _h in range(len(pred)):
         _line = []
         for _w in range(len(pred[_h])):
             _prob = pred[_h][_w]
-            if _prob[0]>0.5:
+            if _prob[0]>prob:
                 _line.append((0,255,255))
-            elif _prob[1]>0.5:
+            elif _prob[1]>prob:
                 _line.append((255,255,0))
             else:
                 _line.append((255,255,255))
@@ -367,6 +367,77 @@ def points2lines(pred, sourceP_LB=True, prob=0.2, line_width=7, padding=3, min_l
     return list_line
 
 
+def get_line_from_binary_image(image_np, point_value=1, axis=0):
+    """
+    根据像素点的变化,将像素点为特定值的转化为line,即找出端点坐标。
+    需要二值化的图。
+    仅支持竖线横线。
+
+    :param image_np: numpy格式 image
+    :param point_value: 像素点的特定值
+    :param axis: 是否是行,否则为列
+    :return: line list
+    """
+    def get_axis_points(_list, axis=0):
+        _list.sort(key=lambda x: (x[1-axis], x[axis]))
+
+        standard_axis = points[axis][1-axis]
+        axis_points = []
+        sub_points = []
+        for p in _list:
+            if p[1-axis] == standard_axis:
+                sub_points.append(p)
+            else:
+                standard_axis = p[1-axis]
+                if sub_points:
+                    axis_points.append(sub_points)
+                sub_points = []
+        # 最后一行/列
+        if sub_points:
+            axis_points.append(sub_points)
+        return axis_points
+
+    def get_axis_lines(_list, axis=0):
+        # 逐行/列判断,一行/列可能多条横线/竖线
+        points_lines = []
+        for axis_list in _list:
+            sub_line = [axis_list[0]]
+            for p in axis_list:
+                # 设置基准点
+                standard_p = sub_line[-1]
+
+                # 判断连续
+                if p[axis] - standard_p[axis] == 1:
+                    sub_line.append(p)
+                else:
+                    points_lines.append(sub_line)
+                    sub_line = [p]
+            # 最后一行/列
+            if sub_line:
+                points_lines.append(sub_line)
+
+        # 许多点组成的line转为两点line
+        lines = []
+        for line in points_lines:
+            line.sort(key=lambda x: (x[axis], x[1-axis]))
+            lines.append([line[0][0], line[0][1], line[-1][0], line[-1][1]])
+        return lines
+
+    # 取值大于point_value的点的坐标
+    ys, xs = np.where(image_np >= point_value)
+    points = [[xs[i], ys[i]] for i in range(len(xs))]
+
+    # 提出所有相同x或相同y的点
+    # 提取行/列
+    axis_points = get_axis_points(points, axis)
+
+    # 提取每行/列的横线/竖线
+    axis_lines = get_axis_lines(axis_points, axis)
+    # print("axis_lines", axis_lines)
+
+    return axis_lines
+
+
 def table_line(img, model, size=(512, 1024), hprob=0.5, vprob=0.5, row=50, col=30, alph=15):
     sizew, sizeh = size
     img_new = cv2.resize(img, (sizew, sizeh), interpolation=cv2.INTER_AREA)
@@ -374,8 +445,95 @@ def table_line(img, model, size=(512, 1024), hprob=0.5, vprob=0.5, row=50, col=3
     pred = model.predict(np.array([img_new]))
     pred = pred[0]
 
+    # 横线预测结果
+    # row_pred = pred[..., 0] > hprob
+    # row_pred = row_pred.astype(np.uint8)
+    # # 竖线预测结果
+    # col_pred = pred[..., 1] > vprob
+    # col_pred = col_pred.astype(np.uint8)
+    # # 打印模型输出
+    # cv2.imshow("predict", (col_pred+row_pred)*255)
+    # cv2.waitKey(0)
+
     _time = time.time()
     list_line = points2lines(pred, False)
+
+    # 分成横竖线
+    list_rows = []
+    list_cols = []
+    for line in list_line:
+        if line[0] == line[2]:
+            list_cols.append(line)
+        elif line[1] == line[3]:
+            list_rows.append(line)
+
+    # 删掉贴着边框的line
+    temp_list = []
+    threshold = 5
+    for line in list_rows:
+        if line[1]-0 <= threshold or size[1]-line[1] <= threshold:
+            continue
+        # 内部排序
+        if line[0] > line[2]:
+            line = [line[2], line[3], line[0], line[1]]
+        temp_list.append(line)
+    list_rows = temp_list
+    temp_list = []
+    for line in list_cols:
+        if line[0]-0 <= threshold or size[0]-line[0] <= threshold:
+            continue
+        # 内部排序
+        if line[1] > line[3]:
+            line = [line[2], line[3], line[0], line[1]]
+        temp_list.append(line)
+    list_cols = temp_list
+    if not list_rows or not list_cols:
+        return []
+
+    # 合并错开线
+    list_rows = merge_line(list_rows, axis=0)
+    list_cols = merge_line(list_cols, axis=1)
+
+    # 计算交点、分割线
+    cross_points = get_points(list_rows, list_cols, (img_new.shape[0], img_new.shape[1]))
+    if not cross_points:
+        return []
+
+    # 清掉外围的没用的线
+    list_rows, list_cols = delete_outline(list_rows, list_cols, cross_points)
+
+    # 多个表格分割线
+    split_lines, split_y = get_split_line(cross_points, list_cols, img_new)
+
+    # 修复边框
+    new_rows, new_cols, long_rows, long_cols = fix_outline(img_new, list_rows, list_cols, cross_points,
+                                                           split_y)
+    # 如有补线
+    if new_rows or new_cols:
+        # 连接至补线的延长线
+        if long_rows:
+            list_rows = long_rows
+        if long_cols:
+            list_cols = long_cols
+        # 新的补线
+        if new_rows:
+            list_rows += new_rows
+        if new_cols:
+            list_cols += new_cols
+
+        # 修复边框后重新计算交点、分割线
+        cross_points = get_points(list_rows, list_cols, (img_new.shape[0], img_new.shape[1]))
+        split_lines, split_y = get_split_line(cross_points, list_cols, img_new)
+
+        # 修复内部缺线
+        cross_points = fix_inner(list_rows, list_cols, cross_points, split_y)
+        if not cross_points:
+            return []
+
+    # 修复表格4个角
+    list_rows, list_cols = fix_corner(list_rows, list_cols, split_y)
+
+    list_line = list_rows + list_cols
     return list_line
 
 

+ 145 - 30
result.html

@@ -1,36 +1,151 @@
 <!DOCTYPE HTML><head><meta charset="UTF-8"></head><body><table border="1">
 <tr>
-<td colspan=1 rowspan=1>新港园区实验小学、实验幼儿园空调采购工程竞争性碳商文件管盛人项目编号:TMSD-HS-2021035采购人:黄石新港(物流)工业园区社会发展局招标代理:天马盛鼎项目管理有限公司二0一年六月</td>
+<td colspan=1 rowspan=1>9</td>
+<td colspan=1 rowspan=1>PPR90°弯头</td>
+<td colspan=1 rowspan=1>20</td>
+<td colspan=1 rowspan=1>个</td>
+<td colspan=1 rowspan=1>38893</td>
+<td colspan=1 rowspan=1></td>
+<td colspan=1 rowspan=1></td>
+<td colspan=1 rowspan=1></td>
 </tr>
-</table>
-<div>理有</div>
-<div>E</div>
-<div>绿鼎项目</div>
-<div>碳商文件备案表</div>
-<div>工程名称:新港园区实验小学、实验幼儿园空调采购工程</div>
-<div>采购人:黄石新港(物流)工业园区社会发展局</div>
-<div>招标代理机构:天马盛鼎项目管理有限公司</div>
-<div>代理机构意见:</div>
-<div>盖章</div>
-<div>6月29日</div>
-<div>201年</div>
-<table border="1">
-<tr>
-<td colspan=1 rowspan=1>同意采购单位意见:盖章月29日202年平6月</td>
+<tr>
+<td colspan=1 rowspan=1>10</td>
+<td colspan=1 rowspan=1>PPR90°弯头</td>
+<td colspan=1 rowspan=1>25</td>
+<td colspan=1 rowspan=1>个</td>
+<td colspan=1 rowspan=1>54352</td>
+<td colspan=2 rowspan=1></td>
+<td colspan=1 rowspan=1></td>
+</tr>
+<tr>
+<td colspan=1 rowspan=1>11</td>
+<td colspan=1 rowspan=1>PPR90°弯头</td>
+<td colspan=1 rowspan=1>32</td>
+<td colspan=1 rowspan=1>个</td>
+<td colspan=1 rowspan=1>5850</td>
+<td colspan=2 rowspan=1></td>
+<td colspan=1 rowspan=1></td>
+</tr>
+<tr>
+<td colspan=1 rowspan=1>12</td>
+<td colspan=1 rowspan=1>PPR三通</td>
+<td colspan=1 rowspan=1>25*25</td>
+<td colspan=1 rowspan=1>个</td>
+<td colspan=1 rowspan=1>9460</td>
+<td colspan=2 rowspan=1></td>
+<td colspan=1 rowspan=1></td>
+</tr>
+<tr>
+<td colspan=1 rowspan=1>13</td>
+<td colspan=1 rowspan=1>PPR中小三通</td>
+<td colspan=1 rowspan=1>25*20</td>
+<td colspan=1 rowspan=1>个</td>
+<td colspan=1 rowspan=1>8751</td>
+<td colspan=2 rowspan=1></td>
+<td colspan=1 rowspan=1></td>
+</tr>
+<tr>
+<td colspan=1 rowspan=1>14</td>
+<td colspan=1 rowspan=1>PPR大小头</td>
+<td colspan=1 rowspan=1>25*20</td>
+<td colspan=1 rowspan=1>个</td>
+<td colspan=1 rowspan=1>10169</td>
+<td colspan=2 rowspan=1></td>
+<td colspan=1 rowspan=1></td>
+</tr>
+<tr>
+<td colspan=1 rowspan=1>15</td>
+<td colspan=1 rowspan=1>PPR直接</td>
+<td colspan=1 rowspan=1>25</td>
+<td colspan=1 rowspan=1>个</td>
+<td colspan=1 rowspan=1>5500</td>
+<td colspan=2 rowspan=1></td>
+<td colspan=1 rowspan=1></td>
+</tr>
+<tr>
+<td colspan=1 rowspan=1>16</td>
+<td colspan=1 rowspan=1>PPR直接</td>
+<td colspan=1 rowspan=1>20</td>
+<td colspan=1 rowspan=1>个</td>
+<td colspan=1 rowspan=1>1460</td>
+<td colspan=2 rowspan=1></td>
+<td colspan=1 rowspan=1></td>
+</tr>
+<tr>
+<td colspan=1 rowspan=1>17</td>
+<td colspan=1 rowspan=1>PPR直接</td>
+<td colspan=1 rowspan=1>32</td>
+<td colspan=1 rowspan=1>个</td>
+<td colspan=1 rowspan=1>1020</td>
+<td colspan=2 rowspan=1></td>
+<td colspan=1 rowspan=1></td>
+</tr>
+<tr>
+<td colspan=1 rowspan=1>18</td>
+<td colspan=1 rowspan=1>PPR过桥弯</td>
+<td colspan=1 rowspan=1>25</td>
+<td colspan=1 rowspan=1>个</td>
+<td colspan=1 rowspan=1>10877</td>
+<td colspan=2 rowspan=1></td>
+<td colspan=1 rowspan=1></td>
+</tr>
+<tr>
+<td colspan=1 rowspan=1>19</td>
+<td colspan=1 rowspan=1>PPR过桥弯</td>
+<td colspan=1 rowspan=1>20</td>
+<td colspan=1 rowspan=1>个</td>
+<td colspan=1 rowspan=1>7350</td>
+<td colspan=2 rowspan=1></td>
+<td colspan=1 rowspan=1></td>
+</tr>
+<tr>
+<td colspan=1 rowspan=1>20</td>
+<td colspan=1 rowspan=1>PPR45°弯头</td>
+<td colspan=1 rowspan=1>25</td>
+<td colspan=1 rowspan=1>个</td>
+<td colspan=1 rowspan=1>11206</td>
+<td colspan=2 rowspan=1></td>
+<td colspan=1 rowspan=1></td>
+</tr>
+<tr>
+<td colspan=1 rowspan=1>21</td>
+<td colspan=1 rowspan=1>PPR截止阀</td>
+<td colspan=1 rowspan=1>25</td>
+<td colspan=1 rowspan=1>个</td>
+<td colspan=1 rowspan=1>1360</td>
+<td colspan=1 rowspan=1>2020年5月至2022年3月</td>
+<td colspan=1 rowspan=1>西安洛悦府项目施工现场</td>
+<td colspan=1 rowspan=1></td>
+</tr>
+<tr>
+<td colspan=1 rowspan=1>22</td>
+<td colspan=1 rowspan=1>合计</td>
+<td colspan=1 rowspan=1></td>
+<td colspan=1 rowspan=1></td>
+<td colspan=1 rowspan=1>314402</td>
+<td colspan=1 rowspan=1>2020年5月至2022年3月</td>
+<td colspan=1 rowspan=1>西安洛悦府项目施工现场</td>
+<td colspan=2 rowspan=12></td>
+<td colspan=1 rowspan=1></td>
+<td colspan=1 rowspan=1></td>
+<td colspan=1 rowspan=1></td>
 </tr>
 </table>
-<div> </div>
-<div></div>
-<div> </div>
-<div></div>
-<div> </div>
-<div></div>
-<div> </div>
-<div></div>
-<div> </div>
-<div></div>
-<div> </div>
-<div></div>
-<div>1 </div>
-<div></div>
+<div>注:本次招标数量仅为估算量,以实际工程需求供应,招标文件及需求一览</div>
+<div>表中的数量仅作为投标报价时的计价依据,不作为最终结算量。中标人签订供货</div>
+<div>合同时将进一步明确供应品种、规格、数量。本次招标数量与合同中数量可能存</div>
+<div>在差异,均以现场实际交货验收合格数量为准,该数量与合同数量也可能将存在</div>
+<div>差异。中标人不得因此向招标人及合同签订买方提出任何补偿要求。</div>
+<div>3.2投标人中标后,不允许对中标物资进行生产拆包、转包、违法分包。</div>
+<div>3.3若因投资计划、征地拆迁、设计变更等原因,致使投资规模、工程量或</div>
+<div>供货品种、供货时间发生较大规模改变,采购量及采购品种相应调整,投标人应</div>
+<div>予接受,并不得以此作为调价和索赔依据。</div>
+<div>4、交货时间:工地开工至合同工期完工(计划工期48个月、详细交货规</div>
+<div>格、数量和具体供货时间由买方签约时提供,依据工地需求分批组织供应)。</div>
+<div>5、交货地点:中国水利水电第三工程局有限公司基础建筑分局西安铭悦府</div>
+<div>项目。</div>
+<div>6、质量要求:必须符合满足招标文件技术要求及国家或行业规范标准要求。</div>
+<div>,品牌要求:</div>
+<div>品牌</div>
 </body>