Переглянути джерело

otr图片过大内存溢出解决,otr阈值调整

fangjiasheng 3 роки тому
батько
коміт
70c18e953a

+ 7 - 6
format_convert/convert.py

@@ -16,6 +16,7 @@ from format_convert.convert_zip import zip2text, ZipConvert
 
 import hashlib
 from format_convert import get_memory_info
+from format_convert.judge_platform import get_platform
 from ocr import ocr_interface
 from otr import otr_interface
 import re
@@ -2563,11 +2564,11 @@ def convert(data, ocr_model, otr_model):
                         "is_success": 0}
 
         # 结果保存result.html
-        # if get_platform() == "Windows":
-        text_str = ""
-        for t in text:
-            text_str += t
-        to_html("../result.html", text_str)
+        if get_platform() == "Windows":
+            text_str = ""
+            for t in text:
+                text_str += t
+            to_html("../result.html", text_str)
 
         # 取纯文本
         only_text = []
@@ -2605,7 +2606,7 @@ else:
         _path = os.path.dirname(os.path.abspath(__file__))
 if __name__ == '__main__':
     if get_platform() == "Windows":
-        file_path = "C:/Users/Administrator/Desktop/error16.pdf"
+        file_path = "C:/Users/Administrator/Desktop/error3.pdf"
         # file_path = "D:/BIDI_DOC/比地_文档/2022/Test_Interface/1622529434414.rar"
         # file_path = "D:/BIDI_DOC/比地_文档/2022/Test_ODPS/1624325845476.pdf"
     else:

+ 4 - 3
format_convert/convert_image.py

@@ -11,7 +11,7 @@ from format_convert.table_correct import get_rotated_image
 from format_convert.convert_need_interface import from_otr_interface, from_ocr_interface
 
 
-def image_process(image_np, image_path, use_ocr=True):
+def image_process(image_np, image_path, is_from_pdf, use_ocr=True):
     from format_convert.convert_tree import _Table, _Sentence
 
     def get_cluster(t_list, b_list, axis):
@@ -92,7 +92,7 @@ def image_process(image_np, image_path, use_ocr=True):
         # 调用otr模型接口
         with open(image_resize_path, "rb") as f:
             image_bytes = f.read()
-        list_line = from_otr_interface(image_bytes)
+        list_line = from_otr_interface(image_bytes, is_from_pdf)
         if judge_error_code(list_line):
             return list_line
 
@@ -107,6 +107,7 @@ def image_process(image_np, image_path, use_ocr=True):
         with open(image_path, "rb") as f:
             image_bytes = f.read()
         text_list, bbox_list = from_ocr_interface(image_bytes, True)
+        # print("convert_image", text_list)
         if judge_error_code(text_list):
             return text_list
 
@@ -178,7 +179,7 @@ def picture2text(path, html=False):
 def get_best_predict_size(image_np, times=64):
     sizes = []
     for i in range(1, 100):
-        if i*times <= 3000:
+        if i*times <= 1300:
             sizes.append(i*times)
     sizes.sort(key=lambda x: x, reverse=True)
 

+ 2 - 2
format_convert/convert_need_interface.py

@@ -134,7 +134,7 @@ def from_otr_interface2(image_stream):
         return [-1], [-1], [-1], [-1], [-1]
 
 
-def from_otr_interface(image_stream):
+def from_otr_interface(image_stream, is_from_pdf=False):
     logging.info("into from_otr_interface")
     try:
         base64_stream = base64.b64encode(image_stream)
@@ -144,7 +144,7 @@ def from_otr_interface(image_stream):
             if globals().get("global_otr_model") is None:
                 globals().update({"global_otr_model": OtrModels().get_model()})
                 print("=========== init otr model ===========")
-            r = otr(data=base64_stream, otr_model=globals().get("global_otr_model"))
+            r = otr(data=base64_stream, otr_model=globals().get("global_otr_model"), is_from_pdf=is_from_pdf)
         except TimeoutError:
             return [-5]
         except requests.exceptions.ConnectionError as e:

+ 3 - 1
format_convert/convert_pdf.py

@@ -26,7 +26,7 @@ from pdfminer.converter import PDFPageAggregator
 from pdfminer.layout import LTTextBoxHorizontal, LAParams, LTFigure, LTImage, LTCurve, LTText, LTChar, LTRect, \
     LTTextBoxVertical, LTLine
 from format_convert import get_memory_info
-from utils import judge_error_code, add_div, get_platform, get_html_p, string_similarity, LineTable
+from format_convert.utils import judge_error_code, add_div, get_platform, get_html_p, string_similarity, LineTable
 import fitz
 
 
@@ -737,6 +737,7 @@ class PDFConvert:
                 self._page.error_code = page_image
             else:
                 _image = _Image(page_image[1], page_image[0])
+                _image.is_from_pdf = True
                 self._page.add_child(_image)
 
         # 正常读取该页对象
@@ -778,6 +779,7 @@ class PDFConvert:
                             self._page.error_code = page_image
                         else:
                             _image = _Image(page_image[1], page_image[0])
+                            _image.is_from_pdf = True
                             self._page.add_child(_image)
                         return
                     # 比较小的图则直接保存用ocr识别

+ 1 - 1
format_convert/convert_tree.py

@@ -120,7 +120,7 @@ class _Image:
         # 二进制转numpy
         image_np = Image.open(io.BytesIO(self.content))
         image_np = cv2.cvtColor(np.asarray(image_np), cv2.COLOR_RGB2BGR)
-        obj_list = image_process(image_np, self.path, use_ocr=True)
+        obj_list = image_process(image_np, self.path, self.is_from_pdf, use_ocr=True)
         if judge_error_code(obj_list):
             self.error_code = obj_list
             return

+ 9 - 5
otr/otr_interface.py

@@ -27,11 +27,14 @@ def log(msg):
     logger.info(msg)
 
 
-def otr(data, otr_model):
+def otr(data, otr_model, is_from_pdf):
     try:
         img_data = base64.b64decode(data)
         # points_and_lines = pool.apply(table_detect, (img_data,))
-        list_lines = line_detect(img_data, otr_model)
+        if is_from_pdf:
+            list_lines = line_detect(img_data, otr_model, prob=0.2)
+        else:
+            list_lines = line_detect(img_data, otr_model, prob=0.5)
         return list_lines
     except TimeoutError:
         raise TimeoutError
@@ -263,7 +266,7 @@ def table_detect2(img_data, otr_model):
                 "outline_points": str([]), "lines": str([])}
 
 
-def line_detect(img_data, otr_model):
+def line_detect(img_data, otr_model, prob=0.2):
     logging.info("into otr_interface table_detect")
     start_time = time.time()
     try:
@@ -279,14 +282,15 @@ def line_detect(img_data, otr_model):
 
         # 选择与图片最接近分辨率,以防失真
         # best_h, best_w = get_best_predict_size(img)
-        print("image_np.shape", image_np.shape)
+        logging.info("image_np.shape" + str(image_np.shape))
         best_h, best_w, _ = image_np.shape
         logging.info("otr preprocess time: " + str(round(float(time.time()-start_time1), 4)) + "s")
 
         # 调用模型
         # rows, cols = table_line(image_np, otr_model)
         start_time1 = time.time()
-        list_line = table_line(image_np, otr_model, size=(best_w, best_h))
+        list_line = table_line(image_np, otr_model, size=(best_w, best_h), prob=prob)
+        logging.info("otr finish " + str(round(float(time.time()-start_time1), 4)) + "s")
         return {"list_line": str(list_line)}
     except TimeoutError:
         raise TimeoutError

+ 4 - 3
otr/table_line.py

@@ -441,14 +441,15 @@ def get_line_from_binary_image(image_np, point_value=1, axis=0):
     return axis_lines
 
 
-def table_line(img, model, size=(512, 1024), is_test=0):
+def table_line(img, model, size=(512, 1024), prob=0.2, is_test=0):
+    logging.info("into table_line, prob is " + str(prob))
     sizew, sizeh = size
     img_new = cv2.resize(img, (sizew, sizeh), interpolation=cv2.INTER_AREA)
 
     pred = model.predict(np.array([img_new]))
     pred = pred[0]
 
-    # draw_pixel(pred, is_test)
+    # draw_pixel(pred)
 
     # 横线预测结果
     # row_pred = pred[..., 0] > hprob
@@ -461,7 +462,7 @@ def table_line(img, model, size=(512, 1024), is_test=0):
     # cv2.waitKey(0)
 
     _time = time.time()
-    list_line = points2lines(pred, False)
+    list_line = points2lines(pred, False, prob=prob)
     mat_plot(list_line, "points2lines", is_test)
 
     # 清除短线

BIN
package_2022_03_22/convert_otr.zip


+ 60 - 101
result.html

@@ -1,110 +1,69 @@
-<!DOCTYPE HTML><head><meta charset="UTF-8"></head><body><div>询价公告</div>
-<div>一、采购单编号:P-XJ22010799</div>
-<div>二、采购单名称:弯头自采</div>
-<div>三、报价截止时间:2022-03-2517:30</div>
-<div>四、报价有效期:2022-06-1900:00</div>
-<div>五、组织形式:自行采购</div>
-<div>六、采购单位:山西漳山发电有限责任公司</div>
-<div>七、联系人:刘军</div>
-<div>八、联系方式:13333556610</div>
-<div>九、询价类型:公开</div>
-<div>具体规格、技术指标及售后服务要求等详见下表。</div>
+<!DOCTYPE HTML><head><meta charset="UTF-8"></head><body><div>华池县柔远镇李庄肉牛养殖场建设项目配</div>
+<div>套设备购置政府采购公开招标中标公告</div>
+<div>、项目编号</div>
+<div>HCZC2021-0001</div>
+<div>二、项目名称</div>
+<div>华池县柔远镇李庄肉牛养殖场建设项目配套设备购置</div>
+<div>三、中标(成交)信息</div>
 <table border="1">
 <tr>
-<td colspan=1 rowspan=1>序
-号
-</td>
-<td colspan=1 rowspan=1>产品描述
-</td>
-<td colspan=1 rowspan=1>采购数量
-</td>
-<td colspan=1 rowspan=1>计量单位
-</td>
-<td colspan=1 rowspan=1>税率
-</td>
-<td colspan=1 rowspan=1>交付时间
-</td>
-<td colspan=1 rowspan=1>交货地点
-</td>
-<td colspan=1 rowspan=1>采购需求
-单位
-</td>
-<td colspan=1 rowspan=1>行项目备
-注
-</td>
+<td colspan=1 rowspan=1>供应商名称</td>
+<td colspan=1 rowspan=1>供应商联系地址</td>
+<td colspan=1 rowspan=1>中标金额(万元)</td>
 </tr>
 <tr>
-<td colspan=1 rowspan=1>1
-</td>
-<td colspan=1 rowspan=1>弯头|弯头\\DN20
-PN10 内丝\\|无|无
-</td>
-<td colspan=1 rowspan=1>20
-</td>
-<td colspan=1 rowspan=1>个
-</td>
-<td colspan=1 rowspan=1>13%
-进项
-税
-,中
-国(新
-)
-</td>
-<td colspan=1 rowspan=1>2022-04-
-05
-</td>
-<td colspan=1 rowspan=1>漳山发电工厂
-</td>
-<td colspan=1 rowspan=1>山西漳山
-发电有限
-责任公司
-</td>
-<td colspan=1 rowspan=1>弯头
-\\DN20
-PN10 内丝
-\\
-</td>
+<td colspan=1 rowspan=1>华池县卓泰机械设备租赁有限公司</td>
+<td colspan=1 rowspan=1>甘肃省庆阳市华池县柔远镇张川村</td>
+<td colspan=1 rowspan=1>72.3500</td>
+</tr>
+</table>
+<div>四、主要标的信息</div>
+<table border="1">
+<tr>
+<td colspan=1 rowspan=1>货物类</td>
+<td colspan=1 rowspan=1>货物类</td>
+<td colspan=1 rowspan=1>货物类</td>
+<td colspan=1 rowspan=1>货物类</td>
+<td colspan=1 rowspan=1>货物类</td>
+<td colspan=1 rowspan=1>货物类</td>
+</tr>
+<tr>
+<td colspan=1 rowspan=1>供应商名称</td>
+<td colspan=1 rowspan=1>名称</td>
+<td colspan=1 rowspan=1>品牌</td>
+<td colspan=1 rowspan=1>数量</td>
+<td colspan=1 rowspan=1>单价</td>
+<td colspan=1 rowspan=1>规格型号</td>
 </tr>
 <tr>
-<td colspan=1 rowspan=1>2
-</td>
-<td colspan=1 rowspan=1>弯头|弯头\\DN25
-PN10 内丝\\|无|无
-</td>
-<td colspan=1 rowspan=1>10
-</td>
-<td colspan=1 rowspan=1>个
-</td>
-<td colspan=1 rowspan=1>13%
-进项
-税
-,中
-国(新
-)
-</td>
-<td colspan=1 rowspan=1>2022-04-
-05
-</td>
-<td colspan=1 rowspan=1>漳山发电工厂
-</td>
-<td colspan=1 rowspan=1>山西漳山
-发电有限
-责任公司
-</td>
-<td colspan=1 rowspan=1>弯头
-\\DN25
-PN10 内丝
-\\
-</td>
+<td colspan=1 rowspan=1>华池县卓泰机械设备租赁有限公司</td>
+<td colspan=1 rowspan=1>华池县柔远镇李庄肉牛养殖场建设项目配套设备购置</td>
+<td colspan=1 rowspan=1>详见附件</td>
+<td colspan=1 rowspan=1>详见附件</td>
+<td colspan=1 rowspan=1>详见附件</td>
+<td colspan=1 rowspan=1>详见附件</td>
 </tr>
 </table>
-<div>十、报价要求:请根据明细清单填报不含税单价</div>
-<div>十一、付款方式:货到90%付款:乙方在合同规定的交货期内将货物运达指定交货地点并验收合格后</div>
-<div>,于15个工作日内向甲方提供金额为合同总价格的增值税发票和90%收据,经甲方审核无误后,六十</div>
-<div>(60)个工作日内向乙方支付合同总价格90%的货款,剩余10%为质保金,质保金1年后支付;具体条款</div>
-<div>以成交供应商与采购需求单位合同签订为准,请供应商充分考虑付款方式可能对报价产生的影响。</div>
-<div>十二、是否需要强制上传应答文件:否</div>
-<div>十三、其他内容:无</div>
-<div>山西漳山发电有限责任公司</div>
-<div>2022-03-22</div>
+<div>五、评审专家(单一来源采购人员)名单:</div>
+<div>王正刚、段海龙、李鑫、刘翠平、张武峰</div>
+<div>六、代理服务收费标准及金额:</div>
+<div>收费标准:无</div>
+<div>收费金额:0万元</div>
+<div>七、公告期限</div>
+<div>自本公告发布之日起1个工作日。</div>
+<div>八、其他补充事宜</div>
+<div>无</div>
+<div>九、凡对本次公告内容提出询问,请按以下方式联系。</div>
+<div>1.采购人信息</div>
+<div>名称:华池县柔远镇人民政府</div>
+<div>地址:华池县东关街70号</div>
+<div>联系方式:0934-5952951</div>
+<div>2.采购代理机构信息</div>
+<div>名称:华池县公共资源交易中心</div>
+<div>地址:华池县东关街22号</div>
+<div>联系方式:0934-5953080</div>
+<div>3.项目联系方式</div>
+<div>项目联系人:孙治江</div>
+<div>电话:18793418165</div>
+<div>2</div>
 </body>