4 роки тому · 70c18e953a
--- a/format_convert/convert.py
+++ b/format_convert/convert.py
@@ -16,6 +16,7 @@ from format_convert.convert_zip import zip2text, ZipConvert
 
				 
			
 
				 import hashlib
			
 
				 from format_convert import get_memory_info
			
 
				+from format_convert.judge_platform import get_platform
			
 
				 from ocr import ocr_interface
			
 
				 from otr import otr_interface
			
 
				 import re
			
@@ -2563,11 +2564,11 @@ def convert(data, ocr_model, otr_model):
 
				                         "is_success": 0}
			
 
				 
			
 
				         # 结果保存result.html
			
 
				-        # if get_platform() == "Windows":
			
 
				-        text_str = ""
			
 
				-        for t in text:
			
 
				-            text_str += t
			
 
				-        to_html("../result.html", text_str)
			
 
				+        if get_platform() == "Windows":
			
 
				+            text_str = ""
			
 
				+            for t in text:
			
 
				+                text_str += t
			
 
				+            to_html("../result.html", text_str)
			
 
				 
			
 
				         # 取纯文本
			
 
				         only_text = []
			
@@ -2605,7 +2606,7 @@ else:
 
				         _path = os.path.dirname(os.path.abspath(__file__))
			
 
				 if __name__ == '__main__':
			
 
				     if get_platform() == "Windows":
			
 
				-        file_path = "C:/Users/Administrator/Desktop/error16.pdf"
			
 
				+        file_path = "C:/Users/Administrator/Desktop/error3.pdf"
			
 
				         # file_path = "D:/BIDI_DOC/比地_文档/2022/Test_Interface/1622529434414.rar"
			
 
				         # file_path = "D:/BIDI_DOC/比地_文档/2022/Test_ODPS/1624325845476.pdf"
			
 
				     else:
			
--- a/format_convert/convert_image.py
+++ b/format_convert/convert_image.py
@@ -11,7 +11,7 @@ from format_convert.table_correct import get_rotated_image
 
				 from format_convert.convert_need_interface import from_otr_interface, from_ocr_interface
			
 
				 
			
 
				 
			
 
				-def image_process(image_np, image_path, use_ocr=True):
			
 
				+def image_process(image_np, image_path, is_from_pdf, use_ocr=True):
			
 
				     from format_convert.convert_tree import _Table, _Sentence
			
 
				 
			
 
				     def get_cluster(t_list, b_list, axis):
			
@@ -92,7 +92,7 @@ def image_process(image_np, image_path, use_ocr=True):
 
				         # 调用otr模型接口
			
 
				         with open(image_resize_path, "rb") as f:
			
 
				             image_bytes = f.read()
			
 
				-        list_line = from_otr_interface(image_bytes)
			
 
				+        list_line = from_otr_interface(image_bytes, is_from_pdf)
			
 
				         if judge_error_code(list_line):
			
 
				             return list_line
			
 
				 
			
@@ -107,6 +107,7 @@ def image_process(image_np, image_path, use_ocr=True):
 
				         with open(image_path, "rb") as f:
			
 
				             image_bytes = f.read()
			
 
				         text_list, bbox_list = from_ocr_interface(image_bytes, True)
			
 
				+        # print("convert_image", text_list)
			
 
				         if judge_error_code(text_list):
			
 
				             return text_list
			
 
				 
			
@@ -178,7 +179,7 @@ def picture2text(path, html=False):
 
				 def get_best_predict_size(image_np, times=64):
			
 
				     sizes = []
			
 
				     for i in range(1, 100):
			
 
				-        if i*times <= 3000:
			
 
				+        if i*times <= 1300:
			
 
				             sizes.append(i*times)
			
 
				     sizes.sort(key=lambda x: x, reverse=True)
			
 
				 
			
--- a/format_convert/convert_need_interface.py
+++ b/format_convert/convert_need_interface.py
@@ -134,7 +134,7 @@ def from_otr_interface2(image_stream):
 
				         return [-1], [-1], [-1], [-1], [-1]
			
 
				 
			
 
				 
			
 
				-def from_otr_interface(image_stream):
			
 
				+def from_otr_interface(image_stream, is_from_pdf=False):
			
 
				     logging.info("into from_otr_interface")
			
 
				     try:
			
 
				         base64_stream = base64.b64encode(image_stream)
			
@@ -144,7 +144,7 @@ def from_otr_interface(image_stream):
 
				             if globals().get("global_otr_model") is None:
			
 
				                 globals().update({"global_otr_model": OtrModels().get_model()})
			
 
				                 print("=========== init otr model ===========")
			
 
				-            r = otr(data=base64_stream, otr_model=globals().get("global_otr_model"))
			
 
				+            r = otr(data=base64_stream, otr_model=globals().get("global_otr_model"), is_from_pdf=is_from_pdf)
			
 
				         except TimeoutError:
			
 
				             return [-5]
			
 
				         except requests.exceptions.ConnectionError as e:
			
--- a/format_convert/convert_pdf.py
+++ b/format_convert/convert_pdf.py
@@ -26,7 +26,7 @@ from pdfminer.converter import PDFPageAggregator
 
				 from pdfminer.layout import LTTextBoxHorizontal, LAParams, LTFigure, LTImage, LTCurve, LTText, LTChar, LTRect, \
			
 
				     LTTextBoxVertical, LTLine
			
 
				 from format_convert import get_memory_info
			
 
				-from utils import judge_error_code, add_div, get_platform, get_html_p, string_similarity, LineTable
			
 
				+from format_convert.utils import judge_error_code, add_div, get_platform, get_html_p, string_similarity, LineTable
			
 
				 import fitz
			
 
				 
			
 
				 
			
@@ -737,6 +737,7 @@ class PDFConvert:
 
				                 self._page.error_code = page_image
			
 
				             else:
			
 
				                 _image = _Image(page_image[1], page_image[0])
			
 
				+                _image.is_from_pdf = True
			
 
				                 self._page.add_child(_image)
			
 
				 
			
 
				         # 正常读取该页对象
			
@@ -778,6 +779,7 @@ class PDFConvert:
 
				                             self._page.error_code = page_image
			
 
				                         else:
			
 
				                             _image = _Image(page_image[1], page_image[0])
			
 
				+                            _image.is_from_pdf = True
			
 
				                             self._page.add_child(_image)
			
 
				                         return
			
 
				                     # 比较小的图则直接保存用ocr识别
			
--- a/format_convert/convert_tree.py
+++ b/format_convert/convert_tree.py
@@ -120,7 +120,7 @@ class _Image:
 
				         # 二进制转numpy
			
 
				         image_np = Image.open(io.BytesIO(self.content))
			
 
				         image_np = cv2.cvtColor(np.asarray(image_np), cv2.COLOR_RGB2BGR)
			
 
				-        obj_list = image_process(image_np, self.path, use_ocr=True)
			
 
				+        obj_list = image_process(image_np, self.path, self.is_from_pdf, use_ocr=True)
			
 
				         if judge_error_code(obj_list):
			
 
				             self.error_code = obj_list
			
 
				             return
			
--- a/otr/otr_interface.py
+++ b/otr/otr_interface.py
@@ -27,11 +27,14 @@ def log(msg):
 
				     logger.info(msg)
			
 
				 
			
 
				 
			
 
				-def otr(data, otr_model):
			
 
				+def otr(data, otr_model, is_from_pdf):
			
 
				     try:
			
 
				         img_data = base64.b64decode(data)
			
 
				         # points_and_lines = pool.apply(table_detect, (img_data,))
			
 
				-        list_lines = line_detect(img_data, otr_model)
			
 
				+        if is_from_pdf:
			
 
				+            list_lines = line_detect(img_data, otr_model, prob=0.2)
			
 
				+        else:
			
 
				+            list_lines = line_detect(img_data, otr_model, prob=0.5)
			
 
				         return list_lines
			
 
				     except TimeoutError:
			
 
				         raise TimeoutError
			
@@ -263,7 +266,7 @@ def table_detect2(img_data, otr_model):
 
				                 "outline_points": str([]), "lines": str([])}
			
 
				 
			
 
				 
			
 
				-def line_detect(img_data, otr_model):
			
 
				+def line_detect(img_data, otr_model, prob=0.2):
			
 
				     logging.info("into otr_interface table_detect")
			
 
				     start_time = time.time()
			
 
				     try:
			
@@ -279,14 +282,15 @@ def line_detect(img_data, otr_model):
 
				 
			
 
				         # 选择与图片最接近分辨率，以防失真
			
 
				         # best_h, best_w = get_best_predict_size(img)
			
 
				-        print("image_np.shape", image_np.shape)
			
 
				+        logging.info("image_np.shape" + str(image_np.shape))
			
 
				         best_h, best_w, _ = image_np.shape
			
 
				         logging.info("otr preprocess time: " + str(round(float(time.time()-start_time1), 4)) + "s")
			
 
				 
			
 
				         # 调用模型
			
 
				         # rows, cols = table_line(image_np, otr_model)
			
 
				         start_time1 = time.time()
			
 
				-        list_line = table_line(image_np, otr_model, size=(best_w, best_h))
			
 
				+        list_line = table_line(image_np, otr_model, size=(best_w, best_h), prob=prob)
			
 
				+        logging.info("otr finish " + str(round(float(time.time()-start_time1), 4)) + "s")
			
 
				         return {"list_line": str(list_line)}
			
 
				     except TimeoutError:
			
 
				         raise TimeoutError
			
--- a/otr/table_line.py
+++ b/otr/table_line.py
@@ -441,14 +441,15 @@ def get_line_from_binary_image(image_np, point_value=1, axis=0):
 
				     return axis_lines
			
 
				 
			
 
				 
			
 
				-def table_line(img, model, size=(512, 1024), is_test=0):
			
 
				+def table_line(img, model, size=(512, 1024), prob=0.2, is_test=0):
			
 
				+    logging.info("into table_line, prob is " + str(prob))
			
 
				     sizew, sizeh = size
			
 
				     img_new = cv2.resize(img, (sizew, sizeh), interpolation=cv2.INTER_AREA)
			
 
				 
			
 
				     pred = model.predict(np.array([img_new]))
			
 
				     pred = pred[0]
			
 
				 
			
 
				-    # draw_pixel(pred, is_test)
			
 
				+    # draw_pixel(pred)
			
 
				 
			
 
				     # 横线预测结果
			
 
				     # row_pred = pred[..., 0] > hprob
			
@@ -461,7 +462,7 @@ def table_line(img, model, size=(512, 1024), is_test=0):
 
				     # cv2.waitKey(0)
			
 
				 
			
 
				     _time = time.time()
			
 
				-    list_line = points2lines(pred, False)
			
 
				+    list_line = points2lines(pred, False, prob=prob)
			
 
				     mat_plot(list_line, "points2lines", is_test)
			
 
				 
			
 
				     # 清除短线
			
--- a/package_2022_03_22/convert_otr.zip
+++ b/package_2022_03_22/convert_otr.zip
--- a/result.html
+++ b/result.html
@@ -1,110 +1,69 @@
 
				-<!DOCTYPE HTML><head><meta charset="UTF-8"></head><body><div>询价公告</div>
			
 
				-<div>一、采购单编号：P-XJ22010799</div>
			
 
				-<div>二、采购单名称：弯头自采</div>
			
 
				-<div>三、报价截止时间：2022-03-2517:30</div>
			
 
				-<div>四、报价有效期：2022-06-1900:00</div>
			
 
				-<div>五、组织形式：自行采购</div>
			
 
				-<div>六、采购单位：山西漳山发电有限责任公司</div>
			
 
				-<div>七、联系人：刘军</div>
			
 
				-<div>八、联系方式：13333556610</div>
			
 
				-<div>九、询价类型：公开</div>
			
 
				-<div>具体规格、技术指标及售后服务要求等详见下表。</div>
			
 
				+<!DOCTYPE HTML><head><meta charset="UTF-8"></head><body><div>华池县柔远镇李庄肉牛养殖场建设项目配</div>
			
 
				+<div>套设备购置政府采购公开招标中标公告</div>
			
 
				+<div>、项目编号</div>
			
 
				+<div>HCZC2021-0001</div>
			
 
				+<div>二、项目名称</div>
			
 
				+<div>华池县柔远镇李庄肉牛养殖场建设项目配套设备购置</div>
			
 
				+<div>三、中标（成交）信息</div>
			
 
				 <table border="1">
			
 
				 <tr>
			
 
				-<td colspan=1 rowspan=1>序
			
 
				-号
			
 
				-</td>
			
 
				-<td colspan=1 rowspan=1>产品描述
			
 
				-</td>
			
 
				-<td colspan=1 rowspan=1>采购数量
			
 
				-</td>
			
 
				-<td colspan=1 rowspan=1>计量单位
			
 
				-</td>
			
 
				-<td colspan=1 rowspan=1>税率
			
 
				-</td>
			
 
				-<td colspan=1 rowspan=1>交付时间
			
 
				-</td>
			
 
				-<td colspan=1 rowspan=1>交货地点
			
 
				-</td>
			
 
				-<td colspan=1 rowspan=1>采购需求
			
 
				-单位
			
 
				-</td>
			
 
				-<td colspan=1 rowspan=1>行项目备
			
 
				-注
			
 
				-</td>
			
 
				+<td colspan=1 rowspan=1>供应商名称</td>
			
 
				+<td colspan=1 rowspan=1>供应商联系地址</td>
			
 
				+<td colspan=1 rowspan=1>中标金额（万元）</td>
			
 
				 </tr>
			
 
				 <tr>
			
 
				-<td colspan=1 rowspan=1>1
			
 
				-</td>
			
 
				-<td colspan=1 rowspan=1>弯头|弯头\\DN20
			
 
				-PN10 内丝\\|无|无
			
 
				-</td>
			
 
				-<td colspan=1 rowspan=1>20
			
 
				-</td>
			
 
				-<td colspan=1 rowspan=1>个
			
 
				-</td>
			
 
				-<td colspan=1 rowspan=1>13%
			
 
				-进项
			
 
				-税
			
 
				-，中
			
 
				-国(新
			
 
				-）
			
 
				-</td>
			
 
				-<td colspan=1 rowspan=1>2022-04-
			
 
				-05
			
 
				-</td>
			
 
				-<td colspan=1 rowspan=1>漳山发电工厂
			
 
				-</td>
			
 
				-<td colspan=1 rowspan=1>山西漳山
			
 
				-发电有限
			
 
				-责任公司
			
 
				-</td>
			
 
				-<td colspan=1 rowspan=1>弯头
			
 
				-\\DN20
			
 
				-PN10 内丝
			
 
				-\\
			
 
				-</td>
			
 
				+<td colspan=1 rowspan=1>华池县卓泰机械设备租赁有限公司</td>
			
 
				+<td colspan=1 rowspan=1>甘肃省庆阳市华池县柔远镇张川村</td>
			
 
				+<td colspan=1 rowspan=1>72.3500</td>
			
 
				+</tr>
			
 
				+</table>
			
 
				+<div>四、主要标的信息</div>
			
 
				+<table border="1">
			
 
				+<tr>
			
 
				+<td colspan=1 rowspan=1>货物类</td>
			
 
				+<td colspan=1 rowspan=1>货物类</td>
			
 
				+<td colspan=1 rowspan=1>货物类</td>
			
 
				+<td colspan=1 rowspan=1>货物类</td>
			
 
				+<td colspan=1 rowspan=1>货物类</td>
			
 
				+<td colspan=1 rowspan=1>货物类</td>
			
 
				+</tr>
			
 
				+<tr>
			
 
				+<td colspan=1 rowspan=1>供应商名称</td>
			
 
				+<td colspan=1 rowspan=1>名称</td>
			
 
				+<td colspan=1 rowspan=1>品牌</td>
			
 
				+<td colspan=1 rowspan=1>数量</td>
			
 
				+<td colspan=1 rowspan=1>单价</td>
			
 
				+<td colspan=1 rowspan=1>规格型号</td>
			
 
				 </tr>
			
 
				 <tr>
			
 
				-<td colspan=1 rowspan=1>2
			
 
				-</td>
			
 
				-<td colspan=1 rowspan=1>弯头|弯头\\DN25
			
 
				-PN10 内丝\\|无|无
			
 
				-</td>
			
 
				-<td colspan=1 rowspan=1>10
			
 
				-</td>
			
 
				-<td colspan=1 rowspan=1>个
			
 
				-</td>
			
 
				-<td colspan=1 rowspan=1>13%
			
 
				-进项
			
 
				-税
			
 
				-，中
			
 
				-国(新
			
 
				-）
			
 
				-</td>
			
 
				-<td colspan=1 rowspan=1>2022-04-
			
 
				-05
			
 
				-</td>
			
 
				-<td colspan=1 rowspan=1>漳山发电工厂
			
 
				-</td>
			
 
				-<td colspan=1 rowspan=1>山西漳山
			
 
				-发电有限
			
 
				-责任公司
			
 
				-</td>
			
 
				-<td colspan=1 rowspan=1>弯头
			
 
				-\\DN25
			
 
				-PN10 内丝
			
 
				-\\
			
 
				-</td>
			
 
				+<td colspan=1 rowspan=1>华池县卓泰机械设备租赁有限公司</td>
			
 
				+<td colspan=1 rowspan=1>华池县柔远镇李庄肉牛养殖场建设项目配套设备购置</td>
			
 
				+<td colspan=1 rowspan=1>详见附件</td>
			
 
				+<td colspan=1 rowspan=1>详见附件</td>
			
 
				+<td colspan=1 rowspan=1>详见附件</td>
			
 
				+<td colspan=1 rowspan=1>详见附件</td>
			
 
				 </tr>
			
 
				 </table>
			
 
				-<div>十、报价要求：请根据明细清单填报不含税单价</div>
			
 
				-<div>十一、付款方式：货到90%付款：乙方在合同规定的交货期内将货物运达指定交货地点并验收合格后</div>
			
 
				-<div>，于15个工作日内向甲方提供金额为合同总价格的增值税发票和90%收据，经甲方审核无误后，六十</div>
			
 
				-<div>（60）个工作日内向乙方支付合同总价格90%的货款,剩余10%为质保金，质保金1年后支付；具体条款</div>
			
 
				-<div>以成交供应商与采购需求单位合同签订为准，请供应商充分考虑付款方式可能对报价产生的影响。</div>
			
 
				-<div>十二、是否需要强制上传应答文件：否</div>
			
 
				-<div>十三、其他内容：无</div>
			
 
				-<div>山西漳山发电有限责任公司</div>
			
 
				-<div>2022-03-22</div>
			
 
				+<div>五、评审专家（单一来源采购人员）名单：</div>
			
 
				+<div>王正刚、段海龙、李鑫、刘翠平、张武峰</div>
			
 
				+<div>六、代理服务收费标准及金额：</div>
			
 
				+<div>收费标准：无</div>
			
 
				+<div>收费金额：0万元</div>
			
 
				+<div>七、公告期限</div>
			
 
				+<div>自本公告发布之日起1个工作日。</div>
			
 
				+<div>八、其他补充事宜</div>
			
 
				+<div>无</div>
			
 
				+<div>九、凡对本次公告内容提出询问，请按以下方式联系。</div>
			
 
				+<div>1.采购人信息</div>
			
 
				+<div>名称：华池县柔远镇人民政府</div>
			
 
				+<div>地址：华池县东关街70号</div>
			
 
				+<div>联系方式：0934-5952951</div>
			
 
				+<div>2.采购代理机构信息</div>
			
 
				+<div>名称：华池县公共资源交易中心</div>
			
 
				+<div>地址：华池县东关街22号</div>
			
 
				+<div>联系方式：0934-5953080</div>
			
 
				+<div>3.项目联系方式</div>
			
 
				+<div>项目联系人：孙治江</div>
			
 
				+<div>电话：18793418165</div>
			
 
				+<div>2</div>
			
 
				 </body>