4 年之前 · ef5a6583e2
--- a/format_convert/convert.py
+++ b/format_convert/convert.py
@@ -5,8 +5,8 @@ import sys
 
				 import os
			
 
				 sys.path.append(os.path.dirname(os.path.abspath(__file__)) + "/../")
			
 
				 
			
 
				-from format_convert.convert_doc import doc2text
			
 
				-from format_convert.convert_docx import docx2text
			
 
				+from format_convert.convert_doc import doc2text, DocConvert
			
 
				+from format_convert.convert_docx import docx2text, DocxConvert
			
 
				 from format_convert.convert_image import picture2text
			
 
				 from format_convert.convert_pdf import pdf2text, PDFConvert
			
 
				 from format_convert.convert_rar import rar2text
			
@@ -2251,9 +2251,10 @@ def getText(_type, path_or_stream):
 
				 
			
 
				     if _type == "pdf":
			
 
				         # return pdf2text(path_or_stream, unique_type_dir)
			
 
				-        return PDFConvert(path_or_stream).get_html()
			
 
				+        return PDFConvert(path_or_stream, unique_type_dir).get_html()
			
 
				     if _type == "docx":
			
 
				-        return docx2text(path_or_stream, unique_type_dir)
			
 
				+        # return docx2text(path_or_stream, unique_type_dir)
			
 
				+        return DocxConvert(path_or_stream, unique_type_dir).get_html()
			
 
				     if _type == "zip":
			
 
				         return zip2text(path_or_stream, unique_type_dir)
			
 
				     if _type == "rar":
			
@@ -2263,7 +2264,8 @@ def getText(_type, path_or_stream):
 
				     if _type == "xls":
			
 
				         return xls2text(path_or_stream, unique_type_dir)
			
 
				     if _type == "doc":
			
 
				-        return doc2text(path_or_stream, unique_type_dir)
			
 
				+        # return doc2text(path_or_stream, unique_type_dir)
			
 
				+        return DocConvert(path_or_stream, unique_type_dir).get_html()
			
 
				     if _type == "jpg" or _type == "png" or _type == "jpeg":
			
 
				         return picture2text(path_or_stream)
			
 
				     if _type == "swf":
			
@@ -2641,35 +2643,9 @@ else:
 
				     if not os.path.exists(_path):
			
 
				         _path = os.path.dirname(os.path.abspath(__file__))
			
 
				 if __name__ == '__main__':
			
 
				-
			
 
				-    print(os.path.abspath(__file__) + "/../../")
			
 
				-    # if len(sys.argv) == 2:
			
 
				-    #     port = int(sys.argv[1])
			
 
				-    # else:
			
 
				-    #     port = 15015
			
 
				-    # app.run(host='0.0.0.0', port=port, threaded=True, debug=False)
			
 
				-    # log("format_conversion running")
			
 
				-
			
 
				-    # convert("", "ocr_model", "otr_model")
			
 
				-    # _str = "啊"
			
 
				-    # str1 = ""
			
 
				-    # str2 = ""
			
 
				-    # for i in range(900000):
			
 
				-    #     str1 += _str
			
 
				-    # list1 = [str1]
			
 
				-    # for i in range(700000):
			
 
				-    #     str2 += _str
			
 
				-    # list2 = [str2]
			
 
				-    # cut_str(list1, list2)
			
 
				-
			
 
				-    # file_path = "C:/Users/Administrator/Desktop/error1.png"
			
 
				-    # file_path = "D:/Project/table-detect-master/train_data/label_1.jpg"
			
 
				-    # file_path = "D:/Project/table-detect-master/test_files/1.png"
			
 
				-    # file_path = "D:/Project/table-detect-master/test_files/table2.jpg"
			
 
				-
			
 
				     if get_platform() == "Windows":
			
 
				-        file_path = "C:/Users/Administrator/Desktop/error3.pdf"
			
 
				-        # file_path = "D:/BIDI_DOC/比地_文档/2022/Test_Interface/94961e1987d1090e.xls"
			
 
				+        # file_path = "C:/Users/Administrator/Desktop/error3.pdf"
			
 
				+        file_path = "D:/BIDI_DOC/比地_文档/2022/Test_Interface/招标公告--汾口镇汪家桥村村道硬化工程 - .doc"
			
 
				         # file_path = "C:/Users/Administrator/Desktop/Test_ODPS/1624875783055.pdf"
			
 
				     else:
			
 
				         file_path = "1.doc"
			
--- a/format_convert/convert_doc.py
+++ b/format_convert/convert_doc.py
@@ -1,10 +1,13 @@
 
				 import os
			
 
				 import sys
			
 
				+
			
 
				+from format_convert.convert_tree import _Document
			
 
				+
			
 
				 sys.path.append(os.path.dirname(__file__) + "/../")
			
 
				 import logging
			
 
				 import traceback
			
 
				 from format_convert import get_memory_info
			
 
				-from format_convert.convert_docx import docx2text
			
 
				+from format_convert.convert_docx import docx2text, DocxConvert
			
 
				 from format_convert.convert_need_interface import from_office_interface
			
 
				 from format_convert.utils import judge_error_code
			
 
				 
			
@@ -23,4 +26,27 @@ def doc2text(path, unique_type_dir):
 
				     except Exception as e:
			
 
				         logging.info("doc2text error!")
			
 
				         print("doc2text", traceback.print_exc())
			
 
				-        return [-1]
			
 
				+        return [-1]
			
 
				+
			
 
				+
			
 
				+class DocConvert:
			
 
				+    def __init__(self, path, unique_type_dir):
			
 
				+        self._doc = _Document(path)
			
 
				+        self.path = path
			
 
				+        self.unique_type_dir = unique_type_dir
			
 
				+
			
 
				+    def convert(self):
			
 
				+        # 调用office格式转换
			
 
				+        file_path = from_office_interface(self.path, self.unique_type_dir, 'docx')
			
 
				+        if judge_error_code(file_path):
			
 
				+            self._doc = file_path
			
 
				+            return
			
 
				+        print("file_path", file_path)
			
 
				+        self._doc = DocxConvert(file_path, self.unique_type_dir)._doc
			
 
				+
			
 
				+    def get_html(self):
			
 
				+        self.convert()
			
 
				+        if self._doc.error_code is not None:
			
 
				+            return self._doc.error_code
			
 
				+        print()
			
 
				+        return self._doc.get_html()
			
--- a/format_convert/convert_docx.py
+++ b/format_convert/convert_docx.py
@@ -1,6 +1,7 @@
 
				 import os
			
 
				 import sys
			
 
				 sys.path.append(os.path.dirname(__file__) + "/../")
			
 
				+from format_convert.convert_tree import _Document, _Sentence, _Page, _Image, _Table
			
 
				 import logging
			
 
				 import re
			
 
				 import traceback
			
@@ -285,4 +286,112 @@ def read_docx_table(document):
 
				         table_text += "</table>\n"
			
 
				         # print(table_text)
			
 
				         table_text_list.append(table_text)
			
 
				-    return table_text_list
			
 
				+    return table_text_list
			
 
				+
			
 
				+
			
 
				+class DocxConvert:
			
 
				+    def __init__(self, path, unique_type_dir):
			
 
				+        self._doc = _Document(path)
			
 
				+        self.path = path
			
 
				+        self.unique_type_dir = unique_type_dir
			
 
				+
			
 
				+    def init_package(self, package_name):
			
 
				+        # 各个包初始化
			
 
				+        try:
			
 
				+            self.docx = docx.Document(self.path)
			
 
				+            self.zip = zipfile.ZipFile(self.path)
			
 
				+        except:
			
 
				+            logging.info(package_name + " cannot open docx!")
			
 
				+            traceback.print_exc()
			
 
				+            self._doc.error_code = [-3]
			
 
				+
			
 
				+    def convert(self):
			
 
				+        self.init_package("docx")
			
 
				+        if self._doc.error_code is not None:
			
 
				+            return
			
 
				+
			
 
				+        order_list = self.get_orders()
			
 
				+        if judge_error_code(order_list):
			
 
				+            self._doc.error_code = order_list
			
 
				+            return
			
 
				+
			
 
				+        table_list = self.get_tables()
			
 
				+        if judge_error_code(table_list):
			
 
				+            self._doc.error_code = table_list
			
 
				+            return
			
 
				+
			
 
				+        paragraph_list = self.get_paragraphs()
			
 
				+
			
 
				+        image_list = self.get_images()
			
 
				+
			
 
				+        self._page = _Page(None, 0)
			
 
				+        order_y = 0
			
 
				+        for tag in order_list:
			
 
				+            bbox = (0, order_y, 0, 0)
			
 
				+            if tag == "w:t":
			
 
				+                if len(paragraph_list) > 0:
			
 
				+                    _para = paragraph_list.pop(0)
			
 
				+                    self._page.add_child(_Sentence(_para, bbox))
			
 
				+            if tag == "wp:docPr":
			
 
				+                if len(image_list) > 0:
			
 
				+                    _image = image_list.pop(0)
			
 
				+                    self._page.add_child(_Image(_image, bbox))
			
 
				+            if tag == "w:tbl":
			
 
				+                if len(table_list) > 0:
			
 
				+                    _table = table_list.pop(0)
			
 
				+                    self._page.add_child(_Table(_table, bbox))
			
 
				+            order_y += 1
			
 
				+
			
 
				+        if self._doc.error_code is None and self._page.error_code is not None:
			
 
				+            self._doc.error_code = self._page.error_code
			
 
				+        self._doc.add_child(self._page)
			
 
				+
			
 
				+    def get_paragraphs(self):
			
 
				+        # 遍历段落
			
 
				+        paragraph_list = []
			
 
				+        for paragraph in self.docx.paragraphs:
			
 
				+            if paragraph.text != "":
			
 
				+                paragraph_list.append(paragraph.text)
			
 
				+        return paragraph_list
			
 
				+
			
 
				+    def get_tables(self):
			
 
				+        # 遍历表
			
 
				+        table_list = read_xml_table(self.path, self.unique_type_dir)
			
 
				+        return table_list
			
 
				+
			
 
				+    def get_images(self):
			
 
				+        # 顺序遍历图片
			
 
				+        image_list = []
			
 
				+        pattern = re.compile('rId\d+')
			
 
				+        for graph in self.docx.paragraphs:
			
 
				+            for run in graph.runs:
			
 
				+                if run.text == '':
			
 
				+                    try:
			
 
				+                        if not pattern.search(run.element.xml):
			
 
				+                            continue
			
 
				+                        content_id = pattern.search(run.element.xml).group(0)
			
 
				+                        content_type = self.docx.part.related_parts[content_id].content_type
			
 
				+                    except Exception as e:
			
 
				+                        print("docx no image!", e)
			
 
				+                        continue
			
 
				+                    if not content_type.startswith('image'):
			
 
				+                        continue
			
 
				+
			
 
				+                    img_data = self.docx.part.related_parts[content_id].blob
			
 
				+                    if img_data is not None:
			
 
				+                        image_list.append(img_data)
			
 
				+        return image_list
			
 
				+
			
 
				+    def get_orders(self):
			
 
				+        # 解析document.xml，获取文字顺序
			
 
				+        order_list = read_xml_order(self.path, self.unique_type_dir)
			
 
				+        return order_list
			
 
				+
			
 
				+    def get_doc_object(self):
			
 
				+        return self._doc
			
 
				+
			
 
				+    def get_html(self):
			
 
				+        self.convert()
			
 
				+        if self._doc.error_code is not None:
			
 
				+            return self._doc.error_code
			
 
				+        return self._doc.get_html()
			
--- a/format_convert/convert_image.py
+++ b/format_convert/convert_image.py
@@ -35,7 +35,7 @@ def image_preprocess(image_np, image_path, use_ocr=True):
 
				             image_bytes = f.read()
			
 
				         list_line = from_otr_interface(image_bytes)
			
 
				         if judge_error_code(list_line):
			
 
				-            return list_line, [], [], 0
			
 
				+            return list_line
			
 
				 
			
 
				         # 将resize后得到的bbox根据比例还原
			
 
				         ratio = (image_np.shape[0]/best_h, image_np.shape[1]/best_w)
			
@@ -49,7 +49,7 @@ def image_preprocess(image_np, image_path, use_ocr=True):
 
				             image_bytes = f.read()
			
 
				         text_list, bbox_list = from_ocr_interface(image_bytes, True)
			
 
				         if judge_error_code(text_list):
			
 
				-            return text_list, [], [], 0
			
 
				+            return text_list
			
 
				 
			
 
				         # 调用现成方法形成表格
			
 
				         try:
			
--- a/format_convert/convert_pdf.py
+++ b/format_convert/convert_pdf.py
@@ -583,14 +583,15 @@ def page_table_connect(has_table_dict):
 
				 
			
 
				 
			
 
				 class PDFConvert:
			
 
				-    def __init__(self, path):
			
 
				+    def __init__(self, path, unique_type_dir):
			
 
				         self._doc = _Document(path)
			
 
				         self.path = path
			
 
				+        self.unique_type_dir = unique_type_dir
			
 
				 
			
 
				         self.packages = ["pdfminer", "PyMuPDF", "PyPDF2", "pdfplumber"]
			
 
				         self.has_init_pdf = [0] * len(self.packages)
			
 
				 
			
 
				-    def init_pdf(self, package_name):
			
 
				+    def init_package(self, package_name):
			
 
				         # 各个包初始化
			
 
				         try:
			
 
				             if package_name == self.packages[0]:
			
@@ -623,15 +624,15 @@ class PDFConvert:
 
				                 self.doc_pdfplumber = PDF(self.fp, laparams=self.laparams.__dict__)
			
 
				 
			
 
				             else:
			
 
				-                print("Only Suppport Packages", str(self.packages))
			
 
				+                print("Only Support Packages", str(self.packages))
			
 
				                 raise Exception
			
 
				         except:
			
 
				             logging.info(package_name + " cannot open pdf!")
			
 
				             self._doc.error_code = [-3]
			
 
				 
			
 
				-    def convert_pdf(self):
			
 
				+    def convert(self):
			
 
				         if self.has_init_pdf[0] == 0:
			
 
				-            self.init_pdf("pdfminer")
			
 
				+            self.init_package("pdfminer")
			
 
				         if self._doc.error_code is not None:
			
 
				             return
			
 
				 
			
@@ -690,7 +691,7 @@ class PDFConvert:
 
				         if only_image == 0 and image_count == 0:
			
 
				             # PDFPlumber
			
 
				             if self.has_init_pdf[3] == 0:
			
 
				-                self.init_pdf("pdfplumber")
			
 
				+                self.init_package("pdfplumber")
			
 
				             if self._doc.error_code is not None:
			
 
				                 return
			
 
				 
			
@@ -789,7 +790,7 @@ class PDFConvert:
 
				 
			
 
				     def get_layout(self, page):
			
 
				         if self.has_init_pdf[0] == 0:
			
 
				-            self.init_pdf("pdfminer")
			
 
				+            self.init_package("pdfminer")
			
 
				         if self._doc.error_code is not None:
			
 
				             return
			
 
				 
			
@@ -818,7 +819,7 @@ class PDFConvert:
 
				     def get_page_image(self, page_no):
			
 
				         try:
			
 
				             if self.has_init_pdf[1] == 0:
			
 
				-                self.init_pdf("PyMuPDF")
			
 
				+                self.init_package("PyMuPDF")
			
 
				             if self._doc.error_code is not None:
			
 
				                 return
			
 
				 
			
@@ -852,7 +853,7 @@ class PDFConvert:
 
				                 return [-3]
			
 
				 
			
 
				     def get_html(self):
			
 
				-        self.convert_pdf()
			
 
				+        self.convert()
			
 
				         if self._doc.error_code is not None:
			
 
				             return self._doc.error_code
			
 
				         return self._doc.get_html()
			
--- a/result.html
+++ b/result.html
@@ -1,62 +1 @@
 
				-<!DOCTYPE HTML><head><meta charset="UTF-8"></head><body><div>华池县柔远镇李庄肉牛养殖场建设项目配</div>
			
 
				-<div>套设备购置政府采购公开招标中标公告</div>
			
 
				-<div>、项目编号</div>
			
 
				-<div>HCZC2021-0001</div>
			
 
				-<div>二、项目名称</div>
			
 
				-<div>华池县柔远镇李庄肉牛养殖场建设项目配套设备购置</div>
			
 
				-<div>三、中标（成交）信息</div>
			
 
				-<table border="1">
			
 
				-<tr>
			
 
				-<td colspan=1 rowspan=1>供应商名称</td>
			
 
				-<td colspan=1 rowspan=1>供应商联系地址</td>
			
 
				-<td colspan=1 rowspan=1>中标金额（万元）</td>
			
 
				-</tr>
			
 
				-<tr>
			
 
				-<td colspan=1 rowspan=1>华池县卓泰机械设备租赁有限公司</td>
			
 
				-<td colspan=1 rowspan=1>甘肃省庆阳市华池县柔远镇张川村</td>
			
 
				-<td colspan=1 rowspan=1>72.3500</td>
			
 
				-</tr>
			
 
				-</table>
			
 
				-<div>四、主要标的信息</div>
			
 
				-<div>货物类</div>
			
 
				-<table border="1">
			
 
				-<tr>
			
 
				-<td colspan=1 rowspan=1>供应商名称</td>
			
 
				-<td colspan=1 rowspan=1>名称</td>
			
 
				-<td colspan=1 rowspan=1>品牌</td>
			
 
				-<td colspan=1 rowspan=1>数量</td>
			
 
				-<td colspan=1 rowspan=1>单价</td>
			
 
				-<td colspan=1 rowspan=1>规格型号</td>
			
 
				-</tr>
			
 
				-<tr>
			
 
				-<td colspan=1 rowspan=1>华池县卓泰机械设备租赁有限公司</td>
			
 
				-<td colspan=1 rowspan=1>华池县柔远镇李庄肉牛养殖场建设项目配套设备购置</td>
			
 
				-<td colspan=1 rowspan=1>详见附件</td>
			
 
				-<td colspan=1 rowspan=1>详见附件</td>
			
 
				-<td colspan=1 rowspan=1>详见附件</td>
			
 
				-<td colspan=1 rowspan=1>详见附件</td>
			
 
				-</tr>
			
 
				-</table>
			
 
				-<div>五、评审专家（单一来源采购人员）名单：</div>
			
 
				-<div>王正刚、段海龙、李鑫、刘翠平、张武峰</div>
			
 
				-<div>六、代理服务收费标准及金额：</div>
			
 
				-<div>收费标准：无</div>
			
 
				-<div>收费金额：0万元</div>
			
 
				-<div>七、公告期限</div>
			
 
				-<div>自本公告发布之日起1个工作日。</div>
			
 
				-<div>八、其他补充事宜</div>
			
 
				-<div>无</div>
			
 
				-<div>九、凡对本次公告内容提出询问，请按以下方式联系。</div>
			
 
				-<div>1.采购人信息</div>
			
 
				-<div>名称：华池县柔远镇人民政府</div>
			
 
				-<div>地址：华池县东关街70号</div>
			
 
				-<div>联系方式：0934-5952951</div>
			
 
				-<div>2.采购代理机构信息</div>
			
 
				-<div>名称：华池县公共资源交易中心</div>
			
 
				-<div>地址：华池县东关街22号</div>
			
 
				-<div>联系方式：0934-5953080</div>
			
 
				-<div>3.项目联系方式</div>
			
 
				-<div>项目联系人：孙治江</div>
			
 
				-<div>电话：18793418165</div>
			
 
				-<div>2</div>
			
 
				-</body>
			
 
				+<!DOCTYPE HTML><head><meta charset="UTF-8"></head><body></body>