fangjiasheng 3 年之前
父節點
當前提交
ef5a6583e2
共有 6 個文件被更改,包括 160 次插入109 次删除
  1. 9 33
      format_convert/convert.py
  2. 28 2
      format_convert/convert_doc.py
  3. 110 1
      format_convert/convert_docx.py
  4. 2 2
      format_convert/convert_image.py
  5. 10 9
      format_convert/convert_pdf.py
  6. 1 62
      result.html

+ 9 - 33
format_convert/convert.py

@@ -5,8 +5,8 @@ import sys
 import os
 sys.path.append(os.path.dirname(os.path.abspath(__file__)) + "/../")
 
-from format_convert.convert_doc import doc2text
-from format_convert.convert_docx import docx2text
+from format_convert.convert_doc import doc2text, DocConvert
+from format_convert.convert_docx import docx2text, DocxConvert
 from format_convert.convert_image import picture2text
 from format_convert.convert_pdf import pdf2text, PDFConvert
 from format_convert.convert_rar import rar2text
@@ -2251,9 +2251,10 @@ def getText(_type, path_or_stream):
 
     if _type == "pdf":
         # return pdf2text(path_or_stream, unique_type_dir)
-        return PDFConvert(path_or_stream).get_html()
+        return PDFConvert(path_or_stream, unique_type_dir).get_html()
     if _type == "docx":
-        return docx2text(path_or_stream, unique_type_dir)
+        # return docx2text(path_or_stream, unique_type_dir)
+        return DocxConvert(path_or_stream, unique_type_dir).get_html()
     if _type == "zip":
         return zip2text(path_or_stream, unique_type_dir)
     if _type == "rar":
@@ -2263,7 +2264,8 @@ def getText(_type, path_or_stream):
     if _type == "xls":
         return xls2text(path_or_stream, unique_type_dir)
     if _type == "doc":
-        return doc2text(path_or_stream, unique_type_dir)
+        # return doc2text(path_or_stream, unique_type_dir)
+        return DocConvert(path_or_stream, unique_type_dir).get_html()
     if _type == "jpg" or _type == "png" or _type == "jpeg":
         return picture2text(path_or_stream)
     if _type == "swf":
@@ -2641,35 +2643,9 @@ else:
     if not os.path.exists(_path):
         _path = os.path.dirname(os.path.abspath(__file__))
 if __name__ == '__main__':
-
-    print(os.path.abspath(__file__) + "/../../")
-    # if len(sys.argv) == 2:
-    #     port = int(sys.argv[1])
-    # else:
-    #     port = 15015
-    # app.run(host='0.0.0.0', port=port, threaded=True, debug=False)
-    # log("format_conversion running")
-
-    # convert("", "ocr_model", "otr_model")
-    # _str = "啊"
-    # str1 = ""
-    # str2 = ""
-    # for i in range(900000):
-    #     str1 += _str
-    # list1 = [str1]
-    # for i in range(700000):
-    #     str2 += _str
-    # list2 = [str2]
-    # cut_str(list1, list2)
-
-    # file_path = "C:/Users/Administrator/Desktop/error1.png"
-    # file_path = "D:/Project/table-detect-master/train_data/label_1.jpg"
-    # file_path = "D:/Project/table-detect-master/test_files/1.png"
-    # file_path = "D:/Project/table-detect-master/test_files/table2.jpg"
-
     if get_platform() == "Windows":
-        file_path = "C:/Users/Administrator/Desktop/error3.pdf"
-        # file_path = "D:/BIDI_DOC/比地_文档/2022/Test_Interface/94961e1987d1090e.xls"
+        # file_path = "C:/Users/Administrator/Desktop/error3.pdf"
+        file_path = "D:/BIDI_DOC/比地_文档/2022/Test_Interface/招标公告--汾口镇汪家桥村村道硬化工程 - .doc"
         # file_path = "C:/Users/Administrator/Desktop/Test_ODPS/1624875783055.pdf"
     else:
         file_path = "1.doc"

+ 28 - 2
format_convert/convert_doc.py

@@ -1,10 +1,13 @@
 import os
 import sys
+
+from format_convert.convert_tree import _Document
+
 sys.path.append(os.path.dirname(__file__) + "/../")
 import logging
 import traceback
 from format_convert import get_memory_info
-from format_convert.convert_docx import docx2text
+from format_convert.convert_docx import docx2text, DocxConvert
 from format_convert.convert_need_interface import from_office_interface
 from format_convert.utils import judge_error_code
 
@@ -23,4 +26,27 @@ def doc2text(path, unique_type_dir):
     except Exception as e:
         logging.info("doc2text error!")
         print("doc2text", traceback.print_exc())
-        return [-1]
+        return [-1]
+
+
+class DocConvert:
+    def __init__(self, path, unique_type_dir):
+        self._doc = _Document(path)
+        self.path = path
+        self.unique_type_dir = unique_type_dir
+
+    def convert(self):
+        # 调用office格式转换
+        file_path = from_office_interface(self.path, self.unique_type_dir, 'docx')
+        if judge_error_code(file_path):
+            self._doc = file_path
+            return
+        print("file_path", file_path)
+        self._doc = DocxConvert(file_path, self.unique_type_dir)._doc
+
+    def get_html(self):
+        self.convert()
+        if self._doc.error_code is not None:
+            return self._doc.error_code
+        print()
+        return self._doc.get_html()

+ 110 - 1
format_convert/convert_docx.py

@@ -1,6 +1,7 @@
 import os
 import sys
 sys.path.append(os.path.dirname(__file__) + "/../")
+from format_convert.convert_tree import _Document, _Sentence, _Page, _Image, _Table
 import logging
 import re
 import traceback
@@ -285,4 +286,112 @@ def read_docx_table(document):
         table_text += "</table>\n"
         # print(table_text)
         table_text_list.append(table_text)
-    return table_text_list
+    return table_text_list
+
+
+class DocxConvert:
+    def __init__(self, path, unique_type_dir):
+        self._doc = _Document(path)
+        self.path = path
+        self.unique_type_dir = unique_type_dir
+
+    def init_package(self, package_name):
+        # 各个包初始化
+        try:
+            self.docx = docx.Document(self.path)
+            self.zip = zipfile.ZipFile(self.path)
+        except:
+            logging.info(package_name + " cannot open docx!")
+            traceback.print_exc()
+            self._doc.error_code = [-3]
+
+    def convert(self):
+        self.init_package("docx")
+        if self._doc.error_code is not None:
+            return
+
+        order_list = self.get_orders()
+        if judge_error_code(order_list):
+            self._doc.error_code = order_list
+            return
+
+        table_list = self.get_tables()
+        if judge_error_code(table_list):
+            self._doc.error_code = table_list
+            return
+
+        paragraph_list = self.get_paragraphs()
+
+        image_list = self.get_images()
+
+        self._page = _Page(None, 0)
+        order_y = 0
+        for tag in order_list:
+            bbox = (0, order_y, 0, 0)
+            if tag == "w:t":
+                if len(paragraph_list) > 0:
+                    _para = paragraph_list.pop(0)
+                    self._page.add_child(_Sentence(_para, bbox))
+            if tag == "wp:docPr":
+                if len(image_list) > 0:
+                    _image = image_list.pop(0)
+                    self._page.add_child(_Image(_image, bbox))
+            if tag == "w:tbl":
+                if len(table_list) > 0:
+                    _table = table_list.pop(0)
+                    self._page.add_child(_Table(_table, bbox))
+            order_y += 1
+
+        if self._doc.error_code is None and self._page.error_code is not None:
+            self._doc.error_code = self._page.error_code
+        self._doc.add_child(self._page)
+
+    def get_paragraphs(self):
+        # 遍历段落
+        paragraph_list = []
+        for paragraph in self.docx.paragraphs:
+            if paragraph.text != "":
+                paragraph_list.append(paragraph.text)
+        return paragraph_list
+
+    def get_tables(self):
+        # 遍历表
+        table_list = read_xml_table(self.path, self.unique_type_dir)
+        return table_list
+
+    def get_images(self):
+        # 顺序遍历图片
+        image_list = []
+        pattern = re.compile('rId\d+')
+        for graph in self.docx.paragraphs:
+            for run in graph.runs:
+                if run.text == '':
+                    try:
+                        if not pattern.search(run.element.xml):
+                            continue
+                        content_id = pattern.search(run.element.xml).group(0)
+                        content_type = self.docx.part.related_parts[content_id].content_type
+                    except Exception as e:
+                        print("docx no image!", e)
+                        continue
+                    if not content_type.startswith('image'):
+                        continue
+
+                    img_data = self.docx.part.related_parts[content_id].blob
+                    if img_data is not None:
+                        image_list.append(img_data)
+        return image_list
+
+    def get_orders(self):
+        # 解析document.xml,获取文字顺序
+        order_list = read_xml_order(self.path, self.unique_type_dir)
+        return order_list
+
+    def get_doc_object(self):
+        return self._doc
+
+    def get_html(self):
+        self.convert()
+        if self._doc.error_code is not None:
+            return self._doc.error_code
+        return self._doc.get_html()

+ 2 - 2
format_convert/convert_image.py

@@ -35,7 +35,7 @@ def image_preprocess(image_np, image_path, use_ocr=True):
             image_bytes = f.read()
         list_line = from_otr_interface(image_bytes)
         if judge_error_code(list_line):
-            return list_line, [], [], 0
+            return list_line
 
         # 将resize后得到的bbox根据比例还原
         ratio = (image_np.shape[0]/best_h, image_np.shape[1]/best_w)
@@ -49,7 +49,7 @@ def image_preprocess(image_np, image_path, use_ocr=True):
             image_bytes = f.read()
         text_list, bbox_list = from_ocr_interface(image_bytes, True)
         if judge_error_code(text_list):
-            return text_list, [], [], 0
+            return text_list
 
         # 调用现成方法形成表格
         try:

+ 10 - 9
format_convert/convert_pdf.py

@@ -583,14 +583,15 @@ def page_table_connect(has_table_dict):
 
 
 class PDFConvert:
-    def __init__(self, path):
+    def __init__(self, path, unique_type_dir):
         self._doc = _Document(path)
         self.path = path
+        self.unique_type_dir = unique_type_dir
 
         self.packages = ["pdfminer", "PyMuPDF", "PyPDF2", "pdfplumber"]
         self.has_init_pdf = [0] * len(self.packages)
 
-    def init_pdf(self, package_name):
+    def init_package(self, package_name):
         # 各个包初始化
         try:
             if package_name == self.packages[0]:
@@ -623,15 +624,15 @@ class PDFConvert:
                 self.doc_pdfplumber = PDF(self.fp, laparams=self.laparams.__dict__)
 
             else:
-                print("Only Suppport Packages", str(self.packages))
+                print("Only Support Packages", str(self.packages))
                 raise Exception
         except:
             logging.info(package_name + " cannot open pdf!")
             self._doc.error_code = [-3]
 
-    def convert_pdf(self):
+    def convert(self):
         if self.has_init_pdf[0] == 0:
-            self.init_pdf("pdfminer")
+            self.init_package("pdfminer")
         if self._doc.error_code is not None:
             return
 
@@ -690,7 +691,7 @@ class PDFConvert:
         if only_image == 0 and image_count == 0:
             # PDFPlumber
             if self.has_init_pdf[3] == 0:
-                self.init_pdf("pdfplumber")
+                self.init_package("pdfplumber")
             if self._doc.error_code is not None:
                 return
 
@@ -789,7 +790,7 @@ class PDFConvert:
 
     def get_layout(self, page):
         if self.has_init_pdf[0] == 0:
-            self.init_pdf("pdfminer")
+            self.init_package("pdfminer")
         if self._doc.error_code is not None:
             return
 
@@ -818,7 +819,7 @@ class PDFConvert:
     def get_page_image(self, page_no):
         try:
             if self.has_init_pdf[1] == 0:
-                self.init_pdf("PyMuPDF")
+                self.init_package("PyMuPDF")
             if self._doc.error_code is not None:
                 return
 
@@ -852,7 +853,7 @@ class PDFConvert:
                 return [-3]
 
     def get_html(self):
-        self.convert_pdf()
+        self.convert()
         if self._doc.error_code is not None:
             return self._doc.error_code
         return self._doc.get_html()

+ 1 - 62
result.html

@@ -1,62 +1 @@
-<!DOCTYPE HTML><head><meta charset="UTF-8"></head><body><div>华池县柔远镇李庄肉牛养殖场建设项目配</div>
-<div>套设备购置政府采购公开招标中标公告</div>
-<div>、项目编号</div>
-<div>HCZC2021-0001</div>
-<div>二、项目名称</div>
-<div>华池县柔远镇李庄肉牛养殖场建设项目配套设备购置</div>
-<div>三、中标(成交)信息</div>
-<table border="1">
-<tr>
-<td colspan=1 rowspan=1>供应商名称</td>
-<td colspan=1 rowspan=1>供应商联系地址</td>
-<td colspan=1 rowspan=1>中标金额(万元)</td>
-</tr>
-<tr>
-<td colspan=1 rowspan=1>华池县卓泰机械设备租赁有限公司</td>
-<td colspan=1 rowspan=1>甘肃省庆阳市华池县柔远镇张川村</td>
-<td colspan=1 rowspan=1>72.3500</td>
-</tr>
-</table>
-<div>四、主要标的信息</div>
-<div>货物类</div>
-<table border="1">
-<tr>
-<td colspan=1 rowspan=1>供应商名称</td>
-<td colspan=1 rowspan=1>名称</td>
-<td colspan=1 rowspan=1>品牌</td>
-<td colspan=1 rowspan=1>数量</td>
-<td colspan=1 rowspan=1>单价</td>
-<td colspan=1 rowspan=1>规格型号</td>
-</tr>
-<tr>
-<td colspan=1 rowspan=1>华池县卓泰机械设备租赁有限公司</td>
-<td colspan=1 rowspan=1>华池县柔远镇李庄肉牛养殖场建设项目配套设备购置</td>
-<td colspan=1 rowspan=1>详见附件</td>
-<td colspan=1 rowspan=1>详见附件</td>
-<td colspan=1 rowspan=1>详见附件</td>
-<td colspan=1 rowspan=1>详见附件</td>
-</tr>
-</table>
-<div>五、评审专家(单一来源采购人员)名单:</div>
-<div>王正刚、段海龙、李鑫、刘翠平、张武峰</div>
-<div>六、代理服务收费标准及金额:</div>
-<div>收费标准:无</div>
-<div>收费金额:0万元</div>
-<div>七、公告期限</div>
-<div>自本公告发布之日起1个工作日。</div>
-<div>八、其他补充事宜</div>
-<div>无</div>
-<div>九、凡对本次公告内容提出询问,请按以下方式联系。</div>
-<div>1.采购人信息</div>
-<div>名称:华池县柔远镇人民政府</div>
-<div>地址:华池县东关街70号</div>
-<div>联系方式:0934-5952951</div>
-<div>2.采购代理机构信息</div>
-<div>名称:华池县公共资源交易中心</div>
-<div>地址:华池县东关街22号</div>
-<div>联系方式:0934-5953080</div>
-<div>3.项目联系方式</div>
-<div>项目联系人:孙治江</div>
-<div>电话:18793418165</div>
-<div>2</div>
-</body>
+<!DOCTYPE HTML><head><meta charset="UTF-8"></head><body></body>