瀏覽代碼

解决docx识别\n的问题

luojiehua 2 年之前
父節點
當前提交
61dfbf1cbe
共有 1 個文件被更改,包括 14 次插入13 次删除
  1. 14 13
      format_convert/convert_docx.py

+ 14 - 13
format_convert/convert_docx.py

@@ -31,7 +31,7 @@ def docx2text(path, unique_type_dir):
         paragraph_text_list = []
         for paragraph in doc.paragraphs:
             if paragraph.text != "":
-                paragraph_text_list.append("<div>" + paragraph.text + "</div>" + "\n")
+                paragraph_text_list.append("<div>" + paragraph.text + "</div>" )
                 # print("paragraph_text", paragraph.text)
 
         # 遍历表
@@ -188,26 +188,28 @@ def read_xml_table(path, save_path):
             log("docx format error!")
             return [-3]
 
+        log("xml_analyze%s"%(save_path))
         try:
             collection = xml_analyze(save_path + "word/document.xml")
         except TimeoutError:
             log("xml_analyze timeout")
             return [-4]
 
+        log("xml_analyze done")
         body = collection.getElementsByTagName("w:body")[0]
         table_text_list = []
         # print("body.childNodes", body.childNodes)
         for line in body.childNodes:
             if "w:tbl" in str(line):
                 # print("str(line)", str(line))
-                table_text = '<table border="1">' + "\n"
+                table_text = '<table border="1">'
                 tr_list = line.getElementsByTagName("w:tr")
                 # print("line.childNodes", line.childNodes)
                 tr_index = 0
                 tr_text_list = []
                 tr_text_list_colspan = []
                 for tr in tr_list:
-                    table_text = table_text + "<tr rowspan=1>" + "\n"
+                    table_text = table_text + "<tr>"
                     tc_list = tr.getElementsByTagName("w:tc")
                     tc_index = 0
                     tc_text_list = []
@@ -244,7 +246,7 @@ def read_xml_table(path, save_path):
                                     if real_tc_index < len(tr_text_list[tr_index - 1]):
                                         tc_text = tr_text_list[tr_index - 1][real_tc_index][0]
 
-                        table_text = table_text + "<td colspan=" + str(col_span) + ">" + "\n"
+                        table_text = table_text + "<td colspan=" + str(col_span) + ">"
                         p_list = tc.getElementsByTagName("w:p")
 
                         for p in p_list:
@@ -254,15 +256,14 @@ def read_xml_table(path, save_path):
                                     # print("tt", tt.childNodes)
                                     if len(tt.childNodes) > 0:
                                         tc_text += tt.childNodes[0].nodeValue
-                                tc_text += "\n"
 
-                        table_text = table_text + tc_text + "</td>" + "\n"
+                        table_text = table_text + tc_text + "</td>"
                         tc_index += 1
                         tc_text_list.append([tc_text, col_span])
-                    table_text += "</tr>" + "\n"
+                    table_text += "</tr>"
                     tr_index += 1
                     tr_text_list.append(tc_text_list)
-                table_text += "</table>" + "\n"
+                table_text += "</table>"
                 table_text_list.append(table_text)
         return table_text_list
 
@@ -283,14 +284,14 @@ def xml_analyze(path):
 def read_docx_table(document):
     table_text_list = []
     for table in document.tables:
-        table_text = "<table>\n"
+        table_text = "<table>"
         # print("==================")
         for row in table.rows:
-            table_text += "<tr>\n"
+            table_text += "<tr>"
             for cell in row.cells:
-                table_text += "<td>" + cell.text + "</td>\n"
-            table_text += "</tr>\n"
-        table_text += "</table>\n"
+                table_text += "<td>" + re.sub("\s","",str(cell.text)) + "</td>"
+            table_text += "</tr>"
+        table_text += "</table>"
         # print(table_text)
         table_text_list.append(table_text)
     return table_text_list