4 лет назад · 61dfbf1cbe
--- a/format_convert/convert_docx.py
+++ b/format_convert/convert_docx.py
@@ -31,7 +31,7 @@ def docx2text(path, unique_type_dir):
 
															         paragraph_text_list = []
														
 
															         for paragraph in doc.paragraphs:
														
 
															             if paragraph.text != "":
														
 
															-                paragraph_text_list.append("<div>" + paragraph.text + "</div>" + "\n")
														
 
															+                paragraph_text_list.append("<div>" + paragraph.text + "</div>" )
														
 
															                 # print("paragraph_text", paragraph.text)
														
 
															         # 遍历表
														
@@ -188,26 +188,28 @@ def read_xml_table(path, save_path):
 
															             log("docx format error!")
														
 
															             return [-3]
														
 
															+        log("xml_analyze%s"%(save_path))
														
 
															         try:
														
 
															             collection = xml_analyze(save_path + "word/document.xml")
														
 
															         except TimeoutError:
														
 
															             log("xml_analyze timeout")
														
 
															             return [-4]
														
 
															+        log("xml_analyze done")
														
 
															         body = collection.getElementsByTagName("w:body")[0]
														
 
															         table_text_list = []
														
 
															         # print("body.childNodes", body.childNodes)
														
 
															         for line in body.childNodes:
														
 
															             if "w:tbl" in str(line):
														
 
															                 # print("str(line)", str(line))
														
 
															-                table_text = '<table border="1">' + "\n"
														
 
															+                table_text = '<table border="1">'
														
 
															                 tr_list = line.getElementsByTagName("w:tr")
														
 
															                 # print("line.childNodes", line.childNodes)
														
 
															                 tr_index = 0
														
 
															                 tr_text_list = []
														
 
															                 tr_text_list_colspan = []
														
 
															                 for tr in tr_list:
														
 
															-                    table_text = table_text + "<tr rowspan=1>" + "\n"
														
 
															+                    table_text = table_text + "<tr>"
														
 
															                     tc_list = tr.getElementsByTagName("w:tc")
														
 
															                     tc_index = 0
														
 
															                     tc_text_list = []
														
@@ -244,7 +246,7 @@ def read_xml_table(path, save_path):
 
															                                     if real_tc_index < len(tr_text_list[tr_index - 1]):
														
 
															                                         tc_text = tr_text_list[tr_index - 1][real_tc_index][0]
														
 
															-                        table_text = table_text + "<td colspan=" + str(col_span) + ">" + "\n"
														
 
															+                        table_text = table_text + "<td colspan=" + str(col_span) + ">"
														
 
															                         p_list = tc.getElementsByTagName("w:p")
														
 
															                         for p in p_list:
														
@@ -254,15 +256,14 @@ def read_xml_table(path, save_path):
 
															                                     # print("tt", tt.childNodes)
														
 
															                                     if len(tt.childNodes) > 0:
														
 
															                                         tc_text += tt.childNodes[0].nodeValue
														
 
															-                                tc_text += "\n"
														
 
															-                        table_text = table_text + tc_text + "</td>" + "\n"
														
 
															+                        table_text = table_text + tc_text + "</td>"
														
 
															                         tc_index += 1
														
 
															                         tc_text_list.append([tc_text, col_span])
														
 
															-                    table_text += "</tr>" + "\n"
														
 
															+                    table_text += "</tr>"
														
 
															                     tr_index += 1
														
 
															                     tr_text_list.append(tc_text_list)
														
 
															-                table_text += "</table>" + "\n"
														
 
															+                table_text += "</table>"
														
 
															                 table_text_list.append(table_text)
														
 
															         return table_text_list
														
@@ -283,14 +284,14 @@ def xml_analyze(path):
 
															 def read_docx_table(document):
														
 
															     table_text_list = []
														
 
															     for table in document.tables:
														
 
															-        table_text = "<table>\n"
														
 
															+        table_text = "<table>"
														
 
															         # print("==================")
														
 
															         for row in table.rows:
														
 
															-            table_text += "<tr>\n"
														
 
															+            table_text += "<tr>"
														
 
															             for cell in row.cells:
														
 
															-                table_text += "<td>" + cell.text + "</td>\n"
														
 
															-            table_text += "</tr>\n"
														
 
															-        table_text += "</table>\n"
														
 
															+                table_text += "<td>" + re.sub("\s","",str(cell.text)) + "</td>"
														
 
															+            table_text += "</tr>"
														
 
															+        table_text += "</table>"
														
 
															         # print(table_text)
														
 
															         table_text_list.append(table_text)
														
 
															     return table_text_list