4 年之前 · 61dfbf1cbe
--- a/format_convert/convert_docx.py
+++ b/format_convert/convert_docx.py
@@ -31,7 +31,7 @@ def docx2text(path, unique_type_dir):
 
				         paragraph_text_list = []
			
 
				         for paragraph in doc.paragraphs:
			
 
				             if paragraph.text != "":
			
 
				-                paragraph_text_list.append("<div>" + paragraph.text + "</div>" + "\n")
			
 
				+                paragraph_text_list.append("<div>" + paragraph.text + "</div>" )
			
 
				                 # print("paragraph_text", paragraph.text)
			
 
				 
			
 
				         # 遍历表
			
@@ -188,26 +188,28 @@ def read_xml_table(path, save_path):
 
				             log("docx format error!")
			
 
				             return [-3]
			
 
				 
			
 
				+        log("xml_analyze%s"%(save_path))
			
 
				         try:
			
 
				             collection = xml_analyze(save_path + "word/document.xml")
			
 
				         except TimeoutError:
			
 
				             log("xml_analyze timeout")
			
 
				             return [-4]
			
 
				 
			
 
				+        log("xml_analyze done")
			
 
				         body = collection.getElementsByTagName("w:body")[0]
			
 
				         table_text_list = []
			
 
				         # print("body.childNodes", body.childNodes)
			
 
				         for line in body.childNodes:
			
 
				             if "w:tbl" in str(line):
			
 
				                 # print("str(line)", str(line))
			
 
				-                table_text = '<table border="1">' + "\n"
			
 
				+                table_text = '<table border="1">'
			
 
				                 tr_list = line.getElementsByTagName("w:tr")
			
 
				                 # print("line.childNodes", line.childNodes)
			
 
				                 tr_index = 0
			
 
				                 tr_text_list = []
			
 
				                 tr_text_list_colspan = []
			
 
				                 for tr in tr_list:
			
 
				-                    table_text = table_text + "<tr rowspan=1>" + "\n"
			
 
				+                    table_text = table_text + "<tr>"
			
 
				                     tc_list = tr.getElementsByTagName("w:tc")
			
 
				                     tc_index = 0
			
 
				                     tc_text_list = []
			
@@ -244,7 +246,7 @@ def read_xml_table(path, save_path):
 
				                                     if real_tc_index < len(tr_text_list[tr_index - 1]):
			
 
				                                         tc_text = tr_text_list[tr_index - 1][real_tc_index][0]
			
 
				 
			
 
				-                        table_text = table_text + "<td colspan=" + str(col_span) + ">" + "\n"
			
 
				+                        table_text = table_text + "<td colspan=" + str(col_span) + ">"
			
 
				                         p_list = tc.getElementsByTagName("w:p")
			
 
				 
			
 
				                         for p in p_list:
			
@@ -254,15 +256,14 @@ def read_xml_table(path, save_path):
 
				                                     # print("tt", tt.childNodes)
			
 
				                                     if len(tt.childNodes) > 0:
			
 
				                                         tc_text += tt.childNodes[0].nodeValue
			
 
				-                                tc_text += "\n"
			
 
				 
			
 
				-                        table_text = table_text + tc_text + "</td>" + "\n"
			
 
				+                        table_text = table_text + tc_text + "</td>"
			
 
				                         tc_index += 1
			
 
				                         tc_text_list.append([tc_text, col_span])
			
 
				-                    table_text += "</tr>" + "\n"
			
 
				+                    table_text += "</tr>"
			
 
				                     tr_index += 1
			
 
				                     tr_text_list.append(tc_text_list)
			
 
				-                table_text += "</table>" + "\n"
			
 
				+                table_text += "</table>"
			
 
				                 table_text_list.append(table_text)
			
 
				         return table_text_list
			
 
				 
			
@@ -283,14 +284,14 @@ def xml_analyze(path):
 
				 def read_docx_table(document):
			
 
				     table_text_list = []
			
 
				     for table in document.tables:
			
 
				-        table_text = "<table>\n"
			
 
				+        table_text = "<table>"
			
 
				         # print("==================")
			
 
				         for row in table.rows:
			
 
				-            table_text += "<tr>\n"
			
 
				+            table_text += "<tr>"
			
 
				             for cell in row.cells:
			
 
				-                table_text += "<td>" + cell.text + "</td>\n"
			
 
				-            table_text += "</tr>\n"
			
 
				-        table_text += "</table>\n"
			
 
				+                table_text += "<td>" + re.sub("\s","",str(cell.text)) + "</td>"
			
 
				+            table_text += "</tr>"
			
 
				+        table_text += "</table>"
			
 
				         # print(table_text)
			
 
				         table_text_list.append(table_text)
			
 
				     return table_text_list