|
@@ -31,7 +31,7 @@ def docx2text(path, unique_type_dir):
|
|
paragraph_text_list = []
|
|
paragraph_text_list = []
|
|
for paragraph in doc.paragraphs:
|
|
for paragraph in doc.paragraphs:
|
|
if paragraph.text != "":
|
|
if paragraph.text != "":
|
|
- paragraph_text_list.append("<div>" + paragraph.text + "</div>" + "\n")
|
|
|
|
|
|
+ paragraph_text_list.append("<div>" + paragraph.text + "</div>" )
|
|
# print("paragraph_text", paragraph.text)
|
|
# print("paragraph_text", paragraph.text)
|
|
|
|
|
|
# 遍历表
|
|
# 遍历表
|
|
@@ -188,26 +188,28 @@ def read_xml_table(path, save_path):
|
|
log("docx format error!")
|
|
log("docx format error!")
|
|
return [-3]
|
|
return [-3]
|
|
|
|
|
|
|
|
+ log("xml_analyze%s"%(save_path))
|
|
try:
|
|
try:
|
|
collection = xml_analyze(save_path + "word/document.xml")
|
|
collection = xml_analyze(save_path + "word/document.xml")
|
|
except TimeoutError:
|
|
except TimeoutError:
|
|
log("xml_analyze timeout")
|
|
log("xml_analyze timeout")
|
|
return [-4]
|
|
return [-4]
|
|
|
|
|
|
|
|
+ log("xml_analyze done")
|
|
body = collection.getElementsByTagName("w:body")[0]
|
|
body = collection.getElementsByTagName("w:body")[0]
|
|
table_text_list = []
|
|
table_text_list = []
|
|
# print("body.childNodes", body.childNodes)
|
|
# print("body.childNodes", body.childNodes)
|
|
for line in body.childNodes:
|
|
for line in body.childNodes:
|
|
if "w:tbl" in str(line):
|
|
if "w:tbl" in str(line):
|
|
# print("str(line)", str(line))
|
|
# print("str(line)", str(line))
|
|
- table_text = '<table border="1">' + "\n"
|
|
|
|
|
|
+ table_text = '<table border="1">'
|
|
tr_list = line.getElementsByTagName("w:tr")
|
|
tr_list = line.getElementsByTagName("w:tr")
|
|
# print("line.childNodes", line.childNodes)
|
|
# print("line.childNodes", line.childNodes)
|
|
tr_index = 0
|
|
tr_index = 0
|
|
tr_text_list = []
|
|
tr_text_list = []
|
|
tr_text_list_colspan = []
|
|
tr_text_list_colspan = []
|
|
for tr in tr_list:
|
|
for tr in tr_list:
|
|
- table_text = table_text + "<tr rowspan=1>" + "\n"
|
|
|
|
|
|
+ table_text = table_text + "<tr>"
|
|
tc_list = tr.getElementsByTagName("w:tc")
|
|
tc_list = tr.getElementsByTagName("w:tc")
|
|
tc_index = 0
|
|
tc_index = 0
|
|
tc_text_list = []
|
|
tc_text_list = []
|
|
@@ -244,7 +246,7 @@ def read_xml_table(path, save_path):
|
|
if real_tc_index < len(tr_text_list[tr_index - 1]):
|
|
if real_tc_index < len(tr_text_list[tr_index - 1]):
|
|
tc_text = tr_text_list[tr_index - 1][real_tc_index][0]
|
|
tc_text = tr_text_list[tr_index - 1][real_tc_index][0]
|
|
|
|
|
|
- table_text = table_text + "<td colspan=" + str(col_span) + ">" + "\n"
|
|
|
|
|
|
+ table_text = table_text + "<td colspan=" + str(col_span) + ">"
|
|
p_list = tc.getElementsByTagName("w:p")
|
|
p_list = tc.getElementsByTagName("w:p")
|
|
|
|
|
|
for p in p_list:
|
|
for p in p_list:
|
|
@@ -254,15 +256,14 @@ def read_xml_table(path, save_path):
|
|
# print("tt", tt.childNodes)
|
|
# print("tt", tt.childNodes)
|
|
if len(tt.childNodes) > 0:
|
|
if len(tt.childNodes) > 0:
|
|
tc_text += tt.childNodes[0].nodeValue
|
|
tc_text += tt.childNodes[0].nodeValue
|
|
- tc_text += "\n"
|
|
|
|
|
|
|
|
- table_text = table_text + tc_text + "</td>" + "\n"
|
|
|
|
|
|
+ table_text = table_text + tc_text + "</td>"
|
|
tc_index += 1
|
|
tc_index += 1
|
|
tc_text_list.append([tc_text, col_span])
|
|
tc_text_list.append([tc_text, col_span])
|
|
- table_text += "</tr>" + "\n"
|
|
|
|
|
|
+ table_text += "</tr>"
|
|
tr_index += 1
|
|
tr_index += 1
|
|
tr_text_list.append(tc_text_list)
|
|
tr_text_list.append(tc_text_list)
|
|
- table_text += "</table>" + "\n"
|
|
|
|
|
|
+ table_text += "</table>"
|
|
table_text_list.append(table_text)
|
|
table_text_list.append(table_text)
|
|
return table_text_list
|
|
return table_text_list
|
|
|
|
|
|
@@ -283,14 +284,14 @@ def xml_analyze(path):
|
|
def read_docx_table(document):
|
|
def read_docx_table(document):
|
|
table_text_list = []
|
|
table_text_list = []
|
|
for table in document.tables:
|
|
for table in document.tables:
|
|
- table_text = "<table>\n"
|
|
|
|
|
|
+ table_text = "<table>"
|
|
# print("==================")
|
|
# print("==================")
|
|
for row in table.rows:
|
|
for row in table.rows:
|
|
- table_text += "<tr>\n"
|
|
|
|
|
|
+ table_text += "<tr>"
|
|
for cell in row.cells:
|
|
for cell in row.cells:
|
|
- table_text += "<td>" + cell.text + "</td>\n"
|
|
|
|
- table_text += "</tr>\n"
|
|
|
|
- table_text += "</table>\n"
|
|
|
|
|
|
+ table_text += "<td>" + re.sub("\s","",str(cell.text)) + "</td>"
|
|
|
|
+ table_text += "</tr>"
|
|
|
|
+ table_text += "</table>"
|
|
# print(table_text)
|
|
# print(table_text)
|
|
table_text_list.append(table_text)
|
|
table_text_list.append(table_text)
|
|
return table_text_list
|
|
return table_text_list
|