4 years ago · 0f03059e79
--- a/BiddingKG/dl/interface/Preprocessing.py
+++ b/BiddingKG/dl/interface/Preprocessing.py
@@ -669,7 +669,7 @@ def tableToText(soup):
 
				 
			
 
				 
			
 
				                 occu_height = len(table_occurence)
			
 
				-                occu_width = len(table_occurence[0])
			
 
				+                occu_width = len(table_occurence[0]) if len(table_occurence)>0 else 0
			
 
				                 #为每个属性值寻找表头
			
 
				                 for i in range(occu_height):
			
 
				                     for j in range(occu_width):
			
@@ -688,7 +688,7 @@ def tableToText(soup):
 
				                                     key_values = [1]
			
 
				                                 if table_occurence[i-loop_i][j]["type"] in key_values:
			
 
				                                     if find_flag:
			
 
				-                                        if table_occurence[i-loop_i][j][0]!=temp_head:
			
 
				+                                        if table_occurence[i-loop_i]["text"]!=temp_head:
			
 
				                                             top_head = table_occurence[i-loop_i][j]["text"]+":"+top_head
			
 
				                                     else:
			
 
				                                         top_head = table_occurence[i-loop_i][j]["text"]+":"+top_head
			
@@ -760,6 +760,7 @@ def tableToText(soup):
 
				 
			
 
				                 else:
			
 
				                     for j in range(occu_width):
			
 
				+                        pack_text = ""
			
 
				                         rank_text = ""
			
 
				                         entity_text = ""
			
 
				                         text_line = ""
			
@@ -1074,12 +1075,12 @@ def segment(soup):
 
				     #替换标点
			
 
				 
			
 
				     #替换连续的标点
			
 
				-    punc_pattern = "(?P<del>[。，；：:，]+)\s*"
			
 
				-    for punc_del in re.findall(punc_pattern,text):
			
 
				-        if len(punc_del)>1:
			
 
				-            text = re.sub(punc_del+"\s*",punc_del[-1],text)
			
 
				 
			
 
				-    for punc_del in re.findall(punc_pattern,text):
			
 
				+    punc_pattern = "(?P<del>[。，；：:,\s]+)"
			
 
				+
			
 
				+    list_punc = re.findall(punc_pattern,text)
			
 
				+    list_punc.sort(key=lambda x:len(x),reverse=True)
			
 
				+    for punc_del in list_punc:
			
 
				         if len(punc_del)>1:
			
 
				             text = re.sub(punc_del+"\s*",punc_del[-1],text)