Jelajahi Sumber

解决预处理中表格数据可能漏掉的问题以及标点符号没去干净的问题

rogel 4 tahun lalu
induk
melakukan
0f03059e79
1 mengubah file dengan 8 tambahan dan 7 penghapusan
  1. 8 7
      BiddingKG/dl/interface/Preprocessing.py

+ 8 - 7
BiddingKG/dl/interface/Preprocessing.py

@@ -669,7 +669,7 @@ def tableToText(soup):
 
 
                 occu_height = len(table_occurence)
-                occu_width = len(table_occurence[0])
+                occu_width = len(table_occurence[0]) if len(table_occurence)>0 else 0
                 #为每个属性值寻找表头
                 for i in range(occu_height):
                     for j in range(occu_width):
@@ -688,7 +688,7 @@ def tableToText(soup):
                                     key_values = [1]
                                 if table_occurence[i-loop_i][j]["type"] in key_values:
                                     if find_flag:
-                                        if table_occurence[i-loop_i][j][0]!=temp_head:
+                                        if table_occurence[i-loop_i]["text"]!=temp_head:
                                             top_head = table_occurence[i-loop_i][j]["text"]+":"+top_head
                                     else:
                                         top_head = table_occurence[i-loop_i][j]["text"]+":"+top_head
@@ -760,6 +760,7 @@ def tableToText(soup):
 
                 else:
                     for j in range(occu_width):
+                        pack_text = ""
                         rank_text = ""
                         entity_text = ""
                         text_line = ""
@@ -1074,12 +1075,12 @@ def segment(soup):
     #替换标点
 
     #替换连续的标点
-    punc_pattern = "(?P<del>[。,;::,]+)\s*"
-    for punc_del in re.findall(punc_pattern,text):
-        if len(punc_del)>1:
-            text = re.sub(punc_del+"\s*",punc_del[-1],text)
 
-    for punc_del in re.findall(punc_pattern,text):
+    punc_pattern = "(?P<del>[。,;::,\s]+)"
+
+    list_punc = re.findall(punc_pattern,text)
+    list_punc.sort(key=lambda x:len(x),reverse=True)
+    for punc_del in list_punc:
         if len(punc_del)>1:
             text = re.sub(punc_del+"\s*",punc_del[-1],text)