|
@@ -669,7 +669,7 @@ def tableToText(soup):
|
|
|
|
|
|
|
|
|
occu_height = len(table_occurence)
|
|
|
- occu_width = len(table_occurence[0])
|
|
|
+ occu_width = len(table_occurence[0]) if len(table_occurence)>0 else 0
|
|
|
#为每个属性值寻找表头
|
|
|
for i in range(occu_height):
|
|
|
for j in range(occu_width):
|
|
@@ -688,7 +688,7 @@ def tableToText(soup):
|
|
|
key_values = [1]
|
|
|
if table_occurence[i-loop_i][j]["type"] in key_values:
|
|
|
if find_flag:
|
|
|
- if table_occurence[i-loop_i][j][0]!=temp_head:
|
|
|
+ if table_occurence[i-loop_i]["text"]!=temp_head:
|
|
|
top_head = table_occurence[i-loop_i][j]["text"]+":"+top_head
|
|
|
else:
|
|
|
top_head = table_occurence[i-loop_i][j]["text"]+":"+top_head
|
|
@@ -760,6 +760,7 @@ def tableToText(soup):
|
|
|
|
|
|
else:
|
|
|
for j in range(occu_width):
|
|
|
+ pack_text = ""
|
|
|
rank_text = ""
|
|
|
entity_text = ""
|
|
|
text_line = ""
|
|
@@ -1074,12 +1075,12 @@ def segment(soup):
|
|
|
#替换标点
|
|
|
|
|
|
#替换连续的标点
|
|
|
- punc_pattern = "(?P<del>[。,;::,]+)\s*"
|
|
|
- for punc_del in re.findall(punc_pattern,text):
|
|
|
- if len(punc_del)>1:
|
|
|
- text = re.sub(punc_del+"\s*",punc_del[-1],text)
|
|
|
|
|
|
- for punc_del in re.findall(punc_pattern,text):
|
|
|
+ punc_pattern = "(?P<del>[。,;::,\s]+)"
|
|
|
+
|
|
|
+ list_punc = re.findall(punc_pattern,text)
|
|
|
+ list_punc.sort(key=lambda x:len(x),reverse=True)
|
|
|
+ for punc_del in list_punc:
|
|
|
if len(punc_del)>1:
|
|
|
text = re.sub(punc_del+"\s*",punc_del[-1],text)
|
|
|
|