|
@@ -419,10 +419,10 @@ def tableToText(soup):
|
|
inner_table[_h][_w][1] = 0
|
|
inner_table[_h][_w][1] = 0
|
|
|
|
|
|
|
|
|
|
- print("=====")
|
|
|
|
- for item in inner_table:
|
|
|
|
- print(item)
|
|
|
|
- print("======")
|
|
|
|
|
|
+ # print("=====")
|
|
|
|
+ # for item in inner_table:
|
|
|
|
+ # print(item)
|
|
|
|
+ # print("======")
|
|
|
|
|
|
repairTable(inner_table)
|
|
repairTable(inner_table)
|
|
head_list = sliceTable(inner_table)
|
|
head_list = sliceTable(inner_table)
|
|
@@ -640,11 +640,7 @@ def tableToText(soup):
|
|
|
|
|
|
direct = getDirect(inner_table, head_begin, head_end)
|
|
direct = getDirect(inner_table, head_begin, head_end)
|
|
|
|
|
|
- print("----")
|
|
|
|
- print(inner_table[head_begin:head_end])
|
|
|
|
- print("head_end-head_begin",head_end-head_begin)
|
|
|
|
- print(direct)
|
|
|
|
-
|
|
|
|
|
|
+
|
|
#若只有一行,则直接按行读取
|
|
#若只有一行,则直接按行读取
|
|
if head_end-head_begin==1:
|
|
if head_end-head_begin==1:
|
|
text_line = ""
|
|
text_line = ""
|
|
@@ -668,12 +664,69 @@ def tableToText(soup):
|
|
line_oc = []
|
|
line_oc = []
|
|
for j in range(width):
|
|
for j in range(width):
|
|
cell = inner_table[i][j]
|
|
cell = inner_table[i][j]
|
|
- line_oc.append({"text":cell[0],"type":cell[1]})
|
|
|
|
|
|
+ line_oc.append({"text":cell[0],"type":cell[1],"occu_count":0,"left_head":"","top_head":""})
|
|
table_occurence.append(line_oc)
|
|
table_occurence.append(line_oc)
|
|
|
|
|
|
|
|
|
|
|
|
+ occu_height = len(table_occurence)
|
|
|
|
+ occu_width = len(table_occurence[0])
|
|
|
|
+ #为每个属性值寻找表头
|
|
|
|
+ for i in range(occu_height):
|
|
|
|
+ for j in range(occu_width):
|
|
|
|
+ cell = table_occurence[i][j]
|
|
|
|
+ #是属性值
|
|
|
|
+ if cell["type"]==0 and cell["text"]!="":
|
|
|
|
+ left_head = ""
|
|
|
|
+ top_head = ""
|
|
|
|
+
|
|
|
|
+ find_flag = False
|
|
|
|
+ temp_head = ""
|
|
|
|
+ for loop_i in range(1,i+1):
|
|
|
|
+ if not key_direct:
|
|
|
|
+ key_values = [1,2]
|
|
|
|
+ else:
|
|
|
|
+ key_values = [1]
|
|
|
|
+ if table_occurence[i-loop_i][j]["type"] in key_values:
|
|
|
|
+ if find_flag:
|
|
|
|
+ if table_occurence[i-loop_i][j][0]!=temp_head:
|
|
|
|
+ top_head = table_occurence[i-loop_i][j]["text"]+":"+top_head
|
|
|
|
+ else:
|
|
|
|
+ top_head = table_occurence[i-loop_i][j]["text"]+":"+top_head
|
|
|
|
+ find_flag = True
|
|
|
|
+ temp_head = table_occurence[i-loop_i][j]["text"]
|
|
|
|
+ table_occurence[i-loop_i][j]["occu_count"] += 1
|
|
|
|
+ else:
|
|
|
|
+ #找到表头后遇到属性值就返回
|
|
|
|
+ if find_flag:
|
|
|
|
+ break
|
|
|
|
+
|
|
|
|
+
|
|
|
|
+ cell["top_head"] += top_head
|
|
|
|
+ find_flag = False
|
|
|
|
+ temp_head = ""
|
|
|
|
+
|
|
|
|
+
|
|
|
|
+
|
|
|
|
+ for loop_j in range(1,j+1):
|
|
|
|
+ if not key_direct:
|
|
|
|
+ key_values = [1,2]
|
|
|
|
+ else:
|
|
|
|
+ key_values = [2]
|
|
|
|
+ if table_occurence[i][j-loop_j]["type"] in key_values:
|
|
|
|
+ if find_flag:
|
|
|
|
+ if table_occurence[i][j-loop_j]["text"]!=temp_head:
|
|
|
|
+ left_head = table_occurence[i][j-loop_j]["text"]+":"+left_head
|
|
|
|
+ else:
|
|
|
|
+ left_head = table_occurence[i][j-loop_j]["text"]+":"+left_head
|
|
|
|
+ find_flag = True
|
|
|
|
+ temp_head = table_occurence[i][j-loop_j]["text"]
|
|
|
|
+ table_occurence[i][j-loop_j]["occu_count"] += 1
|
|
|
|
+ else:
|
|
|
|
+ if find_flag:
|
|
|
|
+ break
|
|
|
|
+ cell["left_head"] += left_head
|
|
if direct=="row":
|
|
if direct=="row":
|
|
- for i in range(head_begin,head_end):
|
|
|
|
|
|
+ for i in range(occu_height):
|
|
pack_text = ""
|
|
pack_text = ""
|
|
rank_text = ""
|
|
rank_text = ""
|
|
entity_text = ""
|
|
entity_text = ""
|
|
@@ -681,131 +734,195 @@ def tableToText(soup):
|
|
#在同一句话中重复的可以去掉
|
|
#在同一句话中重复的可以去掉
|
|
text_set = set()
|
|
text_set = set()
|
|
for j in range(width):
|
|
for j in range(width):
|
|
- cell = inner_table[i][j]
|
|
|
|
- #是属性值
|
|
|
|
- if cell[1]==0 and cell[0]!="":
|
|
|
|
- head = ""
|
|
|
|
-
|
|
|
|
- find_flag = False
|
|
|
|
- temp_head = ""
|
|
|
|
- for loop_i in range(0,i+1-head_begin):
|
|
|
|
- if not key_direct:
|
|
|
|
- key_values = [1,2]
|
|
|
|
- else:
|
|
|
|
- key_values = [1]
|
|
|
|
- if inner_table[i-loop_i][j][1] in key_values:
|
|
|
|
- if find_flag:
|
|
|
|
- if inner_table[i-loop_i][j][0]!=temp_head:
|
|
|
|
- head = inner_table[i-loop_i][j][0]+":"+head
|
|
|
|
- else:
|
|
|
|
- head = inner_table[i-loop_i][j][0]+":"+head
|
|
|
|
- find_flag = True
|
|
|
|
- temp_head = inner_table[i-loop_i][j][0]
|
|
|
|
- else:
|
|
|
|
- #找到表头后遇到属性值就返回
|
|
|
|
- if find_flag:
|
|
|
|
- break
|
|
|
|
-
|
|
|
|
- find_flag = False
|
|
|
|
- temp_head = ""
|
|
|
|
-
|
|
|
|
-
|
|
|
|
-
|
|
|
|
- for loop_j in range(1,j+1):
|
|
|
|
- if not key_direct:
|
|
|
|
- key_values = [1,2]
|
|
|
|
- else:
|
|
|
|
- key_values = [2]
|
|
|
|
- if inner_table[i][j-loop_j][1] in key_values:
|
|
|
|
- if find_flag:
|
|
|
|
- if inner_table[i][j-loop_j][0]!=temp_head:
|
|
|
|
- head = inner_table[i][j-loop_j][0]+":"+head
|
|
|
|
- else:
|
|
|
|
- head = inner_table[i][j-loop_j][0]+":"+head
|
|
|
|
- find_flag = True
|
|
|
|
- temp_head = inner_table[i][j-loop_j][0]
|
|
|
|
- else:
|
|
|
|
- if find_flag:
|
|
|
|
- break
|
|
|
|
-
|
|
|
|
- if str(head+inner_table[i][j][0]) in text_set:
|
|
|
|
|
|
+ cell = table_occurence[i][j]
|
|
|
|
+ if cell["type"]==0 or (cell["type"]==1 and cell["occu_count"]==0):
|
|
|
|
+
|
|
|
|
+ cell = table_occurence[i][j]
|
|
|
|
+ head = (cell["top_head"]+":") if len(cell["top_head"])>0 else ""
|
|
|
|
+ head += cell["left_head"]
|
|
|
|
+ if str(head+cell["text"]) in text_set:
|
|
continue
|
|
continue
|
|
if re.search(packPattern,head) is not None:
|
|
if re.search(packPattern,head) is not None:
|
|
- pack_text += head+inner_table[i][j][0]+","
|
|
|
|
|
|
+ pack_text += head+cell["text"]+","
|
|
elif re.search(rankPattern,head) is not None: # 2020/11/23 大网站规则发现问题,if 改elif
|
|
elif re.search(rankPattern,head) is not None: # 2020/11/23 大网站规则发现问题,if 改elif
|
|
#排名替换为同一种表达
|
|
#排名替换为同一种表达
|
|
- rank_text += head+inner_table[i][j][0]+","
|
|
|
|
|
|
+ rank_text += head+cell["text"]+","
|
|
#print(rank_text)
|
|
#print(rank_text)
|
|
elif re.search(entityPattern,head) is not None:
|
|
elif re.search(entityPattern,head) is not None:
|
|
- entity_text += head+inner_table[i][j][0]+","
|
|
|
|
|
|
+ entity_text += head+cell["text"]+","
|
|
#print(entity_text)
|
|
#print(entity_text)
|
|
else:
|
|
else:
|
|
- text_line += head+inner_table[i][j][0]+","
|
|
|
|
- text_set.add(str(head+inner_table[i][j][0]))
|
|
|
|
|
|
+ text_line += head+cell["text"]+","
|
|
|
|
+ text_set.add(str(head+cell["text"]))
|
|
|
|
+
|
|
text += pack_text+rank_text+entity_text+text_line
|
|
text += pack_text+rank_text+entity_text+text_line
|
|
text = text[:-1]+"。" if len(text)>0 else text
|
|
text = text[:-1]+"。" if len(text)>0 else text
|
|
|
|
+
|
|
else:
|
|
else:
|
|
- for j in range(width):
|
|
|
|
-
|
|
|
|
|
|
+ for j in range(occu_width):
|
|
rank_text = ""
|
|
rank_text = ""
|
|
entity_text = ""
|
|
entity_text = ""
|
|
text_line = ""
|
|
text_line = ""
|
|
text_set = set()
|
|
text_set = set()
|
|
- for i in range(head_begin,head_end):
|
|
|
|
- cell = inner_table[i][j]
|
|
|
|
- #是属性值
|
|
|
|
- if cell[1]==0 and cell[0]!="":
|
|
|
|
- find_flag = False
|
|
|
|
- head = ""
|
|
|
|
- temp_head = ""
|
|
|
|
-
|
|
|
|
- for loop_j in range(1,j+1):
|
|
|
|
- if not key_direct:
|
|
|
|
- key_values = [1,2]
|
|
|
|
- else:
|
|
|
|
- key_values = [2]
|
|
|
|
- if inner_table[i][j-loop_j][1] in key_values:
|
|
|
|
- if find_flag:
|
|
|
|
- if inner_table[i][j-loop_j][0]!=temp_head:
|
|
|
|
- head = inner_table[i][j-loop_j][0]+":"+head
|
|
|
|
- else:
|
|
|
|
- head = inner_table[i][j-loop_j][0]+":"+head
|
|
|
|
- find_flag = True
|
|
|
|
- temp_head = inner_table[i][j-loop_j][0]
|
|
|
|
- else:
|
|
|
|
- if find_flag:
|
|
|
|
- break
|
|
|
|
- find_flag = False
|
|
|
|
- temp_head = ""
|
|
|
|
- for loop_i in range(0,i+1-head_begin):
|
|
|
|
- if not key_direct:
|
|
|
|
- key_values = [1,2]
|
|
|
|
- else:
|
|
|
|
- key_values = [1]
|
|
|
|
- if inner_table[i-loop_i][j][1] in key_values:
|
|
|
|
- if find_flag:
|
|
|
|
- if inner_table[i-loop_i][j][0]!=temp_head:
|
|
|
|
- head = inner_table[i-loop_i][j][0]+":"+head
|
|
|
|
- else:
|
|
|
|
- head = inner_table[i-loop_i][j][0]+":"+head
|
|
|
|
- find_flag = True
|
|
|
|
- temp_head = inner_table[i-loop_i][j][0]
|
|
|
|
- else:
|
|
|
|
- if find_flag:
|
|
|
|
- break
|
|
|
|
- if str(head+inner_table[i][j][0]) in text_set:
|
|
|
|
|
|
+ for i in range(occu_height):
|
|
|
|
+ cell = table_occurence[i][j]
|
|
|
|
+ if cell["type"]==0 or (cell["type"]==1 and cell["occu_count"]==0):
|
|
|
|
+
|
|
|
|
+ cell = table_occurence[i][j]
|
|
|
|
+ head = (cell["left_head"]+"") if len(cell["left_head"])>0 else ""
|
|
|
|
+ head += cell["top_head"]
|
|
|
|
+ if str(head+cell["text"]) in text_set:
|
|
continue
|
|
continue
|
|
- if re.search(rankPattern,head) is not None:
|
|
|
|
- rank_text += head+inner_table[i][j][0]+","
|
|
|
|
|
|
+ if re.search(packPattern,head) is not None:
|
|
|
|
+ pack_text += head+cell["text"]+","
|
|
|
|
+ elif re.search(rankPattern,head) is not None: # 2020/11/23 大网站规则发现问题,if 改elif
|
|
|
|
+ #排名替换为同一种表达
|
|
|
|
+ rank_text += head+cell["text"]+","
|
|
#print(rank_text)
|
|
#print(rank_text)
|
|
elif re.search(entityPattern,head) is not None:
|
|
elif re.search(entityPattern,head) is not None:
|
|
- entity_text += head+inner_table[i][j][0]+","
|
|
|
|
|
|
+ entity_text += head+cell["text"]+","
|
|
#print(entity_text)
|
|
#print(entity_text)
|
|
else:
|
|
else:
|
|
- text_line += head+inner_table[i][j][0]+","
|
|
|
|
- text_set.add(str(head+inner_table[i][j][0]))
|
|
|
|
- text += rank_text+entity_text+text_line
|
|
|
|
|
|
+ text_line += head+cell["text"]+","
|
|
|
|
+ text_set.add(str(head+cell["text"]))
|
|
|
|
+ text += pack_text+rank_text+entity_text+text_line
|
|
text = text[:-1]+"。" if len(text)>0 else text
|
|
text = text[:-1]+"。" if len(text)>0 else text
|
|
|
|
+
|
|
|
|
+
|
|
|
|
+ # if direct=="row":
|
|
|
|
+ # for i in range(head_begin,head_end):
|
|
|
|
+ # pack_text = ""
|
|
|
|
+ # rank_text = ""
|
|
|
|
+ # entity_text = ""
|
|
|
|
+ # text_line = ""
|
|
|
|
+ # #在同一句话中重复的可以去掉
|
|
|
|
+ # text_set = set()
|
|
|
|
+ # for j in range(width):
|
|
|
|
+ # cell = inner_table[i][j]
|
|
|
|
+ # #是属性值
|
|
|
|
+ # if cell[1]==0 and cell[0]!="":
|
|
|
|
+ # head = ""
|
|
|
|
+ #
|
|
|
|
+ # find_flag = False
|
|
|
|
+ # temp_head = ""
|
|
|
|
+ # for loop_i in range(0,i+1-head_begin):
|
|
|
|
+ # if not key_direct:
|
|
|
|
+ # key_values = [1,2]
|
|
|
|
+ # else:
|
|
|
|
+ # key_values = [1]
|
|
|
|
+ # if inner_table[i-loop_i][j][1] in key_values:
|
|
|
|
+ # if find_flag:
|
|
|
|
+ # if inner_table[i-loop_i][j][0]!=temp_head:
|
|
|
|
+ # head = inner_table[i-loop_i][j][0]+":"+head
|
|
|
|
+ # else:
|
|
|
|
+ # head = inner_table[i-loop_i][j][0]+":"+head
|
|
|
|
+ # find_flag = True
|
|
|
|
+ # temp_head = inner_table[i-loop_i][j][0]
|
|
|
|
+ # else:
|
|
|
|
+ # #找到表头后遇到属性值就返回
|
|
|
|
+ # if find_flag:
|
|
|
|
+ # break
|
|
|
|
+ #
|
|
|
|
+ # find_flag = False
|
|
|
|
+ # temp_head = ""
|
|
|
|
+ #
|
|
|
|
+ #
|
|
|
|
+ #
|
|
|
|
+ # for loop_j in range(1,j+1):
|
|
|
|
+ # if not key_direct:
|
|
|
|
+ # key_values = [1,2]
|
|
|
|
+ # else:
|
|
|
|
+ # key_values = [2]
|
|
|
|
+ # if inner_table[i][j-loop_j][1] in key_values:
|
|
|
|
+ # if find_flag:
|
|
|
|
+ # if inner_table[i][j-loop_j][0]!=temp_head:
|
|
|
|
+ # head = inner_table[i][j-loop_j][0]+":"+head
|
|
|
|
+ # else:
|
|
|
|
+ # head = inner_table[i][j-loop_j][0]+":"+head
|
|
|
|
+ # find_flag = True
|
|
|
|
+ # temp_head = inner_table[i][j-loop_j][0]
|
|
|
|
+ # else:
|
|
|
|
+ # if find_flag:
|
|
|
|
+ # break
|
|
|
|
+ #
|
|
|
|
+ # if str(head+inner_table[i][j][0]) in text_set:
|
|
|
|
+ # continue
|
|
|
|
+ # if re.search(packPattern,head) is not None:
|
|
|
|
+ # pack_text += head+inner_table[i][j][0]+","
|
|
|
|
+ # elif re.search(rankPattern,head) is not None: # 2020/11/23 大网站规则发现问题,if 改elif
|
|
|
|
+ # #排名替换为同一种表达
|
|
|
|
+ # rank_text += head+inner_table[i][j][0]+","
|
|
|
|
+ # #print(rank_text)
|
|
|
|
+ # elif re.search(entityPattern,head) is not None:
|
|
|
|
+ # entity_text += head+inner_table[i][j][0]+","
|
|
|
|
+ # #print(entity_text)
|
|
|
|
+ # else:
|
|
|
|
+ # text_line += head+inner_table[i][j][0]+","
|
|
|
|
+ # text_set.add(str(head+inner_table[i][j][0]))
|
|
|
|
+ # text += pack_text+rank_text+entity_text+text_line
|
|
|
|
+ # text = text[:-1]+"。" if len(text)>0 else text
|
|
|
|
+ # else:
|
|
|
|
+ # for j in range(width):
|
|
|
|
+ #
|
|
|
|
+ # rank_text = ""
|
|
|
|
+ # entity_text = ""
|
|
|
|
+ # text_line = ""
|
|
|
|
+ # text_set = set()
|
|
|
|
+ # for i in range(head_begin,head_end):
|
|
|
|
+ # cell = inner_table[i][j]
|
|
|
|
+ # #是属性值
|
|
|
|
+ # if cell[1]==0 and cell[0]!="":
|
|
|
|
+ # find_flag = False
|
|
|
|
+ # head = ""
|
|
|
|
+ # temp_head = ""
|
|
|
|
+ #
|
|
|
|
+ # for loop_j in range(1,j+1):
|
|
|
|
+ # if not key_direct:
|
|
|
|
+ # key_values = [1,2]
|
|
|
|
+ # else:
|
|
|
|
+ # key_values = [2]
|
|
|
|
+ # if inner_table[i][j-loop_j][1] in key_values:
|
|
|
|
+ # if find_flag:
|
|
|
|
+ # if inner_table[i][j-loop_j][0]!=temp_head:
|
|
|
|
+ # head = inner_table[i][j-loop_j][0]+":"+head
|
|
|
|
+ # else:
|
|
|
|
+ # head = inner_table[i][j-loop_j][0]+":"+head
|
|
|
|
+ # find_flag = True
|
|
|
|
+ # temp_head = inner_table[i][j-loop_j][0]
|
|
|
|
+ # else:
|
|
|
|
+ # if find_flag:
|
|
|
|
+ # break
|
|
|
|
+ # find_flag = False
|
|
|
|
+ # temp_head = ""
|
|
|
|
+ # for loop_i in range(0,i+1-head_begin):
|
|
|
|
+ # if not key_direct:
|
|
|
|
+ # key_values = [1,2]
|
|
|
|
+ # else:
|
|
|
|
+ # key_values = [1]
|
|
|
|
+ # if inner_table[i-loop_i][j][1] in key_values:
|
|
|
|
+ # if find_flag:
|
|
|
|
+ # if inner_table[i-loop_i][j][0]!=temp_head:
|
|
|
|
+ # head = inner_table[i-loop_i][j][0]+":"+head
|
|
|
|
+ # else:
|
|
|
|
+ # head = inner_table[i-loop_i][j][0]+":"+head
|
|
|
|
+ # find_flag = True
|
|
|
|
+ # temp_head = inner_table[i-loop_i][j][0]
|
|
|
|
+ # else:
|
|
|
|
+ # if find_flag:
|
|
|
|
+ # break
|
|
|
|
+ # if str(head+inner_table[i][j][0]) in text_set:
|
|
|
|
+ # continue
|
|
|
|
+ # if re.search(rankPattern,head) is not None:
|
|
|
|
+ # rank_text += head+inner_table[i][j][0]+","
|
|
|
|
+ # #print(rank_text)
|
|
|
|
+ # elif re.search(entityPattern,head) is not None:
|
|
|
|
+ # entity_text += head+inner_table[i][j][0]+","
|
|
|
|
+ # #print(entity_text)
|
|
|
|
+ # else:
|
|
|
|
+ # text_line += head+inner_table[i][j][0]+","
|
|
|
|
+ # text_set.add(str(head+inner_table[i][j][0]))
|
|
|
|
+ # text += rank_text+entity_text+text_line
|
|
|
|
+ # text = text[:-1]+"。" if len(text)>0 else text
|
|
return text
|
|
return text
|
|
|
|
|
|
def removeFix(inner_table,fix_value="~~"):
|
|
def removeFix(inner_table,fix_value="~~"):
|
|
@@ -955,31 +1072,22 @@ def segment(soup):
|
|
text = text.replace('"',"“").replace("\r","").replace("\n",",")
|
|
text = text.replace('"',"“").replace("\r","").replace("\n",",")
|
|
text = re.sub("\s{4,}",",",text)
|
|
text = re.sub("\s{4,}",",",text)
|
|
#替换标点
|
|
#替换标点
|
|
- while(True):
|
|
|
|
- #替换连续的标点
|
|
|
|
- punc = re.search(",(?P<punc>:|。|,|;)\s*",text)
|
|
|
|
- if punc is not None:
|
|
|
|
- text = re.sub(","+punc.group("punc")+"\s*",punc.group("punc"),text)
|
|
|
|
|
|
+
|
|
|
|
+ #替换连续的标点
|
|
|
|
+ punc_pattern = "(?P<del>[。,;::,]+)\s*"
|
|
|
|
+ for punc_del in re.findall(punc_pattern,text):
|
|
|
|
+ if len(punc_del)>1:
|
|
|
|
+ text = re.sub(punc_del+"\s*",punc_del[-1],text)
|
|
|
|
+
|
|
|
|
+ for punc_del in re.findall(punc_pattern,text):
|
|
|
|
+ if len(punc_del)>1:
|
|
|
|
+ text = re.sub(punc_del+"\s*",punc_del[-1],text)
|
|
|
|
|
|
- punc = re.search("(?P<punc>:|。|,|;)\s*,",text)
|
|
|
|
- if punc is not None:
|
|
|
|
- text = re.sub(punc.group("punc")+"\s*,",punc.group("punc"),text)
|
|
|
|
- else:
|
|
|
|
- #替换标点之后的空格
|
|
|
|
- punc = re.search("(?P<punc>:|。|,|;)\s+",text)
|
|
|
|
- if punc is not None:
|
|
|
|
- text = re.sub(punc.group("punc")+"\s+",punc.group("punc"),text)
|
|
|
|
- else:
|
|
|
|
- break
|
|
|
|
|
|
+
|
|
#将连续的中文句号替换为一个
|
|
#将连续的中文句号替换为一个
|
|
text_split = text.split("。")
|
|
text_split = text.split("。")
|
|
text_split = [x for x in text_split if len(x)>0]
|
|
text_split = [x for x in text_split if len(x)>0]
|
|
- list_text = []
|
|
|
|
- # for _t in text_split:
|
|
|
|
- # list_text.append(re.sub(")",")",re.sub("(","(",re.sub("\s*","",_t))))
|
|
|
|
text = "。".join(text_split)
|
|
text = "。".join(text_split)
|
|
- # text = text.replace(')',")").replace("(","(").replace("\s","")
|
|
|
|
- #删除所有空格
|
|
|
|
# text过大报错
|
|
# text过大报错
|
|
LOOP_LEN = 10000
|
|
LOOP_LEN = 10000
|
|
LOOP_BEGIN = 0
|
|
LOOP_BEGIN = 0
|
|
@@ -990,10 +1098,6 @@ def segment(soup):
|
|
LOOP_BEGIN += LOOP_LEN
|
|
LOOP_BEGIN += LOOP_LEN
|
|
else:
|
|
else:
|
|
return text
|
|
return text
|
|
- # text = re.sub("\s*","",text)
|
|
|
|
- # #替换中文括号为英文括号
|
|
|
|
- # text = re.sub("(","(",text)
|
|
|
|
- # text = re.sub(")",")",text)
|
|
|
|
return _text
|
|
return _text
|
|
|
|
|
|
'''
|
|
'''
|