|
@@ -107,7 +107,7 @@ def tableToText(soup):
|
|
|
tr_line = []
|
|
|
tds = tr.findChildren(['td','th'], recursive=False)
|
|
|
for td in tds:
|
|
|
- tr_line.append([re.sub('\xa0','',segment(td)),0])
|
|
|
+ tr_line.append([re.sub('\xa0','',segment(td,final=False)),0])
|
|
|
#tr_line.append([td.get_text(),0])
|
|
|
inner_table.append(tr_line)
|
|
|
return inner_table
|
|
@@ -988,12 +988,13 @@ def tableToText(soup):
|
|
|
# return list_innerTable
|
|
|
|
|
|
#数据清洗
|
|
|
-def segment(soup):
|
|
|
- print("==")
|
|
|
- print(soup)
|
|
|
- print("====")
|
|
|
+def segment(soup,final=True):
|
|
|
+ # print("==")
|
|
|
+ # print(soup)
|
|
|
+ # print("====")
|
|
|
#segList = ["tr","div","h1", "h2", "h3", "h4", "h5", "h6", "header"]
|
|
|
- if soup.name=="td":
|
|
|
+ subspaceList = ["td",'a',"span","p"]
|
|
|
+ if soup.name in subspaceList:
|
|
|
#判断有值叶子节点数
|
|
|
_count = 0
|
|
|
for child in soup.find_all(recursive=True):
|
|
@@ -1018,15 +1019,14 @@ def segment(soup):
|
|
|
# _substr = ""
|
|
|
# else:
|
|
|
# _substr = ""
|
|
|
- # text = _substr.join(re.split("(\s+)",text))
|
|
|
text = text.replace("\r\n",",").replace("\n",",")
|
|
|
- # text = re.sub("^[,\s]*|[,\s]*$","",text)
|
|
|
+ text = re.sub("\s+","##space##",text)
|
|
|
return text
|
|
|
segList = ["title"]
|
|
|
commaList = ["div","br","td","p"]
|
|
|
#commaList = []
|
|
|
spaceList = ["span"]
|
|
|
- subspaceList = ["td",'a',"span","p"]
|
|
|
+
|
|
|
tbodies = soup.find_all('tbody')
|
|
|
if len(tbodies) == 0:
|
|
|
tbodies = soup.find_all('table')
|
|
@@ -1040,8 +1040,8 @@ def segment(soup):
|
|
|
# if child.name in subspaceList:
|
|
|
# child.insert_before("#subs"+str(child.name)+"#")
|
|
|
# child.insert_after("#sube"+str(child.name)+"#")
|
|
|
- if child.name in spaceList:
|
|
|
- child.insert_after(" ")
|
|
|
+ # if child.name in spaceList:
|
|
|
+ # child.insert_after(" ")
|
|
|
text = str(soup.get_text())
|
|
|
|
|
|
#替换英文冒号为中文冒号
|
|
@@ -1060,7 +1060,7 @@ def segment(soup):
|
|
|
|
|
|
#替换连续的标点
|
|
|
|
|
|
- punc_pattern = "(?P<del>[。,;::,\s]{2,})"
|
|
|
+ punc_pattern = "(?P<del>[。,;::,\s]+)"
|
|
|
|
|
|
list_punc = re.findall(punc_pattern,text)
|
|
|
list_punc.sort(key=lambda x:len(x),reverse=True)
|
|
@@ -1090,13 +1090,18 @@ def segment(soup):
|
|
|
LOOP_BEGIN = 0
|
|
|
_text = ""
|
|
|
|
|
|
+
|
|
|
+
|
|
|
if len(text)<10000000:
|
|
|
while(LOOP_BEGIN<len(text)):
|
|
|
- _text += re.sub(")",")",re.sub("(","(",re.sub("\s{2,}","",text[LOOP_BEGIN:LOOP_BEGIN+LOOP_LEN])))
|
|
|
+ _text += re.sub(")",")",re.sub("(","(",re.sub("\s+","",text[LOOP_BEGIN:LOOP_BEGIN+LOOP_LEN])))
|
|
|
LOOP_BEGIN += LOOP_LEN
|
|
|
- else:
|
|
|
- return text
|
|
|
- return _text
|
|
|
+ text = _text
|
|
|
+
|
|
|
+ if final:
|
|
|
+ text = re.sub("##space##"," ",text)
|
|
|
+
|
|
|
+ return text
|
|
|
|
|
|
'''
|
|
|
#数据清洗
|