4 tahun lalu · b39580350c
--- a/BiddingKG/dl/interface/Preprocessing.py
+++ b/BiddingKG/dl/interface/Preprocessing.py
@@ -107,7 +107,7 @@ def tableToText(soup):
 
				             tr_line = []
			
 
				             tds = tr.findChildren(['td','th'], recursive=False)
			
 
				             for td in tds:
			
 
				-                tr_line.append([re.sub('\xa0','',segment(td)),0])
			
 
				+                tr_line.append([re.sub('\xa0','',segment(td,final=False)),0])
			
 
				                 #tr_line.append([td.get_text(),0])
			
 
				             inner_table.append(tr_line)
			
 
				         return inner_table                          
			
@@ -988,12 +988,13 @@ def tableToText(soup):
 
				     # return list_innerTable
			
 
				 
			
 
				 #数据清洗
			
 
				-def segment(soup):
			
 
				-    print("==")
			
 
				-    print(soup)
			
 
				-    print("====")
			
 
				+def segment(soup,final=True):
			
 
				+    # print("==")
			
 
				+    # print(soup)
			
 
				+    # print("====")
			
 
				     #segList = ["tr","div","h1", "h2", "h3", "h4", "h5", "h6", "header"]
			
 
				-    if soup.name=="td":
			
 
				+    subspaceList = ["td",'a',"span","p"]
			
 
				+    if soup.name in subspaceList:
			
 
				         #判断有值叶子节点数
			
 
				         _count = 0
			
 
				         for child in soup.find_all(recursive=True):
			
@@ -1018,15 +1019,14 @@ def segment(soup):
 
				             #         _substr = ""
			
 
				             # else:
			
 
				             #     _substr = ""
			
 
				-            # text = _substr.join(re.split("(\s+)",text))
			
 
				             text = text.replace("\r\n","，").replace("\n","，")
			
 
				-            # text = re.sub("^[，\s]*|[，\s]*$","",text)
			
 
				+            text = re.sub("\s+","##space##",text)
			
 
				             return text
			
 
				     segList = ["title"]
			
 
				     commaList = ["div","br","td","p"]
			
 
				     #commaList = []
			
 
				     spaceList = ["span"]
			
 
				-    subspaceList = ["td",'a',"span","p"]
			
 
				+
			
 
				     tbodies = soup.find_all('tbody')
			
 
				     if len(tbodies) == 0:
			
 
				         tbodies = soup.find_all('table')
			
@@ -1040,8 +1040,8 @@ def segment(soup):
 
				         # if child.name in subspaceList:
			
 
				         #     child.insert_before("#subs"+str(child.name)+"#")
			
 
				         #     child.insert_after("#sube"+str(child.name)+"#")
			
 
				-        if child.name in spaceList:
			
 
				-            child.insert_after(" ")
			
 
				+        # if child.name in spaceList:
			
 
				+        #     child.insert_after(" ")
			
 
				     text = str(soup.get_text())
			
 
				 
			
 
				     #替换英文冒号为中文冒号
			
@@ -1060,7 +1060,7 @@ def segment(soup):
 
				 
			
 
				     #替换连续的标点
			
 
				 
			
 
				-    punc_pattern = "(?P<del>[。，；：:,\s]{2,})"
			
 
				+    punc_pattern = "(?P<del>[。，；：:,\s]+)"
			
 
				 
			
 
				     list_punc = re.findall(punc_pattern,text)
			
 
				     list_punc.sort(key=lambda x:len(x),reverse=True)
			
@@ -1090,13 +1090,18 @@ def segment(soup):
 
				     LOOP_BEGIN = 0
			
 
				     _text = ""
			
 
				 
			
 
				+
			
 
				+
			
 
				     if len(text)<10000000:
			
 
				         while(LOOP_BEGIN<len(text)):
			
 
				-            _text += re.sub("）",")",re.sub("（","(",re.sub("\s{2,}","",text[LOOP_BEGIN:LOOP_BEGIN+LOOP_LEN])))
			
 
				+            _text += re.sub("）",")",re.sub("（","(",re.sub("\s+","",text[LOOP_BEGIN:LOOP_BEGIN+LOOP_LEN])))
			
 
				             LOOP_BEGIN += LOOP_LEN
			
 
				-    else:
			
 
				-        return text
			
 
				-    return _text
			
 
				+        text = _text
			
 
				+
			
 
				+    if final:
			
 
				+        text = re.sub("##space##"," ",text)
			
 
				+
			
 
				+    return text
			
 
				 
			
 
				 '''
			
 
				 #数据清洗