Эх сурвалжийг харах

保留单个的空格以解决预处理中时间被分割的问题,联系人去重maxcompute代码

rogel 4 жил өмнө
parent
commit
b39580350c

+ 21 - 16
BiddingKG/dl/interface/Preprocessing.py

@@ -107,7 +107,7 @@ def tableToText(soup):
             tr_line = []
             tds = tr.findChildren(['td','th'], recursive=False)
             for td in tds:
-                tr_line.append([re.sub('\xa0','',segment(td)),0])
+                tr_line.append([re.sub('\xa0','',segment(td,final=False)),0])
                 #tr_line.append([td.get_text(),0])
             inner_table.append(tr_line)
         return inner_table                          
@@ -988,12 +988,13 @@ def tableToText(soup):
     # return list_innerTable
 
 #数据清洗
-def segment(soup):
-    print("==")
-    print(soup)
-    print("====")
+def segment(soup,final=True):
+    # print("==")
+    # print(soup)
+    # print("====")
     #segList = ["tr","div","h1", "h2", "h3", "h4", "h5", "h6", "header"]
-    if soup.name=="td":
+    subspaceList = ["td",'a',"span","p"]
+    if soup.name in subspaceList:
         #判断有值叶子节点数
         _count = 0
         for child in soup.find_all(recursive=True):
@@ -1018,15 +1019,14 @@ def segment(soup):
             #         _substr = ""
             # else:
             #     _substr = ""
-            # text = _substr.join(re.split("(\s+)",text))
             text = text.replace("\r\n",",").replace("\n",",")
-            # text = re.sub("^[,\s]*|[,\s]*$","",text)
+            text = re.sub("\s+","##space##",text)
             return text
     segList = ["title"]
     commaList = ["div","br","td","p"]
     #commaList = []
     spaceList = ["span"]
-    subspaceList = ["td",'a',"span","p"]
+
     tbodies = soup.find_all('tbody')
     if len(tbodies) == 0:
         tbodies = soup.find_all('table')
@@ -1040,8 +1040,8 @@ def segment(soup):
         # if child.name in subspaceList:
         #     child.insert_before("#subs"+str(child.name)+"#")
         #     child.insert_after("#sube"+str(child.name)+"#")
-        if child.name in spaceList:
-            child.insert_after(" ")
+        # if child.name in spaceList:
+        #     child.insert_after(" ")
     text = str(soup.get_text())
 
     #替换英文冒号为中文冒号
@@ -1060,7 +1060,7 @@ def segment(soup):
 
     #替换连续的标点
 
-    punc_pattern = "(?P<del>[。,;::,\s]{2,})"
+    punc_pattern = "(?P<del>[。,;::,\s]+)"
 
     list_punc = re.findall(punc_pattern,text)
     list_punc.sort(key=lambda x:len(x),reverse=True)
@@ -1090,13 +1090,18 @@ def segment(soup):
     LOOP_BEGIN = 0
     _text = ""
 
+
+
     if len(text)<10000000:
         while(LOOP_BEGIN<len(text)):
-            _text += re.sub(")",")",re.sub("(","(",re.sub("\s{2,}","",text[LOOP_BEGIN:LOOP_BEGIN+LOOP_LEN])))
+            _text += re.sub(")",")",re.sub("(","(",re.sub("\s+","",text[LOOP_BEGIN:LOOP_BEGIN+LOOP_LEN])))
             LOOP_BEGIN += LOOP_LEN
-    else:
-        return text
-    return _text
+        text = _text
+
+    if final:
+        text = re.sub("##space##"," ",text)
+
+    return text
 
 '''
 #数据清洗