Ver Fonte

解决预处理中表格数据可能漏掉的问题以及标点符号没去干净的问题

rogel há 4 anos atrás
pai
commit
42f79e9b70

+ 243 - 139
BiddingKG/dl/interface/Preprocessing.py

@@ -419,10 +419,10 @@ def tableToText(soup):
                 inner_table[_h][_w][1] = 0
 
 
-        print("=====")
-        for item in inner_table:
-            print(item)
-        print("======")
+        # print("=====")
+        # for item in inner_table:
+        #     print(item)
+        # print("======")
 
         repairTable(inner_table)
         head_list = sliceTable(inner_table)
@@ -640,11 +640,7 @@ def tableToText(soup):
                 
             direct = getDirect(inner_table, head_begin, head_end)
 
-            print("----")
-            print(inner_table[head_begin:head_end])
-            print("head_end-head_begin",head_end-head_begin)
-            print(direct)
-            
+
             #若只有一行,则直接按行读取
             if head_end-head_begin==1:
                 text_line = ""
@@ -668,12 +664,69 @@ def tableToText(soup):
                     line_oc = []
                     for j in range(width):
                         cell = inner_table[i][j]
-                        line_oc.append({"text":cell[0],"type":cell[1]})
+                        line_oc.append({"text":cell[0],"type":cell[1],"occu_count":0,"left_head":"","top_head":""})
                     table_occurence.append(line_oc)
 
 
+                occu_height = len(table_occurence)
+                occu_width = len(table_occurence[0])
+                #为每个属性值寻找表头
+                for i in range(occu_height):
+                    for j in range(occu_width):
+                        cell = table_occurence[i][j]
+                        #是属性值
+                        if cell["type"]==0 and cell["text"]!="":
+                            left_head = ""
+                            top_head = ""
+
+                            find_flag = False
+                            temp_head = ""
+                            for loop_i in range(1,i+1):
+                                if not key_direct:
+                                    key_values = [1,2]
+                                else:
+                                    key_values = [1]
+                                if table_occurence[i-loop_i][j]["type"] in key_values:
+                                    if find_flag:
+                                        if table_occurence[i-loop_i][j][0]!=temp_head:
+                                            top_head = table_occurence[i-loop_i][j]["text"]+":"+top_head
+                                    else:
+                                        top_head = table_occurence[i-loop_i][j]["text"]+":"+top_head
+                                    find_flag = True
+                                    temp_head = table_occurence[i-loop_i][j]["text"]
+                                    table_occurence[i-loop_i][j]["occu_count"] += 1
+                                else:
+                                    #找到表头后遇到属性值就返回
+                                    if find_flag:
+                                        break
+
+
+                            cell["top_head"] += top_head
+                            find_flag = False
+                            temp_head = ""
+
+
+
+                            for loop_j in range(1,j+1):
+                                if not key_direct:
+                                    key_values = [1,2]
+                                else:
+                                    key_values = [2]
+                                if table_occurence[i][j-loop_j]["type"] in key_values:
+                                    if find_flag:
+                                        if table_occurence[i][j-loop_j]["text"]!=temp_head:
+                                            left_head = table_occurence[i][j-loop_j]["text"]+":"+left_head
+                                    else:
+                                        left_head = table_occurence[i][j-loop_j]["text"]+":"+left_head
+                                    find_flag = True
+                                    temp_head = table_occurence[i][j-loop_j]["text"]
+                                    table_occurence[i][j-loop_j]["occu_count"] += 1
+                                else:
+                                    if find_flag:
+                                        break
+                            cell["left_head"] += left_head
                 if direct=="row":
-                    for i in range(head_begin,head_end):
+                    for i in range(occu_height):
                         pack_text = ""
                         rank_text = ""
                         entity_text = ""
@@ -681,131 +734,195 @@ def tableToText(soup):
                         #在同一句话中重复的可以去掉
                         text_set = set()
                         for j in range(width):
-                            cell = inner_table[i][j]
-                            #是属性值
-                            if cell[1]==0 and cell[0]!="":
-                                head = ""
-                                
-                                find_flag = False
-                                temp_head = ""
-                                for loop_i in range(0,i+1-head_begin):
-                                    if not key_direct:
-                                        key_values = [1,2]
-                                    else:
-                                        key_values = [1]
-                                    if inner_table[i-loop_i][j][1] in key_values:
-                                        if find_flag:
-                                            if inner_table[i-loop_i][j][0]!=temp_head:
-                                                head = inner_table[i-loop_i][j][0]+":"+head
-                                        else:
-                                            head = inner_table[i-loop_i][j][0]+":"+head
-                                        find_flag = True
-                                        temp_head = inner_table[i-loop_i][j][0]
-                                    else:
-                                        #找到表头后遇到属性值就返回
-                                        if find_flag:
-                                            break
-                                
-                                find_flag = False
-                                temp_head = ""
-                                
-                                
-                                
-                                for loop_j in range(1,j+1):
-                                    if not key_direct:
-                                        key_values = [1,2]
-                                    else:
-                                        key_values = [2]
-                                    if inner_table[i][j-loop_j][1] in key_values:
-                                        if find_flag:
-                                            if inner_table[i][j-loop_j][0]!=temp_head:
-                                                head = inner_table[i][j-loop_j][0]+":"+head
-                                        else:
-                                            head = inner_table[i][j-loop_j][0]+":"+head
-                                        find_flag = True
-                                        temp_head = inner_table[i][j-loop_j][0]
-                                    else:
-                                        if find_flag:
-                                            break
-                                
-                                if str(head+inner_table[i][j][0]) in text_set:
+                            cell = table_occurence[i][j]
+                            if cell["type"]==0 or (cell["type"]==1 and cell["occu_count"]==0):
+
+                                cell = table_occurence[i][j]
+                                head = (cell["top_head"]+":") if len(cell["top_head"])>0 else ""
+                                head += cell["left_head"]
+                                if str(head+cell["text"]) in text_set:
                                     continue
                                 if re.search(packPattern,head) is not None:
-                                    pack_text += head+inner_table[i][j][0]+","
+                                    pack_text += head+cell["text"]+","
                                 elif re.search(rankPattern,head) is not None:   # 2020/11/23 大网站规则发现问题,if 改elif
                                     #排名替换为同一种表达
-                                    rank_text += head+inner_table[i][j][0]+","
+                                    rank_text += head+cell["text"]+","
                                     #print(rank_text)
                                 elif re.search(entityPattern,head) is not None:
-                                    entity_text += head+inner_table[i][j][0]+","
+                                    entity_text += head+cell["text"]+","
                                     #print(entity_text)
                                 else:
-                                    text_line += head+inner_table[i][j][0]+","
-                                text_set.add(str(head+inner_table[i][j][0]))
+                                    text_line += head+cell["text"]+","
+                                text_set.add(str(head+cell["text"]))
+
                         text += pack_text+rank_text+entity_text+text_line
                         text = text[:-1]+"。" if len(text)>0 else text
+
                 else:
-                    for j in range(width):
-                    
+                    for j in range(occu_width):
                         rank_text = ""
                         entity_text = ""
                         text_line = ""
                         text_set = set()
-                        for i in range(head_begin,head_end):
-                            cell = inner_table[i][j]
-                            #是属性值
-                            if cell[1]==0 and cell[0]!="":
-                                find_flag = False
-                                head = ""
-                                temp_head = ""
-                                
-                                for loop_j in range(1,j+1):
-                                    if not key_direct:
-                                        key_values = [1,2]
-                                    else:
-                                        key_values = [2]
-                                    if inner_table[i][j-loop_j][1] in key_values:
-                                        if find_flag:
-                                            if inner_table[i][j-loop_j][0]!=temp_head:
-                                                head = inner_table[i][j-loop_j][0]+":"+head
-                                        else:
-                                            head = inner_table[i][j-loop_j][0]+":"+head
-                                        find_flag = True
-                                        temp_head = inner_table[i][j-loop_j][0]
-                                    else:
-                                        if find_flag:
-                                            break
-                                find_flag = False
-                                temp_head = ""
-                                for loop_i in range(0,i+1-head_begin):
-                                    if not key_direct:
-                                        key_values = [1,2]
-                                    else:
-                                        key_values = [1]
-                                    if inner_table[i-loop_i][j][1] in key_values:
-                                        if find_flag:
-                                            if inner_table[i-loop_i][j][0]!=temp_head:
-                                                head = inner_table[i-loop_i][j][0]+":"+head
-                                        else:
-                                            head = inner_table[i-loop_i][j][0]+":"+head
-                                        find_flag = True
-                                        temp_head = inner_table[i-loop_i][j][0]
-                                    else:
-                                        if find_flag:
-                                            break
-                                if str(head+inner_table[i][j][0]) in text_set:
+                        for i in range(occu_height):
+                            cell = table_occurence[i][j]
+                            if cell["type"]==0 or (cell["type"]==1 and cell["occu_count"]==0):
+
+                                cell = table_occurence[i][j]
+                                head = (cell["left_head"]+"") if len(cell["left_head"])>0 else ""
+                                head += cell["top_head"]
+                                if str(head+cell["text"]) in text_set:
                                     continue
-                                if re.search(rankPattern,head) is not None:
-                                    rank_text += head+inner_table[i][j][0]+","
+                                if re.search(packPattern,head) is not None:
+                                    pack_text += head+cell["text"]+","
+                                elif re.search(rankPattern,head) is not None:   # 2020/11/23 大网站规则发现问题,if 改elif
+                                    #排名替换为同一种表达
+                                    rank_text += head+cell["text"]+","
                                     #print(rank_text)
                                 elif re.search(entityPattern,head) is not None:
-                                    entity_text += head+inner_table[i][j][0]+","
+                                    entity_text += head+cell["text"]+","
                                     #print(entity_text)
                                 else:
-                                    text_line += head+inner_table[i][j][0]+","
-                                text_set.add(str(head+inner_table[i][j][0]))
-                        text += rank_text+entity_text+text_line
+                                    text_line += head+cell["text"]+","
+                                text_set.add(str(head+cell["text"]))
+                        text += pack_text+rank_text+entity_text+text_line
                         text = text[:-1]+"。" if len(text)>0 else text
+
+
+                # if direct=="row":
+                #     for i in range(head_begin,head_end):
+                #         pack_text = ""
+                #         rank_text = ""
+                #         entity_text = ""
+                #         text_line = ""
+                #         #在同一句话中重复的可以去掉
+                #         text_set = set()
+                #         for j in range(width):
+                #             cell = inner_table[i][j]
+                #             #是属性值
+                #             if cell[1]==0 and cell[0]!="":
+                #                 head = ""
+                #
+                #                 find_flag = False
+                #                 temp_head = ""
+                #                 for loop_i in range(0,i+1-head_begin):
+                #                     if not key_direct:
+                #                         key_values = [1,2]
+                #                     else:
+                #                         key_values = [1]
+                #                     if inner_table[i-loop_i][j][1] in key_values:
+                #                         if find_flag:
+                #                             if inner_table[i-loop_i][j][0]!=temp_head:
+                #                                 head = inner_table[i-loop_i][j][0]+":"+head
+                #                         else:
+                #                             head = inner_table[i-loop_i][j][0]+":"+head
+                #                         find_flag = True
+                #                         temp_head = inner_table[i-loop_i][j][0]
+                #                     else:
+                #                         #找到表头后遇到属性值就返回
+                #                         if find_flag:
+                #                             break
+                #
+                #                 find_flag = False
+                #                 temp_head = ""
+                #
+                #
+                #
+                #                 for loop_j in range(1,j+1):
+                #                     if not key_direct:
+                #                         key_values = [1,2]
+                #                     else:
+                #                         key_values = [2]
+                #                     if inner_table[i][j-loop_j][1] in key_values:
+                #                         if find_flag:
+                #                             if inner_table[i][j-loop_j][0]!=temp_head:
+                #                                 head = inner_table[i][j-loop_j][0]+":"+head
+                #                         else:
+                #                             head = inner_table[i][j-loop_j][0]+":"+head
+                #                         find_flag = True
+                #                         temp_head = inner_table[i][j-loop_j][0]
+                #                     else:
+                #                         if find_flag:
+                #                             break
+                #
+                #                 if str(head+inner_table[i][j][0]) in text_set:
+                #                     continue
+                #                 if re.search(packPattern,head) is not None:
+                #                     pack_text += head+inner_table[i][j][0]+","
+                #                 elif re.search(rankPattern,head) is not None:   # 2020/11/23 大网站规则发现问题,if 改elif
+                #                     #排名替换为同一种表达
+                #                     rank_text += head+inner_table[i][j][0]+","
+                #                     #print(rank_text)
+                #                 elif re.search(entityPattern,head) is not None:
+                #                     entity_text += head+inner_table[i][j][0]+","
+                #                     #print(entity_text)
+                #                 else:
+                #                     text_line += head+inner_table[i][j][0]+","
+                #                 text_set.add(str(head+inner_table[i][j][0]))
+                #         text += pack_text+rank_text+entity_text+text_line
+                #         text = text[:-1]+"。" if len(text)>0 else text
+                # else:
+                #     for j in range(width):
+                #
+                #         rank_text = ""
+                #         entity_text = ""
+                #         text_line = ""
+                #         text_set = set()
+                #         for i in range(head_begin,head_end):
+                #             cell = inner_table[i][j]
+                #             #是属性值
+                #             if cell[1]==0 and cell[0]!="":
+                #                 find_flag = False
+                #                 head = ""
+                #                 temp_head = ""
+                #
+                #                 for loop_j in range(1,j+1):
+                #                     if not key_direct:
+                #                         key_values = [1,2]
+                #                     else:
+                #                         key_values = [2]
+                #                     if inner_table[i][j-loop_j][1] in key_values:
+                #                         if find_flag:
+                #                             if inner_table[i][j-loop_j][0]!=temp_head:
+                #                                 head = inner_table[i][j-loop_j][0]+":"+head
+                #                         else:
+                #                             head = inner_table[i][j-loop_j][0]+":"+head
+                #                         find_flag = True
+                #                         temp_head = inner_table[i][j-loop_j][0]
+                #                     else:
+                #                         if find_flag:
+                #                             break
+                #                 find_flag = False
+                #                 temp_head = ""
+                #                 for loop_i in range(0,i+1-head_begin):
+                #                     if not key_direct:
+                #                         key_values = [1,2]
+                #                     else:
+                #                         key_values = [1]
+                #                     if inner_table[i-loop_i][j][1] in key_values:
+                #                         if find_flag:
+                #                             if inner_table[i-loop_i][j][0]!=temp_head:
+                #                                 head = inner_table[i-loop_i][j][0]+":"+head
+                #                         else:
+                #                             head = inner_table[i-loop_i][j][0]+":"+head
+                #                         find_flag = True
+                #                         temp_head = inner_table[i-loop_i][j][0]
+                #                     else:
+                #                         if find_flag:
+                #                             break
+                #                 if str(head+inner_table[i][j][0]) in text_set:
+                #                     continue
+                #                 if re.search(rankPattern,head) is not None:
+                #                     rank_text += head+inner_table[i][j][0]+","
+                #                     #print(rank_text)
+                #                 elif re.search(entityPattern,head) is not None:
+                #                     entity_text += head+inner_table[i][j][0]+","
+                #                     #print(entity_text)
+                #                 else:
+                #                     text_line += head+inner_table[i][j][0]+","
+                #                 text_set.add(str(head+inner_table[i][j][0]))
+                #         text += rank_text+entity_text+text_line
+                #         text = text[:-1]+"。" if len(text)>0 else text
         return text
     
     def removeFix(inner_table,fix_value="~~"):
@@ -955,31 +1072,22 @@ def segment(soup):
     text = text.replace('"',"“").replace("\r","").replace("\n",",")
     text = re.sub("\s{4,}",",",text)   
     #替换标点
-    while(True):
-        #替换连续的标点
-        punc = re.search(",(?P<punc>:|。|,|;)\s*",text)
-        if punc is not None:
-            text = re.sub(","+punc.group("punc")+"\s*",punc.group("punc"),text)
+
+    #替换连续的标点
+    punc_pattern = "(?P<del>[。,;::,]+)\s*"
+    for punc_del in re.findall(punc_pattern,text):
+        if len(punc_del)>1:
+            text = re.sub(punc_del+"\s*",punc_del[-1],text)
+
+    for punc_del in re.findall(punc_pattern,text):
+        if len(punc_del)>1:
+            text = re.sub(punc_del+"\s*",punc_del[-1],text)
         
-        punc = re.search("(?P<punc>:|。|,|;)\s*,",text)
-        if punc is not None:
-            text = re.sub(punc.group("punc")+"\s*,",punc.group("punc"),text)
-        else:
-            #替换标点之后的空格
-            punc = re.search("(?P<punc>:|。|,|;)\s+",text)
-            if punc is not None:
-                text = re.sub(punc.group("punc")+"\s+",punc.group("punc"),text)
-            else:
-                break
+
     #将连续的中文句号替换为一个
     text_split = text.split("。")
     text_split = [x for x in text_split if len(x)>0]
-    list_text = []
-    # for _t in text_split:
-    #     list_text.append(re.sub(")",")",re.sub("(","(",re.sub("\s*","",_t))))
     text = "。".join(text_split)
-    # text = text.replace(')',")").replace("(","(").replace("\s","")
-    #删除所有空格
     # text过大报错
     LOOP_LEN = 10000
     LOOP_BEGIN = 0
@@ -990,10 +1098,6 @@ def segment(soup):
             LOOP_BEGIN += LOOP_LEN
     else:
         return text
-    # text = re.sub("\s*","",text)
-    # #替换中文括号为英文括号
-    # text = re.sub("(","(",text)
-    # text = re.sub(")",")",text)
     return _text
 
 '''

+ 1 - 1
BiddingKG/dl/interface/modelFactory.py

@@ -195,8 +195,8 @@ class Model_person_classify():
     '''
     
     def encode(self,tokens,begin_index,end_index,**kwargs):
+        # return embedding(spanWindow(tokens=tokens,begin_index=begin_index,end_index=end_index,size=10),shape=(2,10,128))
         return embedding(spanWindow(tokens=tokens,begin_index=begin_index,end_index=end_index,size=35),shape=(2,35,128))
-        # return embedding(spanWindow(tokens=tokens,begin_index=begin_index,end_index=end_index,size=35),shape=(2,35,128))
 
     def predict(self,x):
         x = np.transpose(np.array(x),(1,0,2,3))

+ 2 - 1
BiddingKG/maxcompute/documentDumplicate.py

@@ -190,7 +190,8 @@ class f_set_docid(BaseUDAF):
                 _set_column = set()
                 _set_tenderee = set()
                 for j in range(_begin,i+1):
-                    _set_tenderee.add(list_docs[j]["tenderee"])
+                    if list_docs[j]["tenderee"] is not None and list_docs[j]["tenderee"]!="":
+                        _set_tenderee.add(list_docs[j]["tenderee"])
                     _set_column.add(list_docs[j]["defind_column"])
                     _group.append({"docid":list_docs[j]["docid"],"extract_count":list_docs[j]["extract_count"]})