4 år sedan · 42f79e9b70
--- a/BiddingKG/dl/interface/Preprocessing.py
+++ b/BiddingKG/dl/interface/Preprocessing.py
@@ -419,10 +419,10 @@ def tableToText(soup):
 
															                 inner_table[_h][_w][1] = 0
														
 
															-        print("=====")
														
 
															-        for item in inner_table:
														
 
															-            print(item)
														
 
															-        print("======")
														
 
															+        # print("=====")
														
 
															+        # for item in inner_table:
														
 
															+        #     print(item)
														
 
															+        # print("======")
														
 
															         repairTable(inner_table)
														
 
															         head_list = sliceTable(inner_table)
														
@@ -640,11 +640,7 @@ def tableToText(soup):
 
															             direct = getDirect(inner_table, head_begin, head_end)
														
 
															-            print("----")
														
 
															-            print(inner_table[head_begin:head_end])
														
 
															-            print("head_end-head_begin",head_end-head_begin)
														
 
															-            print(direct)
														
 
															-            
														
 
															+
														
 
															             #若只有一行，则直接按行读取
														
 
															             if head_end-head_begin==1:
														
 
															                 text_line = ""
														
@@ -668,12 +664,69 @@ def tableToText(soup):
 
															                     line_oc = []
														
 
															                     for j in range(width):
														
 
															                         cell = inner_table[i][j]
														
 
															-                        line_oc.append({"text":cell[0],"type":cell[1]})
														
 
															+                        line_oc.append({"text":cell[0],"type":cell[1],"occu_count":0,"left_head":"","top_head":""})
														
 
															                     table_occurence.append(line_oc)
														
 
															+                occu_height = len(table_occurence)
														
 
															+                occu_width = len(table_occurence[0])
														
 
															+                #为每个属性值寻找表头
														
 
															+                for i in range(occu_height):
														
 
															+                    for j in range(occu_width):
														
 
															+                        cell = table_occurence[i][j]
														
 
															+                        #是属性值
														
 
															+                        if cell["type"]==0 and cell["text"]!="":
														
 
															+                            left_head = ""
														
 
															+                            top_head = ""
														
 
															+
														
 
															+                            find_flag = False
														
 
															+                            temp_head = ""
														
 
															+                            for loop_i in range(1,i+1):
														
 
															+                                if not key_direct:
														
 
															+                                    key_values = [1,2]
														
 
															+                                else:
														
 
															+                                    key_values = [1]
														
 
															+                                if table_occurence[i-loop_i][j]["type"] in key_values:
														
 
															+                                    if find_flag:
														
 
															+                                        if table_occurence[i-loop_i][j][0]!=temp_head:
														
 
															+                                            top_head = table_occurence[i-loop_i][j]["text"]+":"+top_head
														
 
															+                                    else:
														
 
															+                                        top_head = table_occurence[i-loop_i][j]["text"]+":"+top_head
														
 
															+                                    find_flag = True
														
 
															+                                    temp_head = table_occurence[i-loop_i][j]["text"]
														
 
															+                                    table_occurence[i-loop_i][j]["occu_count"] += 1
														
 
															+                                else:
														
 
															+                                    #找到表头后遇到属性值就返回
														
 
															+                                    if find_flag:
														
 
															+                                        break
														
 
															+
														
 
															+
														
 
															+                            cell["top_head"] += top_head
														
 
															+                            find_flag = False
														
 
															+                            temp_head = ""
														
 
															+
														
 
															+
														
 
															+
														
 
															+                            for loop_j in range(1,j+1):
														
 
															+                                if not key_direct:
														
 
															+                                    key_values = [1,2]
														
 
															+                                else:
														
 
															+                                    key_values = [2]
														
 
															+                                if table_occurence[i][j-loop_j]["type"] in key_values:
														
 
															+                                    if find_flag:
														
 
															+                                        if table_occurence[i][j-loop_j]["text"]!=temp_head:
														
 
															+                                            left_head = table_occurence[i][j-loop_j]["text"]+":"+left_head
														
 
															+                                    else:
														
 
															+                                        left_head = table_occurence[i][j-loop_j]["text"]+":"+left_head
														
 
															+                                    find_flag = True
														
 
															+                                    temp_head = table_occurence[i][j-loop_j]["text"]
														
 
															+                                    table_occurence[i][j-loop_j]["occu_count"] += 1
														
 
															+                                else:
														
 
															+                                    if find_flag:
														
 
															+                                        break
														
 
															+                            cell["left_head"] += left_head
														
 
															                 if direct=="row":
														
 
															-                    for i in range(head_begin,head_end):
														
 
															+                    for i in range(occu_height):
														
 
															                         pack_text = ""
														
 
															                         rank_text = ""
														
 
															                         entity_text = ""
														
@@ -681,131 +734,195 @@ def tableToText(soup):
 
															                         #在同一句话中重复的可以去掉
														
 
															                         text_set = set()
														
 
															                         for j in range(width):
														
 
															-                            cell = inner_table[i][j]
														
 
															-                            #是属性值
														
 
															-                            if cell[1]==0 and cell[0]!="":
														
 
															-                                head = ""
														
 
															-                                
														
 
															-                                find_flag = False
														
 
															-                                temp_head = ""
														
 
															-                                for loop_i in range(0,i+1-head_begin):
														
 
															-                                    if not key_direct:
														
 
															-                                        key_values = [1,2]
														
 
															-                                    else:
														
 
															-                                        key_values = [1]
														
 
															-                                    if inner_table[i-loop_i][j][1] in key_values:
														
 
															-                                        if find_flag:
														
 
															-                                            if inner_table[i-loop_i][j][0]!=temp_head:
														
 
															-                                                head = inner_table[i-loop_i][j][0]+":"+head
														
 
															-                                        else:
														
 
															-                                            head = inner_table[i-loop_i][j][0]+":"+head
														
 
															-                                        find_flag = True
														
 
															-                                        temp_head = inner_table[i-loop_i][j][0]
														
 
															-                                    else:
														
 
															-                                        #找到表头后遇到属性值就返回
														
 
															-                                        if find_flag:
														
 
															-                                            break
														
 
															-                                
														
 
															-                                find_flag = False
														
 
															-                                temp_head = ""
														
 
															-                                
														
 
															-                                
														
 
															-                                
														
 
															-                                for loop_j in range(1,j+1):
														
 
															-                                    if not key_direct:
														
 
															-                                        key_values = [1,2]
														
 
															-                                    else:
														
 
															-                                        key_values = [2]
														
 
															-                                    if inner_table[i][j-loop_j][1] in key_values:
														
 
															-                                        if find_flag:
														
 
															-                                            if inner_table[i][j-loop_j][0]!=temp_head:
														
 
															-                                                head = inner_table[i][j-loop_j][0]+":"+head
														
 
															-                                        else:
														
 
															-                                            head = inner_table[i][j-loop_j][0]+":"+head
														
 
															-                                        find_flag = True
														
 
															-                                        temp_head = inner_table[i][j-loop_j][0]
														
 
															-                                    else:
														
 
															-                                        if find_flag:
														
 
															-                                            break
														
 
															-                                
														
 
															-                                if str(head+inner_table[i][j][0]) in text_set:
														
 
															+                            cell = table_occurence[i][j]
														
 
															+                            if cell["type"]==0 or (cell["type"]==1 and cell["occu_count"]==0):
														
 
															+
														
 
															+                                cell = table_occurence[i][j]
														
 
															+                                head = (cell["top_head"]+":") if len(cell["top_head"])>0 else ""
														
 
															+                                head += cell["left_head"]
														
 
															+                                if str(head+cell["text"]) in text_set:
														
 
															                                     continue
														
 
															                                 if re.search(packPattern,head) is not None:
														
 
															-                                    pack_text += head+inner_table[i][j][0]+"，"
														
 
															+                                    pack_text += head+cell["text"]+"，"
														
 
															                                 elif re.search(rankPattern,head) is not None:   # 2020/11/23 大网站规则发现问题，if 改elif
														
 
															                                     #排名替换为同一种表达
														
 
															-                                    rank_text += head+inner_table[i][j][0]+"，"
														
 
															+                                    rank_text += head+cell["text"]+"，"
														
 
															                                     #print(rank_text)
														
 
															                                 elif re.search(entityPattern,head) is not None:
														
 
															-                                    entity_text += head+inner_table[i][j][0]+"，"
														
 
															+                                    entity_text += head+cell["text"]+"，"
														
 
															                                     #print(entity_text)
														
 
															                                 else:
														
 
															-                                    text_line += head+inner_table[i][j][0]+"，"
														
 
															-                                text_set.add(str(head+inner_table[i][j][0]))
														
 
															+                                    text_line += head+cell["text"]+"，"
														
 
															+                                text_set.add(str(head+cell["text"]))
														
 
															+
														
 
															                         text += pack_text+rank_text+entity_text+text_line
														
 
															                         text = text[:-1]+"。" if len(text)>0 else text
														
 
															+
														
 
															                 else:
														
 
															-                    for j in range(width):
														
 
															-                    
														
 
															+                    for j in range(occu_width):
														
 
															                         rank_text = ""
														
 
															                         entity_text = ""
														
 
															                         text_line = ""
														
 
															                         text_set = set()
														
 
															-                        for i in range(head_begin,head_end):
														
 
															-                            cell = inner_table[i][j]
														
 
															-                            #是属性值
														
 
															-                            if cell[1]==0 and cell[0]!="":
														
 
															-                                find_flag = False
														
 
															-                                head = ""
														
 
															-                                temp_head = ""
														
 
															-                                
														
 
															-                                for loop_j in range(1,j+1):
														
 
															-                                    if not key_direct:
														
 
															-                                        key_values = [1,2]
														
 
															-                                    else:
														
 
															-                                        key_values = [2]
														
 
															-                                    if inner_table[i][j-loop_j][1] in key_values:
														
 
															-                                        if find_flag:
														
 
															-                                            if inner_table[i][j-loop_j][0]!=temp_head:
														
 
															-                                                head = inner_table[i][j-loop_j][0]+":"+head
														
 
															-                                        else:
														
 
															-                                            head = inner_table[i][j-loop_j][0]+":"+head
														
 
															-                                        find_flag = True
														
 
															-                                        temp_head = inner_table[i][j-loop_j][0]
														
 
															-                                    else:
														
 
															-                                        if find_flag:
														
 
															-                                            break
														
 
															-                                find_flag = False
														
 
															-                                temp_head = ""
														
 
															-                                for loop_i in range(0,i+1-head_begin):
														
 
															-                                    if not key_direct:
														
 
															-                                        key_values = [1,2]
														
 
															-                                    else:
														
 
															-                                        key_values = [1]
														
 
															-                                    if inner_table[i-loop_i][j][1] in key_values:
														
 
															-                                        if find_flag:
														
 
															-                                            if inner_table[i-loop_i][j][0]!=temp_head:
														
 
															-                                                head = inner_table[i-loop_i][j][0]+":"+head
														
 
															-                                        else:
														
 
															-                                            head = inner_table[i-loop_i][j][0]+":"+head
														
 
															-                                        find_flag = True
														
 
															-                                        temp_head = inner_table[i-loop_i][j][0]
														
 
															-                                    else:
														
 
															-                                        if find_flag:
														
 
															-                                            break
														
 
															-                                if str(head+inner_table[i][j][0]) in text_set:
														
 
															+                        for i in range(occu_height):
														
 
															+                            cell = table_occurence[i][j]
														
 
															+                            if cell["type"]==0 or (cell["type"]==1 and cell["occu_count"]==0):
														
 
															+
														
 
															+                                cell = table_occurence[i][j]
														
 
															+                                head = (cell["left_head"]+"") if len(cell["left_head"])>0 else ""
														
 
															+                                head += cell["top_head"]
														
 
															+                                if str(head+cell["text"]) in text_set:
														
 
															                                     continue
														
 
															-                                if re.search(rankPattern,head) is not None:
														
 
															-                                    rank_text += head+inner_table[i][j][0]+"，"
														
 
															+                                if re.search(packPattern,head) is not None:
														
 
															+                                    pack_text += head+cell["text"]+"，"
														
 
															+                                elif re.search(rankPattern,head) is not None:   # 2020/11/23 大网站规则发现问题，if 改elif
														
 
															+                                    #排名替换为同一种表达
														
 
															+                                    rank_text += head+cell["text"]+"，"
														
 
															                                     #print(rank_text)
														
 
															                                 elif re.search(entityPattern,head) is not None:
														
 
															-                                    entity_text += head+inner_table[i][j][0]+"，"
														
 
															+                                    entity_text += head+cell["text"]+"，"
														
 
															                                     #print(entity_text)
														
 
															                                 else:
														
 
															-                                    text_line += head+inner_table[i][j][0]+"，"
														
 
															-                                text_set.add(str(head+inner_table[i][j][0]))
														
 
															-                        text += rank_text+entity_text+text_line
														
 
															+                                    text_line += head+cell["text"]+"，"
														
 
															+                                text_set.add(str(head+cell["text"]))
														
 
															+                        text += pack_text+rank_text+entity_text+text_line
														
 
															                         text = text[:-1]+"。" if len(text)>0 else text
														
 
															+
														
 
															+
														
 
															+                # if direct=="row":
														
 
															+                #     for i in range(head_begin,head_end):
														
 
															+                #         pack_text = ""
														
 
															+                #         rank_text = ""
														
 
															+                #         entity_text = ""
														
 
															+                #         text_line = ""
														
 
															+                #         #在同一句话中重复的可以去掉
														
 
															+                #         text_set = set()
														
 
															+                #         for j in range(width):
														
 
															+                #             cell = inner_table[i][j]
														
 
															+                #             #是属性值
														
 
															+                #             if cell[1]==0 and cell[0]!="":
														
 
															+                #                 head = ""
														
 
															+                #
														
 
															+                #                 find_flag = False
														
 
															+                #                 temp_head = ""
														
 
															+                #                 for loop_i in range(0,i+1-head_begin):
														
 
															+                #                     if not key_direct:
														
 
															+                #                         key_values = [1,2]
														
 
															+                #                     else:
														
 
															+                #                         key_values = [1]
														
 
															+                #                     if inner_table[i-loop_i][j][1] in key_values:
														
 
															+                #                         if find_flag:
														
 
															+                #                             if inner_table[i-loop_i][j][0]!=temp_head:
														
 
															+                #                                 head = inner_table[i-loop_i][j][0]+":"+head
														
 
															+                #                         else:
														
 
															+                #                             head = inner_table[i-loop_i][j][0]+":"+head
														
 
															+                #                         find_flag = True
														
 
															+                #                         temp_head = inner_table[i-loop_i][j][0]
														
 
															+                #                     else:
														
 
															+                #                         #找到表头后遇到属性值就返回
														
 
															+                #                         if find_flag:
														
 
															+                #                             break
														
 
															+                #
														
 
															+                #                 find_flag = False
														
 
															+                #                 temp_head = ""
														
 
															+                #
														
 
															+                #
														
 
															+                #
														
 
															+                #                 for loop_j in range(1,j+1):
														
 
															+                #                     if not key_direct:
														
 
															+                #                         key_values = [1,2]
														
 
															+                #                     else:
														
 
															+                #                         key_values = [2]
														
 
															+                #                     if inner_table[i][j-loop_j][1] in key_values:
														
 
															+                #                         if find_flag:
														
 
															+                #                             if inner_table[i][j-loop_j][0]!=temp_head:
														
 
															+                #                                 head = inner_table[i][j-loop_j][0]+":"+head
														
 
															+                #                         else:
														
 
															+                #                             head = inner_table[i][j-loop_j][0]+":"+head
														
 
															+                #                         find_flag = True
														
 
															+                #                         temp_head = inner_table[i][j-loop_j][0]
														
 
															+                #                     else:
														
 
															+                #                         if find_flag:
														
 
															+                #                             break
														
 
															+                #
														
 
															+                #                 if str(head+inner_table[i][j][0]) in text_set:
														
 
															+                #                     continue
														
 
															+                #                 if re.search(packPattern,head) is not None:
														
 
															+                #                     pack_text += head+inner_table[i][j][0]+"，"
														
 
															+                #                 elif re.search(rankPattern,head) is not None:   # 2020/11/23 大网站规则发现问题，if 改elif
														
 
															+                #                     #排名替换为同一种表达
														
 
															+                #                     rank_text += head+inner_table[i][j][0]+"，"
														
 
															+                #                     #print(rank_text)
														
 
															+                #                 elif re.search(entityPattern,head) is not None:
														
 
															+                #                     entity_text += head+inner_table[i][j][0]+"，"
														
 
															+                #                     #print(entity_text)
														
 
															+                #                 else:
														
 
															+                #                     text_line += head+inner_table[i][j][0]+"，"
														
 
															+                #                 text_set.add(str(head+inner_table[i][j][0]))
														
 
															+                #         text += pack_text+rank_text+entity_text+text_line
														
 
															+                #         text = text[:-1]+"。" if len(text)>0 else text
														
 
															+                # else:
														
 
															+                #     for j in range(width):
														
 
															+                #
														
 
															+                #         rank_text = ""
														
 
															+                #         entity_text = ""
														
 
															+                #         text_line = ""
														
 
															+                #         text_set = set()
														
 
															+                #         for i in range(head_begin,head_end):
														
 
															+                #             cell = inner_table[i][j]
														
 
															+                #             #是属性值
														
 
															+                #             if cell[1]==0 and cell[0]!="":
														
 
															+                #                 find_flag = False
														
 
															+                #                 head = ""
														
 
															+                #                 temp_head = ""
														
 
															+                #
														
 
															+                #                 for loop_j in range(1,j+1):
														
 
															+                #                     if not key_direct:
														
 
															+                #                         key_values = [1,2]
														
 
															+                #                     else:
														
 
															+                #                         key_values = [2]
														
 
															+                #                     if inner_table[i][j-loop_j][1] in key_values:
														
 
															+                #                         if find_flag:
														
 
															+                #                             if inner_table[i][j-loop_j][0]!=temp_head:
														
 
															+                #                                 head = inner_table[i][j-loop_j][0]+":"+head
														
 
															+                #                         else:
														
 
															+                #                             head = inner_table[i][j-loop_j][0]+":"+head
														
 
															+                #                         find_flag = True
														
 
															+                #                         temp_head = inner_table[i][j-loop_j][0]
														
 
															+                #                     else:
														
 
															+                #                         if find_flag:
														
 
															+                #                             break
														
 
															+                #                 find_flag = False
														
 
															+                #                 temp_head = ""
														
 
															+                #                 for loop_i in range(0,i+1-head_begin):
														
 
															+                #                     if not key_direct:
														
 
															+                #                         key_values = [1,2]
														
 
															+                #                     else:
														
 
															+                #                         key_values = [1]
														
 
															+                #                     if inner_table[i-loop_i][j][1] in key_values:
														
 
															+                #                         if find_flag:
														
 
															+                #                             if inner_table[i-loop_i][j][0]!=temp_head:
														
 
															+                #                                 head = inner_table[i-loop_i][j][0]+":"+head
														
 
															+                #                         else:
														
 
															+                #                             head = inner_table[i-loop_i][j][0]+":"+head
														
 
															+                #                         find_flag = True
														
 
															+                #                         temp_head = inner_table[i-loop_i][j][0]
														
 
															+                #                     else:
														
 
															+                #                         if find_flag:
														
 
															+                #                             break
														
 
															+                #                 if str(head+inner_table[i][j][0]) in text_set:
														
 
															+                #                     continue
														
 
															+                #                 if re.search(rankPattern,head) is not None:
														
 
															+                #                     rank_text += head+inner_table[i][j][0]+"，"
														
 
															+                #                     #print(rank_text)
														
 
															+                #                 elif re.search(entityPattern,head) is not None:
														
 
															+                #                     entity_text += head+inner_table[i][j][0]+"，"
														
 
															+                #                     #print(entity_text)
														
 
															+                #                 else:
														
 
															+                #                     text_line += head+inner_table[i][j][0]+"，"
														
 
															+                #                 text_set.add(str(head+inner_table[i][j][0]))
														
 
															+                #         text += rank_text+entity_text+text_line
														
 
															+                #         text = text[:-1]+"。" if len(text)>0 else text
														
 
															         return text
														
 
															     def removeFix(inner_table,fix_value="~~"):
														
@@ -955,31 +1072,22 @@ def segment(soup):
 
															     text = text.replace('"',"“").replace("\r","").replace("\n","，")
														
 
															     text = re.sub("\s{4,}","，",text)   
														
 
															     #替换标点
														
 
															-    while(True):
														
 
															-        #替换连续的标点
														
 
															-        punc = re.search("，(?P<punc>：|。|，|；)\s*",text)
														
 
															-        if punc is not None:
														
 
															-            text = re.sub("，"+punc.group("punc")+"\s*",punc.group("punc"),text)
														
 
															+
														
 
															+    #替换连续的标点
														
 
															+    punc_pattern = "(?P<del>[。，；：:，]+)\s*"
														
 
															+    for punc_del in re.findall(punc_pattern,text):
														
 
															+        if len(punc_del)>1:
														
 
															+            text = re.sub(punc_del+"\s*",punc_del[-1],text)
														
 
															+
														
 
															+    for punc_del in re.findall(punc_pattern,text):
														
 
															+        if len(punc_del)>1:
														
 
															+            text = re.sub(punc_del+"\s*",punc_del[-1],text)
														
 
															-        punc = re.search("(?P<punc>：|。|，|；)\s*，",text)
														
 
															-        if punc is not None:
														
 
															-            text = re.sub(punc.group("punc")+"\s*，",punc.group("punc"),text)
														
 
															-        else:
														
 
															-            #替换标点之后的空格
														
 
															-            punc = re.search("(?P<punc>：|。|，|；)\s+",text)
														
 
															-            if punc is not None:
														
 
															-                text = re.sub(punc.group("punc")+"\s+",punc.group("punc"),text)
														
 
															-            else:
														
 
															-                break
														
 
															+
														
 
															     #将连续的中文句号替换为一个
														
 
															     text_split = text.split("。")
														
 
															     text_split = [x for x in text_split if len(x)>0]
														
 
															-    list_text = []
														
 
															-    # for _t in text_split:
														
 
															-    #     list_text.append(re.sub("）",")",re.sub("（","(",re.sub("\s*","",_t))))
														
 
															     text = "。".join(text_split)
														
 
															-    # text = text.replace('）',")").replace("（","(").replace("\s","")
														
 
															-    #删除所有空格
														
 
															     # text过大报错
														
 
															     LOOP_LEN = 10000
														
 
															     LOOP_BEGIN = 0
														
@@ -990,10 +1098,6 @@ def segment(soup):
 
															             LOOP_BEGIN += LOOP_LEN
														
 
															     else:
														
 
															         return text
														
 
															-    # text = re.sub("\s*","",text)
														
 
															-    # #替换中文括号为英文括号
														
 
															-    # text = re.sub("（","(",text)
														
 
															-    # text = re.sub("）",")",text)
														
 
															     return _text
														
 
															 '''
														
--- a/BiddingKG/dl/interface/modelFactory.py
+++ b/BiddingKG/dl/interface/modelFactory.py
@@ -195,8 +195,8 @@ class Model_person_classify():
 
															     '''
														
 
															     def encode(self,tokens,begin_index,end_index,**kwargs):
														
 
															+        # return embedding(spanWindow(tokens=tokens,begin_index=begin_index,end_index=end_index,size=10),shape=(2,10,128))
														
 
															         return embedding(spanWindow(tokens=tokens,begin_index=begin_index,end_index=end_index,size=35),shape=(2,35,128))
														
 
															-        # return embedding(spanWindow(tokens=tokens,begin_index=begin_index,end_index=end_index,size=35),shape=(2,35,128))
														
 
															     def predict(self,x):
														
 
															         x = np.transpose(np.array(x),(1,0,2,3))
														
--- a/BiddingKG/maxcompute/documentDumplicate.py
+++ b/BiddingKG/maxcompute/documentDumplicate.py
@@ -190,7 +190,8 @@ class f_set_docid(BaseUDAF):
 
															                 _set_column = set()
														
 
															                 _set_tenderee = set()
														
 
															                 for j in range(_begin,i+1):
														
 
															-                    _set_tenderee.add(list_docs[j]["tenderee"])
														
 
															+                    if list_docs[j]["tenderee"] is not None and list_docs[j]["tenderee"]!="":
														
 
															+                        _set_tenderee.add(list_docs[j]["tenderee"])
														
 
															                     _set_column.add(list_docs[j]["defind_column"])
														
 
															                     _group.append({"docid":list_docs[j]["docid"],"extract_count":list_docs[j]["extract_count"]})