#coding:utf-8 import json import logging from BiddingKG.dl.table_head.pre_process import postgresql_util user_score = { "test": 1., "test1": 0.83, "test11": 0.82, "test12": 0.74, "test16": 0.83, "test17": 0.77, "test19": 0.79, "test20": 0.82, "test21": 0.73, "test22": 0.64, "test25": 0.77, "test26": 0.80, "test27": 0.72, "test29": 0.8, "test3": 0., "test7": 0.82, "test8": 0.78, "test9": 0.80, } def get_labeled_table(): sql = """ select id, update_user, table_text, pre_label, post_label from label_table_head_info where status = 0 """ result_list = postgresql_util(sql, limit=1000000) print("len(result_list)", len(result_list)) with open(r"C:\Users\Administrator\Desktop\table_not_eval.txt", "r") as f: not_eval_table_list = f.read() not_eval_table_list = eval(not_eval_table_list) table_list = [] # not_eval_table_list = [] for table in result_list: pre_label = eval(table[3]) post_label = eval(table[4]) _id = table[0] update_user = table[1] table_text = table[2] if _id in not_eval_table_list: continue try: if table_text[0] == '"': table_text = eval(table_text) else: table_text = table_text table_text = table_text.replace('\\', '/') table_text = eval(table_text) except: print("无法识别table_text", _id) not_eval_table_list.append(_id) continue if post_label: label_list = post_label else: label_list = pre_label table_list.append([table_text, label_list, update_user, _id]) print("len(table_list)", len(table_list)) # with open(r"C:\Users\Administrator\Desktop\table_not_eval.txt", "w") as f: # f.write(str(not_eval_table_list)) return table_list def table_distance(table1, table2, thresh=0.85): # flatten table1 = [col for row in table1 for col in row] table2 = [col for row in table2 for col in row] while "" in table1: table1.remove("") while "" in table2: table2.remove("") equal_cnt = 0 not_equal_cnt = 0 equal_flag = 0 for col1 in table1: find_flag = 0 for col2 in table2: if col1 == col2: equal_cnt += 1 find_flag = 1 break if not find_flag: not_equal_cnt += 1 # print(equal_cnt, not_equal_cnt) if round(equal_cnt / max(len(table1), len(table2)), 2) >= thresh: # print("> thresh") equal_flag = 1 break if round(not_equal_cnt / max(len(table1), len(table2)), 2) >= 1-thresh: # print("> 1-thresh") equal_flag = 0 break return equal_flag def remove_duplicate(table_list): logging.info("into remove_duplicate") table_list.sort(key=lambda x: x[0]) delete_table_id_list = [] for i in range(len(table_list)): delete_table_id_list = list(set(delete_table_id_list)) if i % 1000 == 0: print("Loop", i, "len(delete_table_id_list)", len(delete_table_id_list)) logging.info("*") with open(r"C:\Users\Administrator\Desktop\table_delete.txt", "w") as f: f.write(str(delete_table_id_list)) table1 = table_list[i] if len(table1[0]) <= 2 and len(table1[0][0]) <= 2: delete_table_id_list.append(table1[3]) continue for j in range(i+1, len(table_list)): table2 = table_list[j] if len(table2[0]) <= 2 and len(table2[0][0]) <= 2: delete_table_id_list.append(table2[3]) continue # 行数相差2以上忽略 if abs(len(table1[0]) - len(table2[0])) >= 2: continue # 列数相差2以上忽略 if abs(len(table1[0][0])) - len(table2[0][0]) >= 2: continue if table_distance(table1[0], table2[0]): print("equal", table1[3], table2[3]) score1 = user_score.get(table1[2]) score2 = user_score.get(table2[2]) if score1 is None: score1 = 0. if score2 is None: score2 = 0. if score1 >= score2: delete_table_id_list.append(table2[3]) else: delete_table_id_list.append(table1[3]) delete_table_id_list = list(set(delete_table_id_list)) new_table_list = [] for table in table_list: if table[3] not in delete_table_id_list: new_table_list.append(table) return new_table_list def eval_table(_str): try: if _str[0] == '"': table_text = eval(_str) else: table_text = _str table_text = table_text.replace('\\', '/') table_text = eval(table_text) except: print("无法识别table_text") table_text = "" return table_text if __name__ == '__main__': _list = get_labeled_table() _list = remove_duplicate(_list) _str = json.dumps(str(_list)) with open(r"C:\Users\Administrator\Desktop\table_simplify.txt", "w") as f: f.write(_str) # _str1 = "[['', '', 'Yes']]" # _str2 = "[['', '', 'Yes', '']]" # table1 = eval_table(_str1) # table2 = eval_table(_str2) # # print(table_distance(table1, table2)) # with open(r"C:\Users\Administrator\Desktop\table_not_eval.txt", "r") as f: # not_eval_table_list = f.read() # print(not_eval_table_list) # not_eval_table_list = eval(not_eval_table_list)