123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188 |
- #coding:utf-8
- import json
- import logging
- from BiddingKG.dl.table_head.pre_process import postgresql_util
- user_score = {
- "test": 1.,
- "test1": 0.83,
- "test11": 0.82,
- "test12": 0.74,
- "test16": 0.83,
- "test17": 0.77,
- "test19": 0.79,
- "test20": 0.82,
- "test21": 0.73,
- "test22": 0.64,
- "test25": 0.77,
- "test26": 0.80,
- "test27": 0.72,
- "test29": 0.8,
- "test3": 0.,
- "test7": 0.82,
- "test8": 0.78,
- "test9": 0.80,
- }
- def get_labeled_table():
- sql = """
- select id, update_user, table_text, pre_label, post_label
- from label_table_head_info where status = 0
- """
- result_list = postgresql_util(sql, limit=1000000)
- print("len(result_list)", len(result_list))
- with open(r"C:\Users\Administrator\Desktop\table_not_eval.txt", "r") as f:
- not_eval_table_list = f.read()
- not_eval_table_list = eval(not_eval_table_list)
- table_list = []
- # not_eval_table_list = []
- for table in result_list:
- pre_label = eval(table[3])
- post_label = eval(table[4])
- _id = table[0]
- update_user = table[1]
- table_text = table[2]
- if _id in not_eval_table_list:
- continue
- try:
- if table_text[0] == '"':
- table_text = eval(table_text)
- else:
- table_text = table_text
- table_text = table_text.replace('\\', '/')
- table_text = eval(table_text)
- except:
- print("无法识别table_text", _id)
- not_eval_table_list.append(_id)
- continue
- if post_label:
- label_list = post_label
- else:
- label_list = pre_label
- table_list.append([table_text, label_list, update_user, _id])
- print("len(table_list)", len(table_list))
- # with open(r"C:\Users\Administrator\Desktop\table_not_eval.txt", "w") as f:
- # f.write(str(not_eval_table_list))
- return table_list
- def table_distance(table1, table2, thresh=0.85):
- # flatten
- table1 = [col for row in table1 for col in row]
- table2 = [col for row in table2 for col in row]
- while "" in table1:
- table1.remove("")
- while "" in table2:
- table2.remove("")
- equal_cnt = 0
- not_equal_cnt = 0
- equal_flag = 0
- for col1 in table1:
- find_flag = 0
- for col2 in table2:
- if col1 == col2:
- equal_cnt += 1
- find_flag = 1
- break
- if not find_flag:
- not_equal_cnt += 1
- # print(equal_cnt, not_equal_cnt)
- if round(equal_cnt / max(len(table1), len(table2)), 2) >= thresh:
- # print("> thresh")
- equal_flag = 1
- break
- if round(not_equal_cnt / max(len(table1), len(table2)), 2) >= 1-thresh:
- # print("> 1-thresh")
- equal_flag = 0
- break
- return equal_flag
- def remove_duplicate(table_list):
- logging.info("into remove_duplicate")
- table_list.sort(key=lambda x: x[0])
- delete_table_id_list = []
- for i in range(len(table_list)):
- delete_table_id_list = list(set(delete_table_id_list))
- if i % 1000 == 0:
- print("Loop", i, "len(delete_table_id_list)", len(delete_table_id_list))
- logging.info("*")
- with open(r"C:\Users\Administrator\Desktop\table_delete.txt", "w") as f:
- f.write(str(delete_table_id_list))
- table1 = table_list[i]
- if len(table1[0]) <= 2 and len(table1[0][0]) <= 2:
- delete_table_id_list.append(table1[3])
- continue
- for j in range(i+1, len(table_list)):
- table2 = table_list[j]
- if len(table2[0]) <= 2 and len(table2[0][0]) <= 2:
- delete_table_id_list.append(table2[3])
- continue
- # 行数相差2以上忽略
- if abs(len(table1[0]) - len(table2[0])) >= 2:
- continue
- # 列数相差2以上忽略
- if abs(len(table1[0][0])) - len(table2[0][0]) >= 2:
- continue
- if table_distance(table1[0], table2[0]):
- print("equal", table1[3], table2[3])
- score1 = user_score.get(table1[2])
- score2 = user_score.get(table2[2])
- if score1 is None:
- score1 = 0.
- if score2 is None:
- score2 = 0.
- if score1 >= score2:
- delete_table_id_list.append(table2[3])
- else:
- delete_table_id_list.append(table1[3])
- delete_table_id_list = list(set(delete_table_id_list))
- new_table_list = []
- for table in table_list:
- if table[3] not in delete_table_id_list:
- new_table_list.append(table)
- return new_table_list
- def eval_table(_str):
- try:
- if _str[0] == '"':
- table_text = eval(_str)
- else:
- table_text = _str
- table_text = table_text.replace('\\', '/')
- table_text = eval(table_text)
- except:
- print("无法识别table_text")
- table_text = ""
- return table_text
- if __name__ == '__main__':
- _list = get_labeled_table()
- _list = remove_duplicate(_list)
- _str = json.dumps(str(_list))
- with open(r"C:\Users\Administrator\Desktop\table_simplify.txt", "w") as f:
- f.write(_str)
- # _str1 = "[['', '', 'Yes']]"
- # _str2 = "[['', '', 'Yes', '']]"
- # table1 = eval_table(_str1)
- # table2 = eval_table(_str2)
- #
- # print(table_distance(table1, table2))
- # with open(r"C:\Users\Administrator\Desktop\table_not_eval.txt", "r") as f:
- # not_eval_table_list = f.read()
- # print(not_eval_table_list)
- # not_eval_table_list = eval(not_eval_table_list)
|