table_simplify.py 5.6 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188
  1. #coding:utf-8
  2. import json
  3. import logging
  4. from BiddingKG.dl.table_head.pre_process import postgresql_util
  5. user_score = {
  6. "test": 1.,
  7. "test1": 0.83,
  8. "test11": 0.82,
  9. "test12": 0.74,
  10. "test16": 0.83,
  11. "test17": 0.77,
  12. "test19": 0.79,
  13. "test20": 0.82,
  14. "test21": 0.73,
  15. "test22": 0.64,
  16. "test25": 0.77,
  17. "test26": 0.80,
  18. "test27": 0.72,
  19. "test29": 0.8,
  20. "test3": 0.,
  21. "test7": 0.82,
  22. "test8": 0.78,
  23. "test9": 0.80,
  24. }
  25. def get_labeled_table():
  26. sql = """
  27. select id, update_user, table_text, pre_label, post_label
  28. from label_table_head_info where status = 0
  29. """
  30. result_list = postgresql_util(sql, limit=1000000)
  31. print("len(result_list)", len(result_list))
  32. with open(r"C:\Users\Administrator\Desktop\table_not_eval.txt", "r") as f:
  33. not_eval_table_list = f.read()
  34. not_eval_table_list = eval(not_eval_table_list)
  35. table_list = []
  36. # not_eval_table_list = []
  37. for table in result_list:
  38. pre_label = eval(table[3])
  39. post_label = eval(table[4])
  40. _id = table[0]
  41. update_user = table[1]
  42. table_text = table[2]
  43. if _id in not_eval_table_list:
  44. continue
  45. try:
  46. if table_text[0] == '"':
  47. table_text = eval(table_text)
  48. else:
  49. table_text = table_text
  50. table_text = table_text.replace('\\', '/')
  51. table_text = eval(table_text)
  52. except:
  53. print("无法识别table_text", _id)
  54. not_eval_table_list.append(_id)
  55. continue
  56. if post_label:
  57. label_list = post_label
  58. else:
  59. label_list = pre_label
  60. table_list.append([table_text, label_list, update_user, _id])
  61. print("len(table_list)", len(table_list))
  62. # with open(r"C:\Users\Administrator\Desktop\table_not_eval.txt", "w") as f:
  63. # f.write(str(not_eval_table_list))
  64. return table_list
  65. def table_distance(table1, table2, thresh=0.85):
  66. # flatten
  67. table1 = [col for row in table1 for col in row]
  68. table2 = [col for row in table2 for col in row]
  69. while "" in table1:
  70. table1.remove("")
  71. while "" in table2:
  72. table2.remove("")
  73. equal_cnt = 0
  74. not_equal_cnt = 0
  75. equal_flag = 0
  76. for col1 in table1:
  77. find_flag = 0
  78. for col2 in table2:
  79. if col1 == col2:
  80. equal_cnt += 1
  81. find_flag = 1
  82. break
  83. if not find_flag:
  84. not_equal_cnt += 1
  85. # print(equal_cnt, not_equal_cnt)
  86. if round(equal_cnt / max(len(table1), len(table2)), 2) >= thresh:
  87. # print("> thresh")
  88. equal_flag = 1
  89. break
  90. if round(not_equal_cnt / max(len(table1), len(table2)), 2) >= 1-thresh:
  91. # print("> 1-thresh")
  92. equal_flag = 0
  93. break
  94. return equal_flag
  95. def remove_duplicate(table_list):
  96. logging.info("into remove_duplicate")
  97. table_list.sort(key=lambda x: x[0])
  98. delete_table_id_list = []
  99. for i in range(len(table_list)):
  100. delete_table_id_list = list(set(delete_table_id_list))
  101. if i % 1000 == 0:
  102. print("Loop", i, "len(delete_table_id_list)", len(delete_table_id_list))
  103. logging.info("*")
  104. with open(r"C:\Users\Administrator\Desktop\table_delete.txt", "w") as f:
  105. f.write(str(delete_table_id_list))
  106. table1 = table_list[i]
  107. if len(table1[0]) <= 2 and len(table1[0][0]) <= 2:
  108. delete_table_id_list.append(table1[3])
  109. continue
  110. for j in range(i+1, len(table_list)):
  111. table2 = table_list[j]
  112. if len(table2[0]) <= 2 and len(table2[0][0]) <= 2:
  113. delete_table_id_list.append(table2[3])
  114. continue
  115. # 行数相差2以上忽略
  116. if abs(len(table1[0]) - len(table2[0])) >= 2:
  117. continue
  118. # 列数相差2以上忽略
  119. if abs(len(table1[0][0])) - len(table2[0][0]) >= 2:
  120. continue
  121. if table_distance(table1[0], table2[0]):
  122. print("equal", table1[3], table2[3])
  123. score1 = user_score.get(table1[2])
  124. score2 = user_score.get(table2[2])
  125. if score1 is None:
  126. score1 = 0.
  127. if score2 is None:
  128. score2 = 0.
  129. if score1 >= score2:
  130. delete_table_id_list.append(table2[3])
  131. else:
  132. delete_table_id_list.append(table1[3])
  133. delete_table_id_list = list(set(delete_table_id_list))
  134. new_table_list = []
  135. for table in table_list:
  136. if table[3] not in delete_table_id_list:
  137. new_table_list.append(table)
  138. return new_table_list
  139. def eval_table(_str):
  140. try:
  141. if _str[0] == '"':
  142. table_text = eval(_str)
  143. else:
  144. table_text = _str
  145. table_text = table_text.replace('\\', '/')
  146. table_text = eval(table_text)
  147. except:
  148. print("无法识别table_text")
  149. table_text = ""
  150. return table_text
  151. if __name__ == '__main__':
  152. _list = get_labeled_table()
  153. _list = remove_duplicate(_list)
  154. _str = json.dumps(str(_list))
  155. with open(r"C:\Users\Administrator\Desktop\table_simplify.txt", "w") as f:
  156. f.write(_str)
  157. # _str1 = "[['', '', 'Yes']]"
  158. # _str2 = "[['', '', 'Yes', '']]"
  159. # table1 = eval_table(_str1)
  160. # table2 = eval_table(_str2)
  161. #
  162. # print(table_distance(table1, table2))
  163. # with open(r"C:\Users\Administrator\Desktop\table_not_eval.txt", "r") as f:
  164. # not_eval_table_list = f.read()
  165. # print(not_eval_table_list)
  166. # not_eval_table_list = eval(not_eval_table_list)