getAccRecall.py 11 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276
  1. '''
  2. Created on 2018年12月20日
  3. @author: User
  4. '''
  5. import psycopg2
  6. import re
  7. from decimal import Decimal
  8. from BiddingKG.dl.common.Utils import getUnifyMoney
  9. conn = psycopg2.connect(dbname="BiddingKG",user="postgres",password="postgres",host="192.168.2.101")
  10. cursor = conn.cursor()
  11. sql = "select A.prem,A.epc,B.prem,B.code,B.name,A.code,A.name,A.doc_id from articles_validation A,predict_validation B where A.doc_id=B.doc_id order by A.doc_id"
  12. cursor.execute(sql)
  13. rows = cursor.fetchall()
  14. def getPackages(prem):
  15. '''
  16. @summary: 取得所有的包名
  17. '''
  18. result = set()
  19. for item in prem:
  20. result.add(item[0])
  21. return result
  22. def getRole_id(role_name):
  23. '''
  24. @summary: 取得角色名对应的ID
  25. '''
  26. if role_name=="tenderee":
  27. return "0"
  28. if role_name=="agency":
  29. return "1"
  30. if role_name=="win_tenderer":
  31. return "2"
  32. if role_name=="second_tenderer":
  33. return "3"
  34. if role_name=="third_tenderer":
  35. return "4"
  36. CN_sum_predict = 0
  37. CN_sum_label = 0
  38. CN_predict_label = 0
  39. role_sum_predict = 0
  40. role_sum_label = 0
  41. role_predict_label = 0
  42. person_label = 0
  43. person_predict = 0
  44. person_label_predict = 0
  45. money_label = 0
  46. money_predict = 0
  47. money_label_predict = 0
  48. only_role_sum_predict = 0
  49. only_role_sum_label = 0
  50. only_role_predict_label = 0
  51. only_money_label = 0
  52. only_money_label_all = 0
  53. only_money_predict = 0
  54. only_money_label_predict = 0
  55. only_person_label = 0
  56. only_person_predict = 0
  57. only_person_label_predict = 0
  58. role_find = 0
  59. role_find_label = 0
  60. money_find = 0
  61. money_find_label = 0
  62. person_find = 0
  63. person_find_label = 0
  64. def getFind(doc_id,type):
  65. '''
  66. @summary: 取得一篇文章某种类型的实体的数量
  67. @param:
  68. doc_id:文章的uuid
  69. type:实体类型
  70. '''
  71. result = set()
  72. sql = "select distinct entity_text from predict_entity where doc_id='"+doc_id+"' and entity_type in "+type+" "
  73. cursor.execute(sql)
  74. rows = cursor.fetchall()
  75. for row in rows:
  76. if type.find("money")>0:
  77. result.add(Decimal(row[0]))
  78. else:
  79. result.add(row[0])
  80. return result
  81. for row in rows:
  82. doc_id = row[7]
  83. label_prem = row[0]
  84. label_epc = row[1]
  85. predict_prem = row[2]
  86. list_label_prem = []
  87. #标注值
  88. for item in re.split("[;;]",label_prem):
  89. list_label_prem.append(re.split("-",item))
  90. list_predict_prem = []
  91. #预测值
  92. for item in re.split("[;;]",predict_prem):
  93. list_predict_prem.append(re.split("\$",item))
  94. #统计编号和名称
  95. hand_code = row[5]
  96. hand_name = row[6]
  97. predict_code = row[3]
  98. predict_name = row[4]
  99. set_hand_code = set()
  100. set_predict_code = set()
  101. for item in re.split("[;;]",hand_code):
  102. if item!="":
  103. set_hand_code.add(item.strip())
  104. for item in re.split("[;;]",predict_code):
  105. if item!="":
  106. set_predict_code.add(item.strip())
  107. CN_sum_label += len(set_hand_code)
  108. CN_sum_predict += len(set_predict_code)
  109. CN_predict_label += len(set_hand_code&set_predict_code)
  110. if hand_name is not None and hand_code!="":
  111. CN_sum_label += 1
  112. if predict_name is not None and predict_name!="":
  113. CN_sum_predict += 1
  114. if hand_name is not None and hand_name!="" and hand_name.strip()==predict_name.strip():
  115. CN_predict_label += 1
  116. #统计角色
  117. set_packages = getPackages(list_label_prem)
  118. set_predict = set()
  119. set_label = set()
  120. set_only_predict = set()
  121. set_only_label = set()
  122. if len(set_packages)>1:
  123. for item in re.split("[;;]",label_prem):
  124. if len(item)>1:
  125. item_split = item.split("-")
  126. set_label.add((item_split[0],item_split[1],item_split[2].strip()))
  127. set_only_label.add(item_split[2].strip())
  128. for item in list_predict_prem:
  129. if len(item)>1:
  130. set_predict.add((item[0],getRole_id(item[2]),item[3].strip()))
  131. set_only_predict.add(item[3].strip())
  132. else:
  133. for item in re.split("[;;]",label_prem):
  134. if len(item)>1:
  135. item_split = item.split("-")
  136. set_label.add((item_split[1],item_split[2]))
  137. set_only_label.add(item_split[2].strip())
  138. for item in list_predict_prem:
  139. if len(item)>1:
  140. set_predict.add((getRole_id(item[2]),item[3].strip()))
  141. set_only_predict.add(item[3].strip())
  142. role_sum_predict += len(set_predict)
  143. role_sum_label += len(set_label)
  144. role_predict_label += len(set_predict&set_label)
  145. only_role_sum_label += len(set_only_label)
  146. only_role_sum_predict += len(set_only_predict)
  147. only_role_predict_label += len(set_only_label&set_only_predict)
  148. set_find_role = getFind(doc_id,"('org','company')")
  149. role_find += len(set_find_role)
  150. role_find_label += len(set_only_label&set_find_role)
  151. #统计金额
  152. set_money_label = set()
  153. set_money_predict = set()
  154. set_only_money_label = set()
  155. set_only_money_predict = set()
  156. count_multi = 0
  157. for item in list_label_prem:
  158. if len(item)>3:
  159. if item[3]!="":
  160. count_multi_temp = 0
  161. for i in item[3].strip().split("、"):
  162. label_money = getUnifyMoney(re.sub("[,,]",'',i))
  163. if label_money>0:
  164. count_multi_temp += 1
  165. set_money_label.add((item[2].strip(),label_money))
  166. set_only_money_label.add(label_money)
  167. if count_multi_temp>0:
  168. count_multi += count_multi_temp -1
  169. money_label+=(len(set_money_label)-count_multi)
  170. only_money_label += len(set_only_money_label)-count_multi
  171. only_money_label_all += len(set_only_money_label)
  172. for item in list_predict_prem:
  173. if len(item)>2:
  174. if str(item[4])!="0":
  175. set_money_predict.add((item[3].strip(),Decimal(item[5])))
  176. set_only_money_predict.add(Decimal(item[5]))
  177. money_predict+=len(set_money_predict)
  178. money_label_predict+= len(set_money_label&set_money_predict)
  179. only_money_predict+=len(set_only_money_predict)
  180. only_money_label_predict+= len(set_only_money_label&set_only_money_predict)
  181. set_money_find = getFind(doc_id,"('money')")
  182. money_find += len(set_money_find)
  183. money_find_label += len(set_money_find&set_only_money_label)
  184. '''
  185. if len(set_money_label)-count_multi-len(set_money_label&set_money_predict)>=2:
  186. print(doc_id)
  187. print(set_money_label)
  188. print(set_money_predict)
  189. '''
  190. #统计联系人
  191. role_set = set()
  192. for item in list_label_prem:
  193. if len(item)>1:
  194. role_set.add(item[2])
  195. list_epc_label = []
  196. for item1 in re.split("[;;]",label_epc):
  197. item = re.split(",",item1)
  198. if len(item)>1:
  199. if item[0].strip() in role_set:
  200. list_epc_label.append([item[0].strip(),re.split("[/、]",item[1])])
  201. set_person_label = set()
  202. set_only_person_label = set()
  203. for item in list_epc_label:
  204. if len(item[1])>0 and item[1][0]!="":
  205. for i in item[1]:
  206. if i!="":
  207. set_person_label.add(item[0].strip()+i.strip())
  208. set_only_person_label.add(i.strip())
  209. person_label += len(set_person_label)
  210. only_person_label += len(set_only_person_label)
  211. set_person_predict = set()
  212. set_only_person_predict = set()
  213. for item1 in list_predict_prem:
  214. if len(item1)>1:
  215. if item1[7]!="":
  216. item = item1[7].split(",")
  217. for i in item:
  218. if i!="":
  219. set_person_predict.add(item1[3].strip()+i.split("/")[0].strip())
  220. set_only_person_predict.add(i.split("/")[0].strip())
  221. person_predict += len(set_person_predict)
  222. person_label_predict += len(set_person_label&set_person_predict)
  223. only_person_predict += len(set_only_person_predict)
  224. only_person_label_predict += len(set_only_person_label&set_only_person_predict)
  225. set_person_find = getFind(doc_id,"('person')")
  226. person_find += len(set_person_find)
  227. person_find_label += len(set_person_find&set_only_person_label)
  228. ''''''
  229. if abs(len(set_only_person_label)-len(set_only_person_predict))>=2:
  230. print(doc_id)
  231. print(set_person_label)
  232. print(set_person_predict)
  233. print("LinkTrueBase")
  234. print("编号名称:CN_sum_predict:%d,CN_sum_label:%d,CN_predict_label:%d,accurency:%f,recall:%f"%(CN_sum_predict,CN_sum_label,CN_predict_label,CN_predict_label/CN_sum_predict,CN_predict_label/CN_sum_label))
  235. print("角色:role_sum_predict:%d,role_sum_label:%d,role_predict_label:%d,accurency:%f,recall:%f"%(role_sum_predict,role_sum_label,role_predict_label,role_predict_label/role_sum_predict,role_predict_label/role_sum_label))
  236. print("金额:money_predict:%d,money_label:%d,money_label_predict:%d,acc:%f,recall:%f"%(money_predict,money_label,money_label_predict,money_label_predict/money_predict,money_label_predict/money_label))
  237. print("联系人:person_label:%d,person_predict:%d,person_label_predict:%d,acc:%f,recall:%f"%(person_label,person_predict,person_label_predict,person_label_predict/person_predict,person_label_predict/person_label))
  238. print("UserableBase")
  239. print("角色:only_role_sum_predict:%d,only_role_sum_label:%d,only_role_predict_label:%d,accurency:%f,recall:%f"%(only_role_sum_predict,only_role_sum_label,only_role_predict_label,only_role_predict_label/only_role_sum_predict,only_role_predict_label/only_role_sum_label))
  240. print("金额:only_money_predict:%d,only_money_label:%d,only_money_label_predict:%d,acc:%f,recall:%f"%(only_money_predict,only_money_label,only_money_label_predict,only_money_label_predict/only_money_predict,only_money_label_predict/only_money_label))
  241. print("联系人:only_person_label:%d,only_person_predict:%d,only_person_label_predict:%d,acc:%f,recall:%f"%(only_person_label,only_person_predict,only_person_label_predict,only_person_label_predict/only_person_predict,only_person_label_predict/only_person_label))
  242. ''''''
  243. print("ExtractBase")
  244. print("角色:role_find:%d,only_role_sum_label:%d,role_find_label:%d,accurency:%f,recall:%f"%(role_find,only_role_sum_label,role_find_label,role_find_label/role_find,role_find_label/only_role_sum_label))
  245. print("金额:money_find:%d,only_money_label:%d,money_find_label:%d,acc:%f,recall:%f"%(money_find,only_money_label_all,money_find_label,money_find_label/money_find,money_find_label/only_money_label_all))
  246. print("联系人:person_find:%d,only_person_predict:%d,person_find_label:%d,acc:%f,recall:%f"%(person_find,only_person_label,person_find_label,person_find_label/person_find,person_find_label/only_person_label))