123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276 |
- '''
- Created on 2018年12月20日
- @author: User
- '''
- import psycopg2
- import re
- from decimal import Decimal
- from BiddingKG.dl.common.Utils import getUnifyMoney
- conn = psycopg2.connect(dbname="BiddingKG",user="postgres",password="postgres",host="192.168.2.101")
- cursor = conn.cursor()
- sql = "select A.prem,A.epc,B.prem,B.code,B.name,A.code,A.name,A.doc_id from articles_validation A,predict_validation B where A.doc_id=B.doc_id order by A.doc_id"
- cursor.execute(sql)
- rows = cursor.fetchall()
- def getPackages(prem):
- '''
- @summary: 取得所有的包名
- '''
- result = set()
- for item in prem:
- result.add(item[0])
- return result
- def getRole_id(role_name):
- '''
- @summary: 取得角色名对应的ID
- '''
- if role_name=="tenderee":
- return "0"
- if role_name=="agency":
- return "1"
- if role_name=="win_tenderer":
- return "2"
- if role_name=="second_tenderer":
- return "3"
- if role_name=="third_tenderer":
- return "4"
-
- CN_sum_predict = 0
- CN_sum_label = 0
- CN_predict_label = 0
- role_sum_predict = 0
- role_sum_label = 0
- role_predict_label = 0
- person_label = 0
- person_predict = 0
- person_label_predict = 0
- money_label = 0
- money_predict = 0
- money_label_predict = 0
- only_role_sum_predict = 0
- only_role_sum_label = 0
- only_role_predict_label = 0
- only_money_label = 0
- only_money_label_all = 0
- only_money_predict = 0
- only_money_label_predict = 0
- only_person_label = 0
- only_person_predict = 0
- only_person_label_predict = 0
- role_find = 0
- role_find_label = 0
- money_find = 0
- money_find_label = 0
- person_find = 0
- person_find_label = 0
- def getFind(doc_id,type):
- '''
- @summary: 取得一篇文章某种类型的实体的数量
- @param:
- doc_id:文章的uuid
- type:实体类型
- '''
- result = set()
- sql = "select distinct entity_text from predict_entity where doc_id='"+doc_id+"' and entity_type in "+type+" "
- cursor.execute(sql)
- rows = cursor.fetchall()
- for row in rows:
- if type.find("money")>0:
- result.add(Decimal(row[0]))
- else:
- result.add(row[0])
- return result
- for row in rows:
- doc_id = row[7]
- label_prem = row[0]
- label_epc = row[1]
- predict_prem = row[2]
- list_label_prem = []
- #标注值
- for item in re.split("[;;]",label_prem):
- list_label_prem.append(re.split("-",item))
- list_predict_prem = []
- #预测值
- for item in re.split("[;;]",predict_prem):
- list_predict_prem.append(re.split("\$",item))
-
- #统计编号和名称
- hand_code = row[5]
- hand_name = row[6]
- predict_code = row[3]
- predict_name = row[4]
- set_hand_code = set()
- set_predict_code = set()
- for item in re.split("[;;]",hand_code):
- if item!="":
- set_hand_code.add(item.strip())
- for item in re.split("[;;]",predict_code):
- if item!="":
- set_predict_code.add(item.strip())
-
- CN_sum_label += len(set_hand_code)
- CN_sum_predict += len(set_predict_code)
- CN_predict_label += len(set_hand_code&set_predict_code)
- if hand_name is not None and hand_code!="":
- CN_sum_label += 1
- if predict_name is not None and predict_name!="":
- CN_sum_predict += 1
- if hand_name is not None and hand_name!="" and hand_name.strip()==predict_name.strip():
- CN_predict_label += 1
-
- #统计角色
- set_packages = getPackages(list_label_prem)
- set_predict = set()
- set_label = set()
- set_only_predict = set()
- set_only_label = set()
- if len(set_packages)>1:
-
- for item in re.split("[;;]",label_prem):
- if len(item)>1:
- item_split = item.split("-")
- set_label.add((item_split[0],item_split[1],item_split[2].strip()))
- set_only_label.add(item_split[2].strip())
- for item in list_predict_prem:
- if len(item)>1:
- set_predict.add((item[0],getRole_id(item[2]),item[3].strip()))
- set_only_predict.add(item[3].strip())
- else:
- for item in re.split("[;;]",label_prem):
- if len(item)>1:
- item_split = item.split("-")
- set_label.add((item_split[1],item_split[2]))
- set_only_label.add(item_split[2].strip())
- for item in list_predict_prem:
- if len(item)>1:
- set_predict.add((getRole_id(item[2]),item[3].strip()))
- set_only_predict.add(item[3].strip())
- role_sum_predict += len(set_predict)
- role_sum_label += len(set_label)
- role_predict_label += len(set_predict&set_label)
- only_role_sum_label += len(set_only_label)
- only_role_sum_predict += len(set_only_predict)
- only_role_predict_label += len(set_only_label&set_only_predict)
-
- set_find_role = getFind(doc_id,"('org','company')")
- role_find += len(set_find_role)
- role_find_label += len(set_only_label&set_find_role)
-
-
-
- #统计金额
- set_money_label = set()
- set_money_predict = set()
- set_only_money_label = set()
- set_only_money_predict = set()
- count_multi = 0
- for item in list_label_prem:
- if len(item)>3:
- if item[3]!="":
- count_multi_temp = 0
- for i in item[3].strip().split("、"):
- label_money = getUnifyMoney(re.sub("[,,]",'',i))
- if label_money>0:
- count_multi_temp += 1
- set_money_label.add((item[2].strip(),label_money))
- set_only_money_label.add(label_money)
- if count_multi_temp>0:
- count_multi += count_multi_temp -1
- money_label+=(len(set_money_label)-count_multi)
- only_money_label += len(set_only_money_label)-count_multi
- only_money_label_all += len(set_only_money_label)
- for item in list_predict_prem:
- if len(item)>2:
- if str(item[4])!="0":
- set_money_predict.add((item[3].strip(),Decimal(item[5])))
- set_only_money_predict.add(Decimal(item[5]))
- money_predict+=len(set_money_predict)
- money_label_predict+= len(set_money_label&set_money_predict)
- only_money_predict+=len(set_only_money_predict)
- only_money_label_predict+= len(set_only_money_label&set_only_money_predict)
-
- set_money_find = getFind(doc_id,"('money')")
- money_find += len(set_money_find)
- money_find_label += len(set_money_find&set_only_money_label)
-
- '''
- if len(set_money_label)-count_multi-len(set_money_label&set_money_predict)>=2:
- print(doc_id)
- print(set_money_label)
- print(set_money_predict)
- '''
-
-
- #统计联系人
- role_set = set()
- for item in list_label_prem:
- if len(item)>1:
- role_set.add(item[2])
- list_epc_label = []
- for item1 in re.split("[;;]",label_epc):
- item = re.split(",",item1)
- if len(item)>1:
- if item[0].strip() in role_set:
- list_epc_label.append([item[0].strip(),re.split("[/、]",item[1])])
- set_person_label = set()
- set_only_person_label = set()
- for item in list_epc_label:
- if len(item[1])>0 and item[1][0]!="":
- for i in item[1]:
- if i!="":
- set_person_label.add(item[0].strip()+i.strip())
- set_only_person_label.add(i.strip())
- person_label += len(set_person_label)
- only_person_label += len(set_only_person_label)
- set_person_predict = set()
- set_only_person_predict = set()
- for item1 in list_predict_prem:
- if len(item1)>1:
- if item1[7]!="":
- item = item1[7].split(",")
- for i in item:
- if i!="":
- set_person_predict.add(item1[3].strip()+i.split("/")[0].strip())
- set_only_person_predict.add(i.split("/")[0].strip())
- person_predict += len(set_person_predict)
- person_label_predict += len(set_person_label&set_person_predict)
- only_person_predict += len(set_only_person_predict)
- only_person_label_predict += len(set_only_person_label&set_only_person_predict)
-
- set_person_find = getFind(doc_id,"('person')")
- person_find += len(set_person_find)
- person_find_label += len(set_person_find&set_only_person_label)
-
- ''''''
- if abs(len(set_only_person_label)-len(set_only_person_predict))>=2:
- print(doc_id)
- print(set_person_label)
- print(set_person_predict)
-
- print("LinkTrueBase")
- print("编号名称:CN_sum_predict:%d,CN_sum_label:%d,CN_predict_label:%d,accurency:%f,recall:%f"%(CN_sum_predict,CN_sum_label,CN_predict_label,CN_predict_label/CN_sum_predict,CN_predict_label/CN_sum_label))
- print("角色:role_sum_predict:%d,role_sum_label:%d,role_predict_label:%d,accurency:%f,recall:%f"%(role_sum_predict,role_sum_label,role_predict_label,role_predict_label/role_sum_predict,role_predict_label/role_sum_label))
- print("金额:money_predict:%d,money_label:%d,money_label_predict:%d,acc:%f,recall:%f"%(money_predict,money_label,money_label_predict,money_label_predict/money_predict,money_label_predict/money_label))
- print("联系人:person_label:%d,person_predict:%d,person_label_predict:%d,acc:%f,recall:%f"%(person_label,person_predict,person_label_predict,person_label_predict/person_predict,person_label_predict/person_label))
- print("UserableBase")
- print("角色:only_role_sum_predict:%d,only_role_sum_label:%d,only_role_predict_label:%d,accurency:%f,recall:%f"%(only_role_sum_predict,only_role_sum_label,only_role_predict_label,only_role_predict_label/only_role_sum_predict,only_role_predict_label/only_role_sum_label))
- print("金额:only_money_predict:%d,only_money_label:%d,only_money_label_predict:%d,acc:%f,recall:%f"%(only_money_predict,only_money_label,only_money_label_predict,only_money_label_predict/only_money_predict,only_money_label_predict/only_money_label))
- print("联系人:only_person_label:%d,only_person_predict:%d,only_person_label_predict:%d,acc:%f,recall:%f"%(only_person_label,only_person_predict,only_person_label_predict,only_person_label_predict/only_person_predict,only_person_label_predict/only_person_label))
- ''''''
- print("ExtractBase")
- print("角色:role_find:%d,only_role_sum_label:%d,role_find_label:%d,accurency:%f,recall:%f"%(role_find,only_role_sum_label,role_find_label,role_find_label/role_find,role_find_label/only_role_sum_label))
- print("金额:money_find:%d,only_money_label:%d,money_find_label:%d,acc:%f,recall:%f"%(money_find,only_money_label_all,money_find_label,money_find_label/money_find,money_find_label/only_money_label_all))
- print("联系人:person_find:%d,only_person_predict:%d,person_find_label:%d,acc:%f,recall:%f"%(person_find,only_person_label,person_find_label,person_find_label/person_find,person_find_label/only_person_label))
|