from django.db.models import Q import datetime,time import iepy iepy.setup(__file__) from iepy.data.db import DocumentManager from iepy.data.models import IEDocument,LabeledIEDocument,IEDocumentMetadata,LabeledIEDocumentMetadata,Payroll from brat.models import BratAnnotation,LabeledBratAnnotation from django.db import transaction import pandas as pd from django.contrib.auth.models import User def object_to_dict(obj,class_model): ''' :param obj:对象 :param class_model:django model :return: 由对象生成的键值对 ''' _dict = {} concrete_model = class_model._meta.concrete_model for field in concrete_model._meta.local_fields: value = field.value_from_object(obj) _dict[field.name] = value return _dict class Settlement(): ''' @summary: 结算类,定义了结算者所需要执行的各种方法 ''' def makePayroll(self,list_user,begin_time,time_end): ''' :param _user: 用户名 :param time_begin: 起始时间 :param time_end: 截至时间 :return:根据用户,时间段生成用户的标注情况 ''' from django.db import connection with transaction.atomic(): cursor = connection.cursor() time_begin = "2020-08-01" for _user in list(set(list_user)): sql = 'select max(end_time) from corpus_payroll where "user"=\'%s\''%(_user) cursor.execute(sql) rows = cursor.fetchall() if rows[0][0] is not None: time_begin = rows[0][0] else: time_begin = "2020-08-01" sql = " select count(1) from corpus_iedocument where edituser='%s' and to_char(edittime,'yyyy-mm-dd')<='%s' and to_char(edittime,'yyyy-mm-dd')>'%s'"%(_user,time_end,time_begin) cursor.execute(sql) doc_count = cursor.fetchall()[0][0] sql = " select count(1) from brat_bratannotation where document_id in (select human_identifier from corpus_iedocument where edituser='%s' and to_char(edittime,'yyyy-mm-dd')<='%s' and to_char(edittime,'yyyy-mm-dd')>'%s' order by edittime desc limit 1200) and value like '%s' "%(_user,time_end,time_begin,"T%") cursor.execute(sql) t_count = cursor.fetchall()[0][0] sql = " select count(1) from brat_bratannotation where document_id in (select human_identifier from corpus_iedocument where edituser='%s' and to_char(edittime,'yyyy-mm-dd')<='%s' and to_char(edittime,'yyyy-mm-dd')>'%s' order by edittime desc limit 1200) and value like '%s' "%(_user,time_end,time_begin,"R%") cursor.execute(sql) r_count = cursor.fetchall()[0][0] sql = " select count(1) from brat_bratannotation where document_id in (select human_identifier from corpus_iedocument where edituser='%s' and to_char(edittime,'yyyy-mm-dd')<='%s' and to_char(edittime,'yyyy-mm-dd')>'%s') and value like '%s' "%(_user,time_end,time_begin,"T%") cursor.execute(sql) all_t_count = cursor.fetchall()[0][0] sql = " select count(1) from brat_bratannotation where document_id in (select human_identifier from corpus_iedocument where edituser='%s' and to_char(edittime,'yyyy-mm-dd')<='%s' and to_char(edittime,'yyyy-mm-dd')>'%s') and value like '%s' "%(_user,time_end,time_begin,"R%") cursor.execute(sql) all_r_count = cursor.fetchall()[0][0] wage = round(0.03*t_count+0.05*r_count+(all_t_count-t_count)*0.04+(all_r_count-r_count)*0.06,2) print(doc_count,t_count,r_count,wage) payrolls = Payroll.objects.filter(Q(user=_user)& Q(begin_time=time_begin) & Q(end_time=time_end)) if len(payrolls)==0: _payroll = Payroll.objects.create(**{"user":_user,"doc_count":doc_count,"begin_time":time_begin,"end_time":time_end,"t_count":all_t_count,"r_count":all_r_count,"wage":wage}) _payroll.save() else: _payroll = payrolls[0] _payroll.doc_count = doc_count _payroll.t_count = all_t_count _payroll.r_count = all_r_count _payroll.wage = wage _payroll.save() def exportPayroll(self,begin_time,end_time): ''' :param begin_time: 导出开始时间 :param end_time: 导出结束时间 :return: ''' list_user = [] list_doc_count = [] list_t_count = [] list_r_count = [] list_wage = [] list_yield = [] list_account = [] list_begin = [] list_end = [] if begin_time is not None: payrolls = Payroll.objects.filter(Q(begin_time=begin_time) & Q(end_time=end_time)) else: payrolls = Payroll.objects.filter(Q(end_time=end_time)) for _payroll in payrolls: list_user.append(_payroll.user) list_doc_count.append(_payroll.doc_count) list_t_count.append(_payroll.t_count) list_r_count.append(_payroll.r_count) list_wage.append(_payroll.wage) list_yield.append(_payroll._yield) list_account.append(_payroll.account) list_begin.append(_payroll.begin_time) list_end.append(_payroll.end_time) df = pd.DataFrame({"用户":list_user,"开始时间":list_begin,"结束时间":list_end,"文章数":list_doc_count,"要素数":list_t_count,"关系数":list_r_count,"总价":list_wage,"合格率":list_yield,"结算价":list_account}) df.to_excel("%s-%s要素标注统计.xls"%(begin_time,end_time),columns=["用户","开始时间","结束时间","文章数","要素数","关系数","总价","合格率","结算价"]) def getAllUser(self): from django.db import connection with transaction.atomic(): list_user = [] cursor = connection.cursor() sql = "select username from auth_user where is_staff='t'" cursor.execute(sql) for row in cursor.fetchall(): list_user.append(row[0]) return list_user def makeMigrate(self,_user,time_begin,time_end): ''' :param _user: 用户名 :param time_begin: 起始时间 :param time_end: 截至时间 :return: 将用户在时间段内的数据迁移到标准表中 ''' pass # from django.db import connection # with transaction.atomic(): # cursor = connection.cursor() # sql = " select human_identifier,offsets_to_text,sentences from corpus_iedocument where edituser is null" # cursor.execute(sql) # cursor1 = connection.cursor() # _index = 0 # rows = True # while(rows): # rows=cursor.fetchmany(1000) # for row in rows: # _index += 1 # print(_index) # human_identifier,offsets_to_text,sentences = row # if sentences!="[]": # _off = offsets_to_text.split(", ")[-1][:-1] # _sen = sentences.split(", ")[-1][:-1] # print(_off,_sen) # if int(_off)!=int(_sen): # offsets_to_text = offsets_to_text[:-1]+", "+str(int(_sen))+"]" # print(offsets_to_text) # cursor1.execute("update corpus_iedocument set offsets_to_text='%s' where human_identifier='%s'"%(offsets_to_text,human_identifier)) # ieDocuments = IEDocument.objects.filter(Q(edituser=_user) & Q(edittime__range=(time_begin,time_end))) # for obj in ieDocuments: # _dict = object_to_dict(obj,IEDocument) # _dict_meta = object_to_dict(obj.metadata,IEDocumentMetadata) # labeledMeta = LabeledIEDocumentMetadata.objects.create(**_dict_meta) # labeledMeta.save() # _dict["metadata"] = labeledMeta # tmp = LabeledIEDocument.objects.create(**_dict) # tmp.save() # # bratAnnotations = BratAnnotation.objects.filter(Q(document_id=obj.human_identifier)) # for ann in bratAnnotations: # _dict_ann = object_to_dict(ann,BratAnnotation) # labeledAnn = LabeledBratAnnotation.objects.create(**_dict_ann) # labeledAnn.save() def getPercentOfPass(self,_user,time_begin,time_end): ''' :param _user:用户名 :param time_begin: 起始时间 :param time_end: 截至时间 :return: 获得用户在时间段内标注数据的合格率 ''' def makePayrolls(self,time_begin,time_end): ''' :param time_begin:起始时间 :param time_end: 截至时间 :return: 获得所有用户的工资表 ''' for _user in self.getAllUser(): self.makePayroll(_user,time_begin,time_end) self.exportPayroll(time_begin,time_end) def createUser_batch(self,batch_size=90): ''' :param batch_size: 用户个数 :return: ''' list_user = [User.objects.create_user(username="bidi%d"%(i+1),password="bidi%d"%(i+1)) for i in range(batch_size)] def exportLabels(self): groups = [[1,7],[8,14],[15,22],[23,29],[30,36],[37,43],[44,50],[51,56],[57,62],[63,71]] from django.db import connection cursor = connection.cursor() for _i in range(len(groups)): _begin,_end = groups[_i] list_username = [] list_user = [] list_label = [] list_time = [] for _j in range(_begin,_end+1): username = "bidi%d"%_j list_username.append("'%s'"%username) sql = " select edituser,human_identifier,to_char(edittime,'yyyy-mm-dd') from corpus_iedocument where edituser in(%s) order by edittime asc"%(",".join(list_username)) print(sql) cursor.execute(sql) rows = cursor.fetchall() for row in rows: list_user.append(row[0]) list_label.append(row[1]) list_time.append(row[2]) df = pd.DataFrame({"时间":list_time,"用户":list_user,"文章编号":list_label}) df.to_excel("分组_%d.xls"%(_i+1),columns=["时间","用户","文章编号"]) def filter(self): ''' 过滤拍卖公告 :return: ''' import re ieDocuments = IEDocument.objects.all() for obj in ieDocuments: if re.search("拍卖",obj.text) is not None: obj.jump_signal = 1 obj.save() print(obj.human_identifier) if __name__=="__main__": settle = Settlement() # settle.makeMigrate("test","2020-08-01","2020-08-31") # settle.makePayroll(["test3","test19","test22","test2","test9","test11","test12","test1","test7","test21","test17"],"2020-08-01","2020-12-25") # settle.exportPayroll(begin_time=None,end_time='2020-12-25') # settle.createUser_batch(batch_size=102) # settle.exportLabels() # settle.filter() from brat.models import BratAnnotation as brat_annotations from iepy.webui.corpus.models import IEDocument filename = '74800260' pre_label = IEDocument.objects.filter(human_identifier='74800260').values("pre_label") _label = "" if len(pre_label)>0: if pre_label[0]["pre_label"] is not None: _label = pre_label[0]["pre_label"] dict_T = dict() dict_R = dict() set_pre_label = set() for _i in _label.split(";"): if _i!="": set_pre_label.add(_i) set_label = set() anns = brat_annotations.objects.filter(document_id='74800260').values("value") for _str in anns: _str = _str["value"].strip() if _str != "": if _str[0]=="T": match = re.search(T_pattern,_str) if match is not None: match = match.groupdict() dict_T[match["T"]] = {"type":match["type"],"begin":match["begin"],"end":match["end"]} if _str[0]=="R": match = re.search(R_pattern,_str) if match is not None: match = match.groupdict() dict_R[match["R"]] = {"type":match["type"],"arg1":match["arg1"],"arg2":match["arg2"]} for _T,_v in dict_T.items(): set_label.add("T|%s|%d|%d"%(_v["type"],int(_v["begin"]),int(_v["end"]))) for _R,_v in dict_R.items(): set_label.add("R|%s|%d|%d|%d|%d"%(_v["type"],int(dict_T[_v["arg1"]]["begin"]),int(dict_T[_v["arg1"]]["end"]),int(dict_T[_v["arg2"]]["begin"]),int(dict_T[_v["arg2"]]["end"]))) union_set = set_pre_label&set_label deleted = len(set_pre_label)-len(union_set) added = len(set_label)-len(union_set) print(deleted,added)