settlement.py 13 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288
  1. from django.db.models import Q
  2. import datetime,time
  3. import iepy
  4. iepy.setup(__file__)
  5. from iepy.data.db import DocumentManager
  6. from iepy.data.models import IEDocument,LabeledIEDocument,IEDocumentMetadata,LabeledIEDocumentMetadata,Payroll
  7. from brat.models import BratAnnotation,LabeledBratAnnotation
  8. from django.db import transaction
  9. import pandas as pd
  10. from django.contrib.auth.models import User
  11. def object_to_dict(obj,class_model):
  12. '''
  13. :param obj:对象
  14. :param class_model:django model
  15. :return: 由对象生成的键值对
  16. '''
  17. _dict = {}
  18. concrete_model = class_model._meta.concrete_model
  19. for field in concrete_model._meta.local_fields:
  20. value = field.value_from_object(obj)
  21. _dict[field.name] = value
  22. return _dict
  23. class Settlement():
  24. '''
  25. @summary: 结算类,定义了结算者所需要执行的各种方法
  26. '''
  27. def makePayroll(self,list_user,begin_time,time_end):
  28. '''
  29. :param _user: 用户名
  30. :param time_begin: 起始时间
  31. :param time_end: 截至时间
  32. :return:根据用户,时间段生成用户的标注情况
  33. '''
  34. from django.db import connection
  35. with transaction.atomic():
  36. cursor = connection.cursor()
  37. time_begin = "2020-08-01"
  38. for _user in list(set(list_user)):
  39. sql = 'select max(end_time) from corpus_payroll where "user"=\'%s\''%(_user)
  40. cursor.execute(sql)
  41. rows = cursor.fetchall()
  42. if rows[0][0] is not None:
  43. time_begin = rows[0][0]
  44. else:
  45. time_begin = "2020-08-01"
  46. sql = " select count(1) from corpus_iedocument where edituser='%s' and to_char(edittime,'yyyy-mm-dd')<='%s' and to_char(edittime,'yyyy-mm-dd')>'%s'"%(_user,time_end,time_begin)
  47. cursor.execute(sql)
  48. doc_count = cursor.fetchall()[0][0]
  49. sql = " select count(1) from brat_bratannotation where document_id in (select human_identifier from corpus_iedocument where edituser='%s' and to_char(edittime,'yyyy-mm-dd')<='%s' and to_char(edittime,'yyyy-mm-dd')>'%s' order by edittime desc limit 1200) and value like '%s' "%(_user,time_end,time_begin,"T%")
  50. cursor.execute(sql)
  51. t_count = cursor.fetchall()[0][0]
  52. sql = " select count(1) from brat_bratannotation where document_id in (select human_identifier from corpus_iedocument where edituser='%s' and to_char(edittime,'yyyy-mm-dd')<='%s' and to_char(edittime,'yyyy-mm-dd')>'%s' order by edittime desc limit 1200) and value like '%s' "%(_user,time_end,time_begin,"R%")
  53. cursor.execute(sql)
  54. r_count = cursor.fetchall()[0][0]
  55. sql = " select count(1) from brat_bratannotation where document_id in (select human_identifier from corpus_iedocument where edituser='%s' and to_char(edittime,'yyyy-mm-dd')<='%s' and to_char(edittime,'yyyy-mm-dd')>'%s') and value like '%s' "%(_user,time_end,time_begin,"T%")
  56. cursor.execute(sql)
  57. all_t_count = cursor.fetchall()[0][0]
  58. sql = " select count(1) from brat_bratannotation where document_id in (select human_identifier from corpus_iedocument where edituser='%s' and to_char(edittime,'yyyy-mm-dd')<='%s' and to_char(edittime,'yyyy-mm-dd')>'%s') and value like '%s' "%(_user,time_end,time_begin,"R%")
  59. cursor.execute(sql)
  60. all_r_count = cursor.fetchall()[0][0]
  61. wage = round(0.03*t_count+0.05*r_count+(all_t_count-t_count)*0.04+(all_r_count-r_count)*0.06,2)
  62. print(doc_count,t_count,r_count,wage)
  63. payrolls = Payroll.objects.filter(Q(user=_user)& Q(begin_time=time_begin) & Q(end_time=time_end))
  64. if len(payrolls)==0:
  65. _payroll = Payroll.objects.create(**{"user":_user,"doc_count":doc_count,"begin_time":time_begin,"end_time":time_end,"t_count":all_t_count,"r_count":all_r_count,"wage":wage})
  66. _payroll.save()
  67. else:
  68. _payroll = payrolls[0]
  69. _payroll.doc_count = doc_count
  70. _payroll.t_count = all_t_count
  71. _payroll.r_count = all_r_count
  72. _payroll.wage = wage
  73. _payroll.save()
  74. def exportPayroll(self,begin_time,end_time):
  75. '''
  76. :param begin_time: 导出开始时间
  77. :param end_time: 导出结束时间
  78. :return:
  79. '''
  80. list_user = []
  81. list_doc_count = []
  82. list_t_count = []
  83. list_r_count = []
  84. list_wage = []
  85. list_yield = []
  86. list_account = []
  87. list_begin = []
  88. list_end = []
  89. if begin_time is not None:
  90. payrolls = Payroll.objects.filter(Q(begin_time=begin_time) & Q(end_time=end_time))
  91. else:
  92. payrolls = Payroll.objects.filter(Q(end_time=end_time))
  93. for _payroll in payrolls:
  94. list_user.append(_payroll.user)
  95. list_doc_count.append(_payroll.doc_count)
  96. list_t_count.append(_payroll.t_count)
  97. list_r_count.append(_payroll.r_count)
  98. list_wage.append(_payroll.wage)
  99. list_yield.append(_payroll._yield)
  100. list_account.append(_payroll.account)
  101. list_begin.append(_payroll.begin_time)
  102. list_end.append(_payroll.end_time)
  103. df = pd.DataFrame({"用户":list_user,"开始时间":list_begin,"结束时间":list_end,"文章数":list_doc_count,"要素数":list_t_count,"关系数":list_r_count,"总价":list_wage,"合格率":list_yield,"结算价":list_account})
  104. df.to_excel("%s-%s要素标注统计.xls"%(begin_time,end_time),columns=["用户","开始时间","结束时间","文章数","要素数","关系数","总价","合格率","结算价"])
  105. def getAllUser(self):
  106. from django.db import connection
  107. with transaction.atomic():
  108. list_user = []
  109. cursor = connection.cursor()
  110. sql = "select username from auth_user where is_staff='t'"
  111. cursor.execute(sql)
  112. for row in cursor.fetchall():
  113. list_user.append(row[0])
  114. return list_user
  115. def makeMigrate(self,_user,time_begin,time_end):
  116. '''
  117. :param _user: 用户名
  118. :param time_begin: 起始时间
  119. :param time_end: 截至时间
  120. :return: 将用户在时间段内的数据迁移到标准表中
  121. '''
  122. pass
  123. # from django.db import connection
  124. # with transaction.atomic():
  125. # cursor = connection.cursor()
  126. # sql = " select human_identifier,offsets_to_text,sentences from corpus_iedocument where edituser is null"
  127. # cursor.execute(sql)
  128. # cursor1 = connection.cursor()
  129. # _index = 0
  130. # rows = True
  131. # while(rows):
  132. # rows=cursor.fetchmany(1000)
  133. # for row in rows:
  134. # _index += 1
  135. # print(_index)
  136. # human_identifier,offsets_to_text,sentences = row
  137. # if sentences!="[]":
  138. # _off = offsets_to_text.split(", ")[-1][:-1]
  139. # _sen = sentences.split(", ")[-1][:-1]
  140. # print(_off,_sen)
  141. # if int(_off)!=int(_sen):
  142. # offsets_to_text = offsets_to_text[:-1]+", "+str(int(_sen))+"]"
  143. # print(offsets_to_text)
  144. # cursor1.execute("update corpus_iedocument set offsets_to_text='%s' where human_identifier='%s'"%(offsets_to_text,human_identifier))
  145. # ieDocuments = IEDocument.objects.filter(Q(edituser=_user) & Q(edittime__range=(time_begin,time_end)))
  146. # for obj in ieDocuments:
  147. # _dict = object_to_dict(obj,IEDocument)
  148. # _dict_meta = object_to_dict(obj.metadata,IEDocumentMetadata)
  149. # labeledMeta = LabeledIEDocumentMetadata.objects.create(**_dict_meta)
  150. # labeledMeta.save()
  151. # _dict["metadata"] = labeledMeta
  152. # tmp = LabeledIEDocument.objects.create(**_dict)
  153. # tmp.save()
  154. #
  155. # bratAnnotations = BratAnnotation.objects.filter(Q(document_id=obj.human_identifier))
  156. # for ann in bratAnnotations:
  157. # _dict_ann = object_to_dict(ann,BratAnnotation)
  158. # labeledAnn = LabeledBratAnnotation.objects.create(**_dict_ann)
  159. # labeledAnn.save()
  160. def getPercentOfPass(self,_user,time_begin,time_end):
  161. '''
  162. :param _user:用户名
  163. :param time_begin: 起始时间
  164. :param time_end: 截至时间
  165. :return: 获得用户在时间段内标注数据的合格率
  166. '''
  167. def makePayrolls(self,time_begin,time_end):
  168. '''
  169. :param time_begin:起始时间
  170. :param time_end: 截至时间
  171. :return: 获得所有用户的工资表
  172. '''
  173. for _user in self.getAllUser():
  174. self.makePayroll(_user,time_begin,time_end)
  175. self.exportPayroll(time_begin,time_end)
  176. def createUser_batch(self,batch_size=90):
  177. '''
  178. :param batch_size: 用户个数
  179. :return:
  180. '''
  181. list_user = [User.objects.create_user(username="bidi%d"%(i+1),password="bidi%d"%(i+1)) for i in range(batch_size)]
  182. def exportLabels(self):
  183. groups = [[1,7],[8,14],[15,22],[23,29],[30,36],[37,43],[44,50],[51,56],[57,62],[63,71]]
  184. from django.db import connection
  185. cursor = connection.cursor()
  186. for _i in range(len(groups)):
  187. _begin,_end = groups[_i]
  188. list_username = []
  189. list_user = []
  190. list_label = []
  191. list_time = []
  192. for _j in range(_begin,_end+1):
  193. username = "bidi%d"%_j
  194. list_username.append("'%s'"%username)
  195. sql = " select edituser,human_identifier,to_char(edittime,'yyyy-mm-dd') from corpus_iedocument where edituser in(%s) order by edittime asc"%(",".join(list_username))
  196. print(sql)
  197. cursor.execute(sql)
  198. rows = cursor.fetchall()
  199. for row in rows:
  200. list_user.append(row[0])
  201. list_label.append(row[1])
  202. list_time.append(row[2])
  203. df = pd.DataFrame({"时间":list_time,"用户":list_user,"文章编号":list_label})
  204. df.to_excel("分组_%d.xls"%(_i+1),columns=["时间","用户","文章编号"])
  205. def filter(self):
  206. '''
  207. 过滤拍卖公告
  208. :return:
  209. '''
  210. import re
  211. ieDocuments = IEDocument.objects.all()
  212. for obj in ieDocuments:
  213. if re.search("拍卖",obj.text) is not None:
  214. obj.jump_signal = 1
  215. obj.save()
  216. print(obj.human_identifier)
  217. if __name__=="__main__":
  218. settle = Settlement()
  219. # settle.makeMigrate("test","2020-08-01","2020-08-31")
  220. # settle.makePayroll(["test3","test19","test22","test2","test9","test11","test12","test1","test7","test21","test17"],"2020-08-01","2020-12-25")
  221. # settle.exportPayroll(begin_time=None,end_time='2020-12-25')
  222. # settle.createUser_batch(batch_size=102)
  223. # settle.exportLabels()
  224. # settle.filter()
  225. from brat.models import BratAnnotation as brat_annotations
  226. from iepy.webui.corpus.models import IEDocument
  227. filename = '74800260'
  228. pre_label = IEDocument.objects.filter(human_identifier='74800260').values("pre_label")
  229. _label = ""
  230. if len(pre_label)>0:
  231. if pre_label[0]["pre_label"] is not None:
  232. _label = pre_label[0]["pre_label"]
  233. dict_T = dict()
  234. dict_R = dict()
  235. set_pre_label = set()
  236. for _i in _label.split(";"):
  237. if _i!="":
  238. set_pre_label.add(_i)
  239. set_label = set()
  240. anns = brat_annotations.objects.filter(document_id='74800260').values("value")
  241. for _str in anns:
  242. _str = _str["value"].strip()
  243. if _str != "":
  244. if _str[0]=="T":
  245. match = re.search(T_pattern,_str)
  246. if match is not None:
  247. match = match.groupdict()
  248. dict_T[match["T"]] = {"type":match["type"],"begin":match["begin"],"end":match["end"]}
  249. if _str[0]=="R":
  250. match = re.search(R_pattern,_str)
  251. if match is not None:
  252. match = match.groupdict()
  253. dict_R[match["R"]] = {"type":match["type"],"arg1":match["arg1"],"arg2":match["arg2"]}
  254. for _T,_v in dict_T.items():
  255. set_label.add("T|%s|%d|%d"%(_v["type"],int(_v["begin"]),int(_v["end"])))
  256. for _R,_v in dict_R.items():
  257. set_label.add("R|%s|%d|%d|%d|%d"%(_v["type"],int(dict_T[_v["arg1"]]["begin"]),int(dict_T[_v["arg1"]]["end"]),int(dict_T[_v["arg2"]]["begin"]),int(dict_T[_v["arg2"]]["end"])))
  258. union_set = set_pre_label&set_label
  259. deleted = len(set_pre_label)-len(union_set)
  260. added = len(set_label)-len(union_set)
  261. print(deleted,added)