exportData.py 12 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338
  1. #coding:utf8
  2. from dataSource.source import getConnection_testmysql,getConnection_postgres
  3. from utils.Utils import save,getCurrent_date
  4. def exportMergeTrainData():
  5. conn = getConnection_testmysql()
  6. cursor = conn.cursor()
  7. for _type in ["is null","is not null"]:
  8. for i in range(20):
  9. _limit = 1000000
  10. _offset = i*_limit
  11. sql = "select docid_less,docid_greater,json_matrix,prob from merge_probability_pairs_featurematrix_train where prob %s limit %d offset %d"%(_type,_limit,_offset)
  12. cursor.execute(sql)
  13. row_name = cursor.description
  14. print(row_name)
  15. rows = cursor.fetchall()
  16. list_data = []
  17. for row in rows:
  18. _dict = dict()
  19. for _n,_v in zip(row_name,row):
  20. _dict[_n[0]] = _v
  21. list_data.append(_dict)
  22. save(list_data,"../data/%s-mergeTrain_%s_part%d.pk"%(getCurrent_date("%Y-%m-%d"),_type.replace(" ",""),i))
  23. import pandas as pd
  24. import re
  25. def labelTime():
  26. filename = "../data/延期数据.xlsx"
  27. df = pd.read_excel(filename)
  28. columns = ["docid","entity_text","label","sentence_left","sentence_right","context_left","context_right","new_label"]
  29. df_data = {}
  30. for _c in columns:
  31. df_data[_c] = df[_c]
  32. append_columns = ["reg_count"]
  33. columns.extend(append_columns)
  34. for _c in append_columns:
  35. df_data[_c] = []
  36. for _left,_new_label,_text in zip(df["sentence_left"],df["new_label"],df["entity_text"]):
  37. _line = _left[-7:]
  38. if str(_new_label)!='nan' and _new_label is not None and _new_label!="":
  39. df_data["reg_count"].append("-1")
  40. else:
  41. # if re.search(".*月.*日",_text) is not None or re.search("^\d+\-\d+\-",_text) is not None:
  42. # df_data["reg_count"].append("1")
  43. # else:
  44. # df_data["reg_count"].append("0")
  45. if re.search("截止(时间|日期):$",_line) is not None:
  46. df_data["reg_count"].append("1")
  47. else:
  48. df_data["reg_count"].append("0")
  49. new_df = pd.DataFrame(df_data)
  50. new_df.to_excel(filename,columns=columns)
  51. import time
  52. def getAttachmentProcessTime():
  53. list_line = []
  54. with open("flow_attachment.log","r",encoding="utf") as f:
  55. list_line = f.readlines()
  56. # _regrex = "filemd5:(?P<filemd5>[a-z0-9]+) of type:(?P<filetype>[a-z0-9]+) with size:(?P<filesize>\d+(\.\d+)?)M download:(?P<dowload>\d+(\.\d+)?)s,recognize takes (?P<recognize>\d+(\.\d+)?)s,ret_size:(?P<ret_size>\d+(\.\d+)?)"
  57. # _regrex = "process filemd5:(?P<filemd5>[a-z0-9]+) of type:(?P<filetype>[a-z0-9]+) with size:(?P<filesize>\d+(\.\d+)?)M download:(?P<dowload>\d+(\.\d+)?)s recognize takes (?P<recognize>\d+(\.\d+)?)s,ret_size:(?P<ret_size>\d+(\.\d+)?)"
  58. # # _regrex = "filemd5:(?P<filemd5>[a-z0-9]+) of type:(?P<filetype>[a-z0-9]+) download:(?P<dowload>\d+(\.\d+)?)s recognize:(?P<recognize>\d+(\.\d+)?)s result:True rec_size:(?P<ret_size>\d+(\.\d+)?)"
  59. # keys = ["filemd5","filetype","filesize","dowload","recognize","ret_size"]
  60. _regrex = "(?P<process_time>.+),\d+ - root - INFO - thread status alive:(?P<alive_count>\d+) restart:0 total:40"
  61. keys = ["process_time","alive_count"]
  62. df_data = {}
  63. for k in keys:
  64. df_data[k] = []
  65. for _line in list_line:
  66. _search = re.search(_regrex,_line)
  67. if _search is not None:
  68. gd = _search.groupdict()
  69. for k in keys:
  70. df_data[k].append(gd.get(k,""))
  71. _sum1 = 0
  72. _sum2 = 0
  73. last_time = None
  74. for _time,_count in zip(df_data["process_time"],df_data["alive_count"]):
  75. if last_time is None:
  76. last_time = time.mktime(time.strptime(_time,"%Y-%m-%d %H:%M:%S"))
  77. current_time = time.mktime(time.strptime(_time,"%Y-%m-%d %H:%M:%S"))
  78. if int(_count)!=40:
  79. _sum1 += (40-int(_count))*(current_time-last_time)
  80. else:
  81. _sum2 += 40*(current_time-last_time)
  82. last_time = current_time
  83. print("sum1",_sum1)
  84. print("sum2",_sum2)
  85. df = pd.DataFrame(df_data)
  86. df.to_excel("attachmentProcessTime3.xlsx",columns=keys)
  87. _exportID = 0
  88. def getExportID():
  89. global _exportID
  90. _exportID += 1
  91. return _exportID
  92. def getTypeName():
  93. _typestr = '''
  94. code | 项目编号
  95. name | 项目名称
  96. org | 组织
  97. company | 公司
  98. job | 职业
  99. person | 人名
  100. time | 时间
  101. location | 地址
  102. package | 包号
  103. phone | 电话
  104. money | 金额
  105. money_tendereeMoney | 招标金额
  106. money_tendererMoney | 中投标金额
  107. org_tenderee | 招标人
  108. org_agency | 代理人
  109. org_tenderer | 中标人
  110. org_secondTenderer | 第二候选人
  111. org_thirdTenderer | 第三候选人
  112. company_tenderee | 招标人
  113. company_agency | 代理人
  114. company_tenderer | 中标人
  115. company_secondTenderer | 第二候选人
  116. company_thirdTenderer | 第三候选人
  117. person_tendereePerson | 招标联系人
  118. person_agencyPerson | 代理联系人
  119. person_person | 联系人
  120. rel_tendererMoney | 中投标金额
  121. rel_tendereeMoney | 招标金额
  122. rel_person | 联系人
  123. rel_pack | 所属包
  124. rel_address | 地址
  125. rel_phone | 联系电话
  126. rel_pack_code | 包件编号
  127. rel_pack_name | 包件名称
  128. person_review | 评审专家
  129. time_release | 发布时间
  130. time_bidopen | 开标时间
  131. time_bidclose | 截标时间
  132. moneysource | 资金来源
  133. bidway | 招标方式
  134. serviceTime | 服务期限
  135. product | 产品
  136. abandon_reason | 失败原因
  137. '''
  138. dict_type_name = {}
  139. for _s_n in _typestr.split("\n"):
  140. _s_n = _s_n.strip()
  141. if _s_n=="":
  142. continue
  143. _s_t = _s_n.split("|")
  144. if len(_s_t) ==2:
  145. _type = _s_t[0].strip()
  146. _name = _s_t[1].strip()
  147. dict_type_name[_type] = _name
  148. return dict_type_name
  149. dict_type_name = getTypeName()
  150. import json
  151. def toJson(list_anno,content):
  152. json_dict = {}
  153. json_dict["id"] = getExportID()
  154. json_dict["text"] = content
  155. dict_anno = {}
  156. for _anno in list_anno:
  157. value = _anno["value"]
  158. _split = value.split("\t")
  159. if _split[0][0]=="T":
  160. _type,_begin,_end = _split[1].split(" ")
  161. dict_anno[_split[0]] = {"id":_split[0],"type":_type,"text":_split[2],"begin":int(_begin),"end":int(_end)}
  162. elif _split[0][0]=="R":
  163. _type,arg1,arg2 = _split[1].split(" ")
  164. dict_anno[_split[0]] = {"id":_split[0],"type":_type,"arg1":arg1.split(":")[1],"arg2":arg2.split(":")[1]}
  165. for k,v in dict_anno.items():
  166. if v["id"][0]=="T":
  167. v["new_id"] = getExportID()
  168. v["label"] = dict_type_name[v["type"]]
  169. v["start_offset"] = v["begin"]
  170. v["end_offset"] = v["end"]
  171. for k,v in dict_anno.items():
  172. if v["id"][0]=="R":
  173. v["new_id"] = getExportID()
  174. v["type"] = dict_type_name[v["type"]]
  175. v["from_id"] = dict_anno[v["arg1"]]["new_id"]
  176. v["to_id"] = dict_anno[v["arg2"]]["new_id"]
  177. list_entitys = []
  178. list_relations = []
  179. for k,v in dict_anno.items():
  180. if v["id"][0]=="T":
  181. _dict = {"id":v["new_id"],
  182. "label":v["label"],
  183. "start_offset":v["start_offset"],
  184. "end_offset":v["end_offset"]}
  185. list_entitys.append(_dict)
  186. if v["id"][0]=="R":
  187. _dict = {"id":v["new_id"],
  188. "type":v["type"],
  189. "from_id":v["from_id"],
  190. "to_id":v["to_id"]}
  191. list_relations.append(_dict)
  192. json_dict["entities"] = list_entitys
  193. json_dict["relations"] = list_relations
  194. return json.dumps(json_dict,ensure_ascii=False)
  195. def exportIepyLabel():
  196. conn = getConnection_postgres("iepy")
  197. cursor = conn.cursor()
  198. sql = ' select begin_time,end_time,"user",doc_count from corpus_payroll where end_time<=\'2021-07-25\' order by end_time desc limit 20'
  199. cursor.execute(sql)
  200. list_diff = []
  201. rows_payroll = cursor.fetchall()
  202. list_json = []
  203. for _payroll in rows_payroll:
  204. _begin_time = _payroll[0]
  205. _end_time = _payroll[1]
  206. _user = _payroll[2]
  207. doc_count = _payroll[3]
  208. print(_user,_begin_time,_end_time,doc_count)
  209. _sql = "select document_id,value from brat_bratannotation where document_id in (select human_identifier from corpus_iedocument where edituser='%s' and to_char(edittime,'yyyy-mm-dd')>='%s' and to_char(edittime,'yyyy-mm-dd')<='%s' limit 100) order by document_id"%(_user,_begin_time,_end_time)
  210. cursor.execute(_sql)
  211. rows = cursor.fetchall()
  212. if len(rows)>0:
  213. current_docid = rows[0][0]
  214. _index = -1
  215. list_values = []
  216. while _index<len(rows)-1:
  217. _index += 1
  218. row = rows[_index]
  219. document_id = row[0]
  220. value = row[1]
  221. if document_id!=current_docid:
  222. print(current_docid)
  223. sql = "select text from corpus_iedocument where human_identifier='%s'"%(str(current_docid))
  224. cursor.execute(sql)
  225. content = cursor.fetchall()[0][0]
  226. _json = toJson(list_values,content)
  227. list_json.append(_json)
  228. _index -= 1
  229. current_docid = document_id
  230. list_values = []
  231. else:
  232. list_values.append({"document_id":document_id,"value":value})
  233. print("length:",len(list_json))
  234. _count = 0
  235. with open("iepy_export.json","w",encoding="utf8") as f:
  236. for _json in list_json:
  237. if len(_json)<600:
  238. f.write(_json)
  239. f.write("\n")
  240. _count += 1
  241. if _count>=100:
  242. break
  243. print(_count)
  244. def exportMergeWrongData():
  245. from dataSource.source import getConnection_mysql
  246. from dataSource.pool import ConnectorPool
  247. from utils.multiThread import MultiThreadHandler
  248. from queue import Queue
  249. import datetime
  250. min_id = 1850000
  251. # max_id = 2000000
  252. max_id = 328426202+1
  253. thread_count = 30
  254. every_count = (max_id-min_id)//30+1
  255. list_dis = []
  256. for _i in range(thread_count):
  257. dis = [min_id+_i*every_count,min_id+(_i+1)*every_count]
  258. list_dis.append(dis)
  259. task_queue = Queue()
  260. result_queue = Queue()
  261. for _dis in list_dis:
  262. task_queue.put(_dis)
  263. def _handle(item,result_queue):
  264. conn = getConnection_mysql()
  265. cursor = conn.cursor()
  266. start,end = item
  267. set_docid = set()
  268. for _id in range(start,end):
  269. sql = "select rule,docid,operate_time from bxkc.bxkc_delete_document_log where id=%d"%(_id)
  270. cursor.execute(sql)
  271. rows = cursor.fetchall()
  272. if len(rows)>0:
  273. rule,docid,operate_time = rows[0]
  274. if docid in set_docid:
  275. continue
  276. if str(rule)[:4]=="项目合并":
  277. set_docid.add(docid)
  278. print(len(set_docid),_id,end,end-_id)
  279. if operate_time.strftime("%Y-%m-%d")<"2022-10-14":
  280. break
  281. result_queue.put(list(set_docid))
  282. mt = MultiThreadHandler(task_queue,_handle,result_queue,30)
  283. mt.run()
  284. list_data = []
  285. while 1:
  286. try:
  287. list_docid = result_queue.get(True,1)
  288. list_data.extend(list_docid)
  289. except Exception as e:
  290. break
  291. list_data = list(set(list_data))
  292. df = pd.DataFrame({"docid":list_data})
  293. df.to_excel("mergeWrong.xlsx")
  294. if __name__=="__main__":
  295. # exportMergeTrainData()
  296. # labelTime()
  297. # getAttachmentProcessTime()
  298. # exportIepyLabel()
  299. exportMergeWrongData()