data_util.py 11 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315
  1. #!/usr/bin/python3
  2. # -*- coding: utf-8 -*-
  3. # @Author : bidikeji
  4. # @Time : 2021/1/13 0013 14:19
  5. import re
  6. import math
  7. import json
  8. import random
  9. import numpy as np
  10. import pandas as pd
  11. from BiddingKG.dl.common.Utils import getVocabAndMatrix,getModel_word,viterbi_decode
  12. tag2index = {'S':0,'B-pro':1, 'I-pro':2, 'E-pro':3, 'B-rea':4, 'I-rea':5, 'E-rea':6}
  13. id_to_tag = {v:k for k,v in tag2index.items()}
  14. # id_to_tag = {0:'O',1:'B',2:'I',3:'E'}
  15. word_model = getModel_word()
  16. vocab, matrix = getVocabAndMatrix(word_model, Embedding_size=60)
  17. word2id = {k: v for v, k in enumerate(vocab)}
  18. max_id = len(vocab)
  19. # vocab = ["<pad>"] + word_model.index2word+ ["<unk>"]
  20. # matrix = np.zeros((len(vocab), 60))
  21. # for i in range(1, len(vocab)-1):
  22. # matrix[i] = word_model[vocab[i]]
  23. # max_id = len(vocab)
  24. # word2id = {k: v for v, k in enumerate(vocab)}
  25. def df2data(df):
  26. import pandas as pd
  27. import json
  28. datas = []
  29. for idx in df.index:
  30. docid = df.loc[idx, 'docid']
  31. text = df.loc[idx, 'text']
  32. # string = list(text)
  33. tags = [0]*len(text)
  34. labels = json.loads(df.loc[idx, 'label'])
  35. for label in labels:
  36. _, _, begin, end, _ = re.split('\s',label)
  37. begin = int(begin)
  38. end = int(end)
  39. if end-begin>=2:
  40. tags[begin]=1
  41. tags[end-1]=3
  42. for i in range(begin+1,end-1):
  43. tags[i]=2
  44. # datas.append([string, tags])
  45. text_sentence = []
  46. ids_sentence = []
  47. tag_sentence = []
  48. for i in range(len(text)):
  49. text_sentence.append(text[i])
  50. ids_sentence.append(word2id.get(text[i], max_id))
  51. tag_sentence.append(tags[i])
  52. if text[i] in ['。','!']:
  53. if text_sentence:
  54. # if len(text_sentence) > 100:
  55. if len(text_sentence)>5 and len(text_sentence)<1000:
  56. datas.append([text_sentence, ids_sentence,tag_sentence])
  57. else:
  58. print('单句小于5或大于1000,句子长度为:%d,文章ID:%s'%(len(text_sentence), docid))
  59. text_sentence = []
  60. ids_sentence = []
  61. tag_sentence = []
  62. if text_sentence:
  63. # if len(text_sentence) > 5:
  64. if len(text_sentence) > 5 and len(text_sentence) < 1000:
  65. datas.append([text_sentence, ids_sentence, tag_sentence])
  66. else:
  67. print('单句小于5或大于1000,句子长度为:%d,文章ID:%s' % (len(text_sentence), docid))
  68. return datas
  69. def find_kw_from_text(kw, s):
  70. '''
  71. 输入关键词及句子信息,返回句子中关键词的所有出现位置
  72. :param kw: 关键词
  73. :param s: 文本
  74. :return:
  75. '''
  76. begin = s.find(kw, 0)
  77. kws = []
  78. while begin!=-1:
  79. end = begin + len(kw)
  80. # print(s[begin:end])
  81. kws.append((begin, end))
  82. begin = s.find(kw, end)
  83. return kws
  84. def get_feature(text, lbs):
  85. '''
  86. 输入文章预处理后文本内容及产品名称列表,返回句子列表,数字化句子列表,数字化标签列表
  87. :param text: 文本内容
  88. :param lbs: 产品名称列表
  89. :return:
  90. '''
  91. lbs = sorted(set(lbs), key=lambda x: len(x), reverse=True)
  92. sentences = []
  93. ids_list = []
  94. tags_list = []
  95. for sentence in text.split('。'):
  96. if len(sentence) < 5:
  97. continue
  98. if len(sentence) > 1000:
  99. sentence = sentence[:1000]
  100. tags = [0] * len(sentence)
  101. ids = [word2id.get(word, max_id) for word in sentence]
  102. for lb in lbs:
  103. kw_indexs = find_kw_from_text(lb, sentence)
  104. for indexs in kw_indexs:
  105. b, e = indexs
  106. if tags[b] == 0 and tags[e - 1] == 0:
  107. tags[b] = 1
  108. tags[e - 1] = 3
  109. for i in range(b+1, e - 1):
  110. tags[i] = 2
  111. sentences.append(list(sentence))
  112. ids_list.append(ids)
  113. tags_list.append(tags)
  114. return sentences, ids_list, tags_list
  115. def dfsearchlb(df):
  116. datas = []
  117. for i in df.index:
  118. text = df.loc[i, 'text']
  119. lbs = json.loads(df.loc[i, 'lbset'])
  120. sentences, ids_list, tags_list = get_feature(text, lbs)
  121. for sen, ids, tags in zip(sentences, ids_list, tags_list):
  122. datas.append([sen, ids, tags])
  123. return datas
  124. def get_label_data():
  125. import psycopg2
  126. conn = psycopg2.connect(dbname='iepy_product', user='postgres', password='postgres', host='192.168.2.101')
  127. cursor = conn.cursor()
  128. sql = "select human_identifier, text from corpus_iedocument where edittime NOTNULL AND jump_signal=0 \
  129. and creation_date > to_timestamp('2021-01-14 00:00:00','yyyy-MM-dd HH24:mi:ss');"
  130. cursor.execute(sql)
  131. writer = open('label_data.txt', 'w', encoding='utf-8')
  132. datas = []
  133. for row in cursor.fetchall():
  134. docid = row[0]
  135. text = row[1]
  136. # string = list(text)
  137. tags = [0]*len(text)
  138. sql_lb = "select b.value from brat_bratannotation as b where document_id = '{}' and b.value like 'T%product%';".format(docid)
  139. cursor.execute(sql_lb)
  140. for row_lb in cursor.fetchall():
  141. label = row_lb[0]
  142. _, _, begin, end, _ = re.split('\s',label)
  143. begin = int(begin)
  144. end = int(end)
  145. if end-begin>=2:
  146. tags[begin]=1
  147. tags[end-1]=3
  148. for i in range(begin+1,end-1):
  149. tags[i]=2
  150. # datas.append([string, tags])
  151. text_sentence = []
  152. ids_sentence = []
  153. tag_sentence = []
  154. for i in range(len(text)):
  155. text_sentence.append(text[i])
  156. ids_sentence.append(word2id.get(text[i], max_id))
  157. tag_sentence.append(tags[i])
  158. writer.write("%s\t%s\n"%(text[i],tags[i]))
  159. if text[i] in ['。','?','!',';']:
  160. writer.write('\n')
  161. if text_sentence:
  162. if len(text_sentence) > 100:
  163. # if len(text_sentence)>5 and len(text_sentence)<1000:
  164. datas.append([text_sentence, ids_sentence,tag_sentence])
  165. elif len(text_sentence) > 5:
  166. continue
  167. else:
  168. print('单句小于5或大于100,句子长度为:%d,文章ID:%s'%(len(text_sentence), docid))
  169. text_sentence = []
  170. ids_sentence = []
  171. tag_sentence = []
  172. if text_sentence:
  173. if len(text_sentence) > 5:
  174. # if len(text_sentence) > 5 and len(text_sentence) < 1000:
  175. datas.append([text_sentence, ids_sentence, tag_sentence])
  176. else:
  177. print('单句小于5或大于100,句子长度为:%d,文章ID:%s' % (len(text_sentence), docid))
  178. writer.close()
  179. return datas
  180. def input_from_line(line):
  181. string = list(line)
  182. ids = [word2id.get(k, max_id) for k in string]
  183. tags = []
  184. return [[string], [ids], [tags]]
  185. def process_data(sentences):
  186. '''
  187. 字符串数字化并统一长度
  188. :param sentences: 文章分句字符串列表['招标公告','招标代理']
  189. :return: 数字化后的统一长度
  190. '''
  191. maxLen = max([len(sentence) for sentence in sentences])
  192. tags = [[word2id.get(k, max_id) for k in sentence] for sentence in sentences]
  193. pad_tags = [tag[:maxLen]+[0]*(maxLen-len(tag)) for tag in tags]
  194. return pad_tags
  195. def get_ner(BIE_tag):
  196. ner = set()
  197. for it in re.finditer('BI*E',BIE_tag):
  198. ner.add((it.start(),it.end()))
  199. return ner
  200. def decode(logits, lengths, matrix):
  201. paths = []
  202. small = -1000.0
  203. start = np.asarray([[small]*4+[0]])
  204. for score, length in zip(logits, lengths):
  205. score = score[:length]
  206. pad = small * np.ones([length, 1])
  207. logits = np.concatenate([score, pad], axis=1)
  208. logits = np.concatenate([start, logits], axis=0)
  209. path, _ = viterbi_decode(logits, matrix)
  210. paths.append(path[1:])
  211. return paths
  212. def result_to_json(line, tags):
  213. result = []
  214. ner = []
  215. tags = ''.join([str(it) for it in tags])
  216. # for it in re.finditer("12*3", tags):
  217. # start = it.start()
  218. # end = it.end()
  219. # ner.append([line[start:end], (start, end)])
  220. for it in re.finditer("45*6", tags):
  221. start = it.start()
  222. end = it.end()
  223. ner.append([line[start:end], (start, end)])
  224. result.append([line, ner])
  225. # print(tags)
  226. return result
  227. class BatchManager(object):
  228. def __init__(self, data, batch_size):
  229. self.batch_data = self.sort_and_pad(data, batch_size)
  230. self.len_data = len(self.batch_data)
  231. def sort_and_pad(self, data, batch_size):
  232. num_batch = int(math.ceil(len(data)/batch_size))
  233. sorted_data = sorted(data, key=lambda x:len(x[0]))
  234. print('最小句子长度:%d;最大句子长度:%d' % (len(sorted_data[0][0]), len(sorted_data[-1][0]))) # 临时增加打印句子长度
  235. batch_data = list()
  236. for i in range(num_batch):
  237. batch_data.append(self.pad_data(sorted_data[i*int(batch_size):(i+1)*int(batch_size)]))
  238. return batch_data
  239. @staticmethod
  240. def pad_data(data):
  241. strings = []
  242. chars = []
  243. targets = []
  244. max_length = max([len(sentence[0]) for sentence in data])
  245. for line in data:
  246. string, char, target = line
  247. padding = [0]*(max_length-len(string))
  248. strings.append(string + padding)
  249. chars.append(char + padding)
  250. targets.append(target + padding)
  251. return [strings, chars, targets]
  252. def iter_batch(self, shuffle=False):
  253. if shuffle:
  254. random.shuffle(self.batch_data)
  255. for idx in range(self.len_data):
  256. yield self.batch_data[idx]
  257. def 获取原始标注数据():
  258. import psycopg2
  259. import json
  260. conn = psycopg2.connect(dbname='iepy_product', user='postgres', password='postgres', host='192.168.2.103')
  261. cursor = conn.cursor()
  262. sql = "select human_identifier, text from corpus_iedocument where edittime NOTNULL AND jump_signal=0 ;"
  263. cursor.execute(sql)
  264. writer = open('label_data.txt', 'w', encoding='utf-8')
  265. datas = []
  266. for row in cursor.fetchall():
  267. docid = row[0]
  268. text = row[1]
  269. sql_lb = "select b.value from brat_bratannotation as b where document_id = '{}' and b.value like 'T%product%';".format(docid)
  270. cursor.execute(sql_lb)
  271. rows = cursor.fetchall()
  272. print('len(rows)', len(rows))
  273. datas.append((docid, text, json.dumps(rows, ensure_ascii=False), len(rows)))
  274. df = pd.DataFrame(datas, columns=['docid', 'text', 'rows', 'product_num'])
  275. df.to_excel('data/产品数据自己人标注的原始数据.xlsx')
  276. if __name__=="__main__":
  277. # import os
  278. import pickle
  279. # with open('data/dev_data2.pkl', 'rb') as f:
  280. # dev_data = pickle.load(f)
  281. # print(len(dev_data))
  282. # print(os.path.exists('data/testdata.xlsx'))
  283. # df = pd.read_excel('data/testdata.xlsx')
  284. # print(len(df))
  285. # data_test = df2data(df)
  286. # print(len(data_test), len(data_test[0][0]))
  287. # 获取原始标注数据()
  288. df = pd.read_excel('data/产品数据自己人标注的原始数据.xlsx')
  289. with open('data/dev_data2.pkl', 'rb') as f:
  290. dev_data = pickle.load(f)
  291. print(len(set(df['docid'])))
  292. print('')