data_util.py 12 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329
  1. #!/usr/bin/python3
  2. # -*- coding: utf-8 -*-
  3. # @Author : bidikeji
  4. # @Time : 2021/1/13 0013 14:19
  5. import re
  6. import os
  7. import math
  8. import json
  9. import random
  10. import numpy as np
  11. import pandas as pd
  12. from BiddingKG.dl.common.Utils import getVocabAndMatrix,getModel_word,viterbi_decode, load
  13. tag2index = {'S':0,'B-pro':1, 'I-pro':2, 'E-pro':3, 'B-rea':4, 'I-rea':5, 'E-rea':6}
  14. id_to_tag = {v:k for k,v in tag2index.items()}
  15. # id_to_tag = {0:'O',1:'B',2:'I',3:'E'}
  16. word_model = getModel_word()
  17. vocab, matrix = getVocabAndMatrix(word_model, Embedding_size=60)
  18. word2id = {k: v for v, k in enumerate(vocab)}
  19. max_id = len(vocab)
  20. # path1 = os.path.abspath(os.path.join(os.path.dirname(__file__), '..'))+"/interface/codename_vocab.pk"
  21. # path2 = os.path.abspath(os.path.join(os.path.dirname(__file__), '..'))+"/interface/codename_w2v_matrix.pk"
  22. # vocab = load(path1)
  23. # matrix = load(path2)
  24. # max_id = len(vocab)
  25. # word2id = {k: v for v, k in enumerate(vocab)}
  26. # vocab = ["<pad>"] + word_model.index2word+ ["<unk>"]
  27. # matrix = np.zeros((len(vocab), 60))
  28. # for i in range(1, len(vocab)-1):
  29. # matrix[i] = word_model[vocab[i]]
  30. # max_id = len(vocab)
  31. # word2id = {k: v for v, k in enumerate(vocab)}
  32. def df2data(df):
  33. import pandas as pd
  34. import json
  35. datas = []
  36. for idx in df.index:
  37. docid = df.loc[idx, 'docid']
  38. text = df.loc[idx, 'text']
  39. # string = list(text)
  40. tags = [0]*len(text)
  41. labels = json.loads(df.loc[idx, 'label'])
  42. for label in labels:
  43. _, _, begin, end, _ = re.split('\s',label)
  44. begin = int(begin)
  45. end = int(end)
  46. if end-begin>=2:
  47. tags[begin]=1
  48. tags[end-1]=3
  49. for i in range(begin+1,end-1):
  50. tags[i]=2
  51. # datas.append([string, tags])
  52. text_sentence = []
  53. ids_sentence = []
  54. tag_sentence = []
  55. for i in range(len(text)):
  56. text_sentence.append(text[i])
  57. # ids_sentence.append(word2id.get(text[i], max_id))
  58. ids_sentence.append(word2id.get(text[i], word2id.get('<unk>')))
  59. tag_sentence.append(tags[i])
  60. if text[i] in ['。','!']:
  61. if text_sentence:
  62. # if len(text_sentence) > 100:
  63. if len(text_sentence)>5 and len(text_sentence)<1000:
  64. datas.append([text_sentence, ids_sentence,tag_sentence])
  65. else:
  66. print('单句小于5或大于1000,句子长度为:%d,文章ID:%s'%(len(text_sentence), docid))
  67. text_sentence = []
  68. ids_sentence = []
  69. tag_sentence = []
  70. if text_sentence:
  71. # if len(text_sentence) > 5:
  72. if len(text_sentence) > 5 and len(text_sentence) < 1000:
  73. datas.append([text_sentence, ids_sentence, tag_sentence])
  74. else:
  75. print('单句小于5或大于1000,句子长度为:%d,文章ID:%s' % (len(text_sentence), docid))
  76. return datas
  77. def find_kw_from_text(kw, s):
  78. '''
  79. 输入关键词及句子信息,返回句子中关键词的所有出现位置
  80. :param kw: 关键词
  81. :param s: 文本
  82. :return:
  83. '''
  84. begin = s.find(kw, 0)
  85. kws = []
  86. while begin!=-1:
  87. end = begin + len(kw)
  88. # print(s[begin:end])
  89. kws.append((begin, end))
  90. begin = s.find(kw, end)
  91. return kws
  92. def get_feature(text, lbs):
  93. '''
  94. 输入文章预处理后文本内容及产品名称列表,返回句子列表,数字化句子列表,数字化标签列表
  95. :param text: 文本内容
  96. :param lbs: 产品名称列表
  97. :return:
  98. '''
  99. lbs = sorted(set(lbs), key=lambda x: len(x), reverse=True)
  100. sentences = []
  101. ids_list = []
  102. tags_list = []
  103. for sentence in text.split('。'):
  104. if len(sentence) < 5:
  105. continue
  106. if len(sentence) > 1000:
  107. sentence = sentence[:1000]
  108. tags = [0] * len(sentence)
  109. # ids = [word2id.get(word, max_id) for word in sentence]
  110. ids = [word2id.get(word, word2id.get('<unk>')) for word in sentence]
  111. for lb in lbs:
  112. kw_indexs = find_kw_from_text(lb, sentence)
  113. for indexs in kw_indexs:
  114. b, e = indexs
  115. if tags[b] == 0 and tags[e - 1] == 0:
  116. tags[b] = 1
  117. tags[e - 1] = 3
  118. for i in range(b+1, e - 1):
  119. tags[i] = 2
  120. sentences.append(list(sentence))
  121. ids_list.append(ids)
  122. tags_list.append(tags)
  123. return sentences, ids_list, tags_list
  124. def dfsearchlb(df):
  125. datas = []
  126. for i in df.index:
  127. text = df.loc[i, 'text']
  128. lbs = json.loads(df.loc[i, 'lbset'])
  129. sentences, ids_list, tags_list = get_feature(text, lbs)
  130. for sen, ids, tags in zip(sentences, ids_list, tags_list):
  131. datas.append([sen, ids, tags])
  132. return datas
  133. def get_label_data():
  134. import psycopg2
  135. conn = psycopg2.connect(dbname='iepy_product', user='postgres', password='postgres', host='192.168.2.101')
  136. cursor = conn.cursor()
  137. sql = "select human_identifier, text from corpus_iedocument where edittime NOTNULL AND jump_signal=0 \
  138. and creation_date > to_timestamp('2021-01-14 00:00:00','yyyy-MM-dd HH24:mi:ss');"
  139. cursor.execute(sql)
  140. writer = open('label_data.txt', 'w', encoding='utf-8')
  141. datas = []
  142. for row in cursor.fetchall():
  143. docid = row[0]
  144. text = row[1]
  145. # string = list(text)
  146. tags = [0]*len(text)
  147. sql_lb = "select b.value from brat_bratannotation as b where document_id = '{}' and b.value like 'T%product%';".format(docid)
  148. cursor.execute(sql_lb)
  149. for row_lb in cursor.fetchall():
  150. label = row_lb[0]
  151. _, _, begin, end, _ = re.split('\s',label)
  152. begin = int(begin)
  153. end = int(end)
  154. if end-begin>=2:
  155. tags[begin]=1
  156. tags[end-1]=3
  157. for i in range(begin+1,end-1):
  158. tags[i]=2
  159. # datas.append([string, tags])
  160. text_sentence = []
  161. ids_sentence = []
  162. tag_sentence = []
  163. for i in range(len(text)):
  164. text_sentence.append(text[i])
  165. # ids_sentence.append(word2id.get(text[i], max_id))
  166. ids_sentence.append(word2id.get(text[i], word2id.get('<unk>')))
  167. tag_sentence.append(tags[i])
  168. writer.write("%s\t%s\n"%(text[i],tags[i]))
  169. if text[i] in ['。','?','!',';']:
  170. writer.write('\n')
  171. if text_sentence:
  172. if len(text_sentence) > 100:
  173. # if len(text_sentence)>5 and len(text_sentence)<1000:
  174. datas.append([text_sentence, ids_sentence,tag_sentence])
  175. elif len(text_sentence) > 5:
  176. continue
  177. else:
  178. print('单句小于5或大于100,句子长度为:%d,文章ID:%s'%(len(text_sentence), docid))
  179. text_sentence = []
  180. ids_sentence = []
  181. tag_sentence = []
  182. if text_sentence:
  183. if len(text_sentence) > 5:
  184. # if len(text_sentence) > 5 and len(text_sentence) < 1000:
  185. datas.append([text_sentence, ids_sentence, tag_sentence])
  186. else:
  187. print('单句小于5或大于100,句子长度为:%d,文章ID:%s' % (len(text_sentence), docid))
  188. writer.close()
  189. return datas
  190. def input_from_line(line):
  191. string = list(line)
  192. # ids = [word2id.get(k, max_id) for k in string]
  193. ids = [word2id.get(k, word2id.get('<unk>')) for k in string]
  194. tags = []
  195. return [[string], [ids], [tags]]
  196. def process_data(sentences):
  197. '''
  198. 字符串数字化并统一长度
  199. :param sentences: 文章分句字符串列表['招标公告','招标代理']
  200. :return: 数字化后的统一长度
  201. '''
  202. maxLen = max([len(sentence) for sentence in sentences])
  203. tags = [[word2id.get(k, max_id) for k in sentence] for sentence in sentences]
  204. # tags = [[word2id.get(k, word2id.get('<unk>')) for k in sentence] for sentence in sentences]
  205. pad_tags = [tag[:maxLen]+[0]*(maxLen-len(tag)) for tag in tags]
  206. return pad_tags
  207. def get_ner(BIE_tag):
  208. ner = set()
  209. for it in re.finditer('BI*E',BIE_tag):
  210. ner.add((it.start(),it.end()))
  211. return ner
  212. def decode(logits, lengths, matrix):
  213. paths = []
  214. small = -1000.0
  215. start = np.asarray([[small]*4+[0]])
  216. # start = np.asarray([[small]*7+[0]])
  217. for score, length in zip(logits, lengths):
  218. score = score[:length]
  219. pad = small * np.ones([length, 1])
  220. logits = np.concatenate([score, pad], axis=1)
  221. logits = np.concatenate([start, logits], axis=0)
  222. path, _ = viterbi_decode(logits, matrix)
  223. paths.append(path[1:])
  224. return paths
  225. def result_to_json(line, tags):
  226. result = []
  227. ner = []
  228. tags = ''.join([str(it) for it in tags])
  229. for it in re.finditer("12*3", tags):
  230. start = it.start()
  231. end = it.end()
  232. ner.append([line[start:end], (start, end)])
  233. # for it in re.finditer("45*6", tags):
  234. # start = it.start()
  235. # end = it.end()
  236. # ner.append([line[start:end], (start, end)])
  237. result.append([line, ner])
  238. # print(tags)
  239. return result
  240. class BatchManager(object):
  241. def __init__(self, data, batch_size):
  242. self.batch_data = self.sort_and_pad(data, batch_size)
  243. self.len_data = len(self.batch_data)
  244. def sort_and_pad(self, data, batch_size):
  245. num_batch = int(math.ceil(len(data)/batch_size))
  246. sorted_data = sorted(data, key=lambda x:len(x[0]))
  247. print('最小句子长度:%d;最大句子长度:%d' % (len(sorted_data[0][0]), len(sorted_data[-1][0]))) # 临时增加打印句子长度
  248. batch_data = list()
  249. for i in range(num_batch):
  250. batch_data.append(self.pad_data(sorted_data[i*int(batch_size):(i+1)*int(batch_size)]))
  251. return batch_data
  252. @staticmethod
  253. def pad_data(data):
  254. strings = []
  255. chars = []
  256. targets = []
  257. max_length = max([len(sentence[0]) for sentence in data])
  258. for line in data:
  259. string, char, target = line
  260. padding = [0]*(max_length-len(string))
  261. strings.append(string + padding)
  262. chars.append(char + padding)
  263. targets.append(target + padding)
  264. return [strings, chars, targets]
  265. def iter_batch(self, shuffle=False):
  266. if shuffle:
  267. random.shuffle(self.batch_data)
  268. for idx in range(self.len_data):
  269. yield self.batch_data[idx]
  270. def 获取原始标注数据():
  271. import psycopg2
  272. import json
  273. conn = psycopg2.connect(dbname='iepy_product', user='postgres', password='postgres', host='192.168.2.103')
  274. cursor = conn.cursor()
  275. sql = "select human_identifier, text from corpus_iedocument where edittime NOTNULL AND jump_signal=0 ;"
  276. cursor.execute(sql)
  277. writer = open('label_data.txt', 'w', encoding='utf-8')
  278. datas = []
  279. for row in cursor.fetchall():
  280. docid = row[0]
  281. text = row[1]
  282. sql_lb = "select b.value from brat_bratannotation as b where document_id = '{}' and b.value like 'T%product%';".format(docid)
  283. cursor.execute(sql_lb)
  284. rows = cursor.fetchall()
  285. print('len(rows)', len(rows))
  286. datas.append((docid, text, json.dumps(rows, ensure_ascii=False), len(rows)))
  287. df = pd.DataFrame(datas, columns=['docid', 'text', 'rows', 'product_num'])
  288. df.to_excel('data/产品数据自己人标注的原始数据.xlsx')
  289. if __name__=="__main__":
  290. # import os
  291. import pickle
  292. # with open('data/dev_data2.pkl', 'rb') as f:
  293. # dev_data = pickle.load(f)
  294. # print(len(dev_data))
  295. # print(os.path.exists('data/testdata.xlsx'))
  296. # df = pd.read_excel('data/testdata.xlsx')
  297. # print(len(df))
  298. # data_test = df2data(df)
  299. # print(len(data_test), len(data_test[0][0]))
  300. # 获取原始标注数据()
  301. df = pd.read_excel('data/产品数据自己人标注的原始数据.xlsx')
  302. with open('data/dev_data2.pkl', 'rb') as f:
  303. dev_data = pickle.load(f)
  304. print(len(set(df['docid'])))
  305. print('')