#!/usr/bin/python3 # -*- coding: utf-8 -*- # @Author : bidikeji # @Time : 2021/1/13 0013 14:19 import re import math import random import psycopg2 import numpy as np from tensorflow.contrib.crf import viterbi_decode from BiddingKG.dl.common.Utils import getVocabAndMatrix,getModel_word id_to_tag = {0:'O',1:'B',2:'I',3:'E'} word_model = getModel_word() vocab, matrix = getVocabAndMatrix(word_model, Embedding_size=60) word2id = {k: v for v, k in enumerate(vocab)} max_id = len(vocab) conn = psycopg2.connect(dbname='iepy_product', user='postgres', password='postgres', host='192.168.2.101') cursor = conn.cursor() def get_label_data(): sql = "select human_identifier, text from corpus_iedocument where edittime NOTNULL AND jump_signal=0 \ and creation_date > to_timestamp('2021-01-14 00:00:00','yyyy-MM-dd HH24:mi:ss');" cursor.execute(sql) writer = open('label_data.txt', 'w', encoding='utf-8') datas = [] for row in cursor.fetchall(): docid = row[0] text = row[1] # string = list(text) tags = [0]*len(text) sql_lb = "select b.value from brat_bratannotation as b where document_id = '{}' and b.value like 'T%product%';".format(docid) cursor.execute(sql_lb) for row_lb in cursor.fetchall(): label = row_lb[0] _, _, begin, end, _ = re.split('\s',label) begin = int(begin) end = int(end) if end-begin>=2: tags[begin]=1 tags[end-1]=3 for i in range(begin+1,end-1): tags[i]=2 # datas.append([string, tags]) text_sentence = [] ids_sentence = [] tag_sentence = [] for i in range(len(text)): text_sentence.append(text[i]) ids_sentence.append(word2id.get(text[i], max_id)) tag_sentence.append(tags[i]) writer.write("%s\t%s\n"%(text[i],tags[i])) if text[i] in ['。','?','!',';']: writer.write('\n') if text_sentence: if len(text_sentence) > 100: # if len(text_sentence)>5 and len(text_sentence)<1000: datas.append([text_sentence, ids_sentence,tag_sentence]) elif len(text_sentence) > 5: continue else: print('单句小于5或大于100,句子长度为:%d,文章ID:%s'%(len(text_sentence), docid)) text_sentence = [] ids_sentence = [] tag_sentence = [] if text_sentence: if len(text_sentence) > 5: # if len(text_sentence) > 5 and len(text_sentence) < 1000: datas.append([text_sentence, ids_sentence, tag_sentence]) else: print('单句小于5或大于100,句子长度为:%d,文章ID:%s' % (len(text_sentence), docid)) writer.close() return datas def input_from_line(line): string = list(line) ids = [word2id.get(k, max_id) for k in string] tags = [] return [[string], [ids], [tags]] def process_data(sentences): ''' 字符串数字化并统一长度 :param sentences: 文章分句字符串列表['招标公告','招标代理'] :return: 数字化后的统一长度 ''' maxLen = max([len(sentence) for sentence in sentences]) tags = [[word2id.get(k, max_id) for k in sentence] for sentence in sentences] pad_tags = [tag[:maxLen]+[0]*(maxLen-len(tag)) for tag in tags] return pad_tags def get_ner(BIE_tag): ner = set() for it in re.finditer('BI*E',BIE_tag): ner.add((it.start(),it.end())) return ner def decode(logits, lengths, matrix): paths = [] small = -1000.0 start = np.asarray([[small]*4+[0]]) for score, length in zip(logits, lengths): score = score[:length] pad = small * np.ones([length, 1]) logits = np.concatenate([score, pad], axis=1) logits = np.concatenate([start, logits], axis=0) path, _ = viterbi_decode(logits, matrix) paths.append(path[1:]) return paths def result_to_json(line, tags): result = [] ner = [] tags = ''.join([str(it) for it in tags]) for it in re.finditer("12*3", tags): start = it.start() end = it.end() ner.append([line[start:end], (start, end)]) result.append([line, ner]) print(tags) return result class BatchManager(object): def __init__(self, data, batch_size): self.batch_data = self.sort_and_pad(data, batch_size) self.len_data = len(self.batch_data) def sort_and_pad(self, data, batch_size): num_batch = int(math.ceil(len(data)/batch_size)) sorted_data = sorted(data, key=lambda x:len(x[0])) print('最小句子长度:%d;最大句子长度:%d' % (len(sorted_data[0][0]), len(sorted_data[-1][0]))) # 临时增加打印句子长度 batch_data = list() for i in range(num_batch): batch_data.append(self.pad_data(sorted_data[i*int(batch_size):(i+1)*int(batch_size)])) return batch_data @staticmethod def pad_data(data): strings = [] chars = [] targets = [] max_length = max([len(sentence[0]) for sentence in data]) for line in data: string, char, target = line padding = [0]*(max_length-len(string)) strings.append(string + padding) chars.append(char + padding) targets.append(target + padding) return [strings, chars, targets] def iter_batch(self, shuffle=False): if shuffle: random.shuffle(self.batch_data) for idx in range(self.len_data): yield self.batch_data[idx]