luojiehua
/
BIDI_ML_INFO_EXTRACTION


			
							123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315
							#!/usr/bin/python3
# -*- coding: utf-8 -*-
# @Author  : bidikeji
# @Time    : 2021/1/13 0013 14:19
import re
import math
import json
import random
import numpy as np
import pandas as pd
from BiddingKG.dl.common.Utils import getVocabAndMatrix,getModel_word,viterbi_decode

tag2index = {'S':0,'B-pro':1, 'I-pro':2, 'E-pro':3, 'B-rea':4, 'I-rea':5, 'E-rea':6}
id_to_tag = {v:k for k,v in tag2index.items()}
# id_to_tag = {0:'O',1:'B',2:'I',3:'E'}
word_model = getModel_word()

vocab, matrix = getVocabAndMatrix(word_model, Embedding_size=60)
word2id = {k: v for v, k in enumerate(vocab)}
max_id = len(vocab)

# vocab = ["<pad>"] + word_model.index2word+ ["<unk>"]
# matrix = np.zeros((len(vocab), 60))
# for i in range(1, len(vocab)-1):
#     matrix[i] = word_model[vocab[i]]
# max_id = len(vocab)
# word2id = {k: v for v, k in enumerate(vocab)}

def df2data(df):
    import pandas as pd
    import json
    datas = []
    for idx in df.index:
        docid = df.loc[idx, 'docid']
        text = df.loc[idx, 'text']
        # string = list(text)
        tags = [0]*len(text)
        labels = json.loads(df.loc[idx, 'label'])
        for label in labels:
            _, _, begin, end, _ = re.split('\s',label)
            begin = int(begin)
            end = int(end)
            if end-begin>=2:
                tags[begin]=1
                tags[end-1]=3
                for i in range(begin+1,end-1):
                    tags[i]=2
        # datas.append([string, tags])
        text_sentence = []
        ids_sentence = []
        tag_sentence = []
        for i in range(len(text)):
            text_sentence.append(text[i])
            ids_sentence.append(word2id.get(text[i], max_id))
            tag_sentence.append(tags[i])
            if text[i] in ['。','！']:
                if text_sentence:
                    # if len(text_sentence) > 100:
                    if len(text_sentence)>5 and len(text_sentence)<1000:
                        datas.append([text_sentence, ids_sentence,tag_sentence])
                    else:
                        print('单句小于5或大于1000，句子长度为：%d,文章ID：%s'%(len(text_sentence), docid))
                    text_sentence = []
                    ids_sentence = []
                    tag_sentence = []
        if text_sentence:
            # if len(text_sentence) > 5:
            if len(text_sentence) > 5 and len(text_sentence) < 1000:
                datas.append([text_sentence, ids_sentence, tag_sentence])
            else:
                print('单句小于5或大于1000，句子长度为：%d,文章ID：%s' % (len(text_sentence), docid))
    return datas

def find_kw_from_text(kw, s):
    '''
    输入关键词及句子信息，返回句子中关键词的所有出现位置
    :param kw: 关键词
    :param s: 文本
    :return:
    '''
    begin = s.find(kw, 0)
    kws = []
    while begin!=-1:
        end = begin + len(kw)
        # print(s[begin:end])
        kws.append((begin, end))
        begin = s.find(kw, end)
    return kws

def get_feature(text, lbs):
    '''
    输入文章预处理后文本内容及产品名称列表，返回句子列表，数字化句子列表，数字化标签列表
    :param text: 文本内容
    :param lbs: 产品名称列表
    :return:
    '''
    lbs = sorted(set(lbs), key=lambda x: len(x), reverse=True)
    sentences = []
    ids_list = []
    tags_list = []
    for sentence in text.split('。'):
        if len(sentence) < 5:
            continue
        if len(sentence) > 1000:
            sentence = sentence[:1000]
        tags = [0] * len(sentence)
        ids = [word2id.get(word, max_id) for word in sentence]
        for lb in lbs:
            kw_indexs = find_kw_from_text(lb, sentence)
            for indexs in kw_indexs:
                b, e = indexs
                if tags[b] == 0 and tags[e - 1] == 0:
                    tags[b] = 1
                    tags[e - 1] = 3
                    for i in range(b+1, e - 1):
                        tags[i] = 2
        sentences.append(list(sentence))
        ids_list.append(ids)
        tags_list.append(tags)
    return sentences, ids_list, tags_list

def dfsearchlb(df):
    datas = []
    for i in df.index:
        text = df.loc[i, 'text']
        lbs = json.loads(df.loc[i, 'lbset'])
        sentences, ids_list, tags_list = get_feature(text, lbs)
        for sen, ids, tags in zip(sentences, ids_list, tags_list):
            datas.append([sen, ids, tags])
    return datas

def get_label_data():
    import psycopg2
    conn = psycopg2.connect(dbname='iepy_product', user='postgres', password='postgres', host='192.168.2.101')
    cursor = conn.cursor()
    sql = "select human_identifier, text from corpus_iedocument where edittime NOTNULL AND jump_signal=0 \
      and creation_date > to_timestamp('2021-01-14 00:00:00','yyyy-MM-dd HH24:mi:ss');"
    cursor.execute(sql)
    writer = open('label_data.txt', 'w', encoding='utf-8')
    datas = []
    for row in cursor.fetchall():
        docid = row[0]
        text = row[1]
        # string = list(text)
        tags = [0]*len(text)
        sql_lb = "select b.value from brat_bratannotation as b where document_id = '{}' and b.value like 'T%product%';".format(docid)
        cursor.execute(sql_lb)
        for row_lb in cursor.fetchall():
            label = row_lb[0]
            _, _, begin, end, _ = re.split('\s',label)
            begin = int(begin)
            end = int(end)
            if end-begin>=2:
                tags[begin]=1
                tags[end-1]=3
                for i in range(begin+1,end-1):
                    tags[i]=2
        # datas.append([string, tags])
        text_sentence = []
        ids_sentence = []
        tag_sentence = []
        for i in range(len(text)):
            text_sentence.append(text[i])
            ids_sentence.append(word2id.get(text[i], max_id))
            tag_sentence.append(tags[i])
            writer.write("%s\t%s\n"%(text[i],tags[i]))
            if text[i] in ['。','?','!','；']:
                writer.write('\n')
                if text_sentence:
                    if len(text_sentence) > 100:
                    # if len(text_sentence)>5 and len(text_sentence)<1000:
                        datas.append([text_sentence, ids_sentence,tag_sentence])
                    elif len(text_sentence) > 5:
                        continue
                    else:
                        print('单句小于5或大于100，句子长度为：%d,文章ID：%s'%(len(text_sentence), docid))
                    text_sentence = []
                    ids_sentence = []
                    tag_sentence = []
        if text_sentence:
            if len(text_sentence) > 5:
            # if len(text_sentence) > 5 and len(text_sentence) < 1000:
                datas.append([text_sentence, ids_sentence, tag_sentence])
            else:
                print('单句小于5或大于100，句子长度为：%d,文章ID：%s' % (len(text_sentence), docid))
    writer.close()
    return datas

def input_from_line(line):
    string = list(line)
    ids = [word2id.get(k, max_id) for k in string]
    tags = []
    return [[string], [ids], [tags]]

def process_data(sentences):
    '''
    字符串数字化并统一长度
    :param sentences: 文章分句字符串列表['招标公告','招标代理']
    :return: 数字化后的统一长度
    '''
    maxLen = max([len(sentence) for sentence in sentences])
    tags = [[word2id.get(k, max_id) for k in sentence] for sentence in sentences]
    pad_tags = [tag[:maxLen]+[0]*(maxLen-len(tag)) for tag in tags]
    return pad_tags

def get_ner(BIE_tag):
    ner = set()
    for it in re.finditer('BI*E',BIE_tag):
        ner.add((it.start(),it.end()))
    return ner

def decode(logits, lengths, matrix):
    paths = []
    small = -1000.0
    start = np.asarray([[small]*4+[0]])
    for score, length in zip(logits, lengths):
        score = score[:length]
        pad = small * np.ones([length, 1])
        logits = np.concatenate([score, pad], axis=1)
        logits = np.concatenate([start, logits], axis=0)
        path, _  = viterbi_decode(logits, matrix)
        paths.append(path[1:])
    return paths

def result_to_json(line, tags):
    result = []
    ner = []
    tags = ''.join([str(it) for it in tags])
    # for it in re.finditer("12*3", tags):
    #     start = it.start()
    #     end = it.end()
    #     ner.append([line[start:end], (start, end)])
    for it in re.finditer("45*6", tags):
        start = it.start()
        end = it.end()
        ner.append([line[start:end], (start, end)])
    result.append([line, ner])
    # print(tags)
    return result


class BatchManager(object):
    def __init__(self, data, batch_size):
        self.batch_data = self.sort_and_pad(data, batch_size)
        self.len_data = len(self.batch_data)

    def sort_and_pad(self, data, batch_size):
        num_batch = int(math.ceil(len(data)/batch_size))
        sorted_data = sorted(data, key=lambda x:len(x[0]))
        print('最小句子长度：%d；最大句子长度：%d' % (len(sorted_data[0][0]), len(sorted_data[-1][0])))  # 临时增加打印句子长度
        batch_data = list()
        for i in range(num_batch):
            batch_data.append(self.pad_data(sorted_data[i*int(batch_size):(i+1)*int(batch_size)]))
        return batch_data

    @staticmethod
    def pad_data(data):
        strings = []
        chars = []
        targets = []
        max_length = max([len(sentence[0]) for sentence in data])
        for line in data:
            string, char, target = line
            padding = [0]*(max_length-len(string))
            strings.append(string + padding)
            chars.append(char + padding)
            targets.append(target + padding)
        return [strings, chars, targets]

    def iter_batch(self, shuffle=False):
        if shuffle:
            random.shuffle(self.batch_data)
        for idx in range(self.len_data):
            yield self.batch_data[idx]

def 获取原始标注数据():
    import psycopg2
    import json
    conn = psycopg2.connect(dbname='iepy_product', user='postgres', password='postgres', host='192.168.2.103')
    cursor = conn.cursor()
    sql = "select human_identifier, text from corpus_iedocument where edittime NOTNULL AND jump_signal=0 ;"
    cursor.execute(sql)
    writer = open('label_data.txt', 'w', encoding='utf-8')
    datas = []
    for row in cursor.fetchall():
        docid = row[0]
        text = row[1]
        sql_lb = "select b.value from brat_bratannotation as b where document_id = '{}' and b.value like 'T%product%';".format(docid)
        cursor.execute(sql_lb)
        rows = cursor.fetchall()
        print('len(rows)', len(rows))
        datas.append((docid, text, json.dumps(rows, ensure_ascii=False), len(rows)))
    df = pd.DataFrame(datas, columns=['docid', 'text', 'rows', 'product_num'])
    df.to_excel('data/产品数据自己人标注的原始数据.xlsx')


if __name__=="__main__":
    # import os
    import pickle
    # with open('data/dev_data2.pkl', 'rb') as f:
    #     dev_data = pickle.load(f)
    # print(len(dev_data))
    # print(os.path.exists('data/testdata.xlsx'))
    # df = pd.read_excel('data/testdata.xlsx')
    # print(len(df))
    # data_test = df2data(df)
    # print(len(data_test), len(data_test[0][0]))
    # 获取原始标注数据()
    df = pd.read_excel('data/产品数据自己人标注的原始数据.xlsx')
    with open('data/dev_data2.pkl', 'rb') as f:
        dev_data = pickle.load(f)
    print(len(set(df['docid'])))
    print('')