|
@@ -0,0 +1,344 @@
|
|
|
|
+#!/usr/bin/python3
|
|
|
|
+# -*- coding: utf-8 -*-
|
|
|
|
+# @Author : bidikeji
|
|
|
|
+# @Time : 2021/12/13 18:28
|
|
|
|
+import tensorflow as tf
|
|
|
|
+import pandas as pd
|
|
|
|
+import random
|
|
|
|
+import json
|
|
|
|
+import os
|
|
|
|
+import re
|
|
|
|
+import collections
|
|
|
|
+from BiddingKG.dl.product.data_util import word2id, max_id
|
|
|
|
+max_len = 500
|
|
|
|
+
|
|
|
|
+def create_int_feature(values):
|
|
|
|
+ feature = tf.train.Feature(int64_list=tf.train.Int64List(value=list(values))) #list(values)
|
|
|
|
+ return feature
|
|
|
|
+
|
|
|
|
+class TrainingInstance(object):
|
|
|
|
+ def __init__(self, word_list, tag_list):
|
|
|
|
+ self.word_list = word_list
|
|
|
|
+ self.tag_list = tag_list
|
|
|
|
+
|
|
|
|
+def fix_label_ner_句号分开(sentence, product_list, reasons_list):
|
|
|
|
+ tag_list = ['S'] * len(sentence)
|
|
|
|
+ word_list = list(sentence)
|
|
|
|
+ for product in product_list:
|
|
|
|
+ b = sentence.find(product)
|
|
|
|
+ while b != -1:
|
|
|
|
+ e = b + len(product)
|
|
|
|
+ if tag_list[b] == 'S' and tag_list[e - 1] == 'S':
|
|
|
|
+ tag_list[b] = 'B-pro'
|
|
|
|
+ tag_list[e - 1] = 'E-pro'
|
|
|
|
+ for i in range(b + 1, e - 1):
|
|
|
|
+ tag_list[i] = 'I-pro'
|
|
|
|
+ b = sentence.find(product, e)
|
|
|
|
+ for reason in reasons_list:
|
|
|
|
+ if '。' in reason:
|
|
|
|
+ for reason in reason.split('。'):
|
|
|
|
+ # print('分句reason: ', reason)
|
|
|
|
+ b = sentence.find(reason)
|
|
|
|
+ while b != -1:
|
|
|
|
+ e = b + len(reason)
|
|
|
|
+ if tag_list[b] == 'S' and tag_list[e - 1] == 'S':
|
|
|
|
+ tag_list[b] = 'B-rea'
|
|
|
|
+ tag_list[e - 1] = 'E-rea'
|
|
|
|
+ for i in range(b + 1, e - 1):
|
|
|
|
+ tag_list[i] = 'I-rea'
|
|
|
|
+ b = sentence.find(reason, e)
|
|
|
|
+ else:
|
|
|
|
+ b = sentence.find(reason)
|
|
|
|
+ while b != -1:
|
|
|
|
+ e = b + len(reason)
|
|
|
|
+ if tag_list[b] == 'S' and tag_list[e - 1] == 'S':
|
|
|
|
+ tag_list[b] = 'B-rea'
|
|
|
|
+ tag_list[e - 1] = 'E-rea'
|
|
|
|
+ for i in range(b + 1, e - 1):
|
|
|
|
+ tag_list[i] = 'I-rea'
|
|
|
|
+ b = sentence.find(reason, e)
|
|
|
|
+ return tag_list, word_list
|
|
|
|
+
|
|
|
|
+
|
|
|
|
+def create_instances_from_document_句号分开(docid, document_text, product_list, reasons_list):
|
|
|
|
+ for it in ['一','二','三','四','五','六','七','八','九','十','十一','十二','十三','十四','十五']:
|
|
|
|
+ document_text = document_text.replace(',%s、'%it, '。%s、'%it)
|
|
|
|
+
|
|
|
|
+ if docid in ['docid']:
|
|
|
|
+ pass
|
|
|
|
+ product_list = sorted(product_list, key=lambda x:len(x), reverse=True)
|
|
|
|
+ reasons_list = sorted(reasons_list, key=lambda x:len(x), reverse=True)
|
|
|
|
+ kw_re = re.search('(流标|废标|终止|中止|失败|异常)的?原因', document_text)
|
|
|
|
+ if reasons_list == [] and kw_re:
|
|
|
|
+ kw = kw_re.group(0)
|
|
|
|
+ idx = document_text.find(kw)
|
|
|
|
+ if idx!=-1:
|
|
|
|
+ document_text = document_text[:idx]
|
|
|
|
+ # instances = []
|
|
|
|
+ pos = []
|
|
|
|
+ neg = []
|
|
|
|
+ for sentence in document_text.split('。'):
|
|
|
|
+ if len(sentence)<2:
|
|
|
|
+ # print("句子长度小于5")
|
|
|
|
+ # print(sentence)
|
|
|
|
+ continue
|
|
|
|
+ if len(sentence)>=2*max_len: # 超过1000字的取前1000字
|
|
|
|
+ sentences = sentence.split(',')
|
|
|
|
+ i = 0
|
|
|
|
+ sentence = ""
|
|
|
|
+ while i < len(sentences):
|
|
|
|
+ if len(sentences[i]) > max_len:
|
|
|
|
+ sentence = sentences[i][:max_len]
|
|
|
|
+ print('len(sentences[i]) > max_len:', len(sentence))
|
|
|
|
+ tag_list, word_list = fix_label_ner(sentence, product_list, reasons_list)
|
|
|
|
+ instance = TrainingInstance(word_list, tag_list)
|
|
|
|
+ if 'B-pro' in tag_list or 'B-rea' in tag_list:
|
|
|
|
+ pos.append(instance)
|
|
|
|
+ else:
|
|
|
|
+ neg.append(instance)
|
|
|
|
+ sentence = ""
|
|
|
|
+ elif len(sentence)<max_len*0.8 :
|
|
|
|
+ sentence += sentences[i]+','
|
|
|
|
+ else:
|
|
|
|
+ sentence =sentence[:max_len]
|
|
|
|
+ print('else: ', len(sentence))
|
|
|
|
+ tag_list, word_list = fix_label_ner(sentence, product_list, reasons_list)
|
|
|
|
+ instance = TrainingInstance(word_list, tag_list)
|
|
|
|
+ if 'B-pro' in tag_list or 'B-rea' in tag_list:
|
|
|
|
+ pos.append(instance)
|
|
|
|
+ else:
|
|
|
|
+ neg.append(instance)
|
|
|
|
+ sentence = sentences[i]+','
|
|
|
|
+ i += 1
|
|
|
|
+ if len(sentence)>=10:
|
|
|
|
+ sentence = sentence[:max_len]
|
|
|
|
+ tag_list, word_list = fix_label_ner(sentence, product_list, reasons_list)
|
|
|
|
+ instance = TrainingInstance(word_list, tag_list)
|
|
|
|
+ if 'B-pro' in tag_list or 'B-rea' in tag_list:
|
|
|
|
+ pos.append(instance)
|
|
|
|
+ else:
|
|
|
|
+ neg.append(instance)
|
|
|
|
+ print('len(sentence)>=10: ',len(sentence))
|
|
|
|
+ else:
|
|
|
|
+ sentence = sentence[:max_len]
|
|
|
|
+ tag_list, word_list = fix_label_ner(sentence, product_list, reasons_list)
|
|
|
|
+ instance = TrainingInstance(word_list, tag_list)
|
|
|
|
+ if 'B-pro' in tag_list or 'B-rea' in tag_list:
|
|
|
|
+ pos.append(instance)
|
|
|
|
+ else:
|
|
|
|
+ neg.append(instance)
|
|
|
|
+ random.shuffle(neg)
|
|
|
|
+ neg = neg[:min(5, 10*len(pos))]
|
|
|
|
+ instances = pos+neg
|
|
|
|
+ random.shuffle(instances)
|
|
|
|
+ return instances
|
|
|
|
+
|
|
|
|
+def fix_label_ner(sentence, product_list, reasons_list):
|
|
|
|
+ tag_list = ['S'] * len(sentence)
|
|
|
|
+ word_list = list(sentence)
|
|
|
|
+ for product in product_list:
|
|
|
|
+ b = sentence.find(product)
|
|
|
|
+ while b != -1:
|
|
|
|
+ e = b + len(product)
|
|
|
|
+ if tag_list[b] == 'S' and tag_list[e - 1] == 'S':
|
|
|
|
+ tag_list[b] = 'B-pro'
|
|
|
|
+ tag_list[e - 1] = 'E-pro'
|
|
|
|
+ for i in range(b + 1, e - 1):
|
|
|
|
+ tag_list[i] = 'I-pro'
|
|
|
|
+ b = sentence.find(product, e)
|
|
|
|
+ for reason in reasons_list:
|
|
|
|
+ b = sentence.find(reason)
|
|
|
|
+ while b != -1:
|
|
|
|
+ e = b + len(reason)
|
|
|
|
+ if tag_list[b] == 'S' and tag_list[e - 1] == 'S':
|
|
|
|
+ tag_list[b] = 'B-rea'
|
|
|
|
+ tag_list[e - 1] = 'E-rea'
|
|
|
|
+ for i in range(b + 1, e - 1):
|
|
|
|
+ tag_list[i] = 'I-rea'
|
|
|
|
+ b = sentence.find(reason, e)
|
|
|
|
+ return tag_list, word_list
|
|
|
|
+
|
|
|
|
+def create_instances_from_document(docid, document_text, product_list, reasons_list):
|
|
|
|
+ product_list = sorted(product_list, key=lambda x:len(x), reverse=True)
|
|
|
|
+ reasons_list = sorted(reasons_list, key=lambda x:len(x), reverse=True)
|
|
|
|
+ kw_re = re.search('(流标|废标|终止|中止|失败|异常)的?原因', document_text)
|
|
|
|
+ if reasons_list == [] and kw_re:
|
|
|
|
+ kw = kw_re.group(0)
|
|
|
|
+ idx = document_text.find(kw)
|
|
|
|
+ if idx!=-1:
|
|
|
|
+ document_text = document_text[:idx]
|
|
|
|
+ # instances = []
|
|
|
|
+ pos = []
|
|
|
|
+ neg = []
|
|
|
|
+ if len(document_text)<= max_len:
|
|
|
|
+ document_text = document_text[:max_len]
|
|
|
|
+ tag_list, word_list = fix_label_ner(document_text, product_list, reasons_list)
|
|
|
|
+ if len(reasons_list)>0 and 'B-rea' not in tag_list:
|
|
|
|
+ print("少于%d字的文章废标原因标注未找到:%s"%(max_len, docid))
|
|
|
|
+ instance = TrainingInstance(word_list, tag_list)
|
|
|
|
+ if 'B-pro' in tag_list or 'E-rea' in tag_list:
|
|
|
|
+ pos.append(instance)
|
|
|
|
+ else:
|
|
|
|
+ neg.append(instance)
|
|
|
|
+ elif len(reasons_list)>0:
|
|
|
|
+ b = document_text.find(reasons_list[0])
|
|
|
|
+ if b != -1:
|
|
|
|
+ document_text = document_text[max(0, b-8):][:max_len]
|
|
|
|
+ else:
|
|
|
|
+ document_text = document_text[:max_len]
|
|
|
|
+ print("多于%d字的文章废标原因标注未找到:%s," % (max_len, docid))
|
|
|
|
+ tag_list, word_list = fix_label_ner(document_text, product_list, reasons_list)
|
|
|
|
+ if 'E-rea' not in tag_list:
|
|
|
|
+ print("文章废标原因标注未找到:%s, 开始位置:%d"%(docid, b))
|
|
|
|
+ instance = TrainingInstance(word_list, tag_list)
|
|
|
|
+ if 'B-pro' in tag_list or 'B-rea' in tag_list:
|
|
|
|
+ pos.append(instance)
|
|
|
|
+ else:
|
|
|
|
+ neg.append(instance)
|
|
|
|
+ else:
|
|
|
|
+ for it in ['一', '二', '三', '四', '五', '六', '七', '八', '九', '十', '十一', '十二', '十三', '十四', '十五']:
|
|
|
|
+ document_text = document_text.replace(',%s、' % it, '。%s、' % it)
|
|
|
|
+ for sentence in document_text.split('。'):
|
|
|
|
+ if len(sentence)<2:
|
|
|
|
+ # print("句子长度小于5")
|
|
|
|
+ # print(sentence)
|
|
|
|
+ continue
|
|
|
|
+ if len(sentence)>=2*max_len: # 超过1000字的取前1000字
|
|
|
|
+ sentences = sentence.split(',')
|
|
|
|
+ i = 0
|
|
|
|
+ sentence = ""
|
|
|
|
+ while i < len(sentences):
|
|
|
|
+ if len(sentences[i]) > max_len:
|
|
|
|
+ sentence = sentences[i][:max_len]
|
|
|
|
+ # print('len(sentences[i]) > max_len:', len(sentence))
|
|
|
|
+ tag_list, word_list = fix_label_ner(sentence, product_list, reasons_list)
|
|
|
|
+ instance = TrainingInstance(word_list, tag_list)
|
|
|
|
+ if 'B-pro' in tag_list or 'B-rea' in tag_list:
|
|
|
|
+ pos.append(instance)
|
|
|
|
+ else:
|
|
|
|
+ neg.append(instance)
|
|
|
|
+ sentence = ""
|
|
|
|
+ elif len(sentence)<max_len*0.8 :
|
|
|
|
+ sentence += sentences[i]+','
|
|
|
|
+ else:
|
|
|
|
+ sentence =sentence[:max_len]
|
|
|
|
+ # print('else: ', len(sentence))
|
|
|
|
+ tag_list, word_list = fix_label_ner(sentence, product_list, reasons_list)
|
|
|
|
+ instance = TrainingInstance(word_list, tag_list)
|
|
|
|
+ if 'B-pro' in tag_list or 'B-rea' in tag_list:
|
|
|
|
+ pos.append(instance)
|
|
|
|
+ else:
|
|
|
|
+ neg.append(instance)
|
|
|
|
+ sentence = sentences[i]+','
|
|
|
|
+ i += 1
|
|
|
|
+ if len(sentence)>=10:
|
|
|
|
+ sentence = sentence[:max_len]
|
|
|
|
+ tag_list, word_list = fix_label_ner(sentence, product_list, reasons_list)
|
|
|
|
+ instance = TrainingInstance(word_list, tag_list)
|
|
|
|
+ if 'B-pro' in tag_list or 'B-rea' in tag_list:
|
|
|
|
+ pos.append(instance)
|
|
|
|
+ else:
|
|
|
|
+ neg.append(instance)
|
|
|
|
+ # print('len(sentence)>=10: ',len(sentence))
|
|
|
|
+ else:
|
|
|
|
+ sentence = sentence[:max_len]
|
|
|
|
+ tag_list, word_list = fix_label_ner(sentence, product_list, reasons_list)
|
|
|
|
+ instance = TrainingInstance(word_list, tag_list)
|
|
|
|
+ if 'B-pro' in tag_list or 'B-rea' in tag_list:
|
|
|
|
+ pos.append(instance)
|
|
|
|
+ else:
|
|
|
|
+ neg.append(instance)
|
|
|
|
+ random.shuffle(neg)
|
|
|
|
+ neg = neg[:min(5, 10*len(pos))]
|
|
|
|
+ instances = pos+neg
|
|
|
|
+ random.shuffle(instances)
|
|
|
|
+ return instances
|
|
|
|
+
|
|
|
|
+def create_training_instances(df):
|
|
|
|
+ instances = []
|
|
|
|
+ # df = pd.read_excel(xlsx)
|
|
|
|
+ df.fillna('', inplace=True)
|
|
|
|
+ for i in df.index:
|
|
|
|
+ try:
|
|
|
|
+ docid = df.loc[i, 'docid']
|
|
|
|
+ document_text = df.loc[i, 'text']
|
|
|
|
+ product_list = json.loads(df.loc[i, 'lbset'])
|
|
|
|
+ reasons_list = json.loads(df.loc[i, 'reasons_list'])
|
|
|
|
+ # if reasons_list == []:
|
|
|
|
+ # continue
|
|
|
|
+ instances.extend(
|
|
|
|
+ create_instances_from_document(
|
|
|
|
+ docid, document_text, product_list, reasons_list
|
|
|
|
+ ))
|
|
|
|
+ except Exception as e:
|
|
|
|
+ print('json出错',i, df.loc[i, 'lbset'], type(df.loc[i, 'lbset']), e)
|
|
|
|
+ return instances
|
|
|
|
+
|
|
|
|
+def write_instance_to_example_files(instances, word2index, tag2index, output_dir):
|
|
|
|
+ # writers = []
|
|
|
|
+ # instances = sorted(instances, key=lambda x: len(x.word_list))
|
|
|
|
+ i = 0
|
|
|
|
+ # for max_len in [200, 500, 1000]:
|
|
|
|
+ writer = tf.python_io.TFRecordWriter(output_dir + '/maxlen_%s_addunk_product_reason.tfrecode'%max_len)
|
|
|
|
+ # print('排序前:', [len(x.word_list) for x in instances[:5]])
|
|
|
|
+ # instances.sort(key=lambda x:len(x.word_list), reverse=True)
|
|
|
|
+ # print('排序后:', [len(x.word_list) for x in instances[:5]])
|
|
|
|
+ while i < len(instances):
|
|
|
|
+ instance = instances[i]
|
|
|
|
+ if len(instance.word_list)>max_len:
|
|
|
|
+ writer.close()
|
|
|
|
+ break
|
|
|
|
+ i += 1
|
|
|
|
+ # word_ids = [word2index.get(word, max_id) for word in instance.word_list]
|
|
|
|
+ word_ids = [word2index.get(word, word2index.get('<unk>')) for word in instance.word_list]
|
|
|
|
+ tag_ids = [tag2index.get(tag, 0) for tag in instance.tag_list]
|
|
|
|
+ while len(word_ids)<max_len:
|
|
|
|
+ word_ids.append(0)
|
|
|
|
+ tag_ids.append(0)
|
|
|
|
+ features = collections.OrderedDict()
|
|
|
|
+ features["word_ids"] = create_int_feature(word_ids)
|
|
|
|
+ features["tag_ids"] = create_int_feature(tag_ids)
|
|
|
|
+ features['text_len'] = create_int_feature([len(instance.word_list)])
|
|
|
|
+ tf_example = tf.train.Example(features=tf.train.Features(feature=features))
|
|
|
|
+ writer.write(tf_example.SerializeToString())
|
|
|
|
+ writer.close()
|
|
|
|
+
|
|
|
|
+if __name__ == "__main__":
|
|
|
|
+ df = pd.read_excel(os.path.dirname(__file__) + '/data/所有产品标注数据筛选20211125_ProductAndReason.xlsx')
|
|
|
|
+ df['pos'] = df.apply(lambda x:1 if re.search('(流标|废标|终止|中止|失败|异常)(公告|公示)', x['text']) and x['reasons_list']=='[]' else 0, axis=1)
|
|
|
|
+ df = df[df.loc[:, 'pos']==0] # 过滤掉未标注废标原因文章
|
|
|
|
+ df.reset_index(drop=True, inplace=True)
|
|
|
|
+ print('总文章数:',len(df))
|
|
|
|
+ df.fillna('', inplace=True)
|
|
|
|
+ print('读取完毕')
|
|
|
|
+ df['lbs'] = df['lbset'].apply(lambda x: json.loads(x))
|
|
|
|
+ lbset = [it for l in df['lbs'] for it in l]
|
|
|
|
+ c = collections.Counter(lbset)
|
|
|
|
+ m = c.most_common()
|
|
|
|
+ m3 = [it[0] for it in m if it[1] > 2]
|
|
|
|
+ df['pos'] = df['lbs'].apply(lambda x: 1 if len(set(m3) & set(x)) >= 1 else 0)
|
|
|
|
+ df_dev = df[df.loc[:, 'pos'] == 1].sample(frac=0.1, random_state=8)
|
|
|
|
+ print('len_df_dev:', len(df_dev))
|
|
|
|
+ df_reason = df[df.loc[:, 'reasons_list'] != '[]'].sample(frac=0.1, random_state=8)
|
|
|
|
+ print('len(df_reason)', len(df_reason))
|
|
|
|
+ df_dev.append(df_reason)
|
|
|
|
+ df_dev.drop_duplicates(subset=['docid'], inplace=True)
|
|
|
|
+ print('len_df_dev:', len(df_dev))
|
|
|
|
+ df_train = df[~df.index.isin(df_dev.index)]
|
|
|
|
+ print(len(df), len(df_dev), len(df_train))
|
|
|
|
+ df_train = df_train.sample(frac=1)
|
|
|
|
+ df_dev = df_dev.sample(frac=1)
|
|
|
|
+
|
|
|
|
+ # file = 'data/traindata.xlsx'
|
|
|
|
+ instances = create_training_instances(df_train)
|
|
|
|
+ # word2index = {'<unk>':0, '我':1, '们':2, '中':3, '国':4, '人':5}
|
|
|
|
+ tag2index = {'S':0,'B-pro':1, 'I-pro':2, 'E-pro':3, 'B-rea':4, 'I-rea':5, 'E-rea':6}
|
|
|
|
+ output_dir = 'data/train_data'
|
|
|
|
+ write_instance_to_example_files(instances, word2id, tag2index, output_dir)
|
|
|
|
+
|
|
|
|
+ instances = create_training_instances(df_dev)
|
|
|
|
+ # word2index = {'<unk>':0, '我':1, '们':2, '中':3, '国':4, '人':5}
|
|
|
|
+ tag2index = {'S':0,'B-pro':1, 'I-pro':2, 'E-pro':3, 'B-rea':4, 'I-rea':5, 'E-rea':6}
|
|
|
|
+ output_dir = 'data/test_data'
|
|
|
|
+ write_instance_to_example_files(instances, word2id, tag2index, output_dir)
|
|
|
|
+ print('全部写入成功!')
|