|
@@ -10,6 +10,21 @@ import os
|
|
|
import re
|
|
|
import collections
|
|
|
from BiddingKG.dl.product.data_util import word2id, max_id
|
|
|
+import psycopg2
|
|
|
+import json
|
|
|
+import pickle
|
|
|
+
|
|
|
+conn = psycopg2.connect(host='192.168.2.103', port='5432', user='postgres', password='postgres', dbname='bid_validate')
|
|
|
+cursor = conn.cursor()
|
|
|
+def get_title(docid):
|
|
|
+ sql = "select doctitle from qiao_ke_bao_raw where docid='{0}'".format(docid)
|
|
|
+ cursor.execute(sql)
|
|
|
+ for row in cursor.fetchall():
|
|
|
+ return row[0]
|
|
|
+ return ''
|
|
|
+
|
|
|
+product_notin = []
|
|
|
+
|
|
|
max_len = 500
|
|
|
|
|
|
def create_int_feature(values):
|
|
@@ -61,8 +76,11 @@ def fix_label_ner_句号分开(sentence, product_list, reasons_list):
|
|
|
|
|
|
|
|
|
def create_instances_from_document_句号分开(docid, document_text, product_list, reasons_list):
|
|
|
- for it in ['一','二','三','四','五','六','七','八','九','十','十一','十二','十三','十四','十五']:
|
|
|
- document_text = document_text.replace(',%s、'%it, '。%s、'%it)
|
|
|
+ # for it in ['一','二','三','四','五','六','七','八','九','十','十一','十二','十三','十四','十五']:
|
|
|
+ # document_text = document_text.replace(',%s、'%it, '。%s、'%it)
|
|
|
+ for it in re.finditer('[^\w\d。][一二三四五六七八九十]{1,3}、', document_text):
|
|
|
+ t = it.group(0)
|
|
|
+ document_text = document_text.replace(t, '。' + t[1:])
|
|
|
|
|
|
if docid in ['docid']:
|
|
|
pass
|
|
@@ -137,6 +155,10 @@ def fix_label_ner(sentence, product_list, reasons_list):
|
|
|
tag_list = ['S'] * len(sentence)
|
|
|
word_list = list(sentence)
|
|
|
for product in product_list:
|
|
|
+ if len(re.sub('[^\w]', '', product))<1:
|
|
|
+ print('错误产品: ', product)
|
|
|
+ continue
|
|
|
+
|
|
|
b = sentence.find(product)
|
|
|
while b != -1:
|
|
|
e = b + len(product)
|
|
@@ -158,10 +180,97 @@ def fix_label_ner(sentence, product_list, reasons_list):
|
|
|
b = sentence.find(reason, e)
|
|
|
return tag_list, word_list
|
|
|
|
|
|
+def fix_label_ner_remove_punctuation(sentence, product_list, reasons_list):
|
|
|
+ tag_list = ['S'] * len(sentence)
|
|
|
+ word_list = list(sentence)
|
|
|
+ if len(product_list)>0:
|
|
|
+ for it in re.finditer('|'.join(product_list), sentence):
|
|
|
+ b, e = it.span()
|
|
|
+ if tag_list[b] == 'S' and tag_list[e - 1] == 'S':
|
|
|
+ tag_list[b] = 'B-pro'
|
|
|
+ tag_list[e - 1] = 'E-pro'
|
|
|
+ for i in range(b + 1, e - 1):
|
|
|
+ tag_list[i] = 'I-pro'
|
|
|
+
|
|
|
+ for reason in reasons_list:
|
|
|
+ b = sentence.find(reason)
|
|
|
+ while b != -1:
|
|
|
+ e = b + len(reason)
|
|
|
+ if tag_list[b] == 'S' and tag_list[e - 1] == 'S':
|
|
|
+ tag_list[b] = 'B-rea'
|
|
|
+ tag_list[e - 1] = 'E-rea'
|
|
|
+ for i in range(b + 1, e - 1):
|
|
|
+ tag_list[i] = 'I-rea'
|
|
|
+ b = sentence.find(reason, e)
|
|
|
+ return tag_list, word_list
|
|
|
+
|
|
|
+def create_instances_from_document_remove_punctuation(docid, document_text, product_list, reasons_list):
|
|
|
+ product_list = set([re.sub('[^\w]', '', it) for it in product_list if len(re.sub('[^\w]', '', it))>1]) # 产品字段去掉符号
|
|
|
+ reasons_list = set([re.sub('[^\w]', '', it) for it in reasons_list if len(re.sub('[^\w]', '', it))>1])
|
|
|
+ document_text = re.sub('[^\w]', '', document_text)
|
|
|
+
|
|
|
+ product_list = sorted(product_list, key=lambda x:len(x), reverse=True)
|
|
|
+ reasons_list = sorted(reasons_list, key=lambda x:len(x), reverse=True)
|
|
|
+ kw_re = re.search('(流标|废标|终止|中止|失败|异常)的?(原因|理由)', document_text)
|
|
|
+ if reasons_list == [] and kw_re:
|
|
|
+ document_text = re.sub('(流标|废标|终止|中止|失败|异常)的?(原因|理由).{, 30}', '', document_text)
|
|
|
+
|
|
|
+ pos = []
|
|
|
+ neg = []
|
|
|
+ if len(document_text)<= max_len:
|
|
|
+ document_text = document_text[:max_len]
|
|
|
+ tag_list, word_list = fix_label_ner_remove_punctuation(document_text, product_list, reasons_list)
|
|
|
+ if len(reasons_list)>0 and 'B-rea' not in tag_list:
|
|
|
+ print("少于%d字的文章废标原因标注未找到:%s"%(max_len, docid))
|
|
|
+ instance = TrainingInstance(word_list, tag_list)
|
|
|
+ if 'B-pro' in tag_list or 'E-rea' in tag_list:
|
|
|
+ pos.append(instance)
|
|
|
+ else:
|
|
|
+ neg.append(instance)
|
|
|
+ elif len(reasons_list)>0:
|
|
|
+ b = document_text.find(reasons_list[0])
|
|
|
+ if b != -1:
|
|
|
+ document_text = document_text[max(0, b-8):][:max_len]
|
|
|
+ else:
|
|
|
+ document_text = document_text[:max_len]
|
|
|
+ print("多于%d字的文章废标原因标注未找到:%s," % (max_len, docid))
|
|
|
+ tag_list, word_list = fix_label_ner_remove_punctuation(document_text, product_list, reasons_list)
|
|
|
+ if 'E-rea' not in tag_list:
|
|
|
+ print("文章废标原因标注未找到:%s, 开始位置:%d"%(docid, b))
|
|
|
+ instance = TrainingInstance(word_list, tag_list)
|
|
|
+ if 'B-pro' in tag_list or 'B-rea' in tag_list:
|
|
|
+ pos.append(instance)
|
|
|
+ else:
|
|
|
+ neg.append(instance)
|
|
|
+ else:
|
|
|
+ epoch = len(document_text)//max_len
|
|
|
+ if len(document_text)%max_len > 50:
|
|
|
+ epoch += 1
|
|
|
+ for i in range(epoch):
|
|
|
+ sentence = document_text[i*max_len: (i+1)*max_len]
|
|
|
+ if len(sentence)<5:
|
|
|
+ # print("句子长度小于5")
|
|
|
+ # print(sentence)
|
|
|
+ continue
|
|
|
+ sentence = sentence[:max_len]
|
|
|
+ tag_list, word_list = fix_label_ner_remove_punctuation(sentence, product_list, reasons_list)
|
|
|
+ instance = TrainingInstance(word_list, tag_list)
|
|
|
+ if 'B-pro' in tag_list or 'B-rea' in tag_list:
|
|
|
+ pos.append(instance)
|
|
|
+ else:
|
|
|
+ neg.append(instance)
|
|
|
+ random.shuffle(neg)
|
|
|
+ # neg = neg[:min(5, 10*len(pos))]
|
|
|
+ neg = neg[:min(5, 2*len(pos))]
|
|
|
+ instances = pos+neg
|
|
|
+ random.shuffle(instances)
|
|
|
+ return instances
|
|
|
+
|
|
|
def create_instances_from_document(docid, document_text, product_list, reasons_list):
|
|
|
product_list = sorted(product_list, key=lambda x:len(x), reverse=True)
|
|
|
reasons_list = sorted(reasons_list, key=lambda x:len(x), reverse=True)
|
|
|
kw_re = re.search('(流标|废标|终止|中止|失败|异常)的?原因', document_text)
|
|
|
+
|
|
|
if reasons_list == [] and kw_re:
|
|
|
kw = kw_re.group(0)
|
|
|
idx = document_text.find(kw)
|
|
@@ -196,10 +305,13 @@ def create_instances_from_document(docid, document_text, product_list, reasons_l
|
|
|
else:
|
|
|
neg.append(instance)
|
|
|
else:
|
|
|
- for it in ['一', '二', '三', '四', '五', '六', '七', '八', '九', '十', '十一', '十二', '十三', '十四', '十五']:
|
|
|
- document_text = document_text.replace(',%s、' % it, '。%s、' % it)
|
|
|
+ # for it in ['一', '二', '三', '四', '五', '六', '七', '八', '九', '十', '十一', '十二', '十三', '十四', '十五']:
|
|
|
+ # document_text = document_text.replace(',%s、' % it, '。%s、' % it)
|
|
|
+ for it in re.finditer('[^\w\d][一二三四五六七八九十]{1,3}、', document_text):
|
|
|
+ t = it.group(0)
|
|
|
+ document_text = document_text.replace(t, '。' + t[1:])
|
|
|
for sentence in document_text.split('。'):
|
|
|
- if len(sentence)<2:
|
|
|
+ if len(sentence)<5:
|
|
|
# print("句子长度小于5")
|
|
|
# print(sentence)
|
|
|
continue
|
|
@@ -249,7 +361,8 @@ def create_instances_from_document(docid, document_text, product_list, reasons_l
|
|
|
else:
|
|
|
neg.append(instance)
|
|
|
random.shuffle(neg)
|
|
|
- neg = neg[:min(5, 10*len(pos))]
|
|
|
+ # neg = neg[:min(5, 10*len(pos))]
|
|
|
+ neg = neg[:min(5, 2*len(pos))]
|
|
|
instances = pos+neg
|
|
|
random.shuffle(instances)
|
|
|
return instances
|
|
@@ -259,37 +372,92 @@ def create_training_instances(df):
|
|
|
# df = pd.read_excel(xlsx)
|
|
|
df.fillna('', inplace=True)
|
|
|
for i in df.index:
|
|
|
- try:
|
|
|
- docid = df.loc[i, 'docid']
|
|
|
- document_text = df.loc[i, 'text']
|
|
|
- product_list = json.loads(df.loc[i, 'lbset'])
|
|
|
- reasons_list = json.loads(df.loc[i, 'reasons_list'])
|
|
|
- # if reasons_list == []:
|
|
|
- # continue
|
|
|
- instances.extend(
|
|
|
- create_instances_from_document(
|
|
|
- docid, document_text, product_list, reasons_list
|
|
|
- ))
|
|
|
- except Exception as e:
|
|
|
- print('json出错',i, df.loc[i, 'lbset'], type(df.loc[i, 'lbset']), e)
|
|
|
+ if i % 5000==0:
|
|
|
+ print('create_instance', i)
|
|
|
+ # try:
|
|
|
+ docid = df.loc[i, 'docid']
|
|
|
+ document_text = df.loc[i, 'text']
|
|
|
+ product_list = json.loads(df.loc[i, 'lbset'])
|
|
|
+ reasons_list = json.loads(df.loc[i, 'reasons_list'])
|
|
|
+
|
|
|
+ notin_num = 0
|
|
|
+ for i in range(len(product_list)): # 如果在公告找不到产品,尝试在中间加标点符号,
|
|
|
+ p = product_list[i]
|
|
|
+ if re.search('[^\w]', p) == None and re.search(p, document_text) == None:
|
|
|
+ ser = re.search('[^\w]{,2}'.join(p), document_text)
|
|
|
+ if ser:
|
|
|
+ product_list[i] = ser.group(0)
|
|
|
+ elif '项目' in p and re.search(p.replace('项目', '采购项目'), document_text):
|
|
|
+ product_list[i] = p.replace('项目', '采购项目')
|
|
|
+ elif '项目' in p and re.search(p.replace('项目', ''), document_text):
|
|
|
+ product_list[i] = p.replace('项目', '')
|
|
|
+ elif re.search('[a-zA-Z]', p) and re.search(p.lower(), document_text):
|
|
|
+ product_list[i] = p.lower()
|
|
|
+ elif re.search('[a-zA-Z]', p) and re.search(p.upper(), document_text.upper()):
|
|
|
+ product_list[i] = p.upper()
|
|
|
+ document_text = document_text.upper()
|
|
|
+ else:
|
|
|
+ title = get_title(docid)
|
|
|
+ if title not in document_text:
|
|
|
+ document_text = title + "。" + document_text
|
|
|
+ ser = re.search('[^\w]{,2}'.join(p), document_text)
|
|
|
+ if ser:
|
|
|
+ product_list[i] = ser.group(0)
|
|
|
+ elif '项目' in p and re.search(p.replace('项目', '采购项目'), document_text):
|
|
|
+ product_list[i] = p.replace('项目', '采购项目')
|
|
|
+ elif '项目' in p and re.search(p.replace('项目', ''), document_text):
|
|
|
+ product_list[i] = p.replace('项目', '')
|
|
|
+ elif re.search('[a-zA-Z]', p) and re.search(p.lower(), document_text):
|
|
|
+ product_list[i] = p.lower()
|
|
|
+ elif re.search('[a-zA-Z]', p) and re.search(p.upper(), document_text.upper()):
|
|
|
+ product_list[i] = p.upper()
|
|
|
+ document_text = document_text.upper()
|
|
|
+ else:
|
|
|
+ # print('docid:%s,not in text product: %s' % (docid, p))
|
|
|
+ notin_num += 1
|
|
|
+ if re.search('业绩', document_text) == None:
|
|
|
+ product_notin.append((docid, p))
|
|
|
+ else:
|
|
|
+ # print('docid:%s,not in text product: %s'%(docid, p))
|
|
|
+ notin_num +=1
|
|
|
+ if re.search('业绩', document_text) == None:
|
|
|
+ product_notin.append((docid, p))
|
|
|
+ if notin_num > len(product_list)/2:
|
|
|
+ print('找到的产品少于一半,过滤掉', docid, product_list)
|
|
|
+ continue
|
|
|
+
|
|
|
+ # if reasons_list == []:
|
|
|
+ # continue
|
|
|
+ instances.extend(
|
|
|
+ create_instances_from_document(
|
|
|
+ docid, document_text, product_list, reasons_list
|
|
|
+ ))
|
|
|
+ # instances.extend(
|
|
|
+ # create_instances_from_document_remove_punctuation(
|
|
|
+ # docid, document_text, product_list, reasons_list
|
|
|
+ # ))
|
|
|
+ # except Exception as e:
|
|
|
+ # print('json出错',i, df.loc[i, 'lbset'], type(df.loc[i, 'lbset']), e)
|
|
|
return instances
|
|
|
|
|
|
-def write_instance_to_example_files(instances, word2index, tag2index, output_dir):
|
|
|
+def write_instance_to_example_files(instances, word2index, tag2index, output_dir, tfrecode_name):
|
|
|
# writers = []
|
|
|
# instances = sorted(instances, key=lambda x: len(x.word_list))
|
|
|
i = 0
|
|
|
# for max_len in [200, 500, 1000]:
|
|
|
- writer = tf.python_io.TFRecordWriter(output_dir + '/maxlen_%s_addunk_product_reason.tfrecode'%max_len)
|
|
|
+ # writer = tf.python_io.TFRecordWriter(output_dir + '/maxlen_%s_addunk_product_reason.tfrecode'%max_len)
|
|
|
+ writer = tf.python_io.TFRecordWriter(output_dir + '/%s'%tfrecode_name)
|
|
|
# print('排序前:', [len(x.word_list) for x in instances[:5]])
|
|
|
# instances.sort(key=lambda x:len(x.word_list), reverse=True)
|
|
|
# print('排序后:', [len(x.word_list) for x in instances[:5]])
|
|
|
while i < len(instances):
|
|
|
+ if i % 5000 == 0:
|
|
|
+ print('开始写入', i)
|
|
|
instance = instances[i]
|
|
|
if len(instance.word_list)>max_len:
|
|
|
writer.close()
|
|
|
break
|
|
|
i += 1
|
|
|
- # word_ids = [word2index.get(word, max_id) for word in instance.word_list]
|
|
|
word_ids = [word2index.get(word, word2index.get('<unk>')) for word in instance.word_list]
|
|
|
tag_ids = [tag2index.get(tag, 0) for tag in instance.tag_list]
|
|
|
while len(word_ids)<max_len:
|
|
@@ -303,42 +471,117 @@ def write_instance_to_example_files(instances, word2index, tag2index, output_dir
|
|
|
writer.write(tf_example.SerializeToString())
|
|
|
writer.close()
|
|
|
|
|
|
+def 去除标注不在公告里面的公告(df):
|
|
|
+ df['notin'] = df.apply(
|
|
|
+ lambda x: json.dumps([it for it in json.loads(x['lbset']) if re.sub('[^\w]', '', it) not in re.sub('[^\w]', '', x['text'])],
|
|
|
+ ensure_ascii=False), axis=1)
|
|
|
+ df = df[df['notin']=='[]']
|
|
|
+ return df
|
|
|
+
|
|
|
+
|
|
|
if __name__ == "__main__":
|
|
|
- df = pd.read_excel(os.path.dirname(__file__) + '/data/所有产品标注数据筛选20211125_ProductAndReason.xlsx')
|
|
|
- df['pos'] = df.apply(lambda x:1 if re.search('(流标|废标|终止|中止|失败|异常)(公告|公示)', x['text']) and x['reasons_list']=='[]' else 0, axis=1)
|
|
|
- df = df[df.loc[:, 'pos']==0] # 过滤掉未标注废标原因文章
|
|
|
- df.reset_index(drop=True, inplace=True)
|
|
|
- print('总文章数:',len(df))
|
|
|
- df.fillna('', inplace=True)
|
|
|
- print('读取完毕')
|
|
|
- df['lbs'] = df['lbset'].apply(lambda x: json.loads(x))
|
|
|
- lbset = [it for l in df['lbs'] for it in l]
|
|
|
- c = collections.Counter(lbset)
|
|
|
- m = c.most_common()
|
|
|
- m3 = [it[0] for it in m if it[1] > 2]
|
|
|
- df['pos'] = df['lbs'].apply(lambda x: 1 if len(set(m3) & set(x)) >= 1 else 0)
|
|
|
- df_dev = df[df.loc[:, 'pos'] == 1].sample(frac=0.1, random_state=8)
|
|
|
- print('len_df_dev:', len(df_dev))
|
|
|
- df_reason = df[df.loc[:, 'reasons_list'] != '[]'].sample(frac=0.1, random_state=8)
|
|
|
- print('len(df_reason)', len(df_reason))
|
|
|
- df_dev.append(df_reason)
|
|
|
- df_dev.drop_duplicates(subset=['docid'], inplace=True)
|
|
|
- print('len_df_dev:', len(df_dev))
|
|
|
- df_train = df[~df.index.isin(df_dev.index)]
|
|
|
- print(len(df), len(df_dev), len(df_train))
|
|
|
- df_train = df_train.sample(frac=1)
|
|
|
- df_dev = df_dev.sample(frac=1)
|
|
|
-
|
|
|
- # file = 'data/traindata.xlsx'
|
|
|
+ # df = pd.read_excel(os.path.dirname(__file__) + '/data/所有产品标注数据筛选20211125_ProductAndReason.xlsx')
|
|
|
+
|
|
|
+ # df = pd.read_excel('E:/产品及失败原因标注数据/所有产品标注数据筛选20211125_ProductAndReason.xlsx')
|
|
|
+ # # tfrecode_name = '20211125_ProductAndReason.tfrecode'
|
|
|
+ # df = df[['docid', 'text', 'lbset', 'reasons_list']]
|
|
|
+ #
|
|
|
+ # df1 = pd.read_excel('E:/产品及失败原因标注数据/桥客宝产品数据1.xlsx')
|
|
|
+ #
|
|
|
+ # # tfrecode_name = 'qiaokebao1_product.tfrecode'
|
|
|
+ # df2 = pd.read_csv('E:/产品及失败原因标注数据/桥客宝产品数据2.csv')
|
|
|
+ #
|
|
|
+ # # tfrecode_name = 'qiaokebao2_product.tfrecode'
|
|
|
+ # df3 = pd.read_csv('E:/产品及失败原因标注数据/桥客宝产品数据3.csv')
|
|
|
+ # df = df.append([df1, df2, df3], ignore_index=True)
|
|
|
+ #
|
|
|
+ # tfrecode_name = 'all_product.tfrecode'
|
|
|
+ #
|
|
|
+ # df = df[['docid', 'text', 'lbset', 'reasons_list']]
|
|
|
+ # df.fillna('', inplace=True)
|
|
|
+ # df['pos'] = df.apply(lambda x:1 if re.search('(流标|废标|终止|中止|失败|异常)(公告|公示)', x['text']) and x['reasons_list']=='[]' else 0, axis=1)
|
|
|
+ # df = df[df.loc[:, 'pos']==0] # 过滤掉未标注废标原因文章
|
|
|
+ # df.reset_index(drop=True, inplace=True)
|
|
|
+ # print('总文章数:',len(df))
|
|
|
+ # df.fillna('', inplace=True)
|
|
|
+ # print('读取完毕')
|
|
|
+ # df['lbs'] = df['lbset'].apply(lambda x: json.loads(x))
|
|
|
+ # lbset = [it for l in df['lbs'] for it in l]
|
|
|
+ # c = collections.Counter(lbset)
|
|
|
+ # m = c.most_common()
|
|
|
+ # m3 = [it[0] for it in m if it[1] > 2]
|
|
|
+ # print('m3: ', m3[:20])
|
|
|
+ # df['pos'] = df['lbs'].apply(lambda x: 1 if len(set(m3) & set(x)) >= 1 else 0)
|
|
|
+ # print('sum(pos): ', sum(df['pos']))
|
|
|
+ # df_dev = df[df.loc[:, 'pos'] == 1].sample(frac=0.1, random_state=8)
|
|
|
+ # print('len_df_dev:', len(df_dev))
|
|
|
+ #
|
|
|
+ # df_reason = df[df.loc[:, 'reasons_list'] != '[]']
|
|
|
+ # if len(df_reason)>10:
|
|
|
+ # df_reason = df_reason.sample(frac=0.1, random_state=8)
|
|
|
+ # print('len(df_reason)', len(df_reason))
|
|
|
+ # df_dev.append(df_reason)
|
|
|
+ # df_dev.drop_duplicates(subset=['docid'], inplace=True)
|
|
|
+ # print('len_df_dev:', len(df_dev))
|
|
|
+ #
|
|
|
+ # df_train = df[~df.index.isin(df_dev.index)]
|
|
|
+ # print(len(df), len(df_dev), len(df_train))
|
|
|
+ # df_train = df_train.sample(frac=1)
|
|
|
+ # df_dev = df_dev.sample(frac=1)
|
|
|
+
|
|
|
+
|
|
|
+ df_train = pd.read_csv('E:/产品及失败原因标注数据/df_train.csv')
|
|
|
+ print('读取完毕',len(df_train))
|
|
|
+ sp = len(df_train)//2
|
|
|
+ df_train = df_train[:sp]
|
|
|
+ tfrecode_name = 'ProductAndReason_2023-02-24_train1.tfrecode'
|
|
|
+ # tfrecode_name = 'ProductAndReason_2023-03-30_remove_punctuation_train1.tfrecode'
|
|
|
instances = create_training_instances(df_train)
|
|
|
+ del df_train
|
|
|
# word2index = {'<unk>':0, '我':1, '们':2, '中':3, '国':4, '人':5}
|
|
|
tag2index = {'S':0,'B-pro':1, 'I-pro':2, 'E-pro':3, 'B-rea':4, 'I-rea':5, 'E-rea':6}
|
|
|
output_dir = 'data/train_data'
|
|
|
- write_instance_to_example_files(instances, word2id, tag2index, output_dir)
|
|
|
+ print('准备写入')
|
|
|
+ write_instance_to_example_files(instances, word2id, tag2index, output_dir, tfrecode_name)
|
|
|
+ print('完成1')
|
|
|
+ with open('E:\产品及失败原因标注数据/product_notin1.pkl', 'wb') as f:
|
|
|
+ pickle.dump(product_notin, f)
|
|
|
|
|
|
+ df_train = pd.read_csv('E:/产品及失败原因标注数据/df_train.csv')
|
|
|
+ print('读取完毕', len(df_train))
|
|
|
+ sp = len(df_train)//2
|
|
|
+ df_train = df_train[sp:]
|
|
|
+ tfrecode_name = 'ProductAndReason_2023-02-24_train2.tfrecode'
|
|
|
+ # tfrecode_name = 'ProductAndReason_2023-03-30_remove_punctuation_train2.tfrecode' # 去掉文本及产品里面的符号
|
|
|
+ instances = create_training_instances(df_train)
|
|
|
+ del df_train
|
|
|
+ # word2index = {'<unk>':0, '我':1, '们':2, '中':3, '国':4, '人':5}
|
|
|
+ tag2index = {'S':0,'B-pro':1, 'I-pro':2, 'E-pro':3, 'B-rea':4, 'I-rea':5, 'E-rea':6}
|
|
|
+ output_dir = 'data/train_data'
|
|
|
+ print('准备写入')
|
|
|
+ write_instance_to_example_files(instances, word2id, tag2index, output_dir, tfrecode_name)
|
|
|
+ print('完成2')
|
|
|
+ with open('E:\产品及失败原因标注数据/product_notin2.pkl', 'wb') as f:
|
|
|
+ pickle.dump(product_notin, f)
|
|
|
+
|
|
|
+ df_dev = pd.read_csv('E:/产品及失败原因标注数据/df_dev.csv')
|
|
|
+
|
|
|
+ print('去除前', len(df_dev))
|
|
|
+ # df_dev = 去除标注不在公告里面的公告(df_dev)
|
|
|
+ # print('去除后', len(df_dev))
|
|
|
+ #
|
|
|
+ tfrecode_name = 'ProductAndReason_2023-02-24_dev.tfrecode'
|
|
|
+ # tfrecode_name = 'ProductAndReason_2023-03-30_remove_punctuation_dev.tfrecode'
|
|
|
instances = create_training_instances(df_dev)
|
|
|
+ del df_dev
|
|
|
# word2index = {'<unk>':0, '我':1, '们':2, '中':3, '国':4, '人':5}
|
|
|
tag2index = {'S':0,'B-pro':1, 'I-pro':2, 'E-pro':3, 'B-rea':4, 'I-rea':5, 'E-rea':6}
|
|
|
output_dir = 'data/test_data'
|
|
|
- write_instance_to_example_files(instances, word2id, tag2index, output_dir)
|
|
|
- print('全部写入成功!')
|
|
|
+ write_instance_to_example_files(instances, word2id, tag2index, output_dir, tfrecode_name)
|
|
|
+ print('全部写入成功!')
|
|
|
+ with open('E:\产品及失败原因标注数据/product_notin3.pkl', 'wb') as f:
|
|
|
+ pickle.dump(product_notin, f)
|
|
|
+
|
|
|
+
|
|
|
+cursor.close()
|
|
|
+conn.close()
|