import os, sys import numpy as np import re import tensorflow as tf import jieba import gensim maxlen = 512 words_size = 128 w2v_filepath = os.path.dirname(os.path.abspath(__file__))+"/wiki_128_word_embedding_new.vector" model_w2v = gensim.models.KeyedVectors.load_word2vec_format(w2v_filepath, binary=True) def get_words_matrix(words): if words in model_w2v.key_to_index: return model_w2v[words] else: return model_w2v['unk'] class ModelRelationExtraction: def __init__(self): self.model_file = os.path.dirname(os.path.abspath(__file__))+"/models/model_attachment_classify" self.sess = tf.compat.v1.Session(graph=tf.Graph()) self.classes_dict = { 0: '其他', 1: '招标文件', 2: '限价(控制价)', 3: '工程量清单', 4: '采购清单', 5: '评标办法' } self.get_model() def get_model(self): with self.sess.as_default() as sess: with sess.graph.as_default(): meta_graph_def = tf.compat.v1.saved_model.loader.load(sess, tags=["serve"], export_dir=self.model_file) signature_key = tf.compat.v1.saved_model.signature_constants.DEFAULT_SERVING_SIGNATURE_DEF_KEY signature_def = meta_graph_def.signature_def input0 = sess.graph.get_tensor_by_name(signature_def[signature_key].inputs["input0"].name) print(input0.shape) output = sess.graph.get_tensor_by_name(signature_def[signature_key].outputs["outputs"].name) self.model = [input0, output] return self.model def text_process(self, attachmentcon): text = attachmentcon text = re.sub("\n+", ',', text) text = re.sub("\s+|?+", '', text) text = re.sub("[\.·_]{2,}", ',', text) text = re.sub("_", '', text) text = text[:2500] tokens = list(jieba.cut(text)) return tokens def evaluate(self, attachmentcon): text = str(attachmentcon) tokens = self.text_process(text) maxlen = 512 tokens = tokens[:maxlen] words_matrix = np.zeros((maxlen, words_size)) for i in range(len(tokens)): words_matrix[i] = np.array(get_words_matrix(tokens[i])) words_matrix = np.array([words_matrix]) pred = limit_run(self.sess, [self.model[1]], feed_dict={self.model[0]: words_matrix})[0] pred_label = np.argmax(pred[0]) cn_label = self.classes_dict[pred_label] return pred_label, cn_label def limit_run(sess, list_output, feed_dict, max_batch=1024): len_sample = 0 if len(feed_dict.keys()) > 0: len_sample = len(feed_dict[list(feed_dict.keys())[0]]) if len_sample > max_batch: list_result = [[] for _ in range(len(list_output))] _begin = 0 while _begin < len_sample: new_dict = dict() for _key in feed_dict.keys(): if isinstance(feed_dict[_key], (float, int, np.int32, np.float_, np.float16, np.float32, np.float64)): new_dict[_key] = feed_dict[_key] else: new_dict[_key] = feed_dict[_key][_begin:_begin+max_batch] _output = sess.run(list_output,feed_dict=new_dict) for _index in range(len(list_output)): list_result[_index].extend(_output[_index]) _begin += max_batch else: list_result = sess.run(list_output, feed_dict=feed_dict) return list_result if __name__ == '__main__': text = '''招标文件项目编号:SDGP370302202102000110项目名称:淄川经济开发区中心小学校园智能化采购项目采购人:山东淄川经 济开发区管理委员会采购代理机构:淄博正益招标有限公司发出日期:2021年8月目录第一章投标邀请7一、项目基本情况7二、申请人的资格要 求8三、获取招标文件8四、提交投标文件截止时间、开标时间和地点8五、公告期限9六、其他补充事宜9第二章投标人须知11一、总则161.采 购人、采购代理机构及投标人162.资金来源183.投标费用184.适用法律18二、招标文件185.招标文件构成186.招标文件的澄清与修改207.投 标截止时间的顺延20三、投标文件的编制208.编制要求209.投标范围及投标文件中标准和计量单位的使用2110.投标文件构成2211.投标报价241 2.电子版投标文件2513.投标保证金2614.投标有效期2615.投标文件的签署及规定26四、投标文件的递交2616.投标文件的递交2617.递交 投标文件的截止时间2718.投标文件的接收、修改与撤回27五、开标及评标2719.开标2720.资格审查2821.组建评标委员会2922.投标文件符 合性审查与澄清3023.投标偏离3224.投标无效3225.比较和评价3326.废标3527.保密要求36六、确定中标3628.中标候选人的确定原则及标 准3629.确定中标候选人和中标人3630.采购任务取消3631.中标通知书3632.签订合同3633.履约保证金3734.政府采购融资担保3735.预付 款3736.廉洁自律规定3737.人员回避3738.质疑与接收3739.项目其他相关费用3940.合同公示3941.验收4042.履约验收公示4043.招标文 件解释权40第三章货物需求41一、项目概述41 ''' test_text = re.sub('\n', '', text) model = ModelRelationExtraction() print(model.evaluate(test_text))