123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050105110521053105410551056105710581059106010611062106310641065106610671068106910701071107210731074107510761077107810791080108110821083108410851086108710881089109010911092109310941095109610971098109911001101110211031104110511061107110811091110111111121113111411151116111711181119112011211122112311241125112611271128112911301131113211331134113511361137113811391140114111421143114411451146114711481149115011511152115311541155115611571158115911601161116211631164116511661167116811691170117111721173117411751176117711781179118011811182118311841185118611871188118911901191119211931194119511961197119811991200120112021203120412051206120712081209121012111212121312141215121612171218121912201221122212231224122512261227122812291230123112321233123412351236123712381239124012411242124312441245124612471248124912501251125212531254125512561257125812591260126112621263126412651266126712681269127012711272127312741275127612771278127912801281128212831284128512861287128812891290129112921293129412951296129712981299130013011302130313041305130613071308130913101311131213131314131513161317131813191320132113221323132413251326132713281329133013311332133313341335133613371338133913401341134213431344134513461347134813491350135113521353135413551356135713581359136013611362136313641365136613671368136913701371137213731374137513761377137813791380138113821383138413851386138713881389139013911392139313941395139613971398139914001401140214031404140514061407140814091410141114121413141414151416141714181419142014211422142314241425142614271428142914301431143214331434143514361437143814391440144114421443144414451446144714481449145014511452145314541455145614571458145914601461146214631464146514661467146814691470147114721473147414751476147714781479 |
- '''
- Created on 2018年12月26日
- @author: User
- '''
- import os
- import sys
- sys.path.append(os.path.abspath("../.."))
- # from keras.engine import topology
- # from keras import models
- # from keras import layers
- # from keras_contrib.layers.crf import CRF
- # from keras.preprocessing.sequence import pad_sequences
- # from keras import optimizers,losses,metrics
- from BiddingKG.dl.common.Utils import *
- from BiddingKG.dl.interface.modelFactory import *
- import tensorflow as tf
- from BiddingKG.dl.interface.Entitys import Entity
- from threading import RLock
- dict_predictor = {"codeName":{"predictor":None,"Lock":RLock()},
- "prem":{"predictor":None,"Lock":RLock()},
- "epc":{"predictor":None,"Lock":RLock()},
- "roleRule":{"predictor":None,"Lock":RLock()},
- "form":{"predictor":None,"Lock":RLock()}}
- def getPredictor(_type):
- if _type in dict_predictor:
- with dict_predictor[_type]["Lock"]:
- if dict_predictor[_type]["predictor"] is None:
- if _type=="codeName":
- dict_predictor[_type]["predictor"] = CodeNamePredict()
- if _type=="prem":
- dict_predictor[_type]["predictor"] = PREMPredict()
- if _type=="epc":
- dict_predictor[_type]["predictor"] = EPCPredict()
- if _type=="roleRule":
- dict_predictor[_type]["predictor"] = RoleRulePredictor()
- if _type=="form":
- dict_predictor[_type]["predictor"] = FormPredictor()
- return dict_predictor[_type]["predictor"]
- raise NameError("no this type of predictor")
- #编号名称模型
- class CodeNamePredict():
-
- def __init__(self,EMBED_DIM=None,BiRNN_UNITS=None,lazyLoad=getLazyLoad()):
-
- self.model = None
- self.MAX_LEN = None
- self.model_code = None
- if EMBED_DIM is None:
- self.EMBED_DIM = 60
- else:
- self.EMBED_DIM = EMBED_DIM
- if BiRNN_UNITS is None:
- self.BiRNN_UNITS = 200
- else:
- self.BiRNN_UNITS = BiRNN_UNITS
- self.filepath = os.path.dirname(__file__)+"/../projectCode/models/model_project_"+str(self.EMBED_DIM)+"_"+str(self.BiRNN_UNITS)+".hdf5"
- #self.filepath = "../projectCode/models/model_project_60_200_200ep017-loss6.456-val_loss7.852-val_acc0.969.hdf5"
- self.filepath_code = os.path.dirname(__file__)+"/../projectCode/models/model_code.hdf5"
- vocabpath = os.path.dirname(__file__)+"/codename_vocab.pk"
- classlabelspath = os.path.dirname(__file__)+"/codename_classlabels.pk"
- self.vocab = load(vocabpath)
- self.class_labels = load(classlabelspath)
-
- #生成提取编号和名称的正则
- id_PC_B = self.class_labels.index("PC_B")
- id_PC_M = self.class_labels.index("PC_M")
- id_PC_E = self.class_labels.index("PC_E")
- id_PN_B = self.class_labels.index("PN_B")
- id_PN_M = self.class_labels.index("PN_M")
- id_PN_E = self.class_labels.index("PN_E")
- self.PC_pattern = re.compile(str(id_PC_B)+str(id_PC_M)+"+"+str(id_PC_E)+"?")
- self.PN_pattern = re.compile(str(id_PN_B)+str(id_PN_M)+"+"+str(id_PN_E)+"?")
- print("pc",self.PC_pattern)
- print("pn",self.PN_pattern)
- self.word2index = dict((w,i) for i,w in enumerate(np.array(self.vocab)))
-
- self.inputs = None
- self.outputs = None
- self.sess_codename = tf.Session(graph=tf.Graph())
- self.sess_codesplit = tf.Session(graph=tf.Graph())
- self.inputs_code = None
- self.outputs_code = None
- if not lazyLoad:
- self.getModel()
- self.getModel_code()
-
-
-
- def getModel(self):
- '''
- @summary: 取得编号和名称模型
- '''
- if self.inputs is None:
- log("get model of codename")
- with self.sess_codename.as_default():
- with self.sess_codename.graph.as_default():
- meta_graph_def = tf.saved_model.loader.load(self.sess_codename, ["serve"], export_dir=os.path.dirname(__file__)+"/codename_savedmodel")
- signature_key = tf.saved_model.signature_constants.DEFAULT_SERVING_SIGNATURE_DEF_KEY
- signature_def = meta_graph_def.signature_def
- self.inputs = self.sess_codename.graph.get_tensor_by_name(signature_def[signature_key].inputs["inputs"].name)
- self.outputs = self.sess_codename.graph.get_tensor_by_name(signature_def[signature_key].outputs["outputs"].name)
- return self.inputs,self.outputs
- else:
- return self.inputs,self.outputs
- '''
- if self.model is None:
- self.model = self.getBiLSTMCRFModel(self.MAX_LEN, self.vocab, self.EMBED_DIM, self.BiRNN_UNITS, self.class_labels,weights=None)
- self.model.load_weights(self.filepath)
- return self.model
- '''
-
- def getModel_code(self):
- if self.inputs_code is None:
- log("get model of code")
- with self.sess_codesplit.as_default():
- with self.sess_codesplit.graph.as_default():
- meta_graph_def = tf.saved_model.loader.load(self.sess_codesplit, ["serve"], export_dir=os.path.dirname(__file__)+"/codesplit_savedmodel")
- signature_key = tf.saved_model.signature_constants.DEFAULT_SERVING_SIGNATURE_DEF_KEY
- signature_def = meta_graph_def.signature_def
- self.inputs_code = []
- self.inputs_code.append(self.sess_codesplit.graph.get_tensor_by_name(signature_def[signature_key].inputs["input0"].name))
- self.inputs_code.append(self.sess_codesplit.graph.get_tensor_by_name(signature_def[signature_key].inputs["input1"].name))
- self.inputs_code.append(self.sess_codesplit.graph.get_tensor_by_name(signature_def[signature_key].inputs["input2"].name))
- self.outputs_code = self.sess_codesplit.graph.get_tensor_by_name(signature_def[signature_key].outputs["outputs"].name)
- self.sess_codesplit.graph.finalize()
- return self.inputs_code,self.outputs_code
- else:
- return self.inputs_code,self.outputs_code
- '''
- if self.model_code is None:
- log("get model of model_code")
- with self.sess_codesplit.as_default():
- with self.sess_codesplit.graph.as_default():
- self.model_code = models.load_model(self.filepath_code, custom_objects={'precision':precision,'recall':recall,'f1_score':f1_score})
- return self.model_code
- '''
-
- def getBiLSTMCRFModel(self,MAX_LEN,vocab,EMBED_DIM,BiRNN_UNITS,chunk_tags,weights):
- '''
- model = models.Sequential()
- model.add(layers.Embedding(len(vocab), EMBED_DIM, mask_zero=True)) # Random embedding
- model.add(layers.Bidirectional(layers.LSTM(BiRNN_UNITS // 2, return_sequences=True)))
- crf = CRF(len(chunk_tags), sparse_target=True)
- model.add(crf)
- model.summary()
- model.compile('adam', loss=crf.loss_function, metrics=[crf.accuracy])
- return model
- '''
- input = layers.Input(shape=(None,))
- if weights is not None:
- embedding = layers.embeddings.Embedding(len(vocab),EMBED_DIM,mask_zero=True,weights=[weights],trainable=True)(input)
- else:
- embedding = layers.embeddings.Embedding(len(vocab),EMBED_DIM,mask_zero=True)(input)
- bilstm = layers.Bidirectional(layers.LSTM(BiRNN_UNITS//2,return_sequences=True))(embedding)
- bilstm_dense = layers.TimeDistributed(layers.Dense(len(chunk_tags)))(bilstm)
- crf = CRF(len(chunk_tags),sparse_target=True)
- crf_out = crf(bilstm_dense)
- model = models.Model(input=[input],output = [crf_out])
- model.summary()
- model.compile(optimizer = 'adam', loss = crf.loss_function, metrics = [crf.accuracy])
- return model
-
- #根据规则补全编号或名称两边的符号
- def fitDataByRule(self,data):
- symbol_dict = {"(":")",
- "(":")",
- "[":"]",
- "【":"】",
- ")":"(",
- ")":"(",
- "]":"[",
- "】":"【"}
- leftSymbol_pattern = re.compile("[\((\[【]")
- rightSymbol_pattern = re.compile("[\))\]】]")
- leftfinds = re.findall(leftSymbol_pattern,data)
- rightfinds = re.findall(rightSymbol_pattern,data)
- result = data
- if len(leftfinds)+len(rightfinds)==0:
- return data
- elif len(leftfinds)==len(rightfinds):
- return data
- elif abs(len(leftfinds)-len(rightfinds))==1:
- if len(leftfinds)>len(rightfinds):
- if symbol_dict.get(data[0]) is not None:
- result = data[1:]
- else:
- #print(symbol_dict.get(leftfinds[0]))
- result = data+symbol_dict.get(leftfinds[0])
- else:
- if symbol_dict.get(data[-1]) is not None:
- result = data[:-1]
- else:
- result = symbol_dict.get(rightfinds[0])+data
- return result
-
-
- def predict(self,list_sentences,list_entitys=None,MAX_AREA = 5000):
- #@summary: 获取每篇文章的code和name
-
- pattern_score = re.compile("工程|服务|采购|施工|项目|系统|招标|中标|公告|学校|[大中小]学校?|医院|公司|分公司|研究院|政府采购中心|学院|中心校?|办公室|政府|财[政务]局|办事处|委员会|[部总支]队|警卫局|幼儿园|党委|党校|银行|分行|解放军|发电厂|供电局|管理所|供电公司|卷烟厂|机务段|研究[院所]|油厂|调查局|调查中心|出版社|电视台|监狱|水厂|服务站|信用合作联社|信用社|交易所|交易中心|交易中心党校|科学院|测绘所|运输厅|管理处|局|中心|机关|部门?|处|科|厂|集团|图书馆|馆|所|厅|楼|区|酒店|场|基地|矿|餐厅|酒店")
- result = []
- index_unk = self.word2index.get("<unk>")
- index_pad = self.word2index.get("<pad>")
- if list_entitys is None:
- list_entitys = [[] for _ in range(len(list_sentences))]
- for list_sentence,list_entity in zip(list_sentences,list_entitys):
- if len(list_sentence)==0:
- result.append([list_sentence[0].doc_id,{"code":[],"name":""}])
- continue
- doc_id = list_sentence[0].doc_id
- # sentences = []
- # for sentence in list_sentence:
- # if len(sentence.sentence_text)>MAX_AREA:
- # for _sentence_comma in re.split("[;;,\n]",sentence):
- # _comma_index = 0
- # while(_comma_index<len(_sentence_comma)):
- # sentences.append(_sentence_comma[_comma_index:_comma_index+MAX_AREA])
- # _comma_index += MAX_AREA
- # else:
- # sentences.append(sentence+"。")
- list_sentence.sort(key=lambda x:len(x.sentence_text),reverse=True)
- _begin_index = 0
-
- item = [doc_id,{"code":[],"name":""}]
- code_set = set()
- dict_name_freq_score = dict()
- while(True):
- MAX_LEN = len(list_sentence[_begin_index].sentence_text)
- if MAX_LEN>MAX_AREA:
- MAX_LEN = MAX_AREA
- _LEN = MAX_AREA//MAX_LEN
- #预测
- x = [[self.word2index.get(word,index_unk)for word in sentence.sentence_text[:MAX_AREA]]for sentence in list_sentence[_begin_index:_begin_index+_LEN]]
- x = pad_sequences(x,maxlen=MAX_LEN,padding="post",truncating="post")
- if USE_PAI_EAS:
-
- request = tf_predict_pb2.PredictRequest()
- request.inputs["inputs"].dtype = tf_predict_pb2.DT_INT32
- request.inputs["inputs"].array_shape.dim.extend(np.shape(x))
- request.inputs["inputs"].int_val.extend(np.array(x,dtype=np.int32).reshape(-1))
- request_data = request.SerializeToString()
- list_outputs = ["outputs"]
- _result = vpc_requests(codename_url, codename_authorization, request_data, list_outputs)
- if _result is not None:
- predict_y = _result["outputs"]
- else:
- with self.sess_codename.as_default():
- t_input,t_output = self.getModel()
- predict_y = self.sess_codename.run(t_output,feed_dict={t_input:x})
- else:
- with self.sess_codename.as_default():
- t_input,t_output = self.getModel()
- predict_y = self.sess_codename.run(t_output,feed_dict={t_input:x})
- '''
- for item11 in np.argmax(predict_y,-1):
- print(item11)
- print(predict_y)
- '''
- # print(predict_y)
- for sentence,predict in zip(list_sentence[_begin_index:_begin_index+_LEN],np.argmax(predict_y,-1)):
- pad_sentence = sentence.sentence_text[:MAX_LEN]
- join_predict = "".join([str(s) for s in predict])
- # print(pad_sentence)
- # print(join_predict)
- code_x = []
- code_text = []
- temp_entitys = []
- for iter in re.finditer(self.PC_pattern,join_predict):
- get_len = 40
- if iter.span()[0]<get_len:
- begin = 0
- else:
- begin = iter.span()[0]-get_len
- end = iter.span()[1]+get_len
- code_x.append(embedding_word([pad_sentence[begin:iter.span()[0]],pad_sentence[iter.span()[0]:iter.span()[1]],pad_sentence[iter.span()[1]:end]],shape=(3,get_len,60)))
- code_text.append(pad_sentence[iter.span()[0]:iter.span()[1]])
- _entity = Entity(doc_id=sentence.doc_id,entity_id="%s_%s_%s_%s"%(sentence.doc_id,sentence.sentence_index,iter.span()[0],iter.span()[1]),entity_text=pad_sentence[iter.span()[0]:iter.span()[1]],entity_type="code",sentence_index=sentence.sentence_index,begin_index=0,end_index=0,wordOffset_begin=iter.span()[0],wordOffset_end=iter.span()[1])
- temp_entitys.append(_entity)
- #print("code",code_text)
- if len(code_x)>0:
- code_x = np.transpose(np.array(code_x,dtype=np.float32),(1,0,2,3))
- if USE_PAI_EAS:
- request = tf_predict_pb2.PredictRequest()
- request.inputs["input0"].dtype = tf_predict_pb2.DT_FLOAT
- request.inputs["input0"].array_shape.dim.extend(np.shape(code_x[0]))
- request.inputs["input0"].float_val.extend(np.array(code_x[0],dtype=np.float64).reshape(-1))
- request.inputs["input1"].dtype = tf_predict_pb2.DT_FLOAT
- request.inputs["input1"].array_shape.dim.extend(np.shape(code_x[1]))
- request.inputs["input1"].float_val.extend(np.array(code_x[1],dtype=np.float64).reshape(-1))
- request.inputs["input2"].dtype = tf_predict_pb2.DT_FLOAT
- request.inputs["input2"].array_shape.dim.extend(np.shape(code_x[2]))
- request.inputs["input2"].float_val.extend(np.array(code_x[2],dtype=np.float64).reshape(-1))
- request_data = request.SerializeToString()
- list_outputs = ["outputs"]
- _result = vpc_requests(codeclasses_url, codeclasses_authorization, request_data, list_outputs)
- if _result is not None:
- predict_code = _result["outputs"]
- else:
- with self.sess_codesplit.as_default():
- with self.sess_codesplit.graph.as_default():
- predict_code = self.getModel_code().predict([code_x[0],code_x[1],code_x[2]])
- else:
- with self.sess_codesplit.as_default():
- with self.sess_codesplit.graph.as_default():
- inputs_code,outputs_code = self.getModel_code()
- predict_code = limitRun(self.sess_codesplit,[outputs_code],feed_dict={inputs_code[0]:code_x[0],inputs_code[1]:code_x[1],inputs_code[2]:code_x[2]},MAX_BATCH=2)[0]
- #predict_code = self.sess_codesplit.run(outputs_code,feed_dict={inputs_code[0]:code_x[0],inputs_code[1]:code_x[1],inputs_code[2]:code_x[2]})
- #predict_code = self.getModel_code().predict([code_x[0],code_x[1],code_x[2]])
- for h in range(len(predict_code)):
- if predict_code[h][0]>0.5:
- the_code = self.fitDataByRule(code_text[h])
- #add code to entitys
- list_entity.append(temp_entitys[h])
- if the_code not in code_set:
- code_set.add(the_code)
- item[1]['code'] = list(code_set)
- for iter in re.finditer(self.PN_pattern,join_predict):
- _name = self.fitDataByRule(pad_sentence[iter.span()[0]:iter.span()[1]])
- #add name to entitys
- _entity = Entity(doc_id=sentence.doc_id,entity_id="%s_%s_%s_%s"%(sentence.doc_id,sentence.sentence_index,iter.span()[0],iter.span()[1]),entity_text=_name,entity_type="name",sentence_index=sentence.sentence_index,begin_index=0,end_index=0,wordOffset_begin=iter.span()[0],wordOffset_end=iter.span()[1])
- list_entity.append(_entity)
- w = 1 if re.search('(项目|工程|招标|合同|标项|标的|计划|询价|询价单|询价通知书|申购)(名称|标题|主题)[::\s]', pad_sentence[iter.span()[0]-10:iter.span()[0]])!=None else 0.5
- if _name not in dict_name_freq_score:
- # dict_name_freq_score[_name] = [1,len(re.findall(pattern_score,_name))+len(_name)*0.1]
- dict_name_freq_score[_name] = [1, (len(re.findall(pattern_score, _name)) + len(_name) * 0.05)*w]
- else:
- dict_name_freq_score[_name][0] += 1
- '''
- for iter in re.finditer(self.PN_pattern,join_predict):
- print("name-",self.fitDataByRule(pad_sentence[iter.span()[0]:iter.span()[1]]))
- if item[1]['name']=="":
- for iter in re.finditer(self.PN_pattern,join_predict):
- #item[1]['name']=item[1]['name']+";"+self.fitDataByRule(pad_sentence[iter.span()[0]:iter.span()[1]])
- item[1]['name']=self.fitDataByRule(pad_sentence[iter.span()[0]:iter.span()[1]])
- break
- '''
- if _begin_index+_LEN>=len(list_sentence):
- break
- _begin_index += _LEN
-
- list_name_freq_score = []
- # 2020/11/23 大网站规则调整
- name_re1 = '(项目|工程|招标|合同|标项|标的|计划|询价|询价单|询价通知书|申购)(名称|标题|主题)[::\s]+([^,。:;]{2,60})[,。]'
- for sentence in list_sentence:
- # pad_sentence = sentence.sentence_text
- othername = re.search(name_re1, sentence.sentence_text)
- if othername != None:
- project_name = othername.group(3)
- beg = find_index([project_name], sentence.sentence_text)[0]
- end = beg + len(project_name)
- _name = self.fitDataByRule(sentence.sentence_text[beg:end])
- # add name to entitys
- _entity = Entity(doc_id=sentence.doc_id, entity_id="%s_%s_%s_%s" % (
- sentence.doc_id, sentence.sentence_index, beg, end), entity_text=_name,
- entity_type="name", sentence_index=sentence.sentence_index, begin_index=0,
- end_index=0, wordOffset_begin=beg, wordOffset_end=end)
- list_entity.append(_entity)
- w = 1
- if _name not in dict_name_freq_score:
- # dict_name_freq_score[_name] = [1,len(re.findall(pattern_score,_name))+len(_name)*0.1]
- dict_name_freq_score[_name] = [1, (len(re.findall(pattern_score, _name)) + len(_name) * 0.05) * w]
- else:
- dict_name_freq_score[_name][0] += 1
- # othername = re.search(name_re1, sentence.sentence_text)
- # if othername != None:
- # _name = othername.group(3)
- # if _name not in dict_name_freq_score:
- # dict_name_freq_score[_name] = [1, len(re.findall(pattern_score, _name)) + len(_name) * 0.1]
- # else:
- # dict_name_freq_score[_name][0] += 1
- for _name in dict_name_freq_score.keys():
- list_name_freq_score.append([_name,dict_name_freq_score[_name]])
- # print(list_name_freq_score)
- if len(list_name_freq_score)>0:
- list_name_freq_score.sort(key=lambda x:x[1][0]*x[1][1],reverse=True)
- item[1]['name'] = list_name_freq_score[0][0]
- # if list_name_freq_score[0][1][0]>1:
- # item[1]['name'] = list_name_freq_score[0][0]
- # else:
- # list_name_freq_score.sort(key=lambda x:x[1][1],reverse=True)
- # item[1]["name"] = list_name_freq_score[0][0]
-
- #下面代码加上去用正则添加某些识别不到的项目编号
- if item[1]['code'] == []:
- for sentence in list_sentence:
- # othercode = re.search('(采购计划编号|询价编号)[\))]?[::]?([\[\]a-zA-Z0-9\-]{5,30})', sentence.sentence_text)
- # if othercode != None:
- # item[1]['code'].append(othercode.group(2))
- # 2020/11/23 大网站规则调整
- othercode = re.search('(项目|采购|招标|品目|询价|竞价|询价单|磋商|订单|账单|交易|文件|计划|场次|标的|标段|标包|分包|标段\(包\)|招标文件|合同|通知书|公告)(单号|编号|标号|编码|代码|备案号|号)[::\s]+([^,。;:、]{8,30}[a-zA-Z0-9\号])[\),。]', sentence.sentence_text)
- if othercode != None:
- item[1]['code'].append(othercode.group(3))
- result.append(item)
- list_sentence.sort(key=lambda x: x.sentence_index,reverse=False)
- return result
-
-
- '''
- #当数据量过大时会报错
- def predict(self,articles,MAX_LEN = None):
- sentences = []
- for article in articles:
- for sentence in article.content.split("。"):
- sentences.append([sentence,article.id])
- if MAX_LEN is None:
- sent_len = [len(sentence[0]) for sentence in sentences]
- MAX_LEN = max(sent_len)
- #print(MAX_LEN)
-
- #若为空,则直接返回空
- result = []
- if MAX_LEN==0:
- for article in articles:
- result.append([article.id,{"code":[],"name":""}])
- return result
-
- index_unk = self.word2index.get("<unk>")
- index_pad = self.word2index.get("<pad>")
-
- x = [[self.word2index.get(word,index_unk)for word in sentence[0]]for sentence in sentences]
- x = pad_sequences(x,maxlen=MAX_LEN,padding="post",truncating="post")
-
- predict_y = self.getModel().predict(x)
-
-
- last_doc_id = ""
- item = []
- for sentence,predict in zip(sentences,np.argmax(predict_y,-1)):
- pad_sentence = sentence[0][:MAX_LEN]
- doc_id = sentence[1]
- join_predict = "".join([str(s) for s in predict])
- if doc_id!=last_doc_id:
- if last_doc_id!="":
- result.append(item)
- item = [doc_id,{"code":[],"name":""}]
- code_set = set()
- code_x = []
- code_text = []
- for iter in re.finditer(self.PC_pattern,join_predict):
- get_len = 40
- if iter.span()[0]<get_len:
- begin = 0
- else:
- begin = iter.span()[0]-get_len
- end = iter.span()[1]+get_len
- code_x.append(embedding_word([pad_sentence[begin:iter.span()[0]],pad_sentence[iter.span()[0]:iter.span()[1]],pad_sentence[iter.span()[1]:end]],shape=(3,get_len,60)))
- code_text.append(pad_sentence[iter.span()[0]:iter.span()[1]])
- if len(code_x)>0:
- code_x = np.transpose(np.array(code_x),(1,0,2,3))
- predict_code = self.getModel_code().predict([code_x[0],code_x[1],code_x[2]])
- for h in range(len(predict_code)):
- if predict_code[h][0]>0.5:
- the_code = self.fitDataByRule(code_text[h])
- if the_code not in code_set:
- code_set.add(the_code)
- item[1]['code'] = list(code_set)
- if item[1]['name']=="":
- for iter in re.finditer(self.PN_pattern,join_predict):
- #item[1]['name']=item[1]['name']+";"+self.fitDataByRule(pad_sentence[iter.span()[0]:iter.span()[1]])
- item[1]['name']=self.fitDataByRule(pad_sentence[iter.span()[0]:iter.span()[1]])
- break
-
- last_doc_id = doc_id
- result.append(item)
- return result
- '''
-
- #角色金额模型
- class PREMPredict():
-
- def __init__(self):
- #self.model_role_file = os.path.abspath("../role/models/model_role.model.hdf5")
- self.model_role_file = os.path.dirname(__file__)+"/../role/log/new_biLSTM-ep012-loss0.028-val_loss0.040-f10.954.h5"
- self.model_role = Model_role_classify_word()
- self.model_money = Model_money_classify()
-
- return
-
- def search_role_data(self,list_sentences,list_entitys):
- '''
- @summary:根据句子list和实体list查询角色模型的输入数据
- @param:
- list_sentences:文章的sentences
- list_entitys:文章的entitys
- @return:角色模型的输入数据
- '''
- data_x = []
- points_entitys = []
- for list_entity,list_sentence in zip(list_entitys,list_sentences):
-
- p_entitys = 0
- p_sentences = 0
- while(p_entitys<len(list_entity)):
- entity = list_entity[p_entitys]
- if entity.entity_type in ['org','company']:
- while(p_sentences<len(list_sentence)):
- sentence = list_sentence[p_sentences]
- if entity.doc_id==sentence.doc_id and entity.sentence_index==sentence.sentence_index:
- #item_x = embedding(spanWindow(tokens=sentence.tokens,begin_index=entity.begin_index,end_index=entity.end_index,size=settings.MODEL_ROLE_INPUT_SHAPE[1]),shape=settings.MODEL_ROLE_INPUT_SHAPE)
- item_x = self.model_role.encode(tokens=sentence.tokens,begin_index=entity.begin_index,end_index=entity.end_index,entity_text=entity.entity_text)
- data_x.append(item_x)
- points_entitys.append(entity)
- break
- p_sentences += 1
-
- p_entitys += 1
-
- if len(points_entitys)==0:
- return None
-
- return [data_x,points_entitys]
-
-
- def search_money_data(self,list_sentences,list_entitys):
- '''
- @summary:根据句子list和实体list查询金额模型的输入数据
- @param:
- list_sentences:文章的sentences
- list_entitys:文章的entitys
- @return:金额模型的输入数据
- '''
- data_x = []
- points_entitys = []
- for list_entity,list_sentence in zip(list_entitys,list_sentences):
-
- p_entitys = 0
-
- while(p_entitys<len(list_entity)):
- entity = list_entity[p_entitys]
- if entity.entity_type=="money":
- p_sentences = 0
- while(p_sentences<len(list_sentence)):
- sentence = list_sentence[p_sentences]
- if entity.doc_id==sentence.doc_id and entity.sentence_index==sentence.sentence_index:
- #item_x = embedding(spanWindow(tokens=sentence.tokens,begin_index=entity.begin_index,end_index=entity.end_index,size=settings.MODEL_MONEY_INPUT_SHAPE[1]),shape=settings.MODEL_MONEY_INPUT_SHAPE)
- #item_x = embedding_word(spanWindow(tokens=sentence.tokens, begin_index=entity.begin_index, end_index=entity.end_index, size=10, center_include=True, word_flag=True),shape=settings.MODEL_MONEY_INPUT_SHAPE)
- item_x = self.model_money.encode(tokens=sentence.tokens,begin_index=entity.begin_index,end_index=entity.end_index)
- data_x.append(item_x)
- points_entitys.append(entity)
- break
- p_sentences += 1
- p_entitys += 1
-
- if len(points_entitys)==0:
- return None
-
- return [data_x,points_entitys]
-
- def predict_role(self,list_sentences, list_entitys):
- datas = self.search_role_data(list_sentences, list_entitys)
- if datas is None:
- return
- points_entitys = datas[1]
-
- if USE_PAI_EAS:
- _data = datas[0]
- _data = np.transpose(np.array(_data),(1,0,2))
- request = tf_predict_pb2.PredictRequest()
- request.inputs["input0"].dtype = tf_predict_pb2.DT_FLOAT
- request.inputs["input0"].array_shape.dim.extend(np.shape(_data[0]))
- request.inputs["input0"].float_val.extend(np.array(_data[0],dtype=np.float64).reshape(-1))
- request.inputs["input1"].dtype = tf_predict_pb2.DT_FLOAT
- request.inputs["input1"].array_shape.dim.extend(np.shape(_data[1]))
- request.inputs["input1"].float_val.extend(np.array(_data[1],dtype=np.float64).reshape(-1))
- request.inputs["input2"].dtype = tf_predict_pb2.DT_FLOAT
- request.inputs["input2"].array_shape.dim.extend(np.shape(_data[2]))
- request.inputs["input2"].float_val.extend(np.array(_data[2],dtype=np.float64).reshape(-1))
- request_data = request.SerializeToString()
- list_outputs = ["outputs"]
- _result = vpc_requests(role_url, role_authorization, request_data, list_outputs)
- if _result is not None:
- predict_y = _result["outputs"]
- else:
- predict_y = self.model_role.predict(datas[0])
- else:
- predict_y = self.model_role.predict(np.array(datas[0],dtype=np.float64))
- for i in range(len(predict_y)):
- entity = points_entitys[i]
- label = np.argmax(predict_y[i])
- values = []
- for item in predict_y[i]:
- values.append(item)
- entity.set_Role(label,values)
-
- def predict_money(self,list_sentences,list_entitys):
- datas = self.search_money_data(list_sentences, list_entitys)
- if datas is None:
- return
- points_entitys = datas[1]
- _data = datas[0]
- if USE_PAI_EAS:
- _data = np.transpose(np.array(_data),(1,0,2,3))
- request = tf_predict_pb2.PredictRequest()
- request.inputs["input0"].dtype = tf_predict_pb2.DT_FLOAT
- request.inputs["input0"].array_shape.dim.extend(np.shape(_data[0]))
- request.inputs["input0"].float_val.extend(np.array(_data[0],dtype=np.float64).reshape(-1))
- request.inputs["input1"].dtype = tf_predict_pb2.DT_FLOAT
- request.inputs["input1"].array_shape.dim.extend(np.shape(_data[1]))
- request.inputs["input1"].float_val.extend(np.array(_data[1],dtype=np.float64).reshape(-1))
- request.inputs["input2"].dtype = tf_predict_pb2.DT_FLOAT
- request.inputs["input2"].array_shape.dim.extend(np.shape(_data[2]))
- request.inputs["input2"].float_val.extend(np.array(_data[2],dtype=np.float64).reshape(-1))
- request_data = request.SerializeToString()
- list_outputs = ["outputs"]
- _result = vpc_requests(money_url, money_authorization, request_data, list_outputs)
- if _result is not None:
- predict_y = _result["outputs"]
- else:
- predict_y = self.model_money.predict(_data)
- else:
- predict_y = self.model_money.predict(_data)
- for i in range(len(predict_y)):
- entity = points_entitys[i]
- label = np.argmax(predict_y[i])
- values = []
- for item in predict_y[i]:
- values.append(item)
- entity.set_Money(label,values)
-
- def predict(self,list_sentences,list_entitys):
- self.predict_role(list_sentences,list_entitys)
- self.predict_money(list_sentences,list_entitys)
-
-
- #联系人模型
- class EPCPredict():
-
- def __init__(self):
- self.model_person = Model_person_classify()
-
- def search_person_data(self,list_sentences,list_entitys):
- '''
- @summary:根据句子list和实体list查询联系人模型的输入数据
- @param:
- list_sentences:文章的sentences
- list_entitys:文章的entitys
- @return:联系人模型的输入数据
- '''
- def phoneFromList(phones):
- for phone in phones:
- if len(phone)==11:
- return re.sub('电话[:|:]|联系方式[:|:]','',phone)
- return phones[0]
- data_x = []
- dianhua = []
- points_entitys = []
- for list_entity,list_sentence in zip(list_entitys,list_sentences):
-
- p_entitys = 0
- p_sentences = 0
- key_word = re.compile('电话[:|:]\d{7,12}|联系方式[:|:]\d{7,12}')
- # phone = re.compile('1[3|4|5|7|8][0-9][-—-]?\d{4}[-—-]?\d{4}|\d{3,4}[-—-]\d{7,8}/\d{3,8}|\d{3,4}[-—-]\d{7,8}转\d{1,4}|\d{3,4}[-—-]\d{7,8}|[\(|\(]0\d{2,3}[\)|\)]-?\d{7,8}-?\d{,4}') # 联系电话
- # 2020/11/25 增加发现的号码段
- phone = re.compile('1[3|4|5|6|7|8|9][0-9][-—-]?\d{4}[-—-]?\d{4}|\d{3,4}[-—-]\d{7,8}/\d{3,8}|\d{3,4}[-—-]\d{7,8}转\d{1,4}|\d{3,4}[-—-]\d{7,8}|[\(|\(]0\d{2,3}[\)|\)]-?\d{7,8}-?\d{,4}') # 联系电话
- dict_index_sentence = {}
- for _sentence in list_sentence:
- dict_index_sentence[_sentence.sentence_index] = _sentence
- dict_context_itemx = {}
- while(p_entitys<len(list_entity)):
- entity = list_entity[p_entitys]
- if entity.entity_type=="person":
- sentence = dict_index_sentence[entity.sentence_index]
- #item_x = embedding(spanWindow(tokens=sentence.tokens,begin_index=entity.begin_index,end_index=entity.end_index,size=settings.MODEL_PERSON_INPUT_SHAPE[1]),shape=settings.MODEL_PERSON_INPUT_SHAPE)
- s = spanWindow(tokens=sentence.tokens,begin_index=entity.begin_index,end_index=entity.end_index,size=20)
- _key = "".join(["".join(x) for x in s])
- if _key in dict_context_itemx:
- item_x = dict_context_itemx[_key][0]
- _dianhua = dict_context_itemx[_key][1]
- else:
- item_x = self.model_person.encode(tokens=sentence.tokens,begin_index=entity.begin_index,end_index=entity.end_index)
- s1 = ''.join(s[1])
- #s1 = re.sub(',)', '-', s1)
- s1 = re.sub('\s','',s1)
- have_key = re.findall(key_word, s1)
- have_phone = re.findall(phone, s1)
- s0 = ''.join(s[0])
- #s0 = re.sub(',)', '-', s0)
- s0 = re.sub('\s','',s0)
- have_key2 = re.findall(key_word, s0)
- have_phone2 = re.findall(phone, s0)
- s3 = ''.join(s[1])
- #s0 = re.sub(',)', '-', s0)
- s3 = re.sub(',|,|\s','',s3)
- have_key3 = re.findall(key_word, s3)
- have_phone3 = re.findall(phone, s3)
- s4 = ''.join(s[0])
- #s0 = re.sub(',)', '-', s0)
- s4 = re.sub(',|,|\s','',s0)
- have_key4 = re.findall(key_word, s4)
- have_phone4 = re.findall(phone, s4)
- _dianhua = ""
- if have_phone:
- _dianhua = phoneFromList(have_phone)
- elif have_key:
- _dianhua = phoneFromList(have_key)
- elif have_phone2:
- _dianhua = phoneFromList(have_phone2)
- elif have_key2:
- _dianhua =phoneFromList(have_key2)
- elif have_phone3:
- _dianhua = phoneFromList(have_phone3)
- elif have_key3:
- _dianhua = phoneFromList(have_key3)
- elif have_phone4:
- _dianhua = phoneFromList(have_phone4)
- elif have_key4:
- _dianhua = phoneFromList(have_key4)
- else:
- _dianhua = ""
- dict_context_itemx[_key] = [item_x,_dianhua]
- data_x.append(item_x)
- points_entitys.append(entity)
- dianhua.append(_dianhua)
- p_entitys += 1
- if len(points_entitys)==0:
- return None
-
- return [data_x,points_entitys,dianhua]
-
- def predict_person(self,list_sentences, list_entitys):
- datas = self.search_person_data(list_sentences, list_entitys)
- if datas is None:
- return
- points_entitys = datas[1]
- phone = datas[2]
- if USE_PAI_EAS:
- _data = datas[0]
- _data = np.transpose(np.array(_data),(1,0,2,3))
- request = tf_predict_pb2.PredictRequest()
- request.inputs["input0"].dtype = tf_predict_pb2.DT_FLOAT
- request.inputs["input0"].array_shape.dim.extend(np.shape(_data[0]))
- request.inputs["input0"].float_val.extend(np.array(_data[0],dtype=np.float64).reshape(-1))
- request.inputs["input1"].dtype = tf_predict_pb2.DT_FLOAT
- request.inputs["input1"].array_shape.dim.extend(np.shape(_data[1]))
- request.inputs["input1"].float_val.extend(np.array(_data[1],dtype=np.float64).reshape(-1))
- request_data = request.SerializeToString()
- list_outputs = ["outputs"]
- _result = vpc_requests(person_url, person_authorization, request_data, list_outputs)
- if _result is not None:
- predict_y = _result["outputs"]
- else:
- predict_y = self.model_person.predict(datas[0])
- else:
- predict_y = self.model_person.predict(datas[0])
- assert len(predict_y)==len(points_entitys)==len(phone)
- for i in range(len(predict_y)):
- entity = points_entitys[i]
- label = np.argmax(predict_y[i])
- values = []
- for item in predict_y[i]:
- values.append(item)
- phone_number = phone[i]
- entity.set_Person(label,values,phone_number)
-
- def predict(self,list_sentences,list_entitys):
- self.predict_person(list_sentences,list_entitys)
-
- #表格预测
- class FormPredictor():
-
- def __init__(self,lazyLoad=getLazyLoad()):
- self.model_file_line = os.path.dirname(__file__)+"/../form/model/model_form.model_line.hdf5"
- self.model_file_item = os.path.dirname(__file__)+"/../form/model/model_form.model_item.hdf5"
- self.model_form_item = Model_form_item()
- self.model_form_context = Model_form_context()
- self.model_dict = {"line":[None,self.model_file_line]}
-
-
- def getModel(self,type):
- if type=="item":
- return self.model_form_item
- elif type=="context":
- return self.model_form_context
- else:
- return self.getModel(type)
- def encode(self,data,**kwargs):
- return encodeInput([data], word_len=50, word_flag=True,userFool=False)[0]
- return encodeInput_form(data)
-
- def predict(self,form_datas,type):
- if type=="item":
- return self.model_form_item.predict(form_datas)
- elif type=="context":
- return self.model_form_context.predict(form_datas)
- else:
- return self.getModel(type).predict(form_datas)
-
-
- #角色规则
- #依据正则给所有无角色的实体赋予角色,给予等于阈值的最低概率
- class RoleRulePredictor():
-
- def __init__(self):
- self.pattern_tenderee_left = "(?P<tenderee_left>((采购|招标|项目|竞价|议价|需求|最终|建设|转让|招租|甲|议标|合同主体)(?:人|公司|单位|组织|用户|业主|方|部门)|文章来源|业主名称|需方)(是|为|信息|:|:|\s*$))"
- self.pattern_tenderee_center = "(?P<tenderee_center>(受.{,20}委托))"
- self.pattern_tenderee_right = "(?P<tenderee_right>(\((以下简称)?[\"”]?(招标|采购)(人|单位|机构)\)?)|(^[^.。,,::](采购|竞价|招标|施工|监理|中标|物资)(公告|公示|项目|结果|招标))|的.*正在进行询比价)"
-
- self.pattern_agency_left = "(?P<agency_left>(代理(?:人|机构|公司|单位|组织)|专业采购机构|集中采购机构|招标机构)(是|为|:|:|[,,]?\s*$)|(受.{,20}委托))"
- self.pattern_agency_right = "(?P<agency_right>(\((以下简称)?[\"”]?(代理)(人|单位|机构)\))|受.*委托)"
- # 2020//11/24 大网站规则 中标关键词添加 选定单位|指定的中介服务机构
- self.pattern_winTenderer_left = "(?P<winTenderer_left>((中标|中选|中价|乙|成交|承做|施工|供货|承包|竞得|受让)(候选)?(人|单位|机构|供应商|方|公司|厂商|商)|(供应商|供货商|服务商|选定单位|指定的中介服务机构)).{,4}[::是为].{,2}|(第[一1](名|((中标|中选|中价|成交)?(候选)?(人|单位|机构|供应商))))(是|为|:|:|\s*$)|((评审结果|名次|排名)[::]第?[一1]名?)|(单一来源(采购)?方式向.?$)|((中标|成交)(结果|信息))|(单一来源采购(供应商|供货商|服务商))|((分包|标包).*供应商|供应商名称|服务机构|供方[::]))"
- self.pattern_winTenderer_center = "(?P<winTenderer_center>第[一1].{,20}[是为]((中标|中选|中价|成交|施工)(人|单位|机构|供应商|公司)|供应商).{,4}[::是为])"
- self.pattern_winTenderer_right = "(?P<winTenderer_right>[是为\(]((采购(供应商|供货商|服务商)|(第[一1]|预)?(拟?(中标|中选|中价|成交)(候选)?(人|单位|机构|供应商|公司|厂商)))))"
- self.pattern_winTenderer_whole = "(?P<winTenderer_whole>贵公司.*以.*中标|最终由.*竞买成功|经.*[以由].*中标|成交供应商,成交供应商名称:|谈判结果:由.{5,20}供货)" # 2020//11/24 大网站规则 中标关键词添加 谈判结果:由.{5,20}供货
- self.pattern_winTenderer_location = "(中标|中选|中价|乙|成交|承做|施工|供货|承包|竞得|受让)(候选)?(人|单位|机构|供应商|方|公司|厂商|商)|(供应商|供货商|服务商).{,4}[::]?$|(第[一1](名|((中标|中选|中价|成交)?(候选)?(人|单位|机构|供应商))))(是|为|:|:|\s*$)|((评审结果|名次|排名)[::]第?[一1]名?)|(单一来源(采购)?方式向.?$)"
- self.pattern_secondTenderer_left = "(?P<secondTenderer_left>((第[二2](名|((中标|中选|中价|成交)(候选)?(人|单位|机构|供应商|公司))))(是|为|:|:|\s*$))|((评审结果|名次|排名)[::]第?[二2]名?))"
- self.pattern_secondTenderer_right = "(?P<secondTenderer_right>[是为\(]第[二2](名|(中标|中选|中价|成交)(候选)?(人|单位|机构|供应商|公司)))"
-
- self.pattern_thirdTenderer_left = "(?P<thirdTenderer_left>(第[三3](名|((中标|中选|中价|成交)(候选)?(人|单位|机构|供应商|公司))))|((评审结果|名次|排名)[::]第?[三3]名?))"
- self.pattern_thirdTenderer_right = "(?P<thirdTenderer_right>[是为\(]第[三3](名|(中标|中选|中价|成交)(候选)?(人|单位|机构|供应商|公司)))"
-
- self.dict_list_pattern = {"0":[["L",self.pattern_tenderee_left],
- ["C",self.pattern_tenderee_center],
- ["R",self.pattern_tenderee_right]],
- "1":[["L",self.pattern_agency_left],
- ["R",self.pattern_agency_right]],
- "2":[["L",self.pattern_winTenderer_left],
- ["C",self.pattern_winTenderer_center],
- ["R",self.pattern_winTenderer_right],
- ["W",self.pattern_winTenderer_whole]],
- "3":[["L",self.pattern_secondTenderer_left],
- ["R",self.pattern_secondTenderer_right]],
- "4":[["L",self.pattern_thirdTenderer_left],
- ["R",self.pattern_thirdTenderer_right]]}
- list_pattern = []
- for _k,_v in self.dict_list_pattern.items():
- for _d,_p in _v:
- list_pattern.append(_p)
- self.pattern_whole = "|".join(list_pattern)
-
- self.SET_NOT_TENDERER = set(["人民政府","人民法院","中华人民共和国","人民检察院","评标委员会","中国政府","中国海关"])
-
- self.pattern_money_tenderee = re.compile("投标最高限价|采购计划金额|项目预算|招标金额|采购金额|项目金额|建安费用|采购(单位|人)委托价|限价|拦标价|预算金额")
- self.pattern_money_tenderer = re.compile("((合同|成交|中标|应付款|交易|投标)[)\)]?(总?金额|结果|[单报]?价))|总价|标的基本情况")
- self.pattern_money_tenderer_whole = re.compile("(以金额.*中标)|中标供应商.*单价|以.*元中标")
- self.pattern_money_other = re.compile("代理费|服务费")
- self.pattern_pack = "(([^承](包|标[段号的包]|分?包|包组)编?号?|项目)[::]?[\((]?[0-9A-Za-z一二三四五六七八九十]{1,4})[^至]?|(第?[0-9A-Za-z一二三四五六七八九十]{1,4}(包号|标[段号的包]|分?包))|[0-9]个(包|标[段号的包]|分?包|包组)"
-
- def _check_input(self,text, ignore=False):
- if not text:
- return []
-
- if not isinstance(text, list):
- text = [text]
-
- null_index = [i for i, t in enumerate(text) if not t]
- if null_index and not ignore:
- raise Exception("null text in input ")
-
- return text
-
- def predict(self,list_articles,list_sentences,list_entitys,list_codenames,on_value = 0.5):
-
- for article,list_entity,list_sentence,list_codename in zip(list_articles,list_entitys,list_sentences,list_codenames):
- list_name = list_codename[1]["name"]
- list_name = self._check_input(list_name)+[article.title]
- for p_entity in list_entity:
- if p_entity.entity_type in ["org","company"]:
- #将上下文包含标题的实体概率置为0.6,因为标题中的实体不一定是招标人
- if str(p_entity.label)=="0":
- find_flag = False
- for _sentence in list_sentence:
- if _sentence.sentence_index==p_entity.sentence_index:
- _span = spanWindow(tokens=_sentence.tokens,begin_index=p_entity.begin_index,end_index=p_entity.end_index,size=20,center_include=True,word_flag=True,text=p_entity.entity_text)
- for _name in list_name:
- if _name!="" and str(_span[1]+_span[2][:len(str(_name))]).find(_name)>=0:
- find_flag = True
- if p_entity.values[0]>on_value:
- p_entity.values[0] = 0.6+(p_entity.values[0]-0.6)/10
- if find_flag:
- continue
-
-
- #只解析角色为无的或者概率低于阈值的
- if p_entity.label is None:
- continue
- role_prob = float(p_entity.values[int(p_entity.label)])
- if role_prob<on_value or str(p_entity.label)=="5":
-
- #将标题中的实体置为招标人
- _list_name = self._check_input(list_name,ignore=True)
- find_flag = False
- for _name in _list_name:
- if str(_name).find(p_entity.entity_text)>=0:
- find_flag = True
- _label = 0
- p_entity.label = _label
- p_entity.values[int(_label)] = on_value
- break
- #若是实体在标题中,默认为招标人,不进行以下的规则匹配
- if find_flag:
- continue
-
- for s_index in range(len(list_sentence)):
- if p_entity.doc_id==list_sentence[s_index].doc_id and p_entity.sentence_index==list_sentence[s_index].sentence_index:
- tokens = list_sentence[s_index].tokens
- begin_index = p_entity.begin_index
- end_index = p_entity.end_index
- size = 15
- spans = spanWindow(tokens, begin_index, end_index, size, center_include=True, word_flag=True, use_text=False)
- #距离
- list_distance = [100,100,100,100,100]
- _flag = False
- #使用正则+距离解决冲突
- list_spans = [spans[0][-30:],spans[1],spans[2]]
- for _i_span in range(len(list_spans)):
- # print(list_spans[_i_span],p_entity.entity_text)
- for _iter in re.finditer(self.pattern_whole,list_spans[_i_span]):
- for _group,_v_group in _iter.groupdict().items():
- if _v_group is not None and _v_group!="":
- # print(_group,_v_group)
- _role = _group.split("_")[0]
- _direct = _group.split("_")[1]
- _label = {"tenderee":0,"agency":1,"winTenderer":2,"secondTenderer":3,"thirdTenderer":4}.get(_role)
- if _i_span==0 and _direct=="left":
- _flag = True
- _distance = abs((len(list_spans[_i_span])-_iter.span()[1]))
- list_distance[int(_label)] = min(_distance,list_distance[int(_label)])
- if _i_span==1 and _direct=="center":
- _flag = True
- _distance = abs((len(list_spans[_i_span])-_iter.span()[1]))
- list_distance[int(_label)] = min(_distance,list_distance[int(_label)])
- if _i_span==2 and _direct=="right":
- _flag = True
- _distance = _iter.span()[0]
- list_distance[int(_label)] = min(_distance,list_distance[int(_label)])
- # for _key in self.dict_list_pattern.keys():
- #
- # for pattern in self.dict_list_pattern[_key]:
- # if pattern[0]=="L":
- # for _iter in re.finditer(pattern[1], spans[0][-30:]):
- # _flag = True
- # if len(spans[0])-_iter.span()[1]<list_distance[int(_key)]:
- # list_distance[int(_key)] = len(spans[0])-_iter.span()[1]-(_iter.span()[1]-_iter.span()[0])
- #
- # if pattern[0]=="C":
- # if re.search(pattern[1],spans[0]) is None and re.search(pattern[1],spans[2]) is None and re.search(pattern[1],spans[0]+spans[1]+spans[2]) is not None:
- # _flag = True
- # list_distance[int(_key)] = 0
- #
- # if pattern[0]=="R":
- # for _iter in re.finditer(pattern[1], spans[2][:30]):
- # _flag = True
- # if _iter.span()[0]<list_distance[int(_key)]:
- # list_distance[int(_key)] = _iter.span()[0]
- # if pattern[0]=="W":
- # spans = spanWindow(tokens, begin_index, end_index, size=20, center_include=True, word_flag=True, use_text=False)
- # for _iter in re.finditer(pattern[1], "".join(spans)):
- # _flag = True
- # if _iter.span()[0]<list_distance[int(_key)]:
- # list_distance[int(_key)] = _iter.span()[0]
- # print("==",list_distance)
- #得到结果
- _label = np.argmin(list_distance)
- if _flag:
- # if _label==2 and min(list_distance[3:])<100:
- # _label += np.argmin(list_distance[3:])+1
- if _label in [2,3,4]:
- if p_entity.entity_type in ["company","org"]:
- p_entity.label = _label
- p_entity.values[int(_label)] = on_value+p_entity.values[int(_label)]/10
- else:
- p_entity.label = _label
- p_entity.values[int(_label)] = on_value+p_entity.values[int(_label)]/10
- # if p_entity.entity_type=="location":
- # for _sentence in list_sentence:
- # if _sentence.sentence_index==p_entity.sentence_index:
- # _span = spanWindow(tokens=_sentence.tokens,begin_index=p_entity.begin_index,end_index=p_entity.end_index,size=5,center_include=True,word_flag=True,text=p_entity.entity_text)
- # if re.search(self.pattern_winTenderer_location,_span[0][-10:]) is not None and re.search("地址|地点",_span[0]) is None:
- # p_entity.entity_type="company"
- # _label = "2"
- # p_entity.label = _label
- # p_entity.values = [0]*6
- # p_entity.values[int(_label)] = on_value
- #确定性强的特殊修改
- for s_index in range(len(list_sentence)):
- if p_entity.doc_id==list_sentence[s_index].doc_id and p_entity.sentence_index==list_sentence[s_index].sentence_index:
- tokens = list_sentence[s_index].tokens
- begin_index = p_entity.begin_index
- end_index = p_entity.end_index
- size = 15
- spans = spanWindow(tokens, begin_index, end_index, size, center_include=True, word_flag=True, use_text=False)
- #距离
- list_distance = [100,100,100,100,100]
- _flag = False
- for _key in self.dict_list_pattern.keys():
- for pattern in self.dict_list_pattern[_key]:
- if pattern[0]=="W":
- spans = spanWindow(tokens, begin_index, end_index, size=30, center_include=True, word_flag=True, use_text=False)
- for _iter in re.finditer(pattern[1], spans[0][-10:]+spans[1]+spans[2]):
- _flag = True
- if _iter.span()[0]<list_distance[int(_key)]:
- list_distance[int(_key)] = _iter.span()[0]
- #得到结果
- _label = np.argmin(list_distance)
- if _flag:
- if _label==2 and min(list_distance[3:])<100:
- _label += np.argmin(list_distance[3:])+1
- if _label in [2,3,4]:
- if p_entity.entity_type in ["company","org"]:
- p_entity.label = _label
- p_entity.values[int(_label)] = on_value+p_entity.values[int(_label)]/10
- else:
- p_entity.label = _label
- p_entity.values[int(_label)] = on_value+p_entity.values[int(_label)]/10
- if p_entity.entity_type in ["money"]:
- if str(p_entity.label)=="2":
- for _sentence in list_sentence:
- if _sentence.sentence_index==p_entity.sentence_index:
- _span = spanWindow(tokens=_sentence.tokens,begin_index=p_entity.begin_index,end_index=p_entity.end_index,size=20,center_include=True,word_flag=True,text=p_entity.entity_text)
- if re.search(self.pattern_money_tenderee,_span[0]) is not None and re.search(self.pattern_money_other,_span[0]) is None:
- p_entity.values[0] = 0.8+p_entity.values[0]/10
- p_entity.label = 0
- if re.search(self.pattern_money_tenderer,_span[0]) is not None:
- if re.search(self.pattern_money_other,_span[0]) is not None:
- if re.search(self.pattern_money_tenderer,_span[0]).span()[1]>re.search(self.pattern_money_other,_span[0]).span()[1]:
- p_entity.values[1] = 0.8+p_entity.values[1]/10
- p_entity.label = 1
- else:
- p_entity.values[1] = 0.8+p_entity.values[1]/10
- p_entity.label = 1
- if re.search(self.pattern_money_tenderer_whole,"".join(_span)) is not None and re.search(self.pattern_money_other,_span[0]) is None:
- p_entity.values[1] = 0.8+p_entity.values[1]/10
- p_entity.label = 1
-
- #增加招标金额扩展,招标金额+连续的未识别金额,并且都可以匹配到标段信息,则将为识别的金额设置为招标金额
- list_p = []
- state = 0
- for p_entity in list_entity:
- for _sentence in list_sentence:
- if _sentence.sentence_index==p_entity.sentence_index:
- _span = spanWindow(tokens=_sentence.tokens,begin_index=p_entity.begin_index,end_index=p_entity.end_index,size=20,center_include=True,word_flag=True,text=p_entity.entity_text)
-
- if state==2:
- for _p in list_p[1:]:
-
- _p.values[0] = 0.8+_p.values[0]/10
- _p.label = 0
- state = 0
- list_p = []
-
- if state==0:
- if p_entity.entity_type in ["money"]:
- if str(p_entity.label)=="0" and re.search(self.pattern_pack,_span[0]+"-"+_span[2]) is not None:
- state = 1
- list_p.append(p_entity)
- elif state==1:
- if p_entity.entity_type in ["money"]:
- if str(p_entity.label) in ["0","2"] and re.search(self.pattern_pack,_span[0]+"-"+_span[2]) is not None and re.search(self.pattern_money_other,_span[0]+"-"+_span[2]) is None and p_entity.sentence_index==list_p[0].sentence_index:
- list_p.append(p_entity)
- else:
- state = 2
-
- if len(list_p)>1:
- for _p in list_p[1:]:
- #print("==",_p.entity_text,_p.sentence_index,_p.label)
- _p.values[0] = 0.8+_p.values[0]/10
- _p.label = 0
- state = 0
- list_p = []
-
-
- for p_entity in list_entity:
- #将属于集合中的不可能是中标人的标签置为无
- if p_entity.entity_text in self.SET_NOT_TENDERER:
- p_entity.label=5
- # 时间类别
- class TimePredictor():
- def __init__(self):
- self.sess = tf.Session(graph=tf.Graph())
- self.inputs_code = None
- self.outputs_code = None
- self.input_shape = (2,30,60)
- self.load_model()
- def load_model(self):
- model_path = os.path.dirname(__file__)+'/timesplit_model'
- if self.inputs_code is None:
- log("get model of time")
- with self.sess.as_default():
- with self.sess.graph.as_default():
- meta_graph_def = tf.saved_model.loader.load(self.sess, tags=["serve"], export_dir=model_path)
- signature_key = tf.saved_model.signature_constants.DEFAULT_SERVING_SIGNATURE_DEF_KEY
- signature_def = meta_graph_def.signature_def
- self.inputs_code = []
- self.inputs_code.append(
- self.sess.graph.get_tensor_by_name(signature_def[signature_key].inputs["input0"].name))
- self.inputs_code.append(
- self.sess.graph.get_tensor_by_name(signature_def[signature_key].inputs["input1"].name))
- self.outputs_code = self.sess.graph.get_tensor_by_name(signature_def[signature_key].outputs["outputs"].name)
- return self.inputs_code, self.outputs_code
- else:
- return self.inputs_code, self.outputs_code
- def search_time_data(self,list_sentences,list_entitys):
- data_x = []
- points_entitys = []
- for list_sentence, list_entity in zip(list_sentences, list_entitys):
- p_entitys = 0
- p_sentences = 0
- while(p_entitys<len(list_entity)):
- entity = list_entity[p_entitys]
- if entity.entity_type in ['time']:
- while(p_sentences<len(list_sentence)):
- sentence = list_sentence[p_sentences]
- if entity.doc_id == sentence.doc_id and entity.sentence_index == sentence.sentence_index:
- left = sentence.sentence_text[max(0,entity.wordOffset_begin-self.input_shape[1]):entity.wordOffset_begin]
- right = sentence.sentence_text[entity.wordOffset_end:entity.wordOffset_end+self.input_shape[1]]
- context = [left, right]
- x = embedding_word(context, shape=self.input_shape)
- data_x.append(x)
- points_entitys.append(entity)
- break
- p_sentences += 1
- p_entitys += 1
- if len(points_entitys)==0:
- return None
- data_x = np.transpose(np.array(data_x), (1, 0, 2, 3))
- return [data_x, points_entitys]
- def predict(self, list_sentences,list_entitys):
- datas = self.search_time_data(list_sentences, list_entitys)
- if datas is None:
- return
- points_entitys = datas[1]
- with self.sess.as_default():
- predict_y = self.sess.run(self.outputs_code, feed_dict={self.inputs_code[0]:datas[0][0]
- ,self.inputs_code[1]:datas[0][1]})
- for i in range(len(predict_y)):
- entity = points_entitys[i]
- label = np.argmax(predict_y[i])
- values = []
- for item in predict_y[i]:
- values.append(item)
- entity.set_Role(label, values)
- def getSavedModel():
- #predictor = FormPredictor()
- graph = tf.Graph()
- with graph.as_default():
- model = tf.keras.models.load_model("../form/model/model_form.model_item.hdf5",custom_objects={"precision":precision,"recall":recall,"f1_score":f1_score})
-
- #print(tf.graph_util.remove_training_nodes(model))
- tf.saved_model.simple_save(
- tf.keras.backend.get_session(),
- "./h5_savedmodel/",
- inputs={"image": model.input},
- outputs={"scores": model.output}
- )
-
- def getBiLSTMCRFModel(MAX_LEN,vocab,EMBED_DIM,BiRNN_UNITS,chunk_tags,weights):
- '''
- model = models.Sequential()
- model.add(layers.Embedding(len(vocab), EMBED_DIM, mask_zero=True)) # Random embedding
- model.add(layers.Bidirectional(layers.LSTM(BiRNN_UNITS // 2, return_sequences=True)))
- crf = CRF(len(chunk_tags), sparse_target=True)
- model.add(crf)
- model.summary()
- model.compile('adam', loss=crf.loss_function, metrics=[crf.accuracy])
- return model
- '''
- input = layers.Input(shape=(None,),dtype="int32")
- if weights is not None:
- embedding = layers.embeddings.Embedding(len(vocab),EMBED_DIM,mask_zero=True,weights=[weights],trainable=True)(input)
- else:
- embedding = layers.embeddings.Embedding(len(vocab),EMBED_DIM,mask_zero=True)(input)
- bilstm = layers.Bidirectional(layers.LSTM(BiRNN_UNITS//2,return_sequences=True))(embedding)
- bilstm_dense = layers.TimeDistributed(layers.Dense(len(chunk_tags)))(bilstm)
- crf = CRF(len(chunk_tags),sparse_target=True)
- crf_out = crf(bilstm_dense)
- model = models.Model(input=[input],output = [crf_out])
- model.summary()
- model.compile(optimizer = 'adam', loss = crf.loss_function, metrics = [crf.accuracy])
- return model
- import h5py
- def h5_to_graph(sess,graph,h5file):
-
- f = h5py.File(h5file,'r') #打开h5文件
- def getValue(v):
- _value = f["model_weights"]
- list_names = str(v.name).split("/")
- for _index in range(len(list_names)):
- print(v.name)
- if _index==1:
- _value = _value[list_names[0]]
- _value = _value[list_names[_index]]
- return _value.value
-
- def _load_attributes_from_hdf5_group(group, name):
- """Loads attributes of the specified name from the HDF5 group.
-
- This method deals with an inherent problem
- of HDF5 file which is not able to store
- data larger than HDF5_OBJECT_HEADER_LIMIT bytes.
-
- # Arguments
- group: A pointer to a HDF5 group.
- name: A name of the attributes to load.
-
- # Returns
- data: Attributes data.
- """
- if name in group.attrs:
- data = [n.decode('utf8') for n in group.attrs[name]]
- else:
- data = []
- chunk_id = 0
- while ('%s%d' % (name, chunk_id)) in group.attrs:
- data.extend([n.decode('utf8')
- for n in group.attrs['%s%d' % (name, chunk_id)]])
- chunk_id += 1
- return data
-
- def readGroup(gr,parent_name,data):
- for subkey in gr:
- print(subkey)
- if parent_name!=subkey:
- if parent_name=="":
- _name = subkey
- else:
- _name = parent_name+"/"+subkey
- else:
- _name = parent_name
- if str(type(gr[subkey]))=="<class 'h5py._hl.group.Group'>":
- readGroup(gr[subkey],_name,data)
- else:
- data.append([_name,gr[subkey].value])
- print(_name,gr[subkey].shape)
-
-
- layer_names = _load_attributes_from_hdf5_group(f["model_weights"], 'layer_names')
- list_name_value = []
- readGroup(f["model_weights"], "", list_name_value)
- '''
- for k, name in enumerate(layer_names):
- g = f["model_weights"][name]
- weight_names = _load_attributes_from_hdf5_group(g, 'weight_names')
- #weight_values = [np.asarray(g[weight_name]) for weight_name in weight_names]
- for weight_name in weight_names:
- list_name_value.append([weight_name,np.asarray(g[weight_name])])
- '''
- for name_value in list_name_value:
- name = name_value[0]
- '''
- if re.search("dense",name) is not None:
- name = name[:7]+"_1"+name[7:]
- '''
- value = name_value[1]
- print(name,graph.get_tensor_by_name(name),np.shape(value))
- sess.run(tf.assign(graph.get_tensor_by_name(name),value))
- def initialize_uninitialized(sess):
- global_vars = tf.global_variables()
- is_not_initialized = sess.run([tf.is_variable_initialized(var) for var in global_vars])
- not_initialized_vars = [v for (v, f) in zip(global_vars, is_not_initialized) if not f]
-
- adam_vars = []
- for _vars in not_initialized_vars:
- if re.search("Adam",_vars.name) is not None:
- adam_vars.append(_vars)
-
- print([str(i.name) for i in adam_vars]) # only for testing
- if len(adam_vars):
- sess.run(tf.variables_initializer(adam_vars))
-
-
- def save_codename_model():
- filepath = "../projectCode/models/model_project_"+str(60)+"_"+str(200)+".hdf5"
- vocabpath = "../projectCode/models/vocab.pk"
- classlabelspath = "../projectCode/models/classlabels.pk"
- vocab = load(vocabpath)
- class_labels = load(classlabelspath)
- graph = tf.get_default_graph()
- with graph.as_default() as g:
- ''''''
- model = getBiLSTMCRFModel(None, vocab, 60, 200, class_labels,weights=None)
- #model = models.load_model(filepath,custom_objects={'precision':precision,'recall':recall,'f1_score':f1_score,"CRF":CRF,"loss":CRF.loss_function})
-
- #sess = tf.Session(graph=g)
- sess = tf.keras.backend.get_session()
-
- #with sess.as_default():
- sess.run(tf.global_variables_initializer())
- print(sess.run("time_distributed_1/kernel:0"))
- model.load_weights(filepath)
-
-
-
- print("#",sess.run("time_distributed_1/kernel:0"))
-
- x = load("codename_x.pk")
- #y = model.predict(x)
- y = sess.run(model.output,feed_dict={model.input:x})
-
- for item in np.argmax(y,-1):
- print(item)
- tf.saved_model.simple_save(
- sess,
- "./codename_savedmodel/",
- inputs={"inputs": model.input},
- outputs={"outputs": model.output}
- )
-
-
- def save_role_model():
- '''
- @summary: 保存model为savedModel,部署到PAI平台上调用
- '''
- model_role = PREMPredict().model_role
- with model_role.graph.as_default():
- model = model_role.getModel()
- sess = tf.Session(graph=model_role.graph)
- print(type(model.input))
-
- sess.run(tf.global_variables_initializer())
- h5_to_graph(sess, model_role.graph, model_role.model_role_file)
- model = model_role.getModel()
-
- tf.saved_model.simple_save(sess,
- "./role_savedmodel/",
- inputs={"input0":model.input[0],
- "input1":model.input[1],
- "input2":model.input[2]},
- outputs={"outputs":model.output}
- )
-
- def save_money_model():
- model_money = PREMPredict().model_money
- with model_money.graph.as_default():
- model = model_money.getModel()
- sess = tf.Session(graph=model_money.graph)
- model.summary()
- sess.run(tf.global_variables_initializer())
- h5_to_graph(sess, model_money.graph, model_money.model_money_file)
- tf.saved_model.simple_save(sess,
- "./money_savedmodel/",
- inputs = {"input0":model.input[0],
- "input1":model.input[1],
- "input2":model.input[2]},
- outputs = {"outputs":model.output}
- )
-
- def save_person_model():
- model_person = EPCPredict().model_person
- with model_person.graph.as_default():
-
- x = load("person_x.pk")
- _data = np.transpose(np.array(x),(1,0,2,3))
- model = model_person.getModel()
-
- sess = tf.Session(graph=model_person.graph)
- with sess.as_default():
-
- sess.run(tf.global_variables_initializer())
- model_person.load_weights()
-
-
- #h5_to_graph(sess, model_person.graph, model_person.model_person_file)
-
- predict_y = sess.run(model.output,feed_dict={model.input[0]:_data[0],model.input[1]:_data[1]})
- #predict_y = model.predict([_data[0],_data[1]])
- print(np.argmax(predict_y,-1))
-
- tf.saved_model.simple_save(sess,
- "./person_savedmodel/",
- inputs={"input0":model.input[0],
- "input1":model.input[1]},
- outputs = {"outputs":model.output})
-
- def save_form_model():
- model_form = FormPredictor()
- with model_form.graph.as_default():
- model = model_form.getModel("item")
- sess = tf.Session(graph=model_form.graph)
- sess.run(tf.global_variables_initializer())
- h5_to_graph(sess, model_form.graph, model_form.model_file_item)
- tf.saved_model.simple_save(sess,
- "./form_savedmodel/",
- inputs={"inputs":model.input},
- outputs = {"outputs":model.output})
-
- def save_codesplit_model():
- filepath_code = "../projectCode/models/model_code.hdf5"
-
-
- graph = tf.Graph()
- with graph.as_default():
- model_code = models.load_model(filepath_code, custom_objects={'precision':precision,'recall':recall,'f1_score':f1_score})
- sess = tf.Session()
- sess.run(tf.global_variables_initializer())
- h5_to_graph(sess, graph, filepath_code)
- tf.saved_model.simple_save(sess,
- "./codesplit_savedmodel/",
- inputs={"input0":model_code.input[0],
- "input1":model_code.input[1],
- "input2":model_code.input[2]},
- outputs={"outputs":model_code.output})
- def save_timesplit_model():
- filepath = '../time/model_label_time_classify.model.hdf5'
- with tf.Graph().as_default() as graph:
- time_model = models.load_model(filepath, custom_objects={'precision': precision, 'recall': recall, 'f1_score': f1_score})
- with tf.Session() as sess:
- sess.run(tf.global_variables_initializer())
- h5_to_graph(sess, graph, filepath)
- tf.saved_model.simple_save(sess,
- "./timesplit_model/",
- inputs={"input0":time_model.input[0],
- "input1":time_model.input[1]},
- outputs={"outputs":time_model.output})
- if __name__=="__main__":
- #save_role_model()
- #save_codename_model()
- #save_money_model()
- #save_person_model()
- #save_form_model()
- #save_codesplit_model()
- save_timesplit_model()
- '''
- with tf.Session(graph=tf.Graph()) as sess:
- from tensorflow.python.saved_model import tag_constants
- meta_graph_def = tf.saved_model.loader.load(sess, [tag_constants.SERVING], "./person_savedModel")
- graph = tf.get_default_graph()
- signature_key = tf.saved_model.signature_constants.DEFAULT_SERVING_SIGNATURE_DEF_KEY
- signature = meta_graph_def.signature_def
- input0 = sess.graph.get_tensor_by_name(signature[signature_key].inputs["input0"].name)
- input1 = sess.graph.get_tensor_by_name(signature[signature_key].inputs["input1"].name)
- outputs = sess.graph.get_tensor_by_name(signature[signature_key].outputs["outputs"].name)
- x = load("person_x.pk")
- _data = np.transpose(x,[1,0,2,3])
- y = sess.run(outputs,feed_dict={input0:_data[0],input1:_data[1]})
- print(np.argmax(y,-1))
- '''
|