''' Created on 2018年12月26日 @author: User ''' import os import sys from BiddingKG.dl.common.nerUtils import * sys.path.append(os.path.abspath("../..")) # from keras.engine import topology # from keras import models # from keras import layers # from keras_contrib.layers.crf import CRF # from keras.preprocessing.sequence import pad_sequences # from keras import optimizers,losses,metrics from BiddingKG.dl.common.Utils import * from BiddingKG.dl.interface.modelFactory import * import tensorflow as tf from BiddingKG.dl.product.data_util import decode, process_data from BiddingKG.dl.interface.Entitys import Entity from BiddingKG.dl.complaint.punish_predictor import Punish_Extract from BiddingKG.dl.money.re_money_total_unit import extract_total_money, extract_unit_money from bs4 import BeautifulSoup import copy import calendar import datetime from threading import RLock dict_predictor = {"codeName":{"predictor":None,"Lock":RLock()}, "prem":{"predictor":None,"Lock":RLock()}, "epc":{"predictor":None,"Lock":RLock()}, "roleRule":{"predictor":None,"Lock":RLock()}, "roleRuleFinal":{"predictor":None,"Lock":RLock()}, "tendereeRuleRecall":{"predictor":None,"Lock":RLock()}, "form":{"predictor":None,"Lock":RLock()}, "time":{"predictor":None,"Lock":RLock()}, "punish":{"predictor":None,"Lock":RLock()}, "product":{"predictor":None,"Lock":RLock()}, "product_attrs":{"predictor":None,"Lock":RLock()}, "channel": {"predictor": None, "Lock": RLock()}, "deposit_payment_way": {"predictor": None, "Lock": RLock()}, "total_unit_money": {"predictor": None, "Lock": RLock()} } def getPredictor(_type): if _type in dict_predictor: with dict_predictor[_type]["Lock"]: if dict_predictor[_type]["predictor"] is None: if _type == "codeName": dict_predictor[_type]["predictor"] = CodeNamePredict() if _type == "prem": dict_predictor[_type]["predictor"] = PREMPredict() if _type == "epc": dict_predictor[_type]["predictor"] = EPCPredict() if _type == "roleRule": dict_predictor[_type]["predictor"] = RoleRulePredictor() if _type == "roleRuleFinal": dict_predictor[_type]["predictor"] = RoleRuleFinalAdd() if _type == "tendereeRuleRecall": dict_predictor[_type]["predictor"] = TendereeRuleRecall() if _type == "form": dict_predictor[_type]["predictor"] = FormPredictor() if _type == "time": dict_predictor[_type]["predictor"] = TimePredictor() if _type == "punish": dict_predictor[_type]["predictor"] = Punish_Extract() if _type == "product": dict_predictor[_type]["predictor"] = ProductPredictor() if _type == "product_attrs": dict_predictor[_type]["predictor"] = ProductAttributesPredictor() if _type == "channel": dict_predictor[_type]["predictor"] = DocChannel() if _type == 'deposit_payment_way': dict_predictor[_type]["predictor"] = DepositPaymentWay() if _type == 'total_unit_money': dict_predictor[_type]["predictor"] = TotalUnitMoney() return dict_predictor[_type]["predictor"] raise NameError("no this type of predictor") # 编号名称模型 class CodeNamePredict(): def __init__(self,EMBED_DIM=None,BiRNN_UNITS=None,lazyLoad=getLazyLoad()): self.model = None self.MAX_LEN = None self.model_code = None if EMBED_DIM is None: self.EMBED_DIM = 60 else: self.EMBED_DIM = EMBED_DIM if BiRNN_UNITS is None: self.BiRNN_UNITS = 200 else: self.BiRNN_UNITS = BiRNN_UNITS self.filepath = os.path.dirname(__file__)+"/../projectCode/models/model_project_"+str(self.EMBED_DIM)+"_"+str(self.BiRNN_UNITS)+".hdf5" #self.filepath = "../projectCode/models/model_project_60_200_200ep017-loss6.456-val_loss7.852-val_acc0.969.hdf5" self.filepath_code = os.path.dirname(__file__)+"/../projectCode/models/model_code.hdf5" vocabpath = os.path.dirname(__file__)+"/codename_vocab.pk" classlabelspath = os.path.dirname(__file__)+"/codename_classlabels.pk" self.vocab = load(vocabpath) self.class_labels = load(classlabelspath) #生成提取编号和名称的正则 id_PC_B = self.class_labels.index("PC_B") id_PC_M = self.class_labels.index("PC_M") id_PC_E = self.class_labels.index("PC_E") id_PN_B = self.class_labels.index("PN_B") id_PN_M = self.class_labels.index("PN_M") id_PN_E = self.class_labels.index("PN_E") self.PC_pattern = re.compile(str(id_PC_B)+str(id_PC_M)+"*"+str(id_PC_E)) self.PN_pattern = re.compile(str(id_PN_B)+str(id_PN_M)+"*"+str(id_PN_E)) # print("pc",self.PC_pattern) # print("pn",self.PN_pattern) self.word2index = dict((w,i) for i,w in enumerate(np.array(self.vocab))) self.inputs = None self.outputs = None self.sess_codename = tf.Session(graph=tf.Graph()) self.sess_codesplit = tf.Session(graph=tf.Graph()) self.inputs_code = None self.outputs_code = None if not lazyLoad: self.getModel() self.getModel_code() def getModel(self): ''' @summary: 取得编号和名称模型 ''' if self.inputs is None: log("get model of codename") with self.sess_codename.as_default(): with self.sess_codename.graph.as_default(): meta_graph_def = tf.saved_model.loader.load(self.sess_codename, ["serve"], export_dir=os.path.dirname(__file__)+"/codename_savedmodel_tf") signature_key = tf.saved_model.signature_constants.DEFAULT_SERVING_SIGNATURE_DEF_KEY signature_def = meta_graph_def.signature_def self.inputs = self.sess_codename.graph.get_tensor_by_name(signature_def[signature_key].inputs["inputs"].name) self.inputs_length = self.sess_codename.graph.get_tensor_by_name(signature_def[signature_key].inputs["inputs_length"].name) self.keepprob = self.sess_codename.graph.get_tensor_by_name(signature_def[signature_key].inputs["keepprob"].name) self.logits = self.sess_codename.graph.get_tensor_by_name(signature_def[signature_key].outputs["logits"].name) self.trans = self.sess_codename.graph.get_tensor_by_name(signature_def[signature_key].outputs["trans"].name) return self.inputs,self.inputs_length,self.keepprob,self.logits,self.trans else: return self.inputs,self.inputs_length,self.keepprob,self.logits,self.trans ''' if self.model is None: self.model = self.getBiLSTMCRFModel(self.MAX_LEN, self.vocab, self.EMBED_DIM, self.BiRNN_UNITS, self.class_labels,weights=None) self.model.load_weights(self.filepath) return self.model ''' def getModel_code(self): if self.inputs_code is None: log("get model of code") with self.sess_codesplit.as_default(): with self.sess_codesplit.graph.as_default(): meta_graph_def = tf.saved_model.loader.load(self.sess_codesplit, ["serve"], export_dir=os.path.dirname(__file__)+"/codesplit_savedmodel") signature_key = tf.saved_model.signature_constants.DEFAULT_SERVING_SIGNATURE_DEF_KEY signature_def = meta_graph_def.signature_def self.inputs_code = [] self.inputs_code.append(self.sess_codesplit.graph.get_tensor_by_name(signature_def[signature_key].inputs["input0"].name)) self.inputs_code.append(self.sess_codesplit.graph.get_tensor_by_name(signature_def[signature_key].inputs["input1"].name)) self.inputs_code.append(self.sess_codesplit.graph.get_tensor_by_name(signature_def[signature_key].inputs["input2"].name)) self.outputs_code = self.sess_codesplit.graph.get_tensor_by_name(signature_def[signature_key].outputs["outputs"].name) self.sess_codesplit.graph.finalize() return self.inputs_code,self.outputs_code else: return self.inputs_code,self.outputs_code ''' if self.model_code is None: log("get model of model_code") with self.sess_codesplit.as_default(): with self.sess_codesplit.graph.as_default(): self.model_code = models.load_model(self.filepath_code, custom_objects={'precision':precision,'recall':recall,'f1_score':f1_score}) return self.model_code ''' def getBiLSTMCRFModel(self,MAX_LEN,vocab,EMBED_DIM,BiRNN_UNITS,chunk_tags,weights): ''' model = models.Sequential() model.add(layers.Embedding(len(vocab), EMBED_DIM, mask_zero=True)) # Random embedding model.add(layers.Bidirectional(layers.LSTM(BiRNN_UNITS // 2, return_sequences=True))) crf = CRF(len(chunk_tags), sparse_target=True) model.add(crf) model.summary() model.compile('adam', loss=crf.loss_function, metrics=[crf.accuracy]) return model ''' input = layers.Input(shape=(None,)) if weights is not None: embedding = layers.embeddings.Embedding(len(vocab),EMBED_DIM,mask_zero=True,weights=[weights],trainable=True)(input) else: embedding = layers.embeddings.Embedding(len(vocab),EMBED_DIM,mask_zero=True)(input) bilstm = layers.Bidirectional(layers.LSTM(BiRNN_UNITS//2,return_sequences=True))(embedding) bilstm_dense = layers.TimeDistributed(layers.Dense(len(chunk_tags)))(bilstm) crf = CRF(len(chunk_tags),sparse_target=True) crf_out = crf(bilstm_dense) model = models.Model(input=[input],output = [crf_out]) model.summary() model.compile(optimizer = 'adam', loss = crf.loss_function, metrics = [crf.accuracy]) return model #根据规则补全编号或名称两边的符号 def fitDataByRule(self,data): symbol_dict = {"(":")", "(":")", "[":"]", "【":"】", ")":"(", ")":"(", "]":"[", "】":"【"} leftSymbol_pattern = re.compile("[\((\[【]") rightSymbol_pattern = re.compile("[\))\]】]") leftfinds = re.findall(leftSymbol_pattern,data) rightfinds = re.findall(rightSymbol_pattern,data) result = data if len(leftfinds)+len(rightfinds)==0: return data elif len(leftfinds)==len(rightfinds): return data elif abs(len(leftfinds)-len(rightfinds))==1: if len(leftfinds)>len(rightfinds): if symbol_dict.get(data[0]) is not None: result = data[1:] else: #print(symbol_dict.get(leftfinds[0])) result = data+symbol_dict.get(leftfinds[0]) else: if symbol_dict.get(data[-1]) is not None: result = data[:-1] else: result = symbol_dict.get(rightfinds[0])+data return result def decode(self,logits, trans, sequence_lengths, tag_num): viterbi_sequences = [] for logit, length in zip(logits, sequence_lengths): score = logit[:length] viterbi_seq, viterbi_score = viterbi_decode(score, trans) viterbi_sequences.append(viterbi_seq) return viterbi_sequences def predict(self,list_sentences,list_entitys=None,MAX_AREA = 5000): #@summary: 获取每篇文章的code和name pattern_score = re.compile("工程|服务|采购|施工|项目|系统|招标|中标|公告|学校|[大中小]学校?|医院|公司|分公司|研究院|政府采购中心|学院|中心校?|办公室|政府|财[政务]局|办事处|委员会|[部总支]队|警卫局|幼儿园|党委|党校|银行|分行|解放军|发电厂|供电局|管理所|供电公司|卷烟厂|机务段|研究[院所]|油厂|调查局|调查中心|出版社|电视台|监狱|水厂|服务站|信用合作联社|信用社|交易所|交易中心|交易中心党校|科学院|测绘所|运输厅|管理处|局|中心|机关|部门?|处|科|厂|集团|图书馆|馆|所|厅|楼|区|酒店|场|基地|矿|餐厅|酒店") result = [] index_unk = self.word2index.get("") # index_pad = self.word2index.get("") if list_entitys is None: list_entitys = [[] for _ in range(len(list_sentences))] for list_sentence,list_entity in zip(list_sentences,list_entitys): if len(list_sentence)==0: result.append([{"code":[],"name":""}]) continue doc_id = list_sentence[0].doc_id # sentences = [] # for sentence in list_sentence: # if len(sentence.sentence_text)>MAX_AREA: # for _sentence_comma in re.split("[;;,\n]",sentence): # _comma_index = 0 # while(_comma_indexMAX_AREA: MAX_LEN = MAX_AREA _LEN = MAX_AREA//MAX_LEN #预测 x = [[self.word2index.get(word,index_unk)for word in sentence.sentence_text[:MAX_AREA]]for sentence in list_sentence[_begin_index:_begin_index+_LEN]] # x = [[getIndexOfWord(word) for word in sentence.sentence_text[:MAX_AREA]]for sentence in list_sentence[_begin_index:_begin_index+_LEN]] x_len = [len(_x) if len(_x) < MAX_LEN else MAX_LEN for _x in x] x = pad_sequences(x,maxlen=MAX_LEN,padding="post",truncating="post") if USE_API: requests_result = requests.post(API_URL + "/predict_codeName", json={"inouts": x.tolist(), "inouts_len": x_len},verify=True) predict_y = json.loads(requests_result.text)['result'] # print("cost_time:", json.loads(requests_result.text)['cost_time']) # print(MAX_LEN,_LEN,_begin_index) else: with self.sess_codename.as_default(): t_input,t_input_length,t_keepprob,t_logits,t_trans = self.getModel() _logits,_trans = self.sess_codename.run([t_logits,t_trans],feed_dict={t_input:x, t_input_length:x_len, t_keepprob:1.0}) predict_y = self.decode(_logits,_trans,x_len,7) # print('==========',_logits) ''' for item11 in np.argmax(predict_y,-1): print(item11) print(predict_y) ''' # print(predict_y) for sentence,predict in zip(list_sentence[_begin_index:_begin_index+_LEN],np.array(predict_y)): pad_sentence = sentence.sentence_text[:MAX_LEN] join_predict = "".join([str(s) for s in predict]) # print(pad_sentence) # print(join_predict) code_x = [] code_text = [] temp_entitys = [] for iter in re.finditer(self.PC_pattern,join_predict): get_len = 40 if iter.span()[0]0: code_x = np.transpose(np.array(code_x,dtype=np.float32),(1,0,2,3)) if USE_PAI_EAS: request = tf_predict_pb2.PredictRequest() request.inputs["input0"].dtype = tf_predict_pb2.DT_FLOAT request.inputs["input0"].array_shape.dim.extend(np.shape(code_x[0])) request.inputs["input0"].float_val.extend(np.array(code_x[0],dtype=np.float64).reshape(-1)) request.inputs["input1"].dtype = tf_predict_pb2.DT_FLOAT request.inputs["input1"].array_shape.dim.extend(np.shape(code_x[1])) request.inputs["input1"].float_val.extend(np.array(code_x[1],dtype=np.float64).reshape(-1)) request.inputs["input2"].dtype = tf_predict_pb2.DT_FLOAT request.inputs["input2"].array_shape.dim.extend(np.shape(code_x[2])) request.inputs["input2"].float_val.extend(np.array(code_x[2],dtype=np.float64).reshape(-1)) request_data = request.SerializeToString() list_outputs = ["outputs"] _result = vpc_requests(codeclasses_url, codeclasses_authorization, request_data, list_outputs) if _result is not None: predict_code = _result["outputs"] else: with self.sess_codesplit.as_default(): with self.sess_codesplit.graph.as_default(): predict_code = self.getModel_code().predict([code_x[0],code_x[1],code_x[2]]) else: with self.sess_codesplit.as_default(): with self.sess_codesplit.graph.as_default(): inputs_code,outputs_code = self.getModel_code() predict_code = limitRun(self.sess_codesplit,[outputs_code],feed_dict={inputs_code[0]:code_x[0],inputs_code[1]:code_x[1],inputs_code[2]:code_x[2]},MAX_BATCH=2)[0] #predict_code = self.sess_codesplit.run(outputs_code,feed_dict={inputs_code[0]:code_x[0],inputs_code[1]:code_x[1],inputs_code[2]:code_x[2]}) #predict_code = self.getModel_code().predict([code_x[0],code_x[1],code_x[2]]) for h in range(len(predict_code)): if predict_code[h][0]>0.5: the_code = self.fitDataByRule(code_text[h]) #add code to entitys list_entity.append(temp_entitys[h]) if the_code not in code_set: code_set.add(the_code) item['code'] = list(code_set) for iter in re.finditer(self.PN_pattern,join_predict): _name = self.fitDataByRule(pad_sentence[iter.span()[0]:iter.span()[1]]) #add name to entitys _entity = Entity(doc_id=sentence.doc_id,entity_id="%s_%s_%s_%s"%(sentence.doc_id,sentence.sentence_index,iter.span()[0],iter.span()[1]),entity_text=_name,entity_type="name",sentence_index=sentence.sentence_index,begin_index=0,end_index=0,wordOffset_begin=iter.span()[0],wordOffset_end=iter.span()[1],in_attachment=sentence.in_attachment) list_entity.append(_entity) w = 1 if re.search('(项目|工程|招标|合同|标项|标的|计划|询价|询价单|询价通知书|申购)(名称|标题|主题)[::\s]', pad_sentence[iter.span()[0]-10:iter.span()[0]])!=None else 0.5 if _name not in dict_name_freq_score: # dict_name_freq_score[_name] = [1,len(re.findall(pattern_score,_name))+len(_name)*0.1] dict_name_freq_score[_name] = [1, (len(re.findall(pattern_score, _name)) + len(_name) * 0.05)*w] else: dict_name_freq_score[_name][0] += 1 ''' for iter in re.finditer(self.PN_pattern,join_predict): print("name-",self.fitDataByRule(pad_sentence[iter.span()[0]:iter.span()[1]])) if item[1]['name']=="": for iter in re.finditer(self.PN_pattern,join_predict): #item[1]['name']=item[1]['name']+";"+self.fitDataByRule(pad_sentence[iter.span()[0]:iter.span()[1]]) item[1]['name']=self.fitDataByRule(pad_sentence[iter.span()[0]:iter.span()[1]]) break ''' if _begin_index+_LEN>=len(list_sentence): break _begin_index += _LEN list_name_freq_score = [] # 2020/11/23 大网站规则调整 if len(dict_name_freq_score) == 0: name_re1 = '(项目|工程|招标|合同|标项|标的|计划|询价|询价单|询价通知书|申购)(名称|标题|主题)[::\s]+([^,。:;]{2,60})[,。]' for sentence in list_sentence: # pad_sentence = sentence.sentence_text othername = re.search(name_re1, sentence.sentence_text) if othername != None: project_name = othername.group(3) beg = find_index([project_name], sentence.sentence_text)[0] end = beg + len(project_name) _name = self.fitDataByRule(sentence.sentence_text[beg:end]) # add name to entitys _entity = Entity(doc_id=sentence.doc_id, entity_id="%s_%s_%s_%s" % ( sentence.doc_id, sentence.sentence_index, beg, end), entity_text=_name, entity_type="name", sentence_index=sentence.sentence_index, begin_index=0, end_index=0, wordOffset_begin=beg, wordOffset_end=end,in_attachment=sentence.in_attachment) list_entity.append(_entity) w = 1 if _name not in dict_name_freq_score: # dict_name_freq_score[_name] = [1,len(re.findall(pattern_score,_name))+len(_name)*0.1] dict_name_freq_score[_name] = [1, (len(re.findall(pattern_score, _name)) + len(_name) * 0.05) * w] else: dict_name_freq_score[_name][0] += 1 # othername = re.search(name_re1, sentence.sentence_text) # if othername != None: # _name = othername.group(3) # if _name not in dict_name_freq_score: # dict_name_freq_score[_name] = [1, len(re.findall(pattern_score, _name)) + len(_name) * 0.1] # else: # dict_name_freq_score[_name][0] += 1 for _name in dict_name_freq_score.keys(): list_name_freq_score.append([_name,dict_name_freq_score[_name]]) # print(list_name_freq_score) if len(list_name_freq_score)>0: list_name_freq_score.sort(key=lambda x:x[1][0]*x[1][1],reverse=True) item['name'] = list_name_freq_score[0][0] # if list_name_freq_score[0][1][0]>1: # item[1]['name'] = list_name_freq_score[0][0] # else: # list_name_freq_score.sort(key=lambda x:x[1][1],reverse=True) # item[1]["name"] = list_name_freq_score[0][0] #下面代码加上去用正则添加某些识别不到的项目编号 if item['code'] == []: for sentence in list_sentence: # othercode = re.search('(采购计划编号|询价编号)[\))]?[::]?([\[\]a-zA-Z0-9\-]{5,30})', sentence.sentence_text) # if othercode != None: # item[1]['code'].append(othercode.group(2)) # 2020/11/23 大网站规则调整 othercode = re.search('(项目|采购|招标|品目|询价|竞价|询价单|磋商|订单|账单|交易|文件|计划|场次|标的|标段|标包|分包|标段\(包\)|招标文件|合同|通知书|公告)(单号|编号|标号|编码|代码|备案号|号)[::\s]+([^,。;:、]{8,30}[a-zA-Z0-9\号])[\),。]', sentence.sentence_text) if othercode != None: item['code'].append(othercode.group(3)) item['code'].sort(key=lambda x:len(x),reverse=True) result.append(item) list_sentence.sort(key=lambda x: x.sentence_index,reverse=False) return result ''' #当数据量过大时会报错 def predict(self,articles,MAX_LEN = None): sentences = [] for article in articles: for sentence in article.content.split("。"): sentences.append([sentence,article.id]) if MAX_LEN is None: sent_len = [len(sentence[0]) for sentence in sentences] MAX_LEN = max(sent_len) #print(MAX_LEN) #若为空,则直接返回空 result = [] if MAX_LEN==0: for article in articles: result.append([article.id,{"code":[],"name":""}]) return result index_unk = self.word2index.get("") index_pad = self.word2index.get("") x = [[self.word2index.get(word,index_unk)for word in sentence[0]]for sentence in sentences] x = pad_sequences(x,maxlen=MAX_LEN,padding="post",truncating="post") predict_y = self.getModel().predict(x) last_doc_id = "" item = [] for sentence,predict in zip(sentences,np.argmax(predict_y,-1)): pad_sentence = sentence[0][:MAX_LEN] doc_id = sentence[1] join_predict = "".join([str(s) for s in predict]) if doc_id!=last_doc_id: if last_doc_id!="": result.append(item) item = [doc_id,{"code":[],"name":""}] code_set = set() code_x = [] code_text = [] for iter in re.finditer(self.PC_pattern,join_predict): get_len = 40 if iter.span()[0]0: code_x = np.transpose(np.array(code_x),(1,0,2,3)) predict_code = self.getModel_code().predict([code_x[0],code_x[1],code_x[2]]) for h in range(len(predict_code)): if predict_code[h][0]>0.5: the_code = self.fitDataByRule(code_text[h]) if the_code not in code_set: code_set.add(the_code) item[1]['code'] = list(code_set) if item[1]['name']=="": for iter in re.finditer(self.PN_pattern,join_predict): #item[1]['name']=item[1]['name']+";"+self.fitDataByRule(pad_sentence[iter.span()[0]:iter.span()[1]]) item[1]['name']=self.fitDataByRule(pad_sentence[iter.span()[0]:iter.span()[1]]) break last_doc_id = doc_id result.append(item) return result ''' #角色金额模型 class PREMPredict(): def __init__(self): #self.model_role_file = os.path.abspath("../role/models/model_role.model.hdf5") self.model_role_file = os.path.dirname(__file__)+"/../role/log/new_biLSTM-ep012-loss0.028-val_loss0.040-f10.954.h5" self.model_role = Model_role_classify_word() self.model_money = Model_money_classify() return def search_role_data(self,list_sentences,list_entitys): ''' @summary:根据句子list和实体list查询角色模型的输入数据 @param: list_sentences:文章的sentences list_entitys:文章的entitys @return:角色模型的输入数据 ''' text_list = [] data_x = [] points_entitys = [] for list_entity,list_sentence in zip(list_entitys,list_sentences): list_entity.sort(key=lambda x:x.sentence_index) list_sentence.sort(key=lambda x:x.sentence_index) p_entitys = 0 p_sentences = 0 while(p_entitys 1: # _dianhua = phoneFromList(have_phone[1:]) # else: # _dianhua = phoneFromList(have_phone) # elif have_key: # if entity.entity_text != last_person and s0.find(last_person) != -1 and s1.find( # last_person_phone) != -1: # if len(have_key) > 1: # _dianhua = phoneFromList(have_key[1:]) # else: # _dianhua = phoneFromList(have_key) # elif have_phone2: # if entity.entity_text != last_person and s0.find(last_person) != -1 and s0.find( # last_person_phone) != -1: # if len(have_phone2) > 1: # _dianhua = phoneFromList(have_phone2[1:]) # else: # _dianhua = phoneFromList(have_phone2) # elif have_key2: # if entity.entity_text != last_person and s0.find(last_person) != -1 and s0.find( # last_person_phone) != -1: # if len(have_key2) > 1: # _dianhua = phoneFromList(have_key2[1:]) # else: # _dianhua = phoneFromList(have_key2) # elif have_phone3: # if entity.entity_text != last_person and s4.find(last_person) != -1 and s3.find( # last_person_phone) != -1: # if len(have_phone3) > 1: # _dianhua = phoneFromList(have_phone3[1:]) # else: # _dianhua = phoneFromList(have_phone3) # elif have_key3: # if entity.entity_text != last_person and s4.find(last_person) != -1 and s3.find( # last_person_phone) != -1: # if len(have_key3) > 1: # _dianhua = phoneFromList(have_key3[1:]) # else: # _dianhua = phoneFromList(have_key3) # elif have_phone4: # if entity.entity_text != last_person and s4.find(last_person) != -1 and s4.find( # last_person_phone) != -1: # if len(have_phone4) > 1: # _dianhua = phoneFromList(have_phone4) # else: # _dianhua = phoneFromList(have_phone4) # elif have_key4: # if entity.entity_text != last_person and s4.find(last_person) != -1 and s4.find( # last_person_phone) != -1: # if len(have_key4) > 1: # _dianhua = phoneFromList(have_key4) # else: # _dianhua = phoneFromList(have_key4) # else: # _dianhua = "" # # dict_context_itemx[_key] = [item_x, _dianhua] # dict_context_itemx[_key] = [_dianhua] # # points_entitys.append(entity) # # dianhua.append(_dianhua) # last_person = entity.entity_text # if _dianhua: # # 更新联系人entity联系方式(person_phone) # entity.person_phone = _dianhua # last_person_phone = _dianhua # else: # last_person_phone = "####****++++$^" # p_entitys += 1 from scipy.optimize import linear_sum_assignment from BiddingKG.dl.interface.Entitys import Match def dispatch(match_list): main_roles = list(set([match.main_role for match in match_list])) attributes = list(set([match.attribute for match in match_list])) label = np.zeros(shape=(len(main_roles), len(attributes))) for match in match_list: main_role = match.main_role attribute = match.attribute value = match.value label[main_roles.index(main_role), attributes.index(attribute)] = value + 10000 # print(label) gragh = -label # km算法 row, col = linear_sum_assignment(gragh) max_dispatch = [(i, j) for i, j, value in zip(row, col, gragh[row, col]) if value] return [Match(main_roles[row], attributes[col]) for row, col in max_dispatch] # km算法 key_word = re.compile('((?:电话|联系方式|联系人).{0,4}?)(\d{7,12})') phone = re.compile('1[3|4|5|6|7|8|9][0-9][-—-―]?\d{4}[-—-―]?\d{4}|' '\+86.?1[3|4|5|6|7|8|9]\d{9}|' '0\d{2,3}[-—-―][1-9]\d{6,7}/[1-9]\d{6,10}|' '0\d{2,3}[-—-―]\d{7,8}转\d{1,4}|' '0\d{2,3}[-—-―]?[1-9]\d{6,7}|' '[\(|\(]0\d{2,3}[\)|\)]-?\d{7,8}-?\d{,4}|' '[1-9]\d{6,7}') phone_entitys = [] for _sentence in list_sentence: sentence_text = _sentence.sentence_text res_set = set() for i in re.finditer(phone,sentence_text): res_set.add((i.group(),i.start(),i.end())) for i in re.finditer(key_word,sentence_text): res_set.add((i.group(2),i.start()+len(i.group(1)),i.end())) for item in list(res_set): phone_left = sentence_text[max(0,item[1]-10):item[1]] phone_right = sentence_text[item[2]:item[2]+8] # 排除传真号 和 其它错误项 if re.search("传,?真|信,?箱|邮,?箱",phone_left): if not re.search("电,?话",phone_left): continue if re.search("帐,?号|编,?号|报,?价|证,?号|价,?格|[\((]万?元[\))]",phone_left): continue if re.search("[.,]\d{2,}",phone_right): continue _entity = Entity(_sentence.doc_id, None, item[0], "phone", _sentence.sentence_index, None, None,item[1], item[2],in_attachment=_sentence.in_attachment) phone_entitys.append(_entity) person_entitys = [] for entity in list_entity: if entity.entity_type == "person": entity.person_phone = "" person_entitys.append(entity) _list_entity = phone_entitys + person_entitys _list_entity = sorted(_list_entity,key=lambda x:(x.sentence_index,x.wordOffset_begin)) words_num_dict = dict() last_words_num = 0 list_sentence = sorted(list_sentence, key=lambda x: x.sentence_index) for sentence in list_sentence: _index = sentence.sentence_index if _index == 0: words_num_dict[_index] = 0 else: words_num_dict[_index] = words_num_dict[_index - 1] + last_words_num last_words_num = len(sentence.sentence_text) match_list = [] for index in range(len(_list_entity)): entity = _list_entity[index] if entity.entity_type=="person" and entity.label in [1,2,3]: match_nums = 0 for after_index in range(index + 1, min(len(_list_entity), index + 5)): after_entity = _list_entity[after_index] if after_entity.entity_type=="phone": sentence_distance = after_entity.sentence_index - entity.sentence_index distance = (words_num_dict[after_entity.sentence_index] + after_entity.wordOffset_begin) - ( words_num_dict[entity.sentence_index] + entity.wordOffset_end) if sentence_distance < 2 and distance < 50: value = (-1 / 2 * (distance ** 2)) / 10000 match_list.append(Match(entity, after_entity, value)) match_nums += 1 else: break if after_entity.entity_type=="person": if after_entity.label not in [1,2,3]: break if not match_nums: for previous_index in range(index-1, max(0,index-5), -1): previous_entity = _list_entity[previous_index] if previous_entity.entity_type == "phone": sentence_distance = entity.sentence_index - previous_entity.sentence_index distance = (words_num_dict[entity.sentence_index] + entity.wordOffset_begin) - ( words_num_dict[previous_entity.sentence_index] + previous_entity.wordOffset_end) if sentence_distance < 1 and distance<30: # 前向 没有 /10000 value = (-1 / 2 * (distance ** 2)) match_list.append(Match(entity, previous_entity, value)) else: break result = dispatch(match_list) for match in result: entity = match.main_role # 更新 list_entity entity_index = list_entity.index(entity) list_entity[entity_index].person_phone = match.attribute.entity_text def predict(self,list_sentences,list_entitys): self.predict_person(list_sentences,list_entitys) #表格预测 class FormPredictor(): def __init__(self,lazyLoad=getLazyLoad()): self.model_file_line = os.path.dirname(__file__)+"/../form/model/model_form.model_line.hdf5" self.model_file_item = os.path.dirname(__file__)+"/../form/model/model_form.model_item.hdf5" self.model_form_item = Model_form_item() self.model_form_context = Model_form_context() self.model_dict = {"line":[None,self.model_file_line]} def getModel(self,type): if type=="item": return self.model_form_item elif type=="context": return self.model_form_context else: return self.getModel(type) def encode(self,data,**kwargs): return encodeInput([data], word_len=50, word_flag=True,userFool=False)[0] return encodeInput_form(data) def predict(self,form_datas,type): if type=="item": return self.model_form_item.predict(form_datas) elif type=="context": return self.model_form_context.predict(form_datas) else: return self.getModel(type).predict(form_datas) #角色规则 #依据正则给所有无角色的实体赋予角色,给予等于阈值的最低概率 class RoleRulePredictor(): def __init__(self): # (?P 正则组名 后面的 w1 为概率权重关键词 self.pattern_tenderee_left = "(?P((项目|需求|最终|建设|业主|转让|招租|甲|议标|合同主体|挂牌|出租|出让|买受|选取|抽取|抽选|出售|标卖|比价|处置)" \ "(人|公司|单位|组织|用户|业主|主体|方|部门)|文章来源|委托机构|产权所有人|需求?方|买方|业主|(业主|采购人|招标人)联系方式[,:]公司名称:|权属人|甲方当事人|询价书企业|比选发起人|项目单位[,:]单位名称|结算单位)"\ "[))]?(信息[,:])?((公司|单位)?名称)?([((](全称|盖章)[))])?(是|为|:|:|\s*)+$)" self.pattern_tenderee_left_w1 = "(?P((遴选|采购|招标|竞价|议价|比选|委托|询比?价|评选|谈判|邀标|邀请|洽谈|约谈)" \ "(人|公司|单位|组织|用户|业主|主体|方|部门))" \ "(信息[,:])?((公司|单位)?名称)?([((](全称|盖章)[))])?(是|为|:|:|\s*)+$)" self.pattern_tenderee_center = "(?P(受.{5,20}委托))" self.pattern_tenderee_right = "(?P^([((](以下简称)?[,\"“]*(招标|采购)(人|单位|机构)[,\"”]*[))])|^委托|^现委托|^的\w{2,10}正在进行)" #|(^[^.。,,::](采购|竞价|招标|施工|监理|中标|物资)(公告|公示|项目|结果|招标))|的.*正在进行询比价) self.pattern_tendereeORagency_right = "(?P(^拟对|^现?就|^现对))" self.pattern_agency_left = "(?P(代理(?:人|机构|公司|单位|组织)|专业采购机构|集中采购机构|招标组织机构|集采机构|[招议))]+标机构)(名称)?(.{,4}名,?称|全称|是|为|:|:|[,,]?\s*)$|(受.{5,20}委托,?$))" self.pattern_agency_right = "(?P^([((](以下简称)?[,\"“]*(代理)(人|单位|机构)[,\"”]*[))])|^受.{5,20}委托|^受委?托,)" # |^受托 会与 受托生产等冲突,代理表达一般会在后面有逗号 # 2020//11/24 大网站规则 中标关键词添加 选定单位|指定的中介服务机构 self.pattern_winTenderer_left = "(?P(乙|承做|施工|供货|承包|承建|竞得|受让|签约)(候选)?(人|单位|机构|供应商|方|公司|厂商|商)[::是为]+$|" \ "(选定单位|指定的中介服务机构|实施主体|承制单位|供方)[::是为]+$|((评审结果|名次|排名|中标结果)[::]*第?[一1]名?)[::是为]+$|" \ "单一来源(采购)?(供应商|供货商|服务商|方式向)$|((中标|成交)(结果|信息))[::是为]+$|(供应|供货|承销|服务|实施)(机构|单位|商|方)(名称)?[::是为]+$)" self.pattern_winTenderer_left_w1 = "(?P(中标|中选|中价|成交)(候选)?(人|单位|机构|供应商|方|公司|厂商|商)(名称)?[::是为]+$)" #取消逗号 并拒绝执行改进计划的供应商,华新水泥将可能终止与其合作关系 # self.pattern_winTenderer_center = "(?P第[一1].{,20}[是为]((中标|中选|中价|成交|施工)(人|单位|机构|供应商|公司)|供应商)[::是为])" # self.pattern_winTenderer_right = "(?P(^[是为\(]((采购(供应商|供货商|服务商)|(第[一1]|预)?(拟?(中标|中选|中价|成交)(候选)?(人|单位|机构|供应商|公司|厂商)))))|^(报价|价格)最低,确定为本项目成交供应商)" self.pattern_winTenderer_right = "(?P(^[是为]((采购|中标)(供应商|供货商|服务商)|(第[一1]|预)?(拟?(中标|中选|中价|成交)(候选)?(人|单位|机构|供应商|公司|厂商)))|" \ "^(报价|价格)最低,确定为本项目成交供应商|^:贵公司参与|^:?你方于|^中标。|^成为[\w、()()]+项目的成交供应商))" self.pattern_winTenderer_whole = "(?P贵公司.{,15}以.{,15}中标|最终由.{,15}竞买成功|经.{,15}决定[以由].{,15}公司中标|谈判结果:由.{5,20}供货)|中标通知书.{,15}你方" # 2020//11/24 大网站规则 中标关键词添加 谈判结果:由.{5,20}供货 # self.pattern_winTenderer_location = "(中标|中选|中价|乙|成交|承做|施工|供货|承包|竞得|受让)(候选)?(人|单位|机构|供应商|方|公司|厂商|商)|(供应商|供货商|服务商)[::]?$|(第[一1](名|((中标|中选|中价|成交)?(候选)?(人|单位|机构|供应商))))(是|为|:|:|\s*$)|((评审结果|名次|排名)[::]第?[一1]名?)|(单一来源(采购)?方式向.?$)" self.pattern_secondTenderer_left = "(?P((第[二2](名|((中标|中选|中价|成交)(候选)?(人|单位|机构|供应商|公司))))(名称)?[::是为]+$)|((评审结果|名次|排名)[::]第?[二2]名?,?投标商名称[::]+$))" self.pattern_secondTenderer_right = "(?P^[是为\(]第[二2](名|(中标|中选|中价|成交)(候选)?(人|单位|机构|供应商|公司)))" self.pattern_thirdTenderer_left = "(?P(第[三3](名|((中标|中选|中价|成交)(候选)?(人|单位|机构|供应商|公司))))(名称)?[::是为]+$|((评审结果|名次|排名)[::]第?[三3]名?,?投标商名称[::]+$))" self.pattern_thirdTenderer_right = "(?P^[是为\(]第[三3](名|(中标|中选|中价|成交)(候选)?(人|单位|机构|供应商|公司)))" self.pattern_whole = [self.pattern_tenderee_left, self.pattern_tenderee_left_w1, self.pattern_tenderee_center, self.pattern_tenderee_right, self.pattern_tendereeORagency_right, self.pattern_agency_left, self.pattern_agency_right, self.pattern_winTenderer_left, self.pattern_winTenderer_left_w1, self.pattern_winTenderer_whole, self.pattern_winTenderer_right, self.pattern_secondTenderer_left, self.pattern_secondTenderer_right, self.pattern_thirdTenderer_left, self.pattern_thirdTenderer_right ] # 需按顺序排列, 第二、三中标要在中标正则后面 self.SET_NOT_TENDERER = set(["人民政府","人民法院","中华人民共和国","人民检察院","评标委员会","中国政府","中国海关","中华人民共和国政府"]) self.pattern_money_tenderee = re.compile("投标最高限价|采购计划金额|项目预算|招标金额|采购金额|项目金额|建安费用|采购(单位|人)委托价|限价|拦标价|预算金额") self.pattern_money_tenderer = re.compile("((合同|成交|中标|应付款|交易|投标|验收)[)\)]?(总?金额|结果|[单报]?价))|总价|标的基本情况") self.pattern_money_tenderer_whole = re.compile("(以金额.*中标)|中标供应商.*单价|以.*元中标") self.pattern_money_other = re.compile("代理费|服务费") self.pattern_pack = "(([^承](包|标[段号的包]|分?包|包组)编?号?|项目)[::]?[\((]?[0-9A-Za-z一二三四五六七八九十]{1,4})[^至]?|(第?[0-9A-Za-z一二三四五六七八九十]{1,4}(包号|标[段号的包]|分?包))|[0-9]个(包|标[段号的包]|分?包|包组)" def _check_input(self,text, ignore=False): if not text: return [] if not isinstance(text, list): text = [text] null_index = [i for i, t in enumerate(text) if not t] if null_index and not ignore: raise Exception("null text in input ") return text def predict(self, list_articles, list_sentences, list_entitys, list_codenames, on_value=0.5): for article, list_entity, list_sentence, list_codename in zip(list_articles, list_entitys, list_sentences, list_codenames): list_sentence.sort(key=lambda x: x.sentence_index) # 2022/1/5 按句子顺序排序 # list_name = list_codename["name"] list_name = [] # 2022/1/5 改为实体列表内所有项目名称 for entity in list_entity: if entity.entity_type == 'name': list_name.append(entity.entity_text) list_name = self._check_input(list_name) + [article.title] for p_entity in list_entity: if p_entity.entity_type in ["org", "company"]: # 只解析角色为无的或者概率低于阈值的 if p_entity.label is None: continue # 将上下文包含标题的实体概率置为0.6,因为标题中的实体不一定是招标人 if str(p_entity.label) == "0": find_flag = False for _sentence in list_sentence: if _sentence.sentence_index == p_entity.sentence_index: _span = spanWindow(tokens=_sentence.tokens, begin_index=p_entity.begin_index, end_index=p_entity.end_index, size=20, center_include=True, word_flag=True, use_text=True, text=re.sub(")", ")", re.sub("(", "(", p_entity.entity_text))) for _name in list_name: if _name != "" and str(_span[1] + _span[2][:len(str(_name))]).find(_name) >= 0: find_flag = True if p_entity.values[0] > on_value: p_entity.values[0] = 0.6 + (p_entity.values[0] - 0.6) / 10 else: p_entity.values[0] = on_value # 2022/03/08 修正类似 223985179 公司在文章开头的项目名称概率又没达到0.5的情况 if find_flag: continue # 正则从概率低于阈值或其他类别中召回角色 role_prob = float(p_entity.values[int(p_entity.label)]) if role_prob < on_value or str(p_entity.label) == "5": # 将标题中的实体置为招标人 _list_name = self._check_input(list_name, ignore=True) find_flag = False for _name in _list_name: # 2022/1/5修正只要项目名称出现过的角色,所有位置都标注为招标人 if str(_name).find(re.sub(")", ")", re.sub("(", "(", p_entity.entity_text))) >= 0 and p_entity.sentence_index < 4: for _sentence in list_sentence: if _sentence.sentence_index == p_entity.sentence_index: _span = spanWindow(tokens=_sentence.tokens, begin_index=p_entity.begin_index, end_index=p_entity.end_index, size=20, center_include=True, word_flag=True, use_text=True, text=re.sub(")", ")", re.sub("(", "(", p_entity.entity_text))) if str(_span[1] + _span[2][:len(str(_name))]).find( _name) >= 0: find_flag = True _label = 0 p_entity.label = _label p_entity.values[int(_label)] = on_value break if p_entity.sentence_index >= 4: break if find_flag: break # if str(_name).find(p_entity.entity_text)>=0: # find_flag = True # _label = 0 # p_entity.label = _label # p_entity.values[int(_label)] = on_value # break # 若是实体在标题中,默认为招标人,不进行以下的规则匹配 if find_flag: continue for s_index in range(len(list_sentence)): if p_entity.doc_id == list_sentence[s_index].doc_id and p_entity.sentence_index == \ list_sentence[s_index].sentence_index: tokens = list_sentence[s_index].tokens begin_index = p_entity.begin_index end_index = p_entity.end_index size = 15 spans = spanWindow(tokens, begin_index, end_index, size, center_include=True, word_flag=True, use_text=False) # _flag = False # 使用正则+距离解决冲突 # 2021/6/11update center: spans[1] --> spans[0][-30:]+spans[1] list_spans = [spans[0][-30:], spans[0][-10:] + spans[1] + spans[2][:10], spans[2]] # 实体左、中、右 信息 for _i_span in range(len(list_spans)): _flag = False _prob_weight = 1 # print(list_spans[_i_span],p_entity.entity_text) for _pattern in self.pattern_whole: for _iter in re.finditer(_pattern, list_spans[_i_span]): for _group, _v_group in _iter.groupdict().items(): if _v_group is not None and _v_group != "": _role = _group.split("_")[0] if _role == "tendereeORagency": # 2022/3/9 新增不确定招标代理判断逻辑 # print('p_entity_sentenceindex:', p_entity.sentence_index) if p_entity.sentence_index>=1: # 只在第一句进行这种模糊匹配 continue if re.search('医院|学校|大学|中学|小学|幼儿园|政府|部|委员会|署|行|局|厅|处|室|科|股|站', p_entity.entity_text)\ or re.search('(采购|招标|投标|交易|代理|拍卖|咨询|顾问|管理)', p_entity.entity_text) == None: _role = 'tenderee' else: _role = "agency" _direct = _group.split("_")[1] _weight = _group.split("_")[2] if len(_group.split("_"))==3 else "" # _label = {"tenderee": 0, "agency": 1, "winTenderer": 2, # "secondTenderer": 3, "thirdTenderer": 4}.get(_role) if _i_span == 0 and _direct == "left" and re.search('各.{,5}供应商|尊敬的供应商|业绩|拟招|(交易|采购|招标)服务(单位|机构)', #135463002 拟招一家供应商为宜宾市第三人民医院、李庄同济医院院区提供消防维保服务 list_spans[0]) == None: # 2021/12/22 修正错误中标召回 例子208668937 _flag = True _label = {"tenderee": 0, "agency": 1, "winTenderer": 2, "secondTenderer": 3, "thirdTenderer": 4}.get(_role) _prob_weight = 1.2 if _weight=='w1' else 1 # print('_v_group:',_group, _v_group, p_entity.entity_text) if _i_span == 1 and _direct == "center": _flag = True _label = {"tenderee": 0, "agency": 1, "winTenderer": 2, "secondTenderer": 3, "thirdTenderer": 4}.get(_role) _prob_weight = 1.2 if _weight == 'w1' else 1 # print('_v_group:', _group, _v_group, p_entity.entity_text) if _i_span == 2 and _direct == "right": _flag = True _label = {"tenderee": 0, "agency": 1, "winTenderer": 2, "secondTenderer": 3, "thirdTenderer": 4}.get(_role) _prob_weight = 1.2 if _weight == 'w1' else 1 # print('_v_group:', _group, _v_group, p_entity.entity_text) # 得到结果 if _flag: p_entity.label = _label p_entity.values[int(_label)] = on_value*_prob_weight + p_entity.values[int(_label)] / 10 # log('正则召回实体: %s, %s, %s, %d, %.4f, %s'%(_group, _v_group, p_entity.entity_text, p_entity.label, p_entity.values[p_entity.label], list_spans[_i_span])) break # 其他金额通过正则召回可能是招标或中投标的金额 if p_entity.entity_type in ["money"]: if str(p_entity.label) == "2": for _sentence in list_sentence: if _sentence.sentence_index == p_entity.sentence_index: _span = spanWindow(tokens=_sentence.tokens, begin_index=p_entity.begin_index, end_index=p_entity.end_index, size=20, center_include=True, word_flag=True, text=p_entity.entity_text) if re.search(self.pattern_money_tenderee, _span[0]) is not None and re.search( self.pattern_money_other, _span[0]) is None: p_entity.values[0] = 0.8 + p_entity.values[0] / 10 p_entity.label = 0 if re.search(self.pattern_money_tenderer, _span[0]) is not None: if re.search(self.pattern_money_other, _span[0]) is not None: if re.search(self.pattern_money_tenderer, _span[0]).span()[1] > \ re.search(self.pattern_money_other, _span[0]).span()[1]: p_entity.values[1] = 0.8 + p_entity.values[1] / 10 p_entity.label = 1 else: p_entity.values[1] = 0.8 + p_entity.values[1] / 10 p_entity.label = 1 if re.search(self.pattern_money_tenderer_whole, "".join(_span)) is not None and re.search(self.pattern_money_other, _span[0]) is None: p_entity.values[1] = 0.8 + p_entity.values[1] / 10 p_entity.label = 1 # 增加招标金额扩展,招标金额+连续的未识别金额,并且都可以匹配到标段信息,则将为识别的金额设置为招标金额 list_p = [] state = 0 for p_entity in list_entity: for _sentence in list_sentence: if _sentence.sentence_index == p_entity.sentence_index: _span = spanWindow(tokens=_sentence.tokens, begin_index=p_entity.begin_index, end_index=p_entity.end_index, size=20, center_include=True, word_flag=True, text=p_entity.entity_text) if state == 2: for _p in list_p[1:]: _p.values[0] = 0.8 + _p.values[0] / 10 _p.label = 0 state = 0 list_p = [] if state == 0: if p_entity.entity_type in ["money"]: if str(p_entity.label) == "0" and re.search(self.pattern_pack, _span[0] + "-" + _span[2]) is not None: state = 1 list_p.append(p_entity) elif state == 1: if p_entity.entity_type in ["money"]: if str(p_entity.label) in ["0", "2"] and re.search(self.pattern_pack, _span[0] + "-" + _span[ 2]) is not None and re.search( self.pattern_money_other, _span[0] + "-" + _span[2]) is None and p_entity.sentence_index == list_p[ 0].sentence_index: list_p.append(p_entity) else: state = 2 if len(list_p) > 1: for _p in list_p[1:]: # print("==",_p.entity_text,_p.sentence_index,_p.label) _p.values[0] = 0.8 + _p.values[0] / 10 _p.label = 0 state = 0 list_p = [] for p_entity in list_entity: # 将属于集合中的不可能是中标人的标签置为无 if p_entity.entity_text in self.SET_NOT_TENDERER: p_entity.label = 5 '''正则补充最后一句实体日期格式为招标或代理 2021/12/30''' class RoleRuleFinalAdd(): def predict(self, list_articles,list_sentences, list_entitys, list_codenames): # text_end = list_articles[0].content.split('##attachment##')[0][-40:] main_sentences = [sentence for sentence in list_sentences[0] if not sentence.in_attachment] end_tokens = [] for sentence in main_sentences[-5:]: end_tokens.extend(sentence.tokens) text_end = "".join(end_tokens[-30:]) # print(text_end) # sear_ent = re.search('[,。]([\u4e00-\u9fa5()()]{5,20}),?\s*[.]{2,4}年.{1,2}月.{1,2}日', text_end) sear_ent = re.search('[,。;]([\u4e00-\u9fa5()()]{5,20}(,?[\u4e00-\u9fa5]{,6}(分公司|部))?),?\s*[0-9零一二三四五六七八九十〇]{2,4}[年\-/][0-9零一二三四五六七八九十]{1,2}[月\-/][0-9零一二三四五六七八九十]{1,2}日?', text_end) sear_ent2 = re.search('(户名|开户名称)[::]([\u4e00-\u9fa5()()]{5,20})[,。]', list_articles[0].content[:5000]) sear_ent3 = re.search('(报名咨询|[收送交]货地点)[,:]([\u4e00-\u9fa5()()]{5,20})[0-9\-]*[,。]', list_articles[0].content[:5000]) sear_ent4 = re.search('(发布(?:人|单位|机构))[::]([\u4e00-\u9fa5()()]{5,20})[,。]', list_articles[0].content[:5000]) sear_list = [sear_ent4 , sear_ent3 , sear_ent2 , sear_ent] tenderee_notfound = True agency_notfound = True ents = [] for ent in list_entitys[0]: if ent.entity_type in ['org', 'company']: if ent.label == 0: tenderee_notfound = False elif ent.label == 1: agency_notfound = False elif ent.label == 5: ents.append(ent) if sear_ent or sear_ent2 or sear_ent3 or sear_ent4: for _sear_ent in [_sear for _sear in sear_list if _sear]: # if sear_ent4: # ent_re = sear_ent4.group(2) # elif sear_ent3: # ent_re = sear_ent3.group(2) # elif sear_ent2: # ent_re = sear_ent2.group(2) # else: # ent_re = sear_ent.group(1) if _sear_ent==sear_ent4: ent_re = _sear_ent.group(2) elif _sear_ent==sear_ent3: ent_re = _sear_ent.group(2) elif _sear_ent==sear_ent2: ent_re = _sear_ent.group(2) else: ent_re = _sear_ent.group(1) # print('ent_re', ent_re) ent_re = ent_re.replace(',', '').replace("(","(").replace(")",")") if tenderee_notfound == True and (re.search('医院|学校|大学|中学|小学|幼儿园|政府|部|委员会|署|行|局|厅|处|室|科|股|站', ent_re) or re.search('(采购|招标|投标|交易|代理|拍卖|咨询|顾问|管理)', ent_re) == None): n = 0 for i in range(len(ents) - 1, -1, -1): if not ents[i].in_attachment: n += 1 if n > 3 and _sear_ent==sear_ent: # 文章末尾角色加日期这种只找后三个实体 break if ents[i].entity_text == ent_re or (ents[i].entity_text in ent_re and len(ents[i].entity_text)/len(ent_re)>0.6): ents[i].label = 0 ents[i].values[0] = 0.5 tenderee_notfound = False # log('正则最后补充实体: %s'%(ent_re)) break elif agency_notfound == True and re.search('(采购|招标|投标|交易|代理|拍卖|咨询|顾问|管理)', ent_re): n = 0 for i in range(len(ents) - 1, -1, -1): if not ents[i].in_attachment: n += 1 if n > 3 and _sear_ent==sear_ent: # 文章末尾角色加日期这种只找后三个实体 break if ents[i].entity_text == ent_re or (ents[i].entity_text in ent_re and len(ents[i].entity_text)/len(ent_re)>0.6): ents[i].label = 1 ents[i].values[1] = 0.5 agency_notfound = False # log('正则最后补充实体: %s'%(ent_re)) break if not tenderee_notfound: break elif list_codenames[0]['name'] != "": #把标题包含的公司实体作为招标人 # tenderee_notfound = True # ents = [] # for ent in list_entitys[0]: # if ent.entity_type in ['org', 'company']: # if ent.label == 0: # tenderee_notfound = False # elif ent.label == 1: # agency_notfound = False # elif ent.label == 5: # ents.append(ent) if tenderee_notfound == True: # print('list_codenames',list_codenames[0]['name']) for ent in ents: if ent.entity_text in list_codenames[0]['name']: ent.label = 0 ent.values[0] = 0.5 # log('正则召回标题中包含的实体:%s'%ent.entity_text) break # 招标人角色召回规则 class TendereeRuleRecall(): def __init__(self): self.tenderee_left = re.compile("(发布(人|单位|机构)|需求方(信息[,:])?(单位|公司)?名称|购买主体|收货单位|项目申请单位|发起组织|联系单位|" "询价(机构|企业)|联系(人|方式),?(单位|公司)(名称)?|联系(人|方式),名称)[::][^。;,]{,5}$") self.tenderee_right = re.compile("^[^。;::]{,5}[((](以?下简?称)?,?[,\"“]*[我本][\u4e00-\u9fa5]{1,2}[,\"”]*[))]|" "^[^。;::]{,10}[对就][^。;,]+,?[^。;,]{,20}进行[^。;,]*(采购|询比?价|遴选|招投?标|征集)|" "^[^。;::]{,10}关于[^。;,]+,?[^。;,]{,20}的[^。;,]{,20}公告|" "^[^。;,::]{,10}的[^。;,]+,?[^。;,]{,20}正在[^。;,]{,5}进行|" "^[^。;,::]{,10}的[^。;,]+,?[^。,;]{,20}已?[^。;,]{,20}批准|" "^[^。;,::]{,15}(选定|选取|征集|遴选)[^。;,]{,20}(供应商|(代理|咨询|设计)[^。;,]{,5}机构|代理人)") self.tenderee_right2 = re.compile("^[^。;,::]{,10}(招标办|采购部|办事处|采购小?组)") self.tenderee_right3 = re.compile("^[^。;,::]{,10}(对|就|关于|的)(?P[^。;,?!::]+)") # 公告主语判断规则 self.subject = re.compile("[我本][院校局]") # 未识别实体召回正则 self.unrecognized1 = re.compile("(?P((遴选|采购|招标|竞价|议价|比选|委托|询比?价|评选|谈判|邀标|邀请|洽谈|约谈)" \ "(人|商|公司|单位|组织|用户|业主|主体|方|部门))" \ "(信息[,:]?)?((公司|单位)?名称)?([((](全称|盖章)[))])?(是|为|:|:)+)(?P[^,。::;]+)[,。;::]") self.unrecognized2 = re.compile("(?P((项目|需求|最终|建设|业主|转让|招租|甲|议标|合同主体|挂牌|出租|出让|买受|选取|抽取|抽选|出售|标卖|比价|处置)" \ "(人|公司|单位|组织|用户|业主|主体|方|部门)|文章来源|委托机构|产权所有人|需求?方|买方|业主|(业主|采购人|招标人)联系方式[,:]公司名称:|权属人|甲方当事人|询价书企业|比选发起人|项目单位[,:]单位名称|结算单位)"\ "[))]?(信息[,:])?((公司|单位)?名称)?([((](全称|盖章)[))])?(是|为|:|:)+)(?P[^,。::;]+)[,。;::]") # 未识别实体尾部判断 self.unrecognized_end1 = re.compile(".{2,}?(?:公司|医院|学校|学院|大学|中学|小学|幼儿园|政府|指挥部|办公室|项目部|业主大会|监狱|教育局|委员会|研究所|招标办|采购部|办事处|水利局|公墓|中心)") self.unrecognized_end2 = re.compile(".{4,}(?:署|局|厅|处|室|科|部|站|所|股|行)") def predict(self, list_articles,list_sentences, list_entitys, list_codenames): # tenderee_notfound = True # agency_notfound = True self.get_tenderee = False ents = [] list_name = [] for ent in list_entitys[0]: if ent.entity_type == 'name': list_name.append(ent.entity_text) if ent.entity_type in ['org', 'company']: if ent.label == 0: # tenderee_notfound = False self.get_tenderee = True # elif ent.label == 1: # agency_notfound = False elif ent.label == 5: ents.append(ent) if not self.get_tenderee: self.entity_context_rule(ents,list_name,list_sentences) if not self.get_tenderee: self.subject_rule(ents,list_articles,list_sentences) if not self.get_tenderee: self.unrecognized_entity_rule(self.unrecognized1,list_sentences,list_entitys,0.55) if not self.get_tenderee: self.unrecognized_entity_rule(self.unrecognized2,list_sentences,list_entitys,0.5) #entity上下文正则判断 def entity_context_rule(self,entitys,list_name,list_sentences): for ent in entitys: _sentence = list_sentences[0][ent.sentence_index] _span = spanWindow(tokens=_sentence.tokens, begin_index=ent.begin_index, end_index=ent.end_index, size=40, center_include=True, word_flag=True, use_text=True, text=re.sub(")", ")", re.sub("(", "(", ent.entity_text))) if re.search(self.tenderee_left,_span[0]): ent.label = 0 ent.values[0] = 0.5 + ent.values[0] / 10 self.get_tenderee = True elif re.search(self.tenderee_right,_span[2]): ent.label = 0 ent.values[0] = 0.5 + ent.values[0] / 10 self.get_tenderee = True elif re.search(self.tenderee_right2, _span[2]): ent.label = 0 ent.values[0] = 0.5 + ent.values[0] / 10 self.get_tenderee = True elif list_name: pj_name = re.search(self.tenderee_right3, _span[2]) if pj_name: pj_name = pj_name.groupdict()["project"] for _name in list_name: if _name in pj_name: ent.label = 0 ent.values[0] = 0.5 self.get_tenderee = True break # 公告主语判断 def subject_rule(self, entitys,list_articles,list_sentences): content = list_articles[0].content.split('##attachment##')[0] if re.search(self.subject,content): _subject = re.search(self.subject,content).group() for ent in entitys: if re.search("院",_subject) and re.search("医院|学院",ent.entity_text): ent.label = 0 ent.values[0] = 0.5 + ent.values[0] / 10 self.get_tenderee = True elif re.search("校",_subject) and re.search("学校|学院|大学|高中|初中|中学|小学",ent.entity_text): ent.label = 0 ent.values[0] = 0.5 + ent.values[0] / 10 self.get_tenderee = True elif re.search("局", _subject) and re.search("局", ent.entity_text): _sentence = list_sentences[0][ent.sentence_index] _span = spanWindow(tokens=_sentence.tokens, begin_index=ent.begin_index, end_index=ent.end_index, size=20, center_include=True, word_flag=True, use_text=True, text=re.sub(")", ")", re.sub("(", "(", ent.entity_text))) if not re.search("监督|投诉",_span[0][-10:]): ent.label = 0 ent.values[0] = 0.5 + ent.values[0] / 10 self.get_tenderee = True # 正则召回未识别实体 def unrecognized_entity_rule(self,pattern,list_sentences,list_entitys,on_value=0.5): list_sentence = list_sentences[0] for in_attachment in [False,True]: for sentence in [sentence for sentence in list_sentence if sentence.in_attachment==in_attachment]: sentence_text = sentence.sentence_text tokens = sentence.tokens doc_id = sentence.doc_id in_attachment = sentence.in_attachment list_tokenbegin = [] begin = 0 for i in range(0, len(tokens)): list_tokenbegin.append(begin) begin += len(str(tokens[i])) list_tokenbegin.append(begin + 1) for _match in re.finditer(pattern,sentence_text): _groupdict = _match.groupdict() _match_text = _match.group() _unrecognized_text = _groupdict["unrecognized"] # print(_unrecognized_text) # if _match_text[-1] in [':',':']: # _unrecognized = re.search(self.unrecognized_end1,_unrecognized_text) # if not _unrecognized: # _unrecognized = re.search(self.unrecognized_end2, _unrecognized_text) # if _unrecognized: # _unrecognized = _unrecognized.group() # else: # continue # else: # _unrecognized = _unrecognized_text _unrecognized = re.search(self.unrecognized_end1,_unrecognized_text) if not _unrecognized: _unrecognized = re.search(self.unrecognized_end2, _unrecognized_text) if _unrecognized: _unrecognized = _unrecognized.group() else: continue # print(_unrecognized) if re.search("某",_unrecognized): continue begin_index_temp = _match.start()+len(_groupdict['tenderee_left']) for j in range(len(list_tokenbegin)): if list_tokenbegin[j] == begin_index_temp: begin_index = j break elif list_tokenbegin[j] > begin_index_temp: begin_index = j - 1 break index = begin_index_temp + len(_unrecognized) end_index_temp = index for j in range(begin_index, len(list_tokenbegin)): if list_tokenbegin[j] >= index: end_index = j - 1 break entity_id = "%s_%d_%d_%d" % (doc_id, sentence.sentence_index, begin_index, end_index) entity_text = _unrecognized new_entity = Entity(doc_id, entity_id, entity_text, 'company', sentence.sentence_index, begin_index, end_index, begin_index_temp, end_index_temp, in_attachment=in_attachment) new_entity.label = 0 new_entity.values = [on_value,0,0,0,0,0] list_entitys[0].append(new_entity) self.get_tenderee = True if self.get_tenderee: list_entitys[0] = sorted(list_entitys[0], key=lambda x: (x.sentence_index, x.begin_index)) break # 时间类别 class TimePredictor(): def __init__(self): self.sess = tf.Session(graph=tf.Graph()) self.inputs_code = None self.outputs_code = None self.input_shape = (2,40,128) self.load_model() def load_model(self): model_path = os.path.dirname(__file__)+'/timesplit_model' if self.inputs_code is None: log("get model of time") with self.sess.as_default(): with self.sess.graph.as_default(): meta_graph_def = tf.saved_model.loader.load(self.sess, tags=["serve"], export_dir=model_path) signature_key = tf.saved_model.signature_constants.DEFAULT_SERVING_SIGNATURE_DEF_KEY signature_def = meta_graph_def.signature_def self.inputs_code = [] self.inputs_code.append( self.sess.graph.get_tensor_by_name(signature_def[signature_key].inputs["input0"].name)) self.inputs_code.append( self.sess.graph.get_tensor_by_name(signature_def[signature_key].inputs["input1"].name)) self.outputs_code = self.sess.graph.get_tensor_by_name(signature_def[signature_key].outputs["outputs"].name) return self.inputs_code, self.outputs_code else: return self.inputs_code, self.outputs_code def search_time_data(self,list_sentences,list_entitys): data_x = [] points_entitys = [] for list_sentence, list_entity in zip(list_sentences, list_entitys): p_entitys = 0 p_sentences = 0 list_sentence.sort(key=lambda x: x.sentence_index) while(p_entitys= length: break if item_not_space in model_w2v.vocab: embed[out_index][index] = model_w2v[item_not_space] index += 1 else: embed[out_index][index] = model_w2v['unk'] index += 1 out_index += 1 return embed def predict(self, list_sentences,list_entitys): datas = self.search_time_data(list_sentences, list_entitys) if datas is None: return points_entitys = datas[1] with self.sess.as_default(): predict_y = limitRun(self.sess,[self.outputs_code], feed_dict={self.inputs_code[0]:datas[0][0] ,self.inputs_code[1]:datas[0][1]})[0] for i in range(len(predict_y)): entity = points_entitys[i] label = np.argmax(predict_y[i]) values = [] for item in predict_y[i]: values.append(item) if label != 0: if not timeFormat(entity.entity_text): label = 0 values[0] = 0.5 entity.set_Role(label, values) # 产品字段提取 class ProductPredictor(): def __init__(self): vocabpath = os.path.dirname(__file__) + "/codename_vocab.pk" self.vocab = load(vocabpath) self.word2index = dict((w, i) for i, w in enumerate(np.array(self.vocab))) self.sess = tf.Session(graph=tf.Graph()) self.load_model() def load_model(self): # model_path = os.path.dirname(__file__)+'/product_savedmodel/product.pb' model_path = os.path.dirname(__file__)+'/product_savedmodel/productAndfailreason.pb' with self.sess.as_default(): with self.sess.graph.as_default(): output_graph_def = tf.GraphDef() with open(model_path, 'rb') as f: output_graph_def.ParseFromString(f.read()) tf.import_graph_def(output_graph_def, name='') self.sess.run(tf.global_variables_initializer()) self.char_input = self.sess.graph.get_tensor_by_name('CharInputs:0') self.length = self.sess.graph.get_tensor_by_name("Sum:0") self.dropout = self.sess.graph.get_tensor_by_name("Dropout:0") self.logit = self.sess.graph.get_tensor_by_name("logits/Reshape:0") self.tran = self.sess.graph.get_tensor_by_name("crf_loss/transitions:0") def decode(self,logits, lengths, matrix): paths = [] small = -1000.0 # start = np.asarray([[small] * 4 + [0]]) start = np.asarray([[small]*7+[0]]) for score, length in zip(logits, lengths): score = score[:length] pad = small * np.ones([length, 1]) logits = np.concatenate([score, pad], axis=1) logits = np.concatenate([start, logits], axis=0) path, _ = viterbi_decode(logits, matrix) paths.append(path[1:]) return paths def predict(self, list_sentences,list_entitys=None,list_articles=[], fail=False, MAX_AREA=5000): ''' 预测实体代码,每个句子最多取MAX_AREA个字,超过截断 :param list_sentences: 多篇公告句子列表,[[一篇公告句子列表],[公告句子列表]] :param list_entitys: 多篇公告实体列表 :param MAX_AREA: 每个句子最多截取多少字 :return: 把预测出来的实体放进实体类 ''' with self.sess.as_default() as sess: with self.sess.graph.as_default(): result = [] if fail and list_articles!=[]: text_list = [list_articles[0].content[:MAX_AREA]] chars = [[self.word2index.get(it, self.word2index.get('')) for it in text] for text in text_list] if USE_API: requests_result = requests.post(API_URL + "/predict_product", json={"inputs": chars}, verify=True) batch_paths = json.loads(requests_result.text)['result'] lengths = json.loads(requests_result.text)['lengths'] else: lengths, scores, tran_ = sess.run([self.length, self.logit, self.tran], feed_dict={ self.char_input: np.asarray(chars), self.dropout: 1.0 }) batch_paths = self.decode(scores, lengths, tran_) for text, path, length in zip(text_list, batch_paths, lengths): tags = ''.join([str(it) for it in path[:length]]) for it in re.finditer("12*3", tags): start = it.start() end = it.end() _entity = Entity(doc_id=list_articles[0].doc_id, entity_id="%s_%s_%s_%s" % ( list_articles[0].doc_id, 0, start, end), entity_text=text[start:end], entity_type="product", sentence_index=0, begin_index=0, end_index=0, wordOffset_begin=start, wordOffset_end=end) list_entitys[0].append(_entity) for it in re.finditer("45*6", tags): start = it.start() end = it.end() result.append(text[start:end].replace('?', '').strip()) reasons = [] for it in result: if "(√)" in it or "(√)" in it: reasons = [it] break if reasons != [] and (it not in reasons[-1] and it not in reasons): reasons.append(it) elif reasons == []: reasons.append(it) return {'fail_reason':';'.join(reasons)} if list_entitys is None: list_entitys = [[] for _ in range(len(list_sentences))] for list_sentence, list_entity in zip(list_sentences,list_entitys): if len(list_sentence)==0: result.append({"product":[]}) continue list_sentence.sort(key=lambda x:len(x.sentence_text), reverse=True) _begin_index = 0 item = {"product":[]} temp_list = [] while True: MAX_LEN = len(list_sentence[_begin_index].sentence_text) if MAX_LEN > MAX_AREA: MAX_LEN = MAX_AREA _LEN = MAX_AREA//MAX_LEN chars = [sentence.sentence_text[:MAX_LEN] for sentence in list_sentence[_begin_index:_begin_index+_LEN]] chars = [[self.word2index.get(it, self.word2index.get('')) for it in l] for l in chars] chars = pad_sequences(chars, maxlen=MAX_LEN, padding="post", truncating="post") if USE_API: requests_result = requests.post(API_URL + "/predict_product", json={"inputs": chars.tolist()}, verify=True) batch_paths = json.loads(requests_result.text)['result'] lengths = json.loads(requests_result.text)['lengths'] else: lengths, scores, tran_ = sess.run([self.length, self.logit, self.tran], feed_dict={ self.char_input: np.asarray(chars), self.dropout: 1.0 }) batch_paths = self.decode(scores, lengths, tran_) for sentence, path, length in zip(list_sentence[_begin_index:_begin_index+_LEN],batch_paths, lengths): tags = ''.join([str(it) for it in path[:length]]) for it in re.finditer("12*3", tags): start = it.start() end = it.end() _entity = Entity(doc_id=sentence.doc_id, entity_id="%s_%s_%s_%s" % ( sentence.doc_id, sentence.sentence_index, start, end), entity_text=sentence.sentence_text[start:end], entity_type="product", sentence_index=sentence.sentence_index, begin_index=0, end_index=0, wordOffset_begin=start, wordOffset_end=end,in_attachment=sentence.in_attachment) list_entity.append(_entity) temp_list.append(sentence.sentence_text[start:end]) # item["product"] = list(set(temp_list)) # result.append(item) if _begin_index+_LEN >= len(list_sentence): break _begin_index += _LEN item["product"] = list(set(temp_list)) result.append(item) # 修正bug return {'fail_reason': ""} # 产品数量单价品牌规格提取 #2021/11/10 添加表格中的项目、需求、预算、时间要素提取 class ProductAttributesPredictor(): def __init__(self,): self.p1 = '(设备|货物|商品|产品|物品|货品|材料|物资|物料|物件|耗材|备件|食材|食品|品目|标的|标的物|标项|资产|拍卖物|仪器|器材|器械|药械|药品|药材|采购品?|项目|招标|工程|服务)[\))]?(名称|内容|描述)' self.p2 = '设备|货物|商品|产品|物品|货品|材料|物资|物料|物件|耗材|备件|食材|食品|品目|标的|标的物|资产|拍卖物|仪器|器材|器械|药械|药品|药材|采购品|项目|品名|菜名|内容|名称' with open(os.path.dirname(__file__)+'/header_set.pkl', 'rb') as f: self.header_set = pickle.load(f) def isTrueTable(self, table): '''真假表格规则: 1、包含或标签为真 2、包含大量链接、表单、图片或嵌套表格为假 3、表格尺寸太小为假 4、外层嵌套子
,一般子为真,外为假''' if table.find_all(['caption', 'th']) != []: return True elif len(table.find_all(['form', 'a', 'img'])) > 5: return False elif len(table.find_all(['tr'])) < 2: return False elif len(table.find_all(['table'])) >= 1: return False else: return True def getTrs(self, tbody): # 获取所有的tr trs = [] objs = tbody.find_all(recursive=False) for obj in objs: if obj.name == "tr": trs.append(obj) if obj.name == "tbody": for tr in obj.find_all("tr", recursive=False): trs.append(tr) return trs def getTable(self, tbody): trs = self.getTrs(tbody) inner_table = [] if len(trs) < 2: return inner_table for tr in trs: tr_line = [] tds = tr.findChildren(['td', 'th'], recursive=False) if len(tds) < 2: continue for td in tds: td_text = re.sub('\s', '', td.get_text()) tr_line.append(td_text) inner_table.append(tr_line) return inner_table def fixSpan(self, tbody): # 处理colspan, rowspan信息补全问题 trs = self.getTrs(tbody) ths_len = 0 ths = list() trs_set = set() # 修改为先进行列补全再进行行补全,否则可能会出现表格解析混乱 # 遍历每一个tr for indtr, tr in enumerate(trs): ths_tmp = tr.findChildren('th', recursive=False) # 不补全含有表格的tr if len(tr.findChildren('table')) > 0: continue if len(ths_tmp) > 0: ths_len = ths_len + len(ths_tmp) for th in ths_tmp: ths.append(th) trs_set.add(tr) # 遍历每行中的element tds = tr.findChildren(recursive=False) if len(tds) < 3: continue # 列数太少的不补全 for indtd, td in enumerate(tds): # 若有colspan 则补全同一行下一个位置 if 'colspan' in td.attrs and str(re.sub("[^0-9]", "", str(td['colspan']))) != "": col = int(re.sub("[^0-9]", "", str(td['colspan']))) if col < 10 and len(td.get_text()) < 500: td['colspan'] = 1 for i in range(1, col, 1): td.insert_after(copy.copy(td)) for indtr, tr in enumerate(trs): ths_tmp = tr.findChildren('th', recursive=False) # 不补全含有表格的tr if len(tr.findChildren('table')) > 0: continue if len(ths_tmp) > 0: ths_len = ths_len + len(ths_tmp) for th in ths_tmp: ths.append(th) trs_set.add(tr) # 遍历每行中的element tds = tr.findChildren(recursive=False) same_span = 0 if len(tds) > 1 and 'rowspan' in tds[0].attrs: span0 = tds[0].attrs['rowspan'] for td in tds: if 'rowspan' in td.attrs and td.attrs['rowspan'] == span0: same_span += 1 if same_span == len(tds): continue for indtd, td in enumerate(tds): # 若有rowspan 则补全下一行同样位置 if 'rowspan' in td.attrs and str(re.sub("[^0-9]", "", str(td['rowspan']))) != "": row = int(re.sub("[^0-9]", "", str(td['rowspan']))) td['rowspan'] = 1 for i in range(1, row, 1): # 获取下一行的所有td, 在对应的位置插入 if indtr + i < len(trs): tds1 = trs[indtr + i].findChildren(['td', 'th'], recursive=False) if len(tds1) >= (indtd) and len(tds1) > 0: if indtd > 0: tds1[indtd - 1].insert_after(copy.copy(td)) else: tds1[0].insert_before(copy.copy(td)) elif len(tds1) > 0 and len(tds1) == indtd - 1: tds1[indtd - 2].insert_after(copy.copy(td)) def get_monthlen(self, year, month): '''输入年份、月份 int类型 得到该月份天数''' try: weekday, num = calendar.monthrange(int(year), int(month)) except: num = 30 return str(num) def fix_time(self, text, html, page_time): '''输入日期字段返回格式化日期''' for it in [('十二', '12'),('十一', '11'),('十','10'),('九','9'),('八','8'),('七','7'), ('六','6'),('五','5'),('四','4'),('三','3'),('二','2'),('一','1')]: if it[0] in text: text = text.replace(it[0], it[1]) if re.search('^\d{1,2}月$', text): m = re.search('^(\d{1,2})月$', text).group(1) if len(m) < 2: m = '0' + m year = re.search('(\d{4})年(.{,12}采购意向)?', html) if year: y = year.group(1) num = self.get_monthlen(y, m) if len(num) < 2: num = '0' + num order_begin = "%s-%s-01" % (y, m) order_end = "%s-%s-%s" % (y, m, num) elif page_time != "": year = re.search('\d{4}', page_time) if year: y = year.group(0) num = self.get_monthlen(y, m) if len(num) < 2: num = '0' + num order_begin = "%s-%s-01" % (y, m) order_end = "%s-%s-%s" % (y, m, num) else: y = str(datetime.datetime.now().year) num = self.get_monthlen(y, m) if len(num) < 2: num = '0' + num order_begin = "%s-%s-01" % (y, m) order_end = "%s-%s-%s" % (y, m, num) else: y = str(datetime.datetime.now().year) num = self.get_monthlen(y, m) if len(num) < 2: num = '0' + num order_begin = "%s-%s-01" % (y, m) order_end = "%s-%s-%s" % (y, m, num) return order_begin, order_end t1 = re.search('^(\d{4})(年|/|\.|-)(\d{1,2})月?$', text) if t1: year = t1.group(1) month = t1.group(3) num = self.get_monthlen(year, month) if len(month)<2: month = '0'+month if len(num) < 2: num = '0'+num order_begin = "%s-%s-01" % (year, month) order_end = "%s-%s-%s" % (year, month, num) return order_begin, order_end t2 = re.search('^(\d{4})(年|/|\.|-)(\d{1,2})(月|/|\.|-)(\d{1,2})日?$', text) if t2: y = t2.group(1) m = t2.group(3) d = t2.group(5) m = '0'+ m if len(m)<2 else m d = '0'+d if len(d)<2 else d order_begin = order_end = "%s-%s-%s"%(y,m,d) return order_begin, order_end # 时间样式:"202105" t3 = re.search("^(20\d{2})(\d{1,2})$",text) if t3: year = t3.group(1) month = t3.group(2) if int(month)>0 and int(month)<=12: num = self.get_monthlen(year, month) if len(month) < 2: month = '0' + month if len(num) < 2: num = '0' + num order_begin = "%s-%s-01" % (year, month) order_end = "%s-%s-%s" % (year, month, num) return order_begin, order_end # 时间样式:"20210510" t4 = re.search("^(20\d{2})(\d{2})(\d{2})$", text) if t4: year = t4.group(1) month = t4.group(2) day = t4.group(3) if int(month) > 0 and int(month) <= 12 and int(day)>0 and int(day)<=31: order_begin = order_end = "%s-%s-%s"%(year,month,day) return order_begin, order_end all_match = re.finditer('^(?P\d{4})(年|/|\.)(?P\d{1,2})(?:(月|/|\.)(?:(?P\d{1,2})日)?)?' '(到|至|-)(?:(?P\d{4})(年|/|\.))?(?P\d{1,2})(?:(月|/|\.)' '(?:(?P\d{1,2})日)?)?$', text) y1 = m1 = d1 = y2 = m2 = d2 = "" found_math = False for _match in all_match: if len(_match.group()) > 0: found_math = True for k, v in _match.groupdict().items(): if v!="" and v is not None: if k == 'y1': y1 = v elif k == 'm1': m1 = v elif k == 'd1': d1 = v elif k == 'y2': y2 = v elif k == 'm2': m2 = v elif k == 'd2': d2 = v if not found_math: return "", "" y2 = y1 if y2 == "" else y2 d1 = '1' if d1 == "" else d1 d2 = self.get_monthlen(y2, m2) if d2 == "" else d2 m1 = '0' + m1 if len(m1) < 2 else m1 m2 = '0' + m2 if len(m2) < 2 else m2 d1 = '0' + d1 if len(d1) < 2 else d1 d2 = '0' + d2 if len(d2) < 2 else d2 order_begin = "%s-%s-%s"%(y1,m1,d1) order_end = "%s-%s-%s"%(y2,m2,d2) return order_begin, order_end def find_header(self, items, p1, p2): ''' inner_table 每行正则检查是否为表头,是则返回表头所在列序号,及表头内容 :param items: 列表,内容为每个td 文本内容 :param p1: 优先表头正则 :param p2: 第二表头正则 :return: 表头所在列序号,是否表头,表头内容 ''' flag = False header_dic = {'名称': '', '数量': '', '单价': '', '品牌': '', '规格': '', '需求': '', '预算': '', '时间': ''} product = "" # 产品 quantity = "" # 数量 unitPrice = "" # 单价 brand = "" # 品牌 specs = "" # 规格 demand = "" # 采购需求 budget = "" # 预算金额 order_time = "" # 采购时间 for i in range(min(4, len(items))): it = items[i] if len(it) < 15 and re.search(p1, it) != None: flag = True product = it header_dic['名称'] = i break if not flag: for i in range(min(4, len(items))): it = items[i] if len(it) < 15 and re.search(p2, it) and re.search( '编号|编码|号|情况|报名|单位|位置|地址|数量|单价|价格|金额|品牌|规格类型|型号|公司|中标人|企业|供应商|候选人', it) == None: flag = True product = it header_dic['名称'] = i break if flag: for j in range(i + 1, len(items)): if len(items[j]) > 20 and len(re.sub('[\((].*[)\)]|[^\u4e00-\u9fa5]', '', items[j])) > 10: continue if header_dic['数量']=="" and re.search('数量', items[j]): header_dic['数量'] = j quantity = items[j] elif re.search('单价', items[j]): header_dic['单价'] = j unitPrice = items[j] elif re.search('品牌', items[j]): header_dic['品牌'] = j brand = items[j] elif re.search('规格', items[j]): header_dic['规格'] = j specs = items[j] elif re.search('需求', items[j]): header_dic['需求'] = j demand = items[j] elif re.search('预算', items[j]): header_dic['预算'] = j budget = items[j] elif re.search('时间|采购实施月份|采购月份|采购日期', items[j]): header_dic['时间'] = j order_time = items[j] if header_dic.get('名称', "") != "" : num = 0 for it in (quantity, unitPrice, brand, specs, product, demand, budget, order_time): if it != "": num += 1 if num >=2: return header_dic, flag, (product, quantity, unitPrice, brand, specs), (product, demand, budget, order_time) flag = False return header_dic, flag, (product, quantity, unitPrice, brand, specs), (product, demand, budget, order_time) def predict(self, docid='', html='', page_time=""): ''' 正则寻找table表格内 产品相关信息 :param html:公告HTML原文 :return:公告表格内 产品、数量、单价、品牌、规格 ,表头,表头列等信息 ''' soup = BeautifulSoup(html, 'lxml') flag_yx = True if re.search('采购意向', html) else False tables = soup.find_all(['table']) headers = [] headers_demand = [] header_col = [] product_link = [] demand_link = [] total_product_money = 0 for i in range(len(tables)-1, -1, -1): table = tables[i] if table.parent.name == 'td' and len(table.find_all('td')) <= 3: table.string = table.get_text() table.name = 'turntable' continue if not self.isTrueTable(table): continue self.fixSpan(table) inner_table = self.getTable(table) i = 0 found_header = False header_colnum = 0 if flag_yx: col0_l = [] col1_l = [] for tds in inner_table: if len(tds) == 2: col0_l.append(re.sub(':', '', tds[0])) col1_l.append(tds[1]) if len(set(col0_l) & self.header_set) > len(col0_l) * 0.2: header_list2 = [] product = demand = budget = order_begin = order_end = "" for i in range(len(col0_l)): if re.search('项目名称', col0_l[i]): header_list2.append(col0_l[i]) product = col1_l[i] elif re.search('采购需求|需求概况', col0_l[i]): header_list2.append(col0_l[i]) demand = col1_l[i] elif re.search('采购预算|预算金额', col0_l[i]): header_list2.append(col0_l[i]) budget = col1_l[i] re_price = re.findall("[零壹贰叁肆伍陆柒捌玖拾佰仟萬億圆十百千万亿元角分]{3,}|\d[\d,]*(?:\.\d+)?", budget) if re_price: budget = re_price[0] if '万元' in col0_l[i] and '万' not in budget: budget += '万元' budget = str(getUnifyMoney(budget)) else: budget = "" elif re.search('采购时间|采购实施月份|采购月份|采购日期', col0_l[i]): header_list2.append(col0_l[i]) order_time = col1_l[i].strip() order_begin, order_end = self.fix_time(order_time, html, page_time) if order_begin != "" and order_end!="": order_begin_year = int(order_begin.split("-")[0]) order_end_year = int(order_end.split("-")[0]) # 限制附件错误识别时间 if order_begin_year>=2050 or order_end_year>=2050: order_begin = order_end = "" if product!= "" and demand != "" and budget!="" and order_begin != "": link = {'project_name': product, 'product': [], 'demand': demand, 'budget': budget, 'order_begin': order_begin, 'order_end': order_end} if link not in demand_link: demand_link.append(link) headers_demand.append('_'.join(header_list2)) continue while i < (len(inner_table)): tds = inner_table[i] not_empty = [it for it in tds if it != ""] if len(set(not_empty)) < len(not_empty) * 0.5 or len(tds)<2: i += 1 continue product = "" # 产品 quantity = "" # 数量 unitPrice = "" # 单价 brand = "" # 品牌 specs = "" # 规格 demand = "" # 采购需求 budget = "" # 预算金额 order_time = "" # 采购时间 order_begin = "" order_end = "" if len(set(tds) & self.header_set) > len(tds) * 0.2: header_dic, found_header, header_list, header_list2 = self.find_header(tds, self.p1, self.p2) if found_header: headers.append('_'.join(header_list)) headers_demand.append('_'.join(header_list2)) header_colnum = len(tds) header_col.append('_'.join(tds)) i += 1 continue elif found_header: if len(tds) != header_colnum: # 表头、属性列数不一致跳过 i += 1 continue id1 = header_dic.get('名称', "") id2 = header_dic.get('数量', "") id3 = header_dic.get('单价', "") id4 = header_dic.get('品牌', "") id5 = header_dic.get('规格', "") id6 = header_dic.get('需求', "") id7 = header_dic.get('预算', "") id8 = header_dic.get('时间', "") if re.search('[a-zA-Z\u4e00-\u9fa5]', tds[id1]) and tds[id1] not in self.header_set and \ re.search('备注|汇总|合计|总价|价格|金额|公司|附件|详见|无$|xxx', tds[id1]) == None: product = tds[id1] if id2 != "": if re.search('\d+|[壹贰叁肆伍陆柒捌玖拾一二三四五六七八九十]', tds[id2]): quantity = tds[id2] else: quantity = "" if id3 != "": if re.search('\d+|[零壹贰叁肆伍陆柒捌玖拾佰仟萬億十百千万亿元角分]{3,}', tds[id3]): unitPrice = tds[id3] re_price = re.findall("[零壹贰叁肆伍陆柒捌玖拾佰仟萬億圆十百千万亿元角分]{3,}|\d[\d,]*(?:\.\d+)?",unitPrice) if re_price: unitPrice = re_price[0] if '万元' in header_list[2] and '万' not in unitPrice: unitPrice += '万元' # unitPrice = re.sub("[^0-9.零壹贰叁肆伍陆柒捌玖拾佰仟萬億圆十百千万亿元角分]", "", unitPrice) unitPrice = str(getUnifyMoney(unitPrice)) else: unitPrice = "" else: unitPrice = "" if id4 != "": if re.search('\w', tds[id4]): brand = tds[id4] else: brand = "" if id5 != "": if re.search('\w', tds[id5]): specs = tds[id5] else: specs = "" if id6 != "": if re.search('\w', tds[id6]): demand = tds[id6] else: demand = "" if id7 != "": if re.search('\d+|[零壹贰叁肆伍陆柒捌玖拾佰仟萬億十百千万亿元角分]{3,}', tds[id7]): budget = tds[id7] re_price = re.findall("[零壹贰叁肆伍陆柒捌玖拾佰仟萬億圆十百千万亿元角分]{3,}|\d[\d,]*(?:\.\d+)?", budget) if re_price: budget = re_price[0] if '万元' in header_list[2] and '万' not in budget: budget += '万元' budget = str(getUnifyMoney(budget)) else: budget = "" else: budget = "" if id8 != "": if re.search('\w', tds[id8]): order_time = tds[id8].strip() order_begin, order_end = self.fix_time(order_time, html, page_time) if quantity != "" or unitPrice != "" or brand != "" or specs != "": link = {'product': product, 'quantity': quantity, 'unitPrice': unitPrice, 'brand': brand[:50], 'specs':specs} if link not in product_link: product_link.append(link) mat = re.match('([0-9.,]+)[((]?\w{,3}[))]?$', link['quantity']) if link['unitPrice'] != "" and mat: try: total_product_money += float(link['unitPrice'])*float(mat.group(1).replace(',', '')) except: log('产品属性单价数量相乘出错, 单价: %s, 数量: %s'%(link['unitPrice'], link['quantity'])) if order_begin != "" and order_end != "": order_begin_year = int(order_begin.split("-")[0]) order_end_year = int(order_end.split("-")[0]) # 限制附件错误识别时间 if order_begin_year >= 2050 or order_end_year >= 2050: order_begin = order_end = "" if budget != "" and order_time != "": link = {'project_name': product, 'product':[], 'demand': demand, 'budget': budget, 'order_begin':order_begin, 'order_end':order_end} if link not in demand_link: demand_link.append(link) i += 1 else: i += 1 if len(product_link)>0: attr_dic = {'product_attrs':{'data':product_link, 'header':headers, 'header_col':header_col}} else: attr_dic = {'product_attrs': {'data': [], 'header': [], 'header_col': []}} if len(demand_link)>0: demand_dic = {'demand_info':{'data':demand_link, 'header':headers_demand, 'header_col':header_col}} else: demand_dic = {'demand_info':{'data':[], 'header':[], 'header_col':[]}} return [attr_dic, demand_dic], total_product_money def predict_without_table(self,product_attrs,list_sentences,list_entitys,codeName,prem, html='', page_time=""): if len(prem[0]['prem'])==1: list_sentence = list_sentences[0] list_entity = list_entitys[0] _data = product_attrs[1]['demand_info']['data'] re_bidding_time = re.compile("(采购时间|采购实施月份|采购月份|采购日期)[::,].{0,2}$") order_times = [] for entity in list_entity: if entity.entity_type=='time': sentence = list_sentence[entity.sentence_index] s = spanWindow(tokens=sentence.tokens, begin_index=entity.begin_index, end_index=entity.end_index,size=20) entity_left = "".join(s[0]) if re.search(re_bidding_time,entity_left): time_text = entity.entity_text.strip() standard_time = re.compile("((?P\d{4}|\d{2})\s*[-\/年\.]\s*(?P\d{1,2})\s*[-\/月\.]\s*(?P\d{1,2})日?)") time_match = re.search(standard_time,time_text) if time_match: time_text = time_match.group() order_times.append(time_text) # print(order_times) order_times = [tuple(self.fix_time(order_time, html, page_time)) for order_time in order_times] order_times = [order_time for order_time in order_times if order_time[0]!=""] if len(set(order_times))==1: order_begin,order_end = order_times[0] project_name = codeName[0]['name'] pack_info = [pack for pack in prem[0]['prem'].values()] budget = pack_info[0].get('tendereeMoney',0) product = prem[0]['product'] link = {'project_name': project_name, 'product': product, 'demand': project_name, 'budget': budget, 'order_begin': order_begin, 'order_end': order_end} _data.append(link) product_attrs[1]['demand_info']['data'] = _data return product_attrs # docchannel类型提取 class DocChannel(): def __init__(self, life_model='/channel_savedmodel/channel.pb', type_model='/channel_savedmodel/doctype.pb'): self.lift_sess, self.lift_title, self.lift_content, self.lift_prob, self.lift_softmax,\ self.mask, self.mask_title = self.load_life(life_model) self.type_sess, self.type_title, self.type_content, self.type_prob, self.type_softmax,\ self.type_mask, self.type_mask_title = self.load_type(type_model) self.sequen_len = 200 # 150 200 self.title_len = 30 self.sentence_num = 10 self.kws = '供货商|候选人|供应商|入选人|项目|选定|预告|中标|成交|补遗|延期|报名|暂缓|结果|意向|出租|补充|合同|限价|比选|指定|工程|废标|取消|中止|流标|资质|资格|地块|招标|采购|货物|租赁|计划|宗地|需求|来源|土地|澄清|失败|探矿|预审|变更|变卖|遴选|撤销|意见|恢复|采矿|更正|终止|废置|报建|流拍|供地|登记|挂牌|答疑|中选|受让|拍卖|竞拍|审查|入围|更改|条件|洽谈|乙方|后审|控制|暂停|用地|询价|预' lb_type = ['采招数据', '土地矿产', '拍卖出让', '产权交易', '新闻资讯'] lb_life = ['采购意向', '招标预告', '招标公告', '招标答疑', '公告变更', '资审结果', '中标信息', '合同公告', '废标公告'] self.id2type = {k: v for k, v in enumerate(lb_type)} self.id2life = {k: v for k, v in enumerate(lb_life)} self.load_pattern() def load_pattern(self): self.type_dic = { '土地矿产': '供地结果|(土地|用地|宗地|地块|海域|矿)的?(基本信息|基本情况|概况|信息|详情|来源|用途|性质|编号|位置|坐落|使用年限|出让年限)|(土地|山地|农田)(经营权)?(出让|出租|招租|租赁|承包|流转)|流转土地', '拍卖出让': '(拍卖|变卖|流拍|竞拍)的?(公告|活动|信息|结果|成交|主体|标的|资产|财产|方式|类型|流程|程序|规则|价格|保证金|时间)|(公开|进行|密封)(拍卖|变卖|竞拍)|第[一二三]次拍卖|(资产|司法|网络)拍卖|交易方式.{,2}拍卖|拍卖会', '产权交易': '(产权|资产|权证)的?(类型|信息|名称|编号|(基本)?情况)|(经营权|承包权|使用权|租赁权|股权|债权|排污权|化学需氧量|储备量)(挂牌|转让|出让)|竞价销售|销售结果|房屋所有权房产|免租期限|交易期限|(受让|转让|承租|出租|买受)(人|方)|(店面|店铺|商铺|铺位?|门面|门市|食堂|饭堂|校舍|车位|停车场|厂?房|仓?库|馆|资产|物业|房产|房屋|场地|农田|鱼?塘)\w{,4}(处置|招租|出租|续租|租赁|转让)|(出租|转让|产权|资产)(项目|中标|成交|流标|废标)|出租(用途|类型)|转让底价|租赁(标的物|情况)', '采招数据': '(采购|招标|代理)(人|机构|单位)|(采购|招标)(条件|范围|文件|内容)|(申请人|投标人|供应商|报价人|参选人)的?资格要求;' #|变更|答疑|澄清|中标|成交|合同|废标|流标 } self.title_type_dic = { '土地矿产': '(土地|用地|宗地|荒地|山地|海域|矿)(出让|出租|招租|租赁|承包|流转|使用权|经营权|征收|划拨|中标|成交)|供地结果|矿业权|探矿权|采矿权|(土地|用地|宗地|地块)(使用权)?(终止|中止|网上)?(挂牌|出让|拍卖|招拍|划拨)|征收土地', '拍卖出让': '(拍卖|变卖|流拍|竞拍)的?(公告|公示)|拍卖|变卖|流拍|竞拍', '产权交易': '经营权|承包权|使用权|租赁权|股权|债权|排污权|化学需氧量|储备量|竞价销售|销售结果|出租|招租|拍租|竞租|续租|挂牌|出让', '采招数据': '(采购|招标|询价|议价|竞价|比价|比选|遴选|邀请|邀标|磋商|洽谈|约谈|谈判)的?(公告|公示|中标|成交|结果|$)', # 意向|需求|预公?告|报建|总承包|工程|施工|设计|勘察|代理|监理 |变更|答疑|澄清|中标|成交|合同|废标|流标 '新闻资讯': '(考试|面试|笔试)成绩|成绩的?(公告|公示|公布)|公开招聘|招聘(公告|简章|启事|合同制)|疫情防控\s{,5}(通知|情况|提示)' } self.life_dic = { '采购意向': '采购意向|招标意向|选取意向|意向公告|意向公示', '招标预告': '预计(采购|招标)(时间|日期)', '招标公告': '(采购|招标|竞选|报名)条件;报名时间;报名流程;报名方法;报名需提供的材料;参加竞价采购交易资格;(申请人|投标人|供应商|报价人|参选人)的?资格要求;获取(采购|招标|询价|议价|竞价|比价|比选|遴选|邀请|邀标|磋商|洽谈|约谈|谈判|竞谈|应答)文件;(采购|招标|询价|议价|竞价|比价|比选|遴选|邀请|邀标|磋商|洽谈|约谈|谈判|竞谈|应答)文件的?(获取|领取)', '资审结果': '招标资审公告|评审入围公示|资审及业绩公示|资格后审情况报告|资格后审结果公告|资格后审结果公示|资格预审结果公告|资格预审结果公示|预审公示|预审结果公示', '招标答疑': '现澄清为|答疑澄清公告|异议的回复|(最高(投标)?限价|控制价|拦标价)公示', '公告变更': '原公告(主要)?(信息|内容)|变更[前后]内容|现在?(变更|更正|修改|更改)为|(变更|更正)内容为|更正理由|更正人名称|[、\s](更正信息|更正内容):', '候选人公示': '候选人公示|评标结果公示', '中标信息': '供地结果信息|采用单源直接采购的?情况说明|现将\w{,4}(成交|中标|中选|选定结果|选取结果)\w{2,8}(进行公示|公示如下)|(中标|中选)(供应商|承包商|候选人|入围单位)如下|拟定供应商的情况|(中标(候选人|人|成交)|成交)\w{,3}(信息|情况)[::\s]', '中标信息2': '(成交|中标)(日期|时间)[::\s]|成交金额:', '中标信息3': '(中标|中选|成交|拟定|拟选用|最终选定的?|受让|唯一)(供应商|供货商|服务商|企业|公司|单位|候选人|人)(名称)?[::\s]|[、\s](第一名|(拟定|推荐|入围)?(供应商|供货商)|(中选|中标|供货)单位|中选人)[::\s]', '合同公告': '合同(公告|公示)信息;合同(公告|公示)日期;合同(公告|公示)内容;合同编号;合同名称;合同签订日期;合同主体;供应商乙方', '废标公告': '(终止|中止|废标|流标|失败|作废|异常|撤销)(结果)?(公告|公示|招标|采购|竞价)|(谈判结果为|结果类型):废标|((本|该)项目|本标段|本次(招标)?)((采购|招标)?(失败|终止|流标|废标)|(按|做|作)(流标|废标)处理)', } self.title_life_dic = { '采购意向': '采购意向|招标意向|选取意向|意向公告|意向公示', '招标预告': '预公?告|预公示|报建公告|(批前|标前)公示|供应计划$|(论证|征求|征集)(供应商)?意见|意见征询|需求评审公告|需求(公告|公示|意见)', '公告变更': '(变更|更正(事项)?|更改|延期|暂停)的?(公告|公示|通知)|变更$|更正$', '招标答疑': '质疑|澄清|答疑(文件)?|补遗书?|(最高(投标)?限价|控制价|拦标价)公示', '废标公告': '(终止|中止|废标|流标|失败|作废|异常|撤销|取消成交)(结果|竞价|项目)?的?(公告|公示|$)|(终止|中止)(采购|招标|询价|议价|竞价|比价|比选|遴选|邀请|邀标|磋商|洽谈|约谈|谈判|拍卖|招租|交易|出让)', '合同公告': '(合同(成交)?|履约验收|履约|验收结果)(公告|公示|信息|公式)|合同备案|合同书', # 合同$| '候选人公示': '候选人公示|评标(结果)?公示|中标前?公示|中标预公示', '中标信息': '(中标|中选|中价|中租|成交|入选|确认)(候选人|人|供应商|记录|结果)?(公告|公示|结果)|(遴选|采购|招标|竞价|议价|比选|询价|评选|谈判|邀标|邀请|洽谈|约谈|评标|发包|遴选|交易)结果|开标(记录|信息|情况)|中标通知书|中标$', # '资审结果': '(资质|资格)(预审|后审)(入围)?(公示|公告|报告)|(资质|资格)?(预审|后审)(入围)?(公示|公告|报告)|(资质|资格)(审查|预审)结果(公示)?|资审结果公示|未?入围(公示|公告)|资审及业绩公示', '资审结果': '((资格|资质)(审查|预审|后审|审核|入围项?目?)|资审|入围)结果(公告|公示)?|(资质|资格)(预审|后审|入围)(入围)?(公示|公告|报告)|(资质|资格)?(预审|后审)(入围)?(公示|公告|报告)|未?入围(公示|公告)|资审及业绩公示', '招标公告': '(采购|招标|询价|议价|竞价|比价|比选|遴选|邀请|邀标|磋商|洽谈|约谈|谈判|拍卖|招租|交易|出让)的?(公告|公示|$)|公开(采购|招标|招租|拍卖|挂牌|出让)', } self.wrong_win = '按项目控制价下浮\d%即为成交价|不得确定为(中标|成交)|招标人按下列原则选择中标人|确定成交供应商:|评选成交供应商:|拟邀请供应商|除单一来源采购项目外|单一来源除外|(各.{,5}|尊敬的)(供应商|供货商)|竞拍起止时间:' def load_life(self,life_model): with tf.Graph().as_default() as graph: output_graph_def = graph.as_graph_def() with open(os.path.dirname(__file__)+life_model, 'rb') as f: output_graph_def.ParseFromString(f.read()) tf.import_graph_def(output_graph_def, name='') # print("%d ops in the final graph" % len(output_graph_def.node)) del output_graph_def sess = tf.Session(graph=graph) sess.run(tf.global_variables_initializer()) inputs = sess.graph.get_tensor_by_name('inputs/inputs:0') prob = sess.graph.get_tensor_by_name('inputs/dropout:0') title = sess.graph.get_tensor_by_name('inputs/title:0') mask = sess.graph.get_tensor_by_name('inputs/mask:0') mask_title = sess.graph.get_tensor_by_name('inputs/mask_title:0') # logit = sess.graph.get_tensor_by_name('output/logit:0') softmax = sess.graph.get_tensor_by_name('output/softmax:0') return sess, title, inputs, prob, softmax, mask, mask_title def load_type(self,type_model): with tf.Graph().as_default() as graph: output_graph_def = graph.as_graph_def() with open(os.path.dirname(__file__)+type_model, 'rb') as f: output_graph_def.ParseFromString(f.read()) tf.import_graph_def(output_graph_def, name='') # print("%d ops in the final graph" % len(output_graph_def.node)) del output_graph_def sess = tf.Session(graph=graph) sess.run(tf.global_variables_initializer()) inputs = sess.graph.get_tensor_by_name('inputs/inputs:0') prob = sess.graph.get_tensor_by_name('inputs/dropout:0') title = sess.graph.get_tensor_by_name('inputs/title:0') mask = sess.graph.get_tensor_by_name('inputs/mask:0') mask_title = sess.graph.get_tensor_by_name('inputs/mask_title:0') # logit = sess.graph.get_tensor_by_name('output/logit:0') softmax = sess.graph.get_tensor_by_name('output/softmax:0') return sess, title, inputs, prob, softmax, mask, mask_title def predict_process(self, docid='', doctitle='', dochtmlcon=''): # print('准备预处理') def get_kw_senten(s, span=10): doc_sens = [] tmp = 0 num = 0 end_idx = 0 for it in re.finditer(self.kws, s): # '|'.join(keywordset) left = s[end_idx:it.end()].split() right = s[it.end():].split() tmp_seg = s[tmp:it.start()].split() if len(tmp_seg) > span or tmp == 0: doc_sens.append(' '.join(left[-span:] + right[:span])) end_idx = it.end() + 1 + len(' '.join(right[:span])) tmp = it.end() num += 1 if num >= self.sentence_num: break if doc_sens == []: doc_sens.append(s) return doc_sens def word2id(wordlist, max_len=self.sequen_len): ids = [getIndexOfWords(w) for w in wordlist] ids = ids[:max_len] if len(ids) >= max_len else ids + [0] * (max_len - len(ids)) assert len(ids) == max_len return ids cost_time = dict() datas = [] datas_title = [] try: segword_title = ' '.join(selffool.cut(doctitle)[0]) segword_content = dochtmlcon except: segword_content = '' segword_title = '' if isinstance(segword_content, float): segword_content = '' if isinstance(segword_title, float): segword_title = '' segword_content = segword_content.replace(' 中 选 ', ' 中选 ').replace(' 中 标 ', ' 中标 ').replace(' 补 遗 ', ' 补遗 '). \ replace(' 更 多', '').replace(' 更多', '').replace(' 中 号 ', ' 中标 ').replace(' 中 选人 ', ' 中选人 '). \ replace(' 点击 下载 查看', '').replace(' 咨询 报价 请 点击', '').replace('终结', '终止') segword_title = re.sub('[^\s\u4e00-\u9fa5]', '', segword_title) segword_content = re.sub('[^\s\u4e00-\u9fa5]', '', segword_content) doc_word_list = segword_content.split() if len(doc_word_list) > self.sequen_len / 2: doc_sens = get_kw_senten(' '.join(doc_word_list[100:500])) doc_sens = ' '.join(doc_word_list[:100]) + '\n' + '\n'.join(doc_sens) else: doc_sens = ' '.join(doc_word_list[:self.sequen_len]) # print('标题:',segword_title) # print('正文:',segword_content) datas.append(doc_sens.split()) datas_title.append(segword_title.split()) # print('完成预处理') return datas, datas_title def is_houxuan(self, title, content): ''' 通过标题和中文内容判断是否属于候选人公示类别 :param title: 公告标题 :param content: 公告正文文本内容 :return: 1 是候选人公示 ;0 不是 ''' if re.search('候选人的?公示|评标结果|评审结果|中标公示', title): # (中标|成交|中选|入围) if re.search('变更公告|更正公告|废标|终止|答疑|澄清', title): return 0 return 1 if re.search('候选人的?公示', content[:100]): if re.search('公示(期|活动)?已经?结束|公示期已满|中标结果公告|中标结果公示|变更公告|更正公告|废标|终止|答疑|澄清', content[:100]): return 0 return 1 else: return 0 def predict(self, title='', list_sentence='', web_source_no='', original_docchannel=''): not_extract_dic = { 104: '招标文件', 106: '法律法规', 107: '新闻资讯', 108: '拟建项目', 109: '展会推广', 110: '企业名录', 111: '企业资质', 112: '全国工程人员', 113: '业主采购' } if original_docchannel in not_extract_dic: return {'docchannel': {'docchannel':'', 'doctype':not_extract_dic[original_docchannel], "original_docchannel_id": str(original_docchannel)}} if web_source_no in ['02104-7']: return {'docchannel': {'docchannel':'', 'doctype':'采招数据'}} if isinstance(list_sentence, list): token_l = [it.tokens for it in list_sentence] tokens = [it for l in token_l for it in l] content = ' '.join(tokens[:500]) title = re.sub('[^\u4e00-\u9fa5]', '', title) if len(title)>50: title = title[:20]+title[-30:] data_content, data_title = self.predict_process(docid='', doctitle=title[-50:], dochtmlcon=content) # 标题最多取50字 text_len = len(data_content[0]) if len(data_content[0])]*richTextFetch', html) if ser: html = html[:ser.start()]+'##richTextFetch##' text = re.sub('<[^<]*?>', '', html).replace(' ', ' ') text = re.sub('\s+', ' ', text) text = re.sub('[/|[()()]', '', text) text = cut_single_cn_space(text) return text[:20000] def count_diffser(pattern, text): num = 0 kw = [] for p in pattern.split(';'): if re.search(p, text): num += 1 kw.append(re.search(p, text).group(0)) return num, ';'.join(kw) def is_contain_winner(extract_json): if re.search('win_tenderer', extract_json): return True else: return False def is_single_source(bidway, title): if re.search('单一来源|单一性采购', title): return True elif bidway == '单一来源': return True else: return False def get_type(title, text): if re.search(self.title_type_dic['土地矿产'], title) or re.search(self.type_dic['土地矿产'], text): # and re.search('(土地|用地|宗地|地块)(经营权)?(流转|承包|出租|招租|租赁|确权)', text)==None if re.search(self.title_type_dic['采招数据'], title + text[:50]): return '采招数据', re.search(self.title_type_dic['采招数据'], title + text[:50]).group(0) return '土地矿产', (re.search(self.title_type_dic['土地矿产'], title) or re.search(self.type_dic['土地矿产'], text)).group(0) elif (re.search(self.title_type_dic['拍卖出让'], title) or re.search(self.type_dic['拍卖出让'], text)): if re.search(self.title_type_dic['采招数据'], title + text[:50]): return '采招数据', re.search(self.title_type_dic['采招数据'], title + text[:50]).group(0) return '拍卖出让', (re.search(self.title_type_dic['拍卖出让'], title) or re.search(self.type_dic['拍卖出让'], text)).group(0) elif re.search(self.title_type_dic['产权交易'], title) or re.search(self.type_dic['产权交易'], text): if re.search(self.title_type_dic['采招数据'], title + text[:50]): return '采招数据', re.search(self.title_type_dic['采招数据'], title + text[:50]).group(0) return '产权交易', (re.search(self.title_type_dic['产权交易'], title) or re.search(self.type_dic['产权交易'], text)).group(0) elif re.search(self.title_type_dic['采招数据'], title) or re.search(self.type_dic['采招数据'], title + text): return '采招数据', ( re.search(self.title_type_dic['采招数据'], title) or re.search(self.type_dic['采招数据'], title + text)).group( 0) elif re.search(self.title_type_dic['新闻资讯'], title): if re.search(self.title_type_dic['采招数据'], title + text[:150]): return '采招数据', re.search(self.title_type_dic['采招数据'], title + text[:150]).group(0) return '新闻资讯', re.search(self.title_type_dic['新闻资讯'], title).group(0) else: return '', '没有公告类型关键词,返回空' def get_life(title, text, extract_json="", bidway="", original_docchannel=''): if re.search(self.title_life_dic['采购意向'], title) and re.search(self.life_dic['采购意向'], text[:100]): if re.search(self.title_life_dic['公告变更'], title) or re.search(self.life_dic['公告变更'], text): return '公告变更', (re.search(self.title_life_dic['公告变更'], title) or re.search(self.life_dic['公告变更'], text)).group( 0) elif re.search(self.title_life_dic['候选人公示'], title): return '候选人公示', re.search(self.title_life_dic['候选人公示'], title).group(0) elif re.search(self.title_life_dic['中标信息'], title): return '中标信息', re.search(self.title_life_dic['中标信息'], title).group(0) elif re.search('终止|废标|流标', title): return '废标公告', re.search('终止|废标|流标', title).group(0) elif is_single_source(bidway, title): return '中标信息', 'bidway单一来源' return '采购意向', ( re.search(self.title_life_dic['采购意向'], title) and re.search(self.life_dic['采购意向'], text[:100])).group(0) elif re.search(self.title_life_dic['招标预告'], title) or re.search(self.life_dic['招标预告'], text): if re.search(self.title_life_dic['公告变更'], title) or re.search(self.life_dic['公告变更'], text): return '公告变更', (re.search(self.title_life_dic['公告变更'], title) or re.search(self.life_dic['公告变更'], text)).group( 0) elif re.search(self.title_life_dic['候选人公示'], title): return '候选人公示', re.search(self.title_life_dic['候选人公示'], title).group(0) elif re.search(self.title_life_dic['中标信息'], title): return '中标信息', re.search(self.title_life_dic['中标信息'], title).group(0) elif re.search('终止|废标|流标', title): return '废标公告', re.search('终止|废标|流标', title).group(0) elif is_single_source(extract_json, title): return '中标信息', 'bidway单一来源' return '招标预告', (re.search(self.title_life_dic['招标预告'], title) or re.search(self.life_dic['招标预告'], text)).group(0) elif re.search(self.title_life_dic['公告变更'], title) or re.search(self.life_dic['公告变更'], text): if re.search(self.title_life_dic['废标公告'], title): return '废标公告', re.search(self.title_life_dic['废标公告'], title).group(0) # elif re.search('(中标|成交)结果', title[-8:]): # return '中标信息', re.search('(中标|成交)结果', title[-8:]).group(0) return '公告变更', (re.search(self.title_life_dic['公告变更'], title) or re.search(self.life_dic['公告变更'], text)).group(0) elif re.search(self.title_life_dic['招标答疑'], title) or re.search(self.life_dic['招标答疑'], text) or len( re.findall('(答:|回复:)', text)) >= 2: # or re.search(self.title_life_dic['招标答疑'], text[:150]) if re.search(self.title_life_dic['废标公告'], title): return '废标公告', re.search(self.title_life_dic['废标公告'], title).group(0) elif re.search('(中标|成交)结果', title[-8:]): return '中标信息', re.search('(中标|成交)结果', title[-8:]).group(0) return '招标答疑', ( re.search(self.title_life_dic['招标答疑'], title) or re.search(self.life_dic['招标答疑'], text) or re.search( '(答:|回复:)', text)).group(0) elif re.search(self.title_life_dic['废标公告'], title+ text[:150]) or re.search(self.life_dic['废标公告'], text[:150]): return '废标公告', ( re.search(self.title_life_dic['废标公告'], title+ text[:150]) or re.search(self.life_dic['废标公告'], text[:150])).group(0) elif re.search(self.title_life_dic['候选人公示'], title) or re.search(self.life_dic['候选人公示'], text[:150]): if re.search('候选人|公示期?(已?满|已经?结束)|中标(结果|公告)', text) == None: return '中标信息', '候选人公示排除,修改为中标信息' return '候选人公示', ( re.search(self.title_life_dic['候选人公示'], title) or re.search(self.life_dic['候选人公示'], text[:150])).group( 0) elif re.search(self.title_life_dic['合同公告'], title) or re.search(self.title_life_dic['合同公告'], text[ :150]): return '合同公告', (re.search(self.title_life_dic['合同公告'], title) or re.search(self.title_life_dic['合同公告'], text[:150]) or re.search( self.life_dic['合同公告'], text)).group(0) elif re.search(self.life_dic['合同公告'].replace(';', '|'), text): # or re.search(self.life_dic['合同公告'], text[:300]): num, kw = count_diffser(self.life_dic['合同公告'], text) if num >= 3: return '合同公告', kw elif re.search(self.title_life_dic['招标公告'], title[-8:]): return '招标公告', re.search(self.title_life_dic['招标公告'], title[-8:]).group(0) elif not is_contain_winner(extract_json): return '', '有合同关键词无中标角色返回空' return '合同公告', re.search(self.life_dic['合同公告'].replace(';', '|'), text).group(0) elif is_single_source(extract_json, title): return '中标信息', '单一来源采购' elif re.search(self.title_life_dic['中标信息'], title): if re.search(self.title_life_dic['资审结果'], title+text[:150]): return '资审结果', re.search(self.title_life_dic['资审结果'], title+text[:150]).group(0) return '中标信息', re.search(self.title_life_dic['中标信息'], title).group(0) elif re.search(self.title_life_dic['中标信息'], text[:100]) or re.search(self.life_dic['中标信息'], text[:]): if re.search(self.title_life_dic['资审结果'], title+text[:150]): return '资审结果', re.search(self.title_life_dic['资审结果'], title+text[:150]).group(0) # if re.search(self.wrong_win, text): # return '招标公告', re.search(self.wrong_win, text).group(0) return '中标信息', ( re.search(self.title_life_dic['中标信息'], text[:100]) or re.search(self.life_dic['中标信息'], text[:])).group( 0) elif re.search(self.life_dic['中标信息2'], text[:]): if re.search(self.wrong_win, text): return '招标公告', re.search(self.wrong_win, text).group(0) return '中标信息', re.search(self.life_dic['中标信息2'], text[:]).group(0) elif re.search(self.life_dic['中标信息3'], text[:]) and is_contain_winner(extract_json): if re.search(self.wrong_win, text): return '招标公告', re.search(self.wrong_win, text).group(0) return '中标信息', re.search(self.life_dic['中标信息3'], text[:]).group(0) elif re.search('公开选取.{,20}机构的公告', title): if re.search('(中标|成交|中选)(中介|服务)?机构(名称)?[::\s]', text): return '中标信息', '机构选取有中选机构' else: return '招标公告', '公开选取机构' elif is_contain_winner(extract_json): num, kw = count_diffser(self.life_dic['招标公告'], text) if re.search(self.wrong_win, text): return '招标公告', re.search(self.wrong_win, text).group(0) elif num >= 2: return '招标公告', kw elif re.search('##richTextFetch##', text): return '', '提取到中标人但包含附件返回空' return '中标信息', '提取到中标人' elif re.search(self.title_life_dic['资审结果'], title+text[:150]) or re.search(self.life_dic['资审结果'], text[:]): return '资审结果', (re.search(self.title_life_dic['资审结果'], title+text[:150]) or re.search(self.life_dic['资审结果'], text[:])).group(0) elif re.search(self.title_life_dic['招标公告'], title) or re.search(self.life_dic['招标公告'].replace(';', '|'), text[:]): if re.search('意向|预告|变更|更正|中标|中选|成交|答疑|废标|流标|终止', title): return '', '招标正则召回标题有其他类别关键词,返回空' return '招标公告', (re.search(self.title_life_dic['招标公告'], title) or re.search(self.life_dic['招标公告'].replace(';', '|'), text[:])).group(0) else: return '', '未预测到关键词, 返回空' not_extract_dic = { 104: '招标文件', 106: '法律法规', 107: '新闻资讯', 108: '拟建项目', 109: '展会推广', 110: '企业名录', 111: '企业资质', 112: '全国工程人员', 113: '业主采购' } if original_docchannel in not_extract_dic: return {'docchannel': {'docchannel': '', 'doctype': not_extract_dic[original_docchannel]}} if web_source_no in ['02104-7', '04733']: # 这些数据源无法识别 return {'docchannel': {'docchannel': '', 'doctype': '采招数据'}} title = re.sub('[^\u4e00-\u9fa5]', '', title) if len(title) > 50: title = title[:20] + title[-30:] text = html2text(html) prem_json = json.dumps(prem, ensure_ascii=False) result = {'docchannel': {'docchannel': '', 'doctype': ''}} doc_type, type_kw = get_type(title, text) doc_life, life_kw = get_life(title, text, prem_json, bidway, original_docchannel) if doc_type in self.title_type_dic: result['docchannel']['doctype'] = doc_type if doc_life in self.title_life_dic: result['docchannel']['docchannel'] = doc_life if doc_type=="" or doc_life=="": list_sentence = sorted(list_sentence, key=lambda x:x.sentence_index) token_l = [it.tokens for it in list_sentence] tokens = [it for l in token_l for it in l] content = ' '.join(tokens[:500]) data_content, data_title = self.predict_process(docid='', doctitle=title[-50:], dochtmlcon=content) # 标题最多取50字 text_len = len(data_content[0]) if len(data_content[0]) < self.sequen_len else self.sequen_len title_len = len(data_title[0]) if len(data_title[0]) < self.title_len else self.title_len array_content = embedding(data_content, shape=(len(data_content), self.sequen_len, 128)) array_title = embedding(data_title, shape=(len(data_title), self.title_len, 128)) if doc_type == "": pred = self.type_sess.run(self.type_softmax, feed_dict={ self.type_title: array_title, self.type_content: array_content, self.type_mask: [[0] * text_len + [1] * (self.sequen_len - text_len)], self.type_mask_title: [[0] * title_len + [1] * (self.title_len - title_len)], self.type_prob: 1} ) id = np.argmax(pred, axis=1)[0] prob = pred[0][id] result['docchannel']['doctype'] = self.id2type[id] # print('公告类别:', self.id2type[id], '概率:',prob) # if id == 0: if doc_life=="" and result['docchannel']['doctype'] not in ['', '新闻资讯']: if len(text)>150 and re.search(self.kws, content): pred = self.lift_sess.run(self.lift_softmax, feed_dict={ self.lift_title: array_title, self.lift_content: array_content, self.mask: [[0] * text_len + [1] * (self.sequen_len - text_len)], self.mask_title: [[0] * title_len + [1] * (self.title_len - title_len)], self.lift_prob: 1} ) id = np.argmax(pred, axis=1)[0] prob = pred[0][id] if self.id2life[id] == '中标信息' and original_docchannel in [52, '52', '招标公告'] and not is_contain_winner(prem_json): result['docchannel']['docchannel'] = '招标公告' else: result['docchannel']['docchannel'] = self.id2life[id] # print('生命周期:',self.id2life[id], '概率:',prob) # if id == 6: if result['docchannel']['docchannel'] == '中标信息': if self.is_houxuan(''.join([it for it in title if it.isalpha()]), ''.join([it for it in content if it.isalpha()])): result['docchannel']['docchannel'] = '候选人公示' # return '候选人公示', prob # return [{'docchannel': '候选人公示'}] # print('公告类型:%s, 生命周期:%s, 关键词:%s '%(doc_type, doc_life, life_kw)) # print('result: ', result) return result # 保证金支付方式提取 class DepositPaymentWay(): def __init__(self,): self.pt = '(保证金的?(交纳|缴纳|应按下列|入账|支付)方式)[::]*([^,。]{,60})' self.pt2 = '保证金(必?须以|必?须?通过|以)(.{,8})方式' kws = ['银行转账', '公?对公方?式?转账', '对公转账', '柜台转账', '(线上|网上)自?行?(缴纳|交纳|缴退|收退)', '网上银行支付', '现金存入', '直接缴纳', '支票', '汇票', '本票', '电汇', '转账', '汇款', '随机码', '入账', '基本账户转出', '基本账户汇入', '诚信库中登记的账户转出', '银行保函', '电子保函', '担保函', '保证保险', '合法担保机构出具的担保', '金融机构、担保机构出具的保函'] self.kws = sorted(kws, key=lambda x: len(x), reverse=True) def predict(self,content): pay_way = {'deposit_patment_way':''} result = [] pay = re.search(self.pt, content) if pay: # print(pay.group(0)) pay = pay.group(3) for it in re.finditer('|'.join(self.kws), pay): result.append(it.group(0)) pay_way['deposit_patment_way'] = ';'.join(result) return pay_way pay = re.search(self.pt2, content) if pay: # print(pay.group(0)) pay = pay.group(2) for it in re.finditer('|'.join(self.kws), pay): result.append(it.group(0)) pay_way['deposit_patment_way'] = ';'.join(result) return pay_way else: return pay_way # 总价单价提取 class TotalUnitMoney: def __init__(self): pass def predict(self, list_sentences, list_entitys): for i in range(len(list_entitys)): list_entity = list_entitys[i] # 总价单价 for _entity in list_entity: if _entity.entity_type == 'money': word_of_sentence = list_sentences[i][_entity.sentence_index].sentence_text # 总价在中投标金额中 if _entity.label == 1: result = extract_total_money(word_of_sentence, _entity.entity_text, [_entity.wordOffset_begin, _entity.wordOffset_end]) if result: _entity.is_total_money = 1 # 单价在普通金额中 else: result = extract_unit_money(word_of_sentence, _entity.entity_text, [_entity.wordOffset_begin, _entity.wordOffset_end]) if result: _entity.is_unit_money = 1 # print("total_unit_money", _entity.entity_text, # _entity.is_total_money, _entity.is_unit_money) def getSavedModel(): #predictor = FormPredictor() graph = tf.Graph() with graph.as_default(): model = tf.keras.models.load_model("../form/model/model_form.model_item.hdf5",custom_objects={"precision":precision,"recall":recall,"f1_score":f1_score}) #print(tf.graph_util.remove_training_nodes(model)) tf.saved_model.simple_save( tf.keras.backend.get_session(), "./h5_savedmodel/", inputs={"image": model.input}, outputs={"scores": model.output} ) def getBiLSTMCRFModel(MAX_LEN,vocab,EMBED_DIM,BiRNN_UNITS,chunk_tags,weights): ''' model = models.Sequential() model.add(layers.Embedding(len(vocab), EMBED_DIM, mask_zero=True)) # Random embedding model.add(layers.Bidirectional(layers.LSTM(BiRNN_UNITS // 2, return_sequences=True))) crf = CRF(len(chunk_tags), sparse_target=True) model.add(crf) model.summary() model.compile('adam', loss=crf.loss_function, metrics=[crf.accuracy]) return model ''' input = layers.Input(shape=(None,),dtype="int32") if weights is not None: embedding = layers.embeddings.Embedding(len(vocab),EMBED_DIM,mask_zero=True,weights=[weights],trainable=True)(input) else: embedding = layers.embeddings.Embedding(len(vocab),EMBED_DIM,mask_zero=True)(input) bilstm = layers.Bidirectional(layers.LSTM(BiRNN_UNITS//2,return_sequences=True))(embedding) bilstm_dense = layers.TimeDistributed(layers.Dense(len(chunk_tags)))(bilstm) crf = CRF(len(chunk_tags),sparse_target=True) crf_out = crf(bilstm_dense) model = models.Model(input=[input],output = [crf_out]) model.summary() model.compile(optimizer = 'adam', loss = crf.loss_function, metrics = [crf.accuracy]) return model import h5py def h5_to_graph(sess,graph,h5file): f = h5py.File(h5file,'r') #打开h5文件 def getValue(v): _value = f["model_weights"] list_names = str(v.name).split("/") for _index in range(len(list_names)): print(v.name) if _index==1: _value = _value[list_names[0]] _value = _value[list_names[_index]] return _value.value def _load_attributes_from_hdf5_group(group, name): """Loads attributes of the specified name from the HDF5 group. This method deals with an inherent problem of HDF5 file which is not able to store data larger than HDF5_OBJECT_HEADER_LIMIT bytes. # Arguments group: A pointer to a HDF5 group. name: A name of the attributes to load. # Returns data: Attributes data. """ if name in group.attrs: data = [n.decode('utf8') for n in group.attrs[name]] else: data = [] chunk_id = 0 while ('%s%d' % (name, chunk_id)) in group.attrs: data.extend([n.decode('utf8') for n in group.attrs['%s%d' % (name, chunk_id)]]) chunk_id += 1 return data def readGroup(gr,parent_name,data): for subkey in gr: print(subkey) if parent_name!=subkey: if parent_name=="": _name = subkey else: _name = parent_name+"/"+subkey else: _name = parent_name if str(type(gr[subkey]))=="": readGroup(gr[subkey],_name,data) else: data.append([_name,gr[subkey].value]) print(_name,gr[subkey].shape) layer_names = _load_attributes_from_hdf5_group(f["model_weights"], 'layer_names') list_name_value = [] readGroup(f["model_weights"], "", list_name_value) ''' for k, name in enumerate(layer_names): g = f["model_weights"][name] weight_names = _load_attributes_from_hdf5_group(g, 'weight_names') #weight_values = [np.asarray(g[weight_name]) for weight_name in weight_names] for weight_name in weight_names: list_name_value.append([weight_name,np.asarray(g[weight_name])]) ''' for name_value in list_name_value: name = name_value[0] ''' if re.search("dense",name) is not None: name = name[:7]+"_1"+name[7:] ''' value = name_value[1] print(name,graph.get_tensor_by_name(name),np.shape(value)) sess.run(tf.assign(graph.get_tensor_by_name(name),value)) def initialize_uninitialized(sess): global_vars = tf.global_variables() is_not_initialized = sess.run([tf.is_variable_initialized(var) for var in global_vars]) not_initialized_vars = [v for (v, f) in zip(global_vars, is_not_initialized) if not f] adam_vars = [] for _vars in not_initialized_vars: if re.search("Adam",_vars.name) is not None: adam_vars.append(_vars) print([str(i.name) for i in adam_vars]) # only for testing if len(adam_vars): sess.run(tf.variables_initializer(adam_vars)) def save_codename_model(): # filepath = "../projectCode/models/model_project_"+str(60)+"_"+str(200)+".hdf5" filepath = "../projectCode/models_tf/59-L0.471516189943-F0.8802154826344823-P0.8789179683459191-R0.8815168335321886/model.ckpt" vocabpath = "../projectCode/models/vocab.pk" classlabelspath = "../projectCode/models/classlabels.pk" # vocab = load(vocabpath) # class_labels = load(classlabelspath) w2v_matrix = load('codename_w2v_matrix.pk') graph = tf.get_default_graph() with graph.as_default() as g: '''''' # model = getBiLSTMCRFModel(None, vocab, 60, 200, class_labels,weights=None) #model = models.load_model(filepath,custom_objects={'precision':precision,'recall':recall,'f1_score':f1_score,"CRF":CRF,"loss":CRF.loss_function}) sess = tf.Session(graph=g) # sess = tf.keras.backend.get_session() char_input, logits, target, keepprob, length, crf_loss, trans, train_op = BiLSTM_CRF_tfmodel(sess, w2v_matrix) #with sess.as_default(): sess.run(tf.global_variables_initializer()) # print(sess.run("time_distributed_1/kernel:0")) # model.load_weights(filepath) saver = tf.train.Saver() saver.restore(sess, filepath) # print("logits",sess.run(logits)) # print("#",sess.run("time_distributed_1/kernel:0")) # x = load("codename_x.pk") #y = model.predict(x) # y = sess.run(model.output,feed_dict={model.input:x}) # for item in np.argmax(y,-1): # print(item) tf.saved_model.simple_save( sess, "./codename_savedmodel_tf/", inputs={"inputs": char_input, "inputs_length":length, 'keepprob':keepprob}, outputs={"logits": logits, "trans":trans} ) def save_role_model(): ''' @summary: 保存model为savedModel,部署到PAI平台上调用 ''' model_role = PREMPredict().model_role with model_role.graph.as_default(): model = model_role.getModel() sess = tf.Session(graph=model_role.graph) print(type(model.input)) sess.run(tf.global_variables_initializer()) h5_to_graph(sess, model_role.graph, model_role.model_role_file) model = model_role.getModel() tf.saved_model.simple_save(sess, "./role_savedmodel/", inputs={"input0":model.input[0], "input1":model.input[1], "input2":model.input[2]}, outputs={"outputs":model.output} ) def save_money_model(): model_file = os.path.dirname(__file__)+"/../money/models/model_money_word.h5" graph = tf.Graph() with graph.as_default(): sess = tf.Session(graph=graph) with sess.as_default(): # model = model_money.getModel() # model.summary() # sess.run(tf.global_variables_initializer()) # h5_to_graph(sess, model_money.graph, model_money.model_money_file) model = models.load_model(model_file,custom_objects={'precision':precision,'recall':recall,'f1_score':f1_score}) model.summary() print(model.weights) tf.saved_model.simple_save(sess, "./money_savedmodel2/", inputs = {"input0":model.input[0], "input1":model.input[1], "input2":model.input[2]}, outputs = {"outputs":model.output} ) def save_person_model(): model_person = EPCPredict().model_person with model_person.graph.as_default(): x = load("person_x.pk") _data = np.transpose(np.array(x),(1,0,2,3)) model = model_person.getModel() sess = tf.Session(graph=model_person.graph) with sess.as_default(): sess.run(tf.global_variables_initializer()) model_person.load_weights() #h5_to_graph(sess, model_person.graph, model_person.model_person_file) predict_y = sess.run(model.output,feed_dict={model.input[0]:_data[0],model.input[1]:_data[1]}) #predict_y = model.predict([_data[0],_data[1]]) print(np.argmax(predict_y,-1)) tf.saved_model.simple_save(sess, "./person_savedmodel/", inputs={"input0":model.input[0], "input1":model.input[1]}, outputs = {"outputs":model.output}) def save_form_model(): model_form = FormPredictor() with model_form.graph.as_default(): model = model_form.getModel("item") sess = tf.Session(graph=model_form.graph) sess.run(tf.global_variables_initializer()) h5_to_graph(sess, model_form.graph, model_form.model_file_item) tf.saved_model.simple_save(sess, "./form_savedmodel/", inputs={"inputs":model.input}, outputs = {"outputs":model.output}) def save_codesplit_model(): filepath_code = "../projectCode/models/model_code.hdf5" graph = tf.Graph() with graph.as_default(): model_code = models.load_model(filepath_code, custom_objects={'precision':precision,'recall':recall,'f1_score':f1_score}) sess = tf.Session() sess.run(tf.global_variables_initializer()) h5_to_graph(sess, graph, filepath_code) tf.saved_model.simple_save(sess, "./codesplit_savedmodel/", inputs={"input0":model_code.input[0], "input1":model_code.input[1], "input2":model_code.input[2]}, outputs={"outputs":model_code.output}) def save_timesplit_model(): filepath = '../time/model_label_time_classify.model.hdf5' with tf.Graph().as_default() as graph: time_model = models.load_model(filepath, custom_objects={'precision': precision, 'recall': recall, 'f1_score': f1_score}) with tf.Session() as sess: sess.run(tf.global_variables_initializer()) h5_to_graph(sess, graph, filepath) tf.saved_model.simple_save(sess, "./timesplit_model/", inputs={"input0":time_model.input[0], "input1":time_model.input[1]}, outputs={"outputs":time_model.output}) if __name__=="__main__": #save_role_model() # save_codename_model() # save_money_model() #save_person_model() #save_form_model() #save_codesplit_model() # save_timesplit_model() ''' # with tf.Session(graph=tf.Graph()) as sess: # from tensorflow.python.saved_model import tag_constants # meta_graph_def = tf.saved_model.loader.load(sess, [tag_constants.SERVING], "./person_savedModel") # graph = tf.get_default_graph() # signature_key = tf.saved_model.signature_constants.DEFAULT_SERVING_SIGNATURE_DEF_KEY # signature = meta_graph_def.signature_def # input0 = sess.graph.get_tensor_by_name(signature[signature_key].inputs["input0"].name) # input1 = sess.graph.get_tensor_by_name(signature[signature_key].inputs["input1"].name) # outputs = sess.graph.get_tensor_by_name(signature[signature_key].outputs["outputs"].name) # x = load("person_x.pk") # _data = np.transpose(x,[1,0,2,3]) # y = sess.run(outputs,feed_dict={input0:_data[0],input1:_data[1]}) # print(np.argmax(y,-1)) ''' MAX_LEN = 1000 vocabpath = os.path.dirname(__file__) + "/codename_vocab.pk" vocab = load(vocabpath) word2index = dict((w, i) for i, w in enumerate(np.array(vocab))) index_unk = word2index.get("") sentence = "招标人:广州市重点公共建设项目管理中心,联系人:李工,联系方式:020-22905689,招标代理:广东重工建设监理有限公司," \ "代理联系人:薛家伟,代理联系方式:13535014481,招标监督机构:广州市重点公共建设项目管理中心,监督电话:020-22905690," \ "备注:以上为招标公告简要描述,招标公告详细信息请查看“招标公告”附件," sentence = sentence*5 list_sentence = [sentence]*200 # print(list_sentence) x = [[word2index.get(word, index_unk) for word in sentence] for sentence in list_sentence] x_len = [len(_x) if len(_x) < MAX_LEN else MAX_LEN for _x in x] # print(x_len) x = pad_sequences(x, maxlen=MAX_LEN, padding="post", truncating="post") requests_result = requests.post(API_URL + "/predict_codeName", json={"inouts": x.tolist(), "inouts_len": x_len}, verify=True) # predict_y = json.loads(requests_result.text)['result'] print("cost_time:", json.loads(requests_result.text)['cost_time']) print(MAX_LEN, len(sentence), len(list_sentence)) requests_result = requests.post(API_URL + "/predict_codeName", json={"inouts": x.tolist(), "inouts_len": x_len}, verify=True) # predict_y = json.loads(requests_result.text)['result'] print("cost_time:", json.loads(requests_result.text)['cost_time']) print(MAX_LEN, len(sentence), len(list_sentence))