1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009 |
- '''
- Created on 2019年4月22日
- @author: User
- '''
- from keras import models,layers,losses,optimizers
- from keras.callbacks import ModelCheckpoint
- import numpy as np
- from BiddingKG.dl.common.Utils import *
- import keras.backend as K
- import tensorflow as tf
- import math
- import six
- def getTextCNNModel(input_shape,vocab,embedding_weights,classes):
- def resize(x):
- _shape = shape_list(x)
- print("#$",_shape)
- x1 = tf.reshape(x,[_shape[0],2,_shape[1]//2,_shape[2]]) # type: object
- print("--")
- x2 = tf.transpose(x1,[1,0,2,3])
- x_l,x_r = tf.split(x2,[1,1],axis=0)
- return [tf.squeeze(x_l,axis=0),tf.squeeze(x_r,axis=0)]
- def resize_input(x):
- _shape = shape_list(x)
- x1 = tf.reshape(x,[_shape[0],3,_shape[1]//3]) # type: object
- x2 = tf.transpose(x1,[1,0,2])
- x_l,x_c,x_r = tf.split(x2,[1,1,1],axis=0)
- return [tf.squeeze(x_l,axis=0),tf.squeeze(x_c,axis=0),tf.squeeze(x_r,axis=0)]
- # assert len(input_shape)==3
- list_input = []
- for i in range(input_shape[0]):
- list_input.append(layers.Input(shape=(input_shape[1],),dtype=tf.int32,name="input%d"%(i)))
- print("list_input",list_input)
- list_embedding = []
- # if len(list_input)==1:
- # list_resizeinput = layers.Lambda(resize_input)(list_input[0])
- # else:
- concat_input = layers.Lambda(lambda x:tf.concat(x,axis=-1))(list_input)
- embedding_input = [concat_input]
- # embedding_input = list_input
- embedding = layers.Embedding(len(vocab),input_shape[2],weights=[embedding_weights] if embedding_weights is not None else None,trainable=True,name="char_embeding")
- for i in range(len(embedding_input)):
- print(i)
- list_embedding.append(embedding(embedding_input[i]))
- print(list_embedding)
- #
- set_variables = set()
- for v in tf.trainable_variables():
- set_variables.add(v.name)
- list_bert = []
- for i in range(len(list_embedding)):
- for v in tf.trainable_variables():
- set_variables.add(v.name)
- bert_layer = layers.Lambda(lambda x:transformer_model(input_tensor=x,name="bert%d"%(i)),trainable=True,name="bert%d"%(i))
- list_bert.append(bert_layer(list_embedding[i]))
- #set bert_weights to trainable
- bert_weights = []
- for v in tf.trainable_variables():
- if v.name not in set_variables:
- print("++++",v.name)
- bert_weights.append(v)
- bert_layer._trainable_weights = bert_weights
- _resize = layers.Lambda(lambda x:resize(x))(list_bert[0])
- list_w2v = _resize
- list_conv = []
- list_kernel = [2,5,8]
- for i in range(len(list_w2v)):
- list_temp = []
- for kernel in list_kernel:
- list_temp.append(layers.Conv1D(10,kernel,strides=1,padding="same",activation="relu")(list_w2v[i]))
- list_conv.append(layers.merge(list_temp,mode="concat"))
- layers.Conv2D
- list_matrix = []
- for i in range(len(list_conv)):
- list_matrix.append(layers.Dense(12,activation="relu")(list_conv[i]))
-
- if len(list_matrix)>1:
- ave = layers.merge(list_matrix,mode="concat")
-
- dropout = layers.Dropout(0.3)(ave)
- else:
- dropout = layers.Dropout(0.3)(list_matrix[0])
-
- flatten = layers.Flatten()(dropout)
-
- matrix = layers.Dense(classes*10,activation="relu")(flatten)
-
- out = layers.Dense(classes,activation="softmax")(matrix)
-
- model = models.Model(list_input,out)
-
- model.compile(optimizer=optimizers.Adadelta(),loss=losses.categorical_crossentropy,metrics=[precision,recall,f1_score])
-
- model.summary()
-
- return model
- class Attention(layers.Layer):
-
- def __init__(self, **kwargs):
- super(Attention, self).__init__(**kwargs)
-
-
- def build(self, input_shape):
- # W: (EMBED_SIZE, 1)
- # b: (MAX_TIMESTEPS, 1)
- # u: (MAX_TIMESTEPS, MAX_TIMESTEPS)
- self.W = self.add_weight(name="W_{:s}".format(self.name),
- shape=(input_shape[-1], 1),
- initializer="normal")
- self.b = self.add_weight(name="b_{:s}".format(self.name),
- shape=(input_shape[1], 1),
- initializer="zeros")
- self.u = self.add_weight(name="u_{:s}".format(self.name),
- shape=(input_shape[1], input_shape[1]),
- initializer="normal")
- super(Attention, self).build(input_shape)
-
-
- def call(self, x, mask=None):
- # input: (BATCH_SIZE, MAX_TIMESTEPS, EMBED_SIZE)
- # et: (BATCH_SIZE, MAX_TIMESTEPS)
- et = K.squeeze(K.tanh(K.dot(x, self.W) + self.b), axis=-1)
- # at: (BATCH_SIZE, MAX_TIMESTEPS)
- at = K.dot(et, self.u)
- at = K.exp(at)
- if mask is not None:
- at *= K.cast(mask, K.floatx())
- # ot: (BATCH_SIZE, MAX_TIMESTEPS, EMBED_SIZE)
- at /= K.cast(K.sum(at, axis=1, keepdims=True) + K.epsilon(), K.floatx())
-
- atx = K.expand_dims(at, axis=-1)
- ot = atx * x
- # output: (BATCH_SIZE, EMBED_SIZE)
- return K.sum(ot, axis=1)
- def compute_mask(self, input, input_mask=None):
- # do not pass the mask to the next layers
- return None
-
-
- def compute_output_shape(self, input_shape):
- # output shape: (BATCH_SIZE, EMBED_SIZE)
- return (input_shape[0], input_shape[-1])
-
-
- def get_config(self):
- return super(Attention, self).get_config()
-
- def gelu(x):
- """Gaussian Error Linear Unit.
- This is a smoother version of the RELU.
- Original paper: https://arxiv.org/abs/1606.08415
- Args:
- x: float Tensor to perform activation.
- Returns:
- `x` with the GELU activation applied.
- """
- cdf = 0.5 * (1.0 + tf.tanh(
- (np.sqrt(2 / np.pi) * (x + 0.044715 * tf.pow(x, 3)))))
- return x * cdf
- def shape_list(x):
- """Return list of dims, statically where possible."""
- x = tf.convert_to_tensor(x)
-
- # If unknown rank, return dynamic shape
- if x.get_shape().dims is None:
- return tf.shape(x)
-
- static = x.get_shape().as_list()
- shape = tf.shape(x)
-
- ret = []
- for i in range(len(static)):
- dim = static[i]
- if dim is None:
- dim = shape[i]
- ret.append(dim)
- return ret
- def get_timing_signal_1d(length,
- channels,
- min_timescale=1.0,
- max_timescale=1.0e4,
- start_index=0):
- """Gets a bunch of sinusoids of different frequencies.
- Each channel of the input Tensor is incremented by a sinusoid of a different
- frequency and phase.
- This allows attention to learn to use absolute and relative positions.
- Timing signals should be added to some precursors of both the query and the
- memory inputs to attention.
- The use of relative position is possible because sin(x+y) and cos(x+y) can be
- expressed in terms of y, sin(x) and cos(x).
- In particular, we use a geometric sequence of timescales starting with
- min_timescale and ending with max_timescale. The number of different
- timescales is equal to channels / 2. For each timescale, we
- generate the two sinusoidal signals sin(timestep/timescale) and
- cos(timestep/timescale). All of these sinusoids are concatenated in
- the channels dimension.
- Args:
- length: scalar, length of timing signal sequence.
- channels: scalar, size of timing embeddings to create. The number of
- different timescales is equal to channels / 2.
- min_timescale: a float
- max_timescale: a float
- start_index: index of first position
- Returns:
- a Tensor of timing signals [1, length, channels]
- """
- position = tf.to_float(tf.range(length) + start_index)
- num_timescales = channels // 2
- log_timescale_increment = (
- math.log(float(max_timescale) / float(min_timescale)) /
- (tf.to_float(num_timescales) - 1))
- inv_timescales = min_timescale * tf.exp(
- tf.to_float(tf.range(num_timescales)) * -log_timescale_increment)
- scaled_time = tf.expand_dims(position, 1) * tf.expand_dims(inv_timescales, 0)
- signal = tf.concat([tf.sin(scaled_time), tf.cos(scaled_time)], axis=1)
- signal = tf.pad(signal, [[0, 0], [0, tf.mod(channels, 2)]])
- signal = tf.reshape(signal, [1, length, channels])
- return signal
- def add_timing_signal_1d(x,
- min_timescale=1.0,
- max_timescale=1.0e4,
- start_index=0):
- """Adds a bunch of sinusoids of different frequencies to a Tensor.
- Each channel of the input Tensor is incremented by a sinusoid of a different
- frequency and phase.
- This allows attention to learn to use absolute and relative positions.
- Timing signals should be added to some precursors of both the query and the
- memory inputs to attention.
- The use of relative position is possible because sin(x+y) and cos(x+y) can be
- experessed in terms of y, sin(x) and cos(x).
- In particular, we use a geometric sequence of timescales starting with
- min_timescale and ending with max_timescale. The number of different
- timescales is equal to channels / 2. For each timescale, we
- generate the two sinusoidal signals sin(timestep/timescale) and
- cos(timestep/timescale). All of these sinusoids are concatenated in
- the channels dimension.
- Args:
- x: a Tensor with shape [batch, length, channels]
- min_timescale: a float
- max_timescale: a float
- start_index: index of first position
- Returns:
- a Tensor the same shape as x.
- """
- batchs = shape_list(x)[0]
- length = shape_list(x)[1]
- channels = shape_list(x)[2]
- signal = get_timing_signal_1d(length, channels, min_timescale, max_timescale,
- start_index)
- _signal = tf.tile(signal,[batchs,1,1])
- _concat = tf.concat([x,_signal],axis=2)
- # _concat = tf.quantized_concat(2,[x,signal],input_mins=[-10,-10],input_maxes=[10,10])[0]
- print("##",_concat)
- return _concat
- def get_activation(activation_string):
- """Maps a string to a Python function, e.g., "relu" => `tf.nn.relu`.
- Args:
- activation_string: String name of the activation function.
- Returns:
- A Python function corresponding to the activation function. If
- `activation_string` is None, empty, or "linear", this will return None.
- If `activation_string` is not a string, it will return `activation_string`.
- Raises:
- ValueError: The `activation_string` does not correspond to a known
- activation.
- """
-
- # We assume that anything that"s not a string is already an activation
- # function, so we just return it.
- if not isinstance(activation_string, six.string_types):
- return activation_string
-
- if not activation_string:
- return None
-
- act = activation_string.lower()
- if act == "linear":
- return None
- elif act == "relu":
- return tf.nn.relu
- elif act == "gelu":
- return gelu
- elif act == "tanh":
- return tf.tanh
- else:
- raise ValueError("Unsupported activation: %s" % act)
-
- def dropout(input_tensor, dropout_prob):
- """Perform dropout.
- Args:
- input_tensor: float Tensor.
- dropout_prob: Python float. The probability of dropping out a value (NOT of
- *keeping* a dimension as in `tf.nn.dropout`).
- Returns:
- A version of `input_tensor` with dropout applied.
- """
- if dropout_prob is None or dropout_prob == 0.0:
- return input_tensor
-
- output = tf.nn.dropout(input_tensor, 1.0 - dropout_prob)
- return output
- def layer_norm(input_tensor, name=None):
- """Run layer normalization on the last dimension of the tensor."""
- return tf.contrib.layers.layer_norm(
- inputs=input_tensor, begin_norm_axis=-1, begin_params_axis=-1, scope=name)
- def layer_norm_and_dropout(input_tensor, dropout_prob, name=None):
- """Runs layer normalization followed by dropout."""
- output_tensor = layer_norm(input_tensor, name)
- output_tensor = dropout(output_tensor, dropout_prob)
- return output_tensor
- def create_initializer(initializer_range=0.02):
- """Creates a `truncated_normal_initializer` with the given range."""
- return tf.truncated_normal_initializer(stddev=initializer_range)
- def reshape_to_matrix(input_tensor):
- """Reshapes a >= rank 2 tensor to a rank 2 tensor (i.e., a matrix)."""
- ndims = input_tensor.shape.ndims
- if ndims < 2:
- raise ValueError("Input tensor must have at least rank 2. Shape = %s" %
- (input_tensor.shape))
- if ndims == 2:
- return input_tensor
-
- width = input_tensor.shape[-1]
- output_tensor = tf.reshape(input_tensor, [-1, width])
- return output_tensor
- def reshape_from_matrix(output_tensor, orig_shape_list):
- """Reshapes a rank 2 tensor back to its original rank >= 2 tensor."""
- if len(orig_shape_list) == 2:
- return output_tensor
-
- output_shape = shape_list(output_tensor)
-
- orig_dims = orig_shape_list[0:-1]
- width = output_shape[-1]
-
- return tf.reshape(output_tensor, orig_dims + [width])
- def assert_rank(tensor, expected_rank, name=None):
- """Raises an exception if the tensor rank is not of the expected rank.
- Args:
- tensor: A tf.Tensor to check the rank of.
- expected_rank: Python integer or list of integers, expected rank.
- name: Optional name of the tensor for the error message.
- Raises:
- ValueError: If the expected shape doesn't match the actual shape.
- """
- if name is None:
- name = tensor.name
-
- expected_rank_dict = {}
- if isinstance(expected_rank, six.integer_types):
- expected_rank_dict[expected_rank] = True
- else:
- for x in expected_rank:
- expected_rank_dict[x] = True
-
- actual_rank = tensor.shape.ndims
- if actual_rank not in expected_rank_dict:
- scope_name = tf.get_variable_scope().name
- raise ValueError(
- "For the tensor `%s` in scope `%s`, the actual rank "
- "`%d` (shape = %s) is not equal to the expected rank `%s`" %
- (name, scope_name, actual_rank, str(tensor.shape), str(expected_rank)))
- def get_shape_list(tensor, expected_rank=None, name=None):
- """Returns a list of the shape of tensor, preferring static dimensions.
- Args:
- tensor: A tf.Tensor object to find the shape of.
- expected_rank: (optional) int. The expected rank of `tensor`. If this is
- specified and the `tensor` has a different rank, and exception will be
- thrown.
- name: Optional name of the tensor for the error message.
- Returns:
- A list of dimensions of the shape of tensor. All static dimensions will
- be returned as python integers, and dynamic dimensions will be returned
- as tf.Tensor scalars.
- """
- if name is None:
- name = tensor.name
-
- if expected_rank is not None:
- assert_rank(tensor, expected_rank, name)
-
- shape = tensor.shape.as_list()
-
- non_static_indexes = []
- for (index, dim) in enumerate(shape):
- if dim is None:
- non_static_indexes.append(index)
-
- if not non_static_indexes:
- return shape
-
- dyn_shape = tf.shape(tensor)
- for index in non_static_indexes:
- shape[index] = dyn_shape[index]
- return shape
- def attention_layer(from_tensor,
- to_tensor,
- attention_mask=None,
- num_attention_heads=1,
- size_per_head=10,
- query_act=None,
- key_act=None,
- value_act=None,
- attention_probs_dropout_prob=0.0,
- initializer_range=0.02,
- do_return_2d_tensor=False,
- batch_size=None,
- from_seq_length=None,
- to_seq_length=None):
- """Performs multi-headed attention from `from_tensor` to `to_tensor`.
- This is an implementation of multi-headed attention based on "Attention
- is all you Need". If `from_tensor` and `to_tensor` are the same, then
- this is self-attention. Each timestep in `from_tensor` attends to the
- corresponding sequence in `to_tensor`, and returns a fixed-with vector.
- This function first projects `from_tensor` into a "query" tensor and
- `to_tensor` into "key" and "value" tensors. These are (effectively) a list
- of tensors of length `num_attention_heads`, where each tensor is of shape
- [batch_size, seq_length, size_per_head].
- Then, the query and key tensors are dot-producted and scaled. These are
- softmaxed to obtain attention probabilities. The value tensors are then
- interpolated by these probabilities, then concatenated back to a single
- tensor and returned.
- In practice, the multi-headed attention are done with transposes and
- reshapes rather than actual separate tensors.
- Args:
- from_tensor: float Tensor of shape [batch_size, from_seq_length,
- from_width].
- to_tensor: float Tensor of shape [batch_size, to_seq_length, to_width].
- attention_mask: (optional) int32 Tensor of shape [batch_size,
- from_seq_length, to_seq_length]. The values should be 1 or 0. The
- attention scores will effectively be set to -infinity for any positions in
- the mask that are 0, and will be unchanged for positions that are 1.
- num_attention_heads: int. Number of attention heads.
- size_per_head: int. Size of each attention head.
- query_act: (optional) Activation function for the query transform.
- key_act: (optional) Activation function for the key transform.
- value_act: (optional) Activation function for the value transform.
- attention_probs_dropout_prob: (optional) float. Dropout probability of the
- attention probabilities.
- initializer_range: float. Range of the weight initializer.
- do_return_2d_tensor: bool. If True, the output will be of shape [batch_size
- * from_seq_length, num_attention_heads * size_per_head]. If False, the
- output will be of shape [batch_size, from_seq_length, num_attention_heads
- * size_per_head].
- batch_size: (Optional) int. If the input is 2D, this might be the batch size
- of the 3D version of the `from_tensor` and `to_tensor`.
- from_seq_length: (Optional) If the input is 2D, this might be the seq length
- of the 3D version of the `from_tensor`.
- to_seq_length: (Optional) If the input is 2D, this might be the seq length
- of the 3D version of the `to_tensor`.
- Returns:
- float Tensor of shape [batch_size, from_seq_length,
- num_attention_heads * size_per_head]. (If `do_return_2d_tensor` is
- true, this will be of shape [batch_size * from_seq_length,
- num_attention_heads * size_per_head]).
- Raises:
- ValueError: Any of the arguments or tensor shapes are invalid.
- """
-
- def transpose_for_scores(input_tensor, batch_size, num_attention_heads,
- seq_length, width):
- output_tensor = tf.reshape(
- input_tensor, [batch_size, seq_length, num_attention_heads, width])
-
- output_tensor = tf.transpose(output_tensor, [0, 2, 1, 3])
- return output_tensor
-
- from_shape = get_shape_list(from_tensor, expected_rank=[2, 3])
- to_shape = get_shape_list(to_tensor, expected_rank=[2, 3])
-
- if len(from_shape) != len(to_shape):
- raise ValueError(
- "The rank of `from_tensor` must match the rank of `to_tensor`.")
-
- if len(from_shape) == 3:
- batch_size = from_shape[0]
- from_seq_length = from_shape[1]
- to_seq_length = to_shape[1]
- elif len(from_shape) == 2:
- if (batch_size is None or from_seq_length is None or to_seq_length is None):
- raise ValueError(
- "When passing in rank 2 tensors to attention_layer, the values "
- "for `batch_size`, `from_seq_length`, and `to_seq_length` "
- "must all be specified.")
-
- # Scalar dimensions referenced here:
- # B = batch size (number of sequences)
- # F = `from_tensor` sequence length
- # T = `to_tensor` sequence length
- # N = `num_attention_heads`
- # H = `size_per_head`
-
- from_tensor_2d = reshape_to_matrix(from_tensor)
- to_tensor_2d = reshape_to_matrix(to_tensor)
-
- # `query_layer` = [B*F, N*H]
- '''
- query_matrix = tf.get_variable(name="query",shape=(shape_list(from_tensor_2d)[-1],num_attention_heads * size_per_head),initializer=create_initializer(initializer_range))
- query_layer = tf.matmul(from_tensor_2d,query_matrix)
- if query_act is not None:
- query_layer = query_act(query_layer)
-
- key_matrix = tf.get_variable(name="key",shape=(shape_list(from_tensor_2d)[-1],num_attention_heads * size_per_head),initializer=create_initializer(initializer_range))
- key_layer = tf.matmul(from_tensor_2d,key_matrix)
- if key_act is not None:
- key_layer =key_act(key_layer)
-
- value_matrix = tf.get_variable(name="value",shape=(shape_list(from_tensor_2d)[-1],num_attention_heads * size_per_head),initializer=create_initializer(initializer_range))
- value_layer = tf.matmul(from_tensor_2d,value_matrix)
- if value_act is not None:
- value_layer = value_act(value_layer)
-
- '''
- query_layer = tf.layers.dense(
- from_tensor_2d,
- num_attention_heads * size_per_head,
- activation=query_act,
- name="query",
- kernel_initializer=create_initializer(initializer_range))
-
- # `key_layer` = [B*T, N*H]
- key_layer = tf.layers.dense(
- to_tensor_2d,
- num_attention_heads * size_per_head,
- activation=key_act,
- name="key",
- kernel_initializer=create_initializer(initializer_range))
-
- # `value_layer` = [B*T, N*H]
- value_layer = tf.layers.dense(
- to_tensor_2d,
- num_attention_heads * size_per_head,
- activation=value_act,
- name="value",
- kernel_initializer=create_initializer(initializer_range))
-
- # `query_layer` = [B, N, F, H]
- query_layer = transpose_for_scores(query_layer, batch_size,
- num_attention_heads, from_seq_length,
- size_per_head)
-
- # `key_layer` = [B, N, T, H]
- key_layer = transpose_for_scores(key_layer, batch_size, num_attention_heads,
- to_seq_length, size_per_head)
-
- # Take the dot product between "query" and "key" to get the raw
- # attention scores.
- # `attention_scores` = [B, N, F, T]
- attention_scores = tf.matmul(query_layer, key_layer, transpose_b=True)
- attention_scores = tf.multiply(attention_scores,
- 1.0 / math.sqrt(float(size_per_head)))
- print(attention_scores)
- if attention_mask is not None:
- # `attention_mask` = [B, 1, F, T]
- attention_mask = tf.expand_dims(attention_mask, axis=[1])
-
- # Since attention_mask is 1.0 for positions we want to attend and 0.0 for
- # masked positions, this operation will create a tensor which is 0.0 for
- # positions we want to attend and -10000.0 for masked positions.
- adder = (1.0 - tf.cast(attention_mask, tf.float32)) * -10000.0
-
- # Since we are adding it to the raw scores before the softmax, this is
- # effectively the same as removing these entirely.
- attention_scores += adder
-
- # Normalize the attention scores to probabilities.
- # `attention_probs` = [B, N, F, T]
- # B = batch size (number of sequences)
- # F = `from_tensor` sequence length
- # T = `to_tensor` sequence length
- # N = `num_attention_heads`
- # H = `size_per_head`
- #attention_scores = tf.reshape(attention_scores,[batch_size,num_attention_heads,from_seq_length,to_seq_length])
- attention_probs = tf.nn.softmax(attention_scores)
-
- # This is actually dropping out entire tokens to attend to, which might
- # seem a bit unusual, but is taken from the original Transformer paper.
- attention_probs = dropout(attention_probs, attention_probs_dropout_prob)
-
- # `value_layer` = [B, T, N, H]
- value_layer = tf.reshape(
- value_layer,
- [batch_size, to_seq_length, num_attention_heads, size_per_head])
-
- # `value_layer` = [B, N, T, H]
- value_layer = tf.transpose(value_layer, [0, 2, 1, 3])
-
- # `context_layer` = [B, N, F, H]
- context_layer = tf.matmul(attention_probs, value_layer)
-
- # `context_layer` = [B, F, N, H]
- context_layer = tf.transpose(context_layer, [0, 2, 1, 3])
-
- if do_return_2d_tensor:
- # `context_layer` = [B*F, N*H]
- context_layer = tf.reshape(
- context_layer,
- [batch_size * from_seq_length, num_attention_heads * size_per_head])
- else:
- # `context_layer` = [B, F, N*H]
- context_layer = tf.reshape(
- context_layer,
- [batch_size, from_seq_length, num_attention_heads * size_per_head])
-
- return context_layer
-
- def transformer_model(input_tensor,
- attention_mask=None,
- hidden_size=120,
- num_hidden_layers=1,
- num_attention_heads=6,
- intermediate_size=256,
- intermediate_act_fn=gelu,
- hidden_dropout_prob=0.1,
- attention_probs_dropout_prob=0.1,
- initializer_range=0.02,
- do_return_all_layers=False,
- name=None):
-
- input_tensor = add_timing_signal_1d(input_tensor,max_timescale=1000)
-
- if hidden_size % num_attention_heads != 0:
- raise ValueError(
- "The hidden size (%d) is not a multiple of the number of attention "
- "heads (%d)" % (hidden_size, num_attention_heads))
-
- attention_head_size = int(hidden_size / num_attention_heads)
- input_shape = get_shape_list(input_tensor, expected_rank=3)
- batch_size = input_shape[0]
- seq_length = input_shape[1]
- input_width = input_shape[2]
-
- # The Transformer performs sum residuals on all layers so the input needs
- # to be the same as the hidden size.
- if input_width != hidden_size:
- raise ValueError("The width of the input tensor (%d) != hidden size (%d)" %
- (input_width, hidden_size))
-
- # We keep the representation as a 2D tensor to avoid re-shaping it back and
- # forth from a 3D tensor to a 2D tensor. Re-shapes are normally free on
- # the GPU/CPU but may not be free on the TPU, so we want to minimize them to
- # help the optimizer.
- prev_output = reshape_to_matrix(input_tensor)
-
- all_layer_outputs = []
- if name is not None:
- _name = str(name)+"encoder"
- else:
- _name = "encoder"
- with tf.variable_scope(_name,reuse=tf.AUTO_REUSE):
- for layer_idx in range(num_hidden_layers):
- with tf.variable_scope("layer_%d" % layer_idx,reuse=tf.AUTO_REUSE):
- layer_input = prev_output
-
- with tf.variable_scope("attention",reuse=tf.AUTO_REUSE):
- attention_heads = []
- with tf.variable_scope("self",reuse=tf.AUTO_REUSE):
- attention_head = attention_layer(
- from_tensor=layer_input,
- to_tensor=layer_input,
- attention_mask=attention_mask,
- num_attention_heads=num_attention_heads,
- size_per_head=attention_head_size,
- attention_probs_dropout_prob=attention_probs_dropout_prob,
- initializer_range=initializer_range,
- do_return_2d_tensor=True,
- batch_size=batch_size,
- from_seq_length=seq_length,
- to_seq_length=seq_length)
- attention_heads.append(attention_head)
-
- attention_output = None
- if len(attention_heads) == 1:
- attention_output = attention_heads[0]
- else:
- # In the case where we have other sequences, we just concatenate
- # them to the self-attention head before the projection.
- attention_output = tf.concat(attention_heads, axis=-1)
-
- # Run a linear projection of `hidden_size` then add a residual
- # with `layer_input`.
- with tf.variable_scope("output",reuse=tf.AUTO_REUSE):
- attention_output = tf.layers.dense(
- attention_output,
- hidden_size,
- kernel_initializer=create_initializer(initializer_range))
- attention_output = dropout(attention_output, hidden_dropout_prob)
- attention_output = layer_norm(attention_output + layer_input)
-
- # The activation is only applied to the "intermediate" hidden layer.
- with tf.variable_scope("intermediate",reuse=tf.AUTO_REUSE):
- intermediate_output = tf.layers.dense(
- attention_output,
- intermediate_size,
- activation=intermediate_act_fn,
- kernel_initializer=create_initializer(initializer_range))
-
- # Down-project back to `hidden_size` then add the residual.
- with tf.variable_scope("output",reuse=tf.AUTO_REUSE):
- layer_output = tf.layers.dense(
- intermediate_output,
- hidden_size,
- kernel_initializer=create_initializer(initializer_range))
- layer_output = dropout(layer_output, hidden_dropout_prob)
- layer_output = layer_norm(layer_output + attention_output)
- prev_output = layer_output
- all_layer_outputs.append(layer_output)
-
- if do_return_all_layers:
- final_outputs = []
- for layer_output in all_layer_outputs:
- final_output = reshape_from_matrix(layer_output, input_shape)
- final_outputs.append(final_output)
- return final_outputs
- else:
- final_output = reshape_from_matrix(prev_output, input_shape)
- return final_output
-
-
-
-
- # def getBiLSTMModel(input_shape,vocab,embedding_weights,classes,use_am=False):
- #
- # assert len(input_shape)==3
- # list_input = []
- # for i in range(input_shape[0]):
- # list_input.append(layers.Input(shape=(input_shape[1],),dtype=tf.int32))
- # print(list_input)
- # list_embedding = []
- #
- # embedding = layers.Embedding(len(vocab),input_shape[2],weights=[embedding_weights] if embedding_weights is not None else None,trainable=True,name="char_embeding")
- # for i in range(len(list_input)):
- # list_embedding.append(embedding(list_input[i]))
- # #
- # bert_layer = layers.Lambda(transformer_model,trainable=True,name="bert")
- # set_variables = set()
- # for v in tf.trainable_variables():
- # set_variables.add(v.name)
- # list_bert = []
- # for i in range(len(list_embedding)):
- # list_bert.append(bert_layer(list_embedding[i]))
- # #set bert_weights to trainable
- # bert_weights = []
- # for v in tf.trainable_variables():
- # if v.name not in set_variables:
- # bert_weights.append(v)
- # bert_layer._trainable_weights = bert_weights
- #
- # list_w2v = list_bert
- # list_lstm = []
- #
- # if use_am:
- # for i in range(len(list_w2v)):
- # list_lstm.append(Attention()(layers.Bidirectional(layers.LSTM(120,activation="relu",return_sequences=True))(list_w2v[i])))
- # else:
- # for i in range(len(list_w2v)):
- # list_lstm.append(layers.Bidirectional(layers.LSTM(24,activation="relu"))(list_w2v[i]))
- #
- #
- # list_matrix = []
- # for i in range(len(list_lstm)):
- # list_matrix.append(layers.Dense(classes*2,activation="relu")(list_lstm[i]))
- #
- # if len(list_matrix)>1:
- # ave = layers.merge(list_matrix,mode="concat")
- #
- # dropout = layers.Dropout(0.4)(ave)
- # else:
- # dropout = layers.Dropout(0.4)(list_matrix[0])
- #
- #
- # matrix = layers.Dense(classes*10,activation="sigmoid")(dropout)
- #
- # out = layers.Dense(classes,activation="softmax")(matrix)
- #
- # model = models.Model(list_input,out)
- #
- # model.compile(optimizer=optimizers.Adam(lr=0.00002),loss=losses.categorical_crossentropy,metrics=[precision,recall,f1_score])
- #
- # model.summary()
- #
- # return model
- def getBiLSTMModel(input_shape,vocab,embedding_weights,classes,use_am=False):
- def resize(x):
- _shape = shape_list(x)
- print("#$",_shape)
- x1 = tf.reshape(x,[_shape[0],3,_shape[1]//3,_shape[2]]) # type: object
- print("--")
- x2 = tf.transpose(x1,[1,0,2,3])
- x_l,x_c,x_r = tf.split(x2,[1,1,1],axis=0)
- return [tf.squeeze(x_l,axis=0),tf.squeeze(x_c,axis=0),tf.squeeze(x_r,axis=0)]
- def resize_input(x):
- _shape = shape_list(x)
- x1 = tf.reshape(x,[_shape[0],3,_shape[1]//3]) # type: object
- x2 = tf.transpose(x1,[1,0,2])
- x_l,x_c,x_r = tf.split(x2,[1,1,1],axis=0)
- return [tf.squeeze(x_l,axis=0),tf.squeeze(x_c,axis=0),tf.squeeze(x_r,axis=0)]
- # assert len(input_shape)==3
- list_input = []
- for i in range(input_shape[0]):
- list_input.append(layers.Input(shape=(input_shape[1],),dtype=tf.int32,name="input%d"%(i)))
- print("list_input",list_input)
- list_embedding = []
- # if len(list_input)==1:
- # list_resizeinput = layers.Lambda(resize_input)(list_input[0])
- # else:
- # concat_input = layers.Lambda(lambda x:tf.concat(x,axis=-1))(list_input)
- # embedding_input = [concat_input]
- embedding_input = list_input
- embedding = layers.Embedding(len(vocab),input_shape[2],weights=[embedding_weights] if embedding_weights is not None else None,trainable=True,name="char_embeding")
- for i in range(len(embedding_input)):
- print(i)
- list_embedding.append(embedding(embedding_input[i]))
- print(list_embedding)
- #
- set_variables = set()
- for v in tf.trainable_variables():
- set_variables.add(v.name)
- # list_bert = []
- # for i in range(len(list_embedding)):
- # for v in tf.trainable_variables():
- # set_variables.add(v.name)
- # bert_layer = layers.Lambda(lambda x:transformer_model(input_tensor=x,name="bert%d"%(i)),trainable=True,name="bert%d"%(i))
- # list_bert.append(bert_layer(list_embedding[i]))
- # #set bert_weights to trainable
- # bert_weights = []
- # for v in tf.trainable_variables():
- # if v.name not in set_variables:
- # print("++++",v.name)
- # bert_weights.append(v)
- # bert_layer._trainable_weights = bert_weights
- # bert_layer = layers.Lambda(lambda x:transformer_model(input_tensor=x,name="bert%d"%(i)),trainable=True,name="bert%d"%(0))
- # list_bert = []
- # for i in range(len(list_embedding)):
- # list_bert.append(bert_layer(list_embedding[i]))
- # #set bert_weights to trainable
- # bert_weights = []
- # for v in tf.trainable_variables():
- # if v.name not in set_variables:
- # print("++++",v.name)
- # bert_weights.append(v)
- # bert_layer._trainable_weights = bert_weights
- # print("##",list_bert)
- # context_embedding = []
- # list_kernel = [5,8]
- # for i in range(len(list_bert)):
- # list_temp = []
- # for kernel in list_kernel:
- # list_temp.append(layers.Conv1D(3,kernel,strides=1,padding="same",activation="relu")(list_bert[i]))
- # context_embedding.append(layers.Dense(12,activation="relu")(layers.Flatten()(layers.merge(list_temp,mode="concat"))))
- # context_embedding = [layers.GlobalMaxPool1D()(item) for item in list_bert]
- # _resize = layers.Lambda(lambda x:resize(x))(list_bert[0])
- list_w2v = list_embedding
- list_lstm = []
- if use_am:
- for i in range(len(list_w2v)):
- list_lstm.append(Attention()(layers.Bidirectional(layers.LSTM(120,activation="relu",return_sequences=True))(list_w2v[i])))
- else:
- for i in range(len(list_w2v)):
- list_lstm.append(layers.Bidirectional(layers.LSTM(24,activation="relu"))(list_w2v[i]))
- # list_avg = []
- # for i in range(len(list_lstm)):
- # list_avg.append(layers.GlobalAveragePooling1D()(list_lstm[i]))
- # list_matrix = []
- # for i in range(len(list_lstm)):
- # list_matrix.append(layers.Dense(12,activation="relu")(list_lstm[i]))
- # list_matrix.extend(context_embedding)
- if len(list_lstm)>1:
- ave = layers.merge(list_lstm,mode="concat")
- dropout = layers.Dropout(0.2)(ave)
- else:
- dropout = layers.Dropout(0.2)(list_lstm[0])
- matrix = layers.Dense(48,activation="tanh")(dropout)
- out = layers.Dense(classes,activation="softmax")(matrix)
- # out = layers.Dense(classes,activation="sigmoid")(dropout)
- # out = layers.Lambda(lambda x:layers.activations.softmax(x))(out)
- model = models.Model(list_input,out)
- model.compile(optimizer=optimizers.Adam(lr=0.01),loss=losses.categorical_crossentropy,metrics=[precision,recall,f1_score])
- model.summary()
- return model
- def getBiLSTMModel_entity(input_shape,vocab,embedding_weights,classes):
- list_input = []
- for i in range(input_shape[0]):
- list_input.append(layers.Input(shape=(input_shape[1],),dtype=tf.int32,name="input%d"%(i)))
- print("list_input",list_input)
- list_embedding = []
- embedding_input = list_input
- embedding = layers.Embedding(len(vocab),input_shape[2],weights=[embedding_weights] if embedding_weights is not None else None,trainable=True,name="char_embeding")
- for i in range(len(embedding_input)):
- print(i)
- list_embedding.append(embedding(embedding_input[i]))
- print(list_embedding)
- list_w2v = list_embedding
- list_lstm = []
- for i in range(len(list_w2v)):
- list_lstm.append(layers.Bidirectional(layers.LSTM(24,activation="relu"))(list_w2v[i]))
- # list_avg = []
- # for i in range(len(list_lstm)):
- # list_avg.append(layers.GlobalAveragePooling1D()(list_lstm[i]))
- list_matrix = []
- for i in range(len(list_lstm)):
- list_matrix.append(layers.Dense(12,activation="relu")(list_lstm[i]))
- if len(list_matrix)>1:
- ave = layers.merge(list_matrix,mode="concat")
- dropout = layers.Dropout(0.2)(ave)
- else:
- dropout = layers.Dropout(0.2)(list_matrix[0])
- matrix = layers.Dense(classes*10,activation="relu")(dropout)
- out = layers.Dense(classes,activation="softmax")(matrix)
- # out = layers.Dense(classes,activation="sigmoid")(dropout)
- # out = layers.Lambda(lambda x:layers.activations.softmax(x))(out)
- model = models.Model(list_input,out)
- model.compile(optimizer=optimizers.Adam(lr=0.00001),loss=losses.categorical_crossentropy,metrics=[precision,recall,f1_score])
- model.summary()
- return model
- if __name__=="__main__":
- getTextCNNModel((3,100,60),[1,2,3,4,5],None,2)
- model = getBiLSTMModel((3,100,256),fool_char_to_id.keys(),None,3,use_am=False)
- #getBiLSTMModel_entity((20,20,3,100,60),[1,2,3,4,5],None,6)
-
-
|