123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760 |
- '''
- Created on 2019年4月15日
- @author: User
- '''
- from keras import layers,models,losses,optimizers
- from BiddingKG.dl.common.Utils import *
- from keras_contrib.layers import CRF
- import tensorflow as tf
- import six
- import math
- def getTextCNNModel(input_shape=(40,60),classes=2):
-
- input_left = layers.Input(shape=input_shape)
- input_center = layers.Input(shape=input_shape)
- input_right = layers.Input(shape=input_shape)
-
- list_kernel = [5,10,20]
- list_conv_left = []
- list_conv_center = []
- list_conv_right = []
- for kernel in list_kernel:
- list_conv_left.append(layers.Conv1D(filters=10,kernel_size=kernel,padding="same",activation="relu")(input_left))
- list_conv_center.append(layers.Conv1D(filters=10,kernel_size=kernel,padding="same",activation="relu")(input_center))
- list_conv_right.append(layers.Conv1D(filters=10,kernel_size=kernel,padding="same",activation="relu")(input_right))
-
- concat_left = layers.merge(list_conv_left,mode="concat")
- concat_center = layers.merge(list_conv_center,mode="concat")
- concat_right = layers.merge(list_conv_right,mode="concat")
-
- matrix_left = layers.Dense(12,activation="relu")(concat_left)
- matrix_center = layers.Dense(12,activation="relu")(concat_center)
- matrix_right = layers.Dense(12,activation="relu")(concat_right)
-
- concat_matrix = layers.merge([matrix_left,matrix_center,matrix_right],mode="ave")
-
- flatten = layers.Flatten()(concat_matrix)
-
- matrix = layers.Dense(12,activation="relu")(flatten)
-
- out = layers.Dense(classes,activation="softmax")(matrix)
-
- model = models.Model([input_left,input_center,input_right],out)
-
- model.compile(optimizer=optimizers.SGD(),loss=losses.categorical_crossentropy,metrics=[precision,recall,f1_score])
-
- model.summary()
- return model
-
- def gelu(x):
- """Gaussian Error Linear Unit.
- This is a smoother version of the RELU.
- Original paper: https://arxiv.org/abs/1606.08415
- Args:
- x: float Tensor to perform activation.
- Returns:
- `x` with the GELU activation applied.
- """
- cdf = 0.5 * (1.0 + tf.tanh(
- (np.sqrt(2 / np.pi) * (x + 0.044715 * tf.pow(x, 3)))))
- return x * cdf
- def shape_list(x):
- """Return list of dims, statically where possible."""
- x = tf.convert_to_tensor(x)
-
- # If unknown rank, return dynamic shape
- if x.get_shape().dims is None:
- return tf.shape(x)
-
- static = x.get_shape().as_list()
- shape = tf.shape(x)
-
- ret = []
- for i in range(len(static)):
- dim = static[i]
- if dim is None:
- dim = shape[i]
- ret.append(dim)
- return ret
- def get_timing_signal_1d(length,
- channels,
- min_timescale=1.0,
- max_timescale=1.0e4,
- start_index=0):
- """Gets a bunch of sinusoids of different frequencies.
- Each channel of the input Tensor is incremented by a sinusoid of a different
- frequency and phase.
- This allows attention to learn to use absolute and relative positions.
- Timing signals should be added to some precursors of both the query and the
- memory inputs to attention.
- The use of relative position is possible because sin(x+y) and cos(x+y) can be
- expressed in terms of y, sin(x) and cos(x).
- In particular, we use a geometric sequence of timescales starting with
- min_timescale and ending with max_timescale. The number of different
- timescales is equal to channels / 2. For each timescale, we
- generate the two sinusoidal signals sin(timestep/timescale) and
- cos(timestep/timescale). All of these sinusoids are concatenated in
- the channels dimension.
- Args:
- length: scalar, length of timing signal sequence.
- channels: scalar, size of timing embeddings to create. The number of
- different timescales is equal to channels / 2.
- min_timescale: a float
- max_timescale: a float
- start_index: index of first position
- Returns:
- a Tensor of timing signals [1, length, channels]
- """
- position = tf.to_float(tf.range(length) + start_index)
- num_timescales = channels // 2
- log_timescale_increment = (
- math.log(float(max_timescale) / float(min_timescale)) /
- (tf.to_float(num_timescales) - 1))
- inv_timescales = min_timescale * tf.exp(
- tf.to_float(tf.range(num_timescales)) * -log_timescale_increment)
- scaled_time = tf.expand_dims(position, 1) * tf.expand_dims(inv_timescales, 0)
- signal = tf.concat([tf.sin(scaled_time), tf.cos(scaled_time)], axis=1)
- signal = tf.pad(signal, [[0, 0], [0, tf.mod(channels, 2)]])
- signal = tf.reshape(signal, [1, length, channels])
- return signal
- def add_timing_signal_1d(x,
- min_timescale=1.0,
- max_timescale=1.0e4,
- start_index=0):
- """Adds a bunch of sinusoids of different frequencies to a Tensor.
- Each channel of the input Tensor is incremented by a sinusoid of a different
- frequency and phase.
- This allows attention to learn to use absolute and relative positions.
- Timing signals should be added to some precursors of both the query and the
- memory inputs to attention.
- The use of relative position is possible because sin(x+y) and cos(x+y) can be
- experessed in terms of y, sin(x) and cos(x).
- In particular, we use a geometric sequence of timescales starting with
- min_timescale and ending with max_timescale. The number of different
- timescales is equal to channels / 2. For each timescale, we
- generate the two sinusoidal signals sin(timestep/timescale) and
- cos(timestep/timescale). All of these sinusoids are concatenated in
- the channels dimension.
- Args:
- x: a Tensor with shape [batch, length, channels]
- min_timescale: a float
- max_timescale: a float
- start_index: index of first position
- Returns:
- a Tensor the same shape as x.
- """
- length = shape_list(x)[1]
- channels = shape_list(x)[2]
- signal = get_timing_signal_1d(length, channels, min_timescale, max_timescale,
- start_index)
- return x + signal
- def get_activation(activation_string):
- """Maps a string to a Python function, e.g., "relu" => `tf.nn.relu`.
- Args:
- activation_string: String name of the activation function.
- Returns:
- A Python function corresponding to the activation function. If
- `activation_string` is None, empty, or "linear", this will return None.
- If `activation_string` is not a string, it will return `activation_string`.
- Raises:
- ValueError: The `activation_string` does not correspond to a known
- activation.
- """
-
- # We assume that anything that"s not a string is already an activation
- # function, so we just return it.
- if not isinstance(activation_string, six.string_types):
- return activation_string
-
- if not activation_string:
- return None
-
- act = activation_string.lower()
- if act == "linear":
- return None
- elif act == "relu":
- return tf.nn.relu
- elif act == "gelu":
- return gelu
- elif act == "tanh":
- return tf.tanh
- else:
- raise ValueError("Unsupported activation: %s" % act)
-
- def dropout(input_tensor, dropout_prob):
- """Perform dropout.
- Args:
- input_tensor: float Tensor.
- dropout_prob: Python float. The probability of dropping out a value (NOT of
- *keeping* a dimension as in `tf.nn.dropout`).
- Returns:
- A version of `input_tensor` with dropout applied.
- """
- if dropout_prob is None or dropout_prob == 0.0:
- return input_tensor
-
- output = tf.nn.dropout(input_tensor, 1.0 - dropout_prob)
- return output
- def layer_norm(input_tensor, name=None):
- """Run layer normalization on the last dimension of the tensor."""
- return tf.contrib.layers.layer_norm(
- inputs=input_tensor, begin_norm_axis=-1, begin_params_axis=-1, scope=name)
- def layer_norm_and_dropout(input_tensor, dropout_prob, name=None):
- """Runs layer normalization followed by dropout."""
- output_tensor = layer_norm(input_tensor, name)
- output_tensor = dropout(output_tensor, dropout_prob)
- return output_tensor
- def create_initializer(initializer_range=0.02):
- """Creates a `truncated_normal_initializer` with the given range."""
- return tf.truncated_normal_initializer(stddev=initializer_range)
- def reshape_to_matrix(input_tensor):
- """Reshapes a >= rank 2 tensor to a rank 2 tensor (i.e., a matrix)."""
- ndims = input_tensor.shape.ndims
- if ndims < 2:
- raise ValueError("Input tensor must have at least rank 2. Shape = %s" %
- (input_tensor.shape))
- if ndims == 2:
- return input_tensor
-
- width = input_tensor.shape[-1]
- output_tensor = tf.reshape(input_tensor, [-1, width])
- return output_tensor
- def reshape_from_matrix(output_tensor, orig_shape_list):
- """Reshapes a rank 2 tensor back to its original rank >= 2 tensor."""
- if len(orig_shape_list) == 2:
- return output_tensor
-
- output_shape = shape_list(output_tensor)
-
- orig_dims = orig_shape_list[0:-1]
- width = output_shape[-1]
-
- return tf.reshape(output_tensor, orig_dims + [width])
- def assert_rank(tensor, expected_rank, name=None):
- """Raises an exception if the tensor rank is not of the expected rank.
- Args:
- tensor: A tf.Tensor to check the rank of.
- expected_rank: Python integer or list of integers, expected rank.
- name: Optional name of the tensor for the error message.
- Raises:
- ValueError: If the expected shape doesn't match the actual shape.
- """
- if name is None:
- name = tensor.name
-
- expected_rank_dict = {}
- if isinstance(expected_rank, six.integer_types):
- expected_rank_dict[expected_rank] = True
- else:
- for x in expected_rank:
- expected_rank_dict[x] = True
-
- actual_rank = tensor.shape.ndims
- if actual_rank not in expected_rank_dict:
- scope_name = tf.get_variable_scope().name
- raise ValueError(
- "For the tensor `%s` in scope `%s`, the actual rank "
- "`%d` (shape = %s) is not equal to the expected rank `%s`" %
- (name, scope_name, actual_rank, str(tensor.shape), str(expected_rank)))
- def get_shape_list(tensor, expected_rank=None, name=None):
- """Returns a list of the shape of tensor, preferring static dimensions.
- Args:
- tensor: A tf.Tensor object to find the shape of.
- expected_rank: (optional) int. The expected rank of `tensor`. If this is
- specified and the `tensor` has a different rank, and exception will be
- thrown.
- name: Optional name of the tensor for the error message.
- Returns:
- A list of dimensions of the shape of tensor. All static dimensions will
- be returned as python integers, and dynamic dimensions will be returned
- as tf.Tensor scalars.
- """
- if name is None:
- name = tensor.name
-
- if expected_rank is not None:
- assert_rank(tensor, expected_rank, name)
-
- shape = tensor.shape.as_list()
-
- non_static_indexes = []
- for (index, dim) in enumerate(shape):
- if dim is None:
- non_static_indexes.append(index)
-
- if not non_static_indexes:
- return shape
-
- dyn_shape = tf.shape(tensor)
- for index in non_static_indexes:
- shape[index] = dyn_shape[index]
- return shape
- def attention_layer(from_tensor,
- to_tensor,
- attention_mask=None,
- num_attention_heads=1,
- size_per_head=10,
- query_act=None,
- key_act=None,
- value_act=None,
- attention_probs_dropout_prob=0.0,
- initializer_range=0.02,
- do_return_2d_tensor=False,
- batch_size=None,
- from_seq_length=None,
- to_seq_length=None):
- """Performs multi-headed attention from `from_tensor` to `to_tensor`.
- This is an implementation of multi-headed attention based on "Attention
- is all you Need". If `from_tensor` and `to_tensor` are the same, then
- this is self-attention. Each timestep in `from_tensor` attends to the
- corresponding sequence in `to_tensor`, and returns a fixed-with vector.
- This function first projects `from_tensor` into a "query" tensor and
- `to_tensor` into "key" and "value" tensors. These are (effectively) a list
- of tensors of length `num_attention_heads`, where each tensor is of shape
- [batch_size, seq_length, size_per_head].
- Then, the query and key tensors are dot-producted and scaled. These are
- softmaxed to obtain attention probabilities. The value tensors are then
- interpolated by these probabilities, then concatenated back to a single
- tensor and returned.
- In practice, the multi-headed attention are done with transposes and
- reshapes rather than actual separate tensors.
- Args:
- from_tensor: float Tensor of shape [batch_size, from_seq_length,
- from_width].
- to_tensor: float Tensor of shape [batch_size, to_seq_length, to_width].
- attention_mask: (optional) int32 Tensor of shape [batch_size,
- from_seq_length, to_seq_length]. The values should be 1 or 0. The
- attention scores will effectively be set to -infinity for any positions in
- the mask that are 0, and will be unchanged for positions that are 1.
- num_attention_heads: int. Number of attention heads.
- size_per_head: int. Size of each attention head.
- query_act: (optional) Activation function for the query transform.
- key_act: (optional) Activation function for the key transform.
- value_act: (optional) Activation function for the value transform.
- attention_probs_dropout_prob: (optional) float. Dropout probability of the
- attention probabilities.
- initializer_range: float. Range of the weight initializer.
- do_return_2d_tensor: bool. If True, the output will be of shape [batch_size
- * from_seq_length, num_attention_heads * size_per_head]. If False, the
- output will be of shape [batch_size, from_seq_length, num_attention_heads
- * size_per_head].
- batch_size: (Optional) int. If the input is 2D, this might be the batch size
- of the 3D version of the `from_tensor` and `to_tensor`.
- from_seq_length: (Optional) If the input is 2D, this might be the seq length
- of the 3D version of the `from_tensor`.
- to_seq_length: (Optional) If the input is 2D, this might be the seq length
- of the 3D version of the `to_tensor`.
- Returns:
- float Tensor of shape [batch_size, from_seq_length,
- num_attention_heads * size_per_head]. (If `do_return_2d_tensor` is
- true, this will be of shape [batch_size * from_seq_length,
- num_attention_heads * size_per_head]).
- Raises:
- ValueError: Any of the arguments or tensor shapes are invalid.
- """
-
- def transpose_for_scores(input_tensor, batch_size, num_attention_heads,
- seq_length, width):
- output_tensor = tf.reshape(
- input_tensor, [batch_size, seq_length, num_attention_heads, width])
-
- output_tensor = tf.transpose(output_tensor, [0, 2, 1, 3])
- return output_tensor
-
- from_shape = get_shape_list(from_tensor, expected_rank=[2, 3])
- to_shape = get_shape_list(to_tensor, expected_rank=[2, 3])
-
- if len(from_shape) != len(to_shape):
- raise ValueError(
- "The rank of `from_tensor` must match the rank of `to_tensor`.")
-
- if len(from_shape) == 3:
- batch_size = from_shape[0]
- from_seq_length = from_shape[1]
- to_seq_length = to_shape[1]
- elif len(from_shape) == 2:
- if (batch_size is None or from_seq_length is None or to_seq_length is None):
- raise ValueError(
- "When passing in rank 2 tensors to attention_layer, the values "
- "for `batch_size`, `from_seq_length`, and `to_seq_length` "
- "must all be specified.")
-
- # Scalar dimensions referenced here:
- # B = batch size (number of sequences)
- # F = `from_tensor` sequence length
- # T = `to_tensor` sequence length
- # N = `num_attention_heads`
- # H = `size_per_head`
-
- from_tensor_2d = reshape_to_matrix(from_tensor)
- to_tensor_2d = reshape_to_matrix(to_tensor)
-
- # `query_layer` = [B*F, N*H]
- '''
- query_matrix = tf.get_variable(name="query",shape=(shape_list(from_tensor_2d)[-1],num_attention_heads * size_per_head),initializer=create_initializer(initializer_range))
- query_layer = tf.matmul(from_tensor_2d,query_matrix)
- if query_act is not None:
- query_layer = query_act(query_layer)
-
- key_matrix = tf.get_variable(name="key",shape=(shape_list(from_tensor_2d)[-1],num_attention_heads * size_per_head),initializer=create_initializer(initializer_range))
- key_layer = tf.matmul(from_tensor_2d,key_matrix)
- if key_act is not None:
- key_layer =key_act(key_layer)
-
- value_matrix = tf.get_variable(name="value",shape=(shape_list(from_tensor_2d)[-1],num_attention_heads * size_per_head),initializer=create_initializer(initializer_range))
- value_layer = tf.matmul(from_tensor_2d,value_matrix)
- if value_act is not None:
- value_layer = value_act(value_layer)
-
- '''
- query_layer = tf.layers.dense(
- from_tensor_2d,
- num_attention_heads * size_per_head,
- activation=query_act,
- name="query",
- kernel_initializer=create_initializer(initializer_range))
-
- # `key_layer` = [B*T, N*H]
- key_layer = tf.layers.dense(
- to_tensor_2d,
- num_attention_heads * size_per_head,
- activation=key_act,
- name="key",
- kernel_initializer=create_initializer(initializer_range))
-
- # `value_layer` = [B*T, N*H]
- value_layer = tf.layers.dense(
- to_tensor_2d,
- num_attention_heads * size_per_head,
- activation=value_act,
- name="value",
- kernel_initializer=create_initializer(initializer_range))
-
- # `query_layer` = [B, N, F, H]
- query_layer = transpose_for_scores(query_layer, batch_size,
- num_attention_heads, from_seq_length,
- size_per_head)
-
- # `key_layer` = [B, N, T, H]
- key_layer = transpose_for_scores(key_layer, batch_size, num_attention_heads,
- to_seq_length, size_per_head)
-
- # Take the dot product between "query" and "key" to get the raw
- # attention scores.
- # `attention_scores` = [B, N, F, T]
- attention_scores = tf.matmul(query_layer, key_layer, transpose_b=True)
- attention_scores = tf.multiply(attention_scores,
- 1.0 / math.sqrt(float(size_per_head)))
- print(attention_scores)
- if attention_mask is not None:
- # `attention_mask` = [B, 1, F, T]
- attention_mask = tf.expand_dims(attention_mask, axis=[1])
-
- # Since attention_mask is 1.0 for positions we want to attend and 0.0 for
- # masked positions, this operation will create a tensor which is 0.0 for
- # positions we want to attend and -10000.0 for masked positions.
- adder = (1.0 - tf.cast(attention_mask, tf.float32)) * -10000.0
-
- # Since we are adding it to the raw scores before the softmax, this is
- # effectively the same as removing these entirely.
- attention_scores += adder
-
- # Normalize the attention scores to probabilities.
- # `attention_probs` = [B, N, F, T]
- # B = batch size (number of sequences)
- # F = `from_tensor` sequence length
- # T = `to_tensor` sequence length
- # N = `num_attention_heads`
- # H = `size_per_head`
- #attention_scores = tf.reshape(attention_scores,[batch_size,num_attention_heads,from_seq_length,to_seq_length])
- attention_probs = tf.nn.softmax(attention_scores)
-
- # This is actually dropping out entire tokens to attend to, which might
- # seem a bit unusual, but is taken from the original Transformer paper.
- attention_probs = dropout(attention_probs, attention_probs_dropout_prob)
-
- # `value_layer` = [B, T, N, H]
- value_layer = tf.reshape(
- value_layer,
- [batch_size, to_seq_length, num_attention_heads, size_per_head])
-
- # `value_layer` = [B, N, T, H]
- value_layer = tf.transpose(value_layer, [0, 2, 1, 3])
-
- # `context_layer` = [B, N, F, H]
- context_layer = tf.matmul(attention_probs, value_layer)
-
- # `context_layer` = [B, F, N, H]
- context_layer = tf.transpose(context_layer, [0, 2, 1, 3])
-
- if do_return_2d_tensor:
- # `context_layer` = [B*F, N*H]
- context_layer = tf.reshape(
- context_layer,
- [batch_size * from_seq_length, num_attention_heads * size_per_head])
- else:
- # `context_layer` = [B, F, N*H]
- context_layer = tf.reshape(
- context_layer,
- [batch_size, from_seq_length, num_attention_heads * size_per_head])
-
- return context_layer
-
- def transformer_model(input_tensor,
- attention_mask=None,
- hidden_size=256,
- num_hidden_layers=2,
- num_attention_heads=2,
- intermediate_size=128,
- intermediate_act_fn=gelu,
- hidden_dropout_prob=0,
- attention_probs_dropout_prob=0,
- initializer_range=0.02,
- do_return_all_layers=False):
-
- input_tensor = add_timing_signal_1d(input_tensor)
-
- if hidden_size % num_attention_heads != 0:
- raise ValueError(
- "The hidden size (%d) is not a multiple of the number of attention "
- "heads (%d)" % (hidden_size, num_attention_heads))
-
- attention_head_size = int(hidden_size / num_attention_heads)
- input_shape = get_shape_list(input_tensor, expected_rank=3)
- batch_size = input_shape[0]
- seq_length = input_shape[1]
- input_width = input_shape[2]
-
- # The Transformer performs sum residuals on all layers so the input needs
- # to be the same as the hidden size.
- if input_width != hidden_size:
- raise ValueError("The width of the input tensor (%d) != hidden size (%d)" %
- (input_width, hidden_size))
-
- # We keep the representation as a 2D tensor to avoid re-shaping it back and
- # forth from a 3D tensor to a 2D tensor. Re-shapes are normally free on
- # the GPU/CPU but may not be free on the TPU, so we want to minimize them to
- # help the optimizer.
- prev_output = reshape_to_matrix(input_tensor)
-
- all_layer_outputs = []
- with tf.variable_scope("encoder",reuse=tf.AUTO_REUSE):
- for layer_idx in range(num_hidden_layers):
- with tf.variable_scope("layer_%d" % layer_idx,reuse=tf.AUTO_REUSE):
- layer_input = prev_output
-
- with tf.variable_scope("attention",reuse=tf.AUTO_REUSE):
- attention_heads = []
- with tf.variable_scope("self",reuse=tf.AUTO_REUSE):
- attention_head = attention_layer(
- from_tensor=layer_input,
- to_tensor=layer_input,
- attention_mask=attention_mask,
- num_attention_heads=num_attention_heads,
- size_per_head=attention_head_size,
- attention_probs_dropout_prob=attention_probs_dropout_prob,
- initializer_range=initializer_range,
- do_return_2d_tensor=True,
- batch_size=batch_size,
- from_seq_length=seq_length,
- to_seq_length=seq_length)
- attention_heads.append(attention_head)
-
- attention_output = None
- if len(attention_heads) == 1:
- attention_output = attention_heads[0]
- else:
- # In the case where we have other sequences, we just concatenate
- # them to the self-attention head before the projection.
- attention_output = tf.concat(attention_heads, axis=-1)
-
- # Run a linear projection of `hidden_size` then add a residual
- # with `layer_input`.
- with tf.variable_scope("output",reuse=tf.AUTO_REUSE):
- attention_output = tf.layers.dense(
- attention_output,
- hidden_size,
- kernel_initializer=create_initializer(initializer_range))
- attention_output = dropout(attention_output, hidden_dropout_prob)
- attention_output = layer_norm(attention_output + layer_input)
-
- # The activation is only applied to the "intermediate" hidden layer.
- with tf.variable_scope("intermediate",reuse=tf.AUTO_REUSE):
- intermediate_output = tf.layers.dense(
- attention_output,
- intermediate_size,
- activation=intermediate_act_fn,
- kernel_initializer=create_initializer(initializer_range))
-
- # Down-project back to `hidden_size` then add the residual.
- with tf.variable_scope("output",reuse=tf.AUTO_REUSE):
- layer_output = tf.layers.dense(
- intermediate_output,
- hidden_size,
- kernel_initializer=create_initializer(initializer_range))
- layer_output = dropout(layer_output, hidden_dropout_prob)
- layer_output = layer_norm(layer_output + attention_output)
- prev_output = layer_output
- all_layer_outputs.append(layer_output)
-
- if do_return_all_layers:
- final_outputs = []
- for layer_output in all_layer_outputs:
- final_output = reshape_from_matrix(layer_output, input_shape)
- final_outputs.append(final_output)
- return final_outputs
- else:
- final_output = reshape_from_matrix(prev_output, input_shape)
- return final_output
-
- def getBiLSTMCRFModel(MAX_LEN,vocab,EMBED_DIM,BiRNN_UNITS,chunk_tags,weights):
-
- '''
- model = models.Sequential()
- model.add(layers.Embedding(len(vocab), EMBED_DIM, mask_zero=True)) # Random embedding
- model.add(layers.Bidirectional(layers.LSTM(BiRNN_UNITS // 2, return_sequences=True)))
- crf = CRF(len(chunk_tags), sparse_target=True)
- model.add(crf)
- model.summary()
- model.compile('adam', loss=crf.loss_function, metrics=[crf.accuracy])
- return model
- '''
- input = layers.Input(shape=(None,))
- if weights is not None:
- embedding = layers.embeddings.Embedding(len(vocab),EMBED_DIM,mask_zero=True,weights=[weights],trainable=True)(input)
- else:
- embedding = layers.embeddings.Embedding(len(vocab),EMBED_DIM,mask_zero=True)(input)
-
- ''''''
- # set_v_before = set([v.name for v in tf.trainable_variables()])
- # transformer_layer = layers.Lambda(lambda x:transformer_model(x,hidden_size=get_shape_list(embedding)[-1], do_return_all_layers=False),trainable=True)
- # globalLocalFeature = transformer_layer(embedding)
- # transformer_weights = []
- # for v in tf.trainable_variables():
- # if v.name not in set_v_before:
- # transformer_weights.append(v)
- # transformer_layer._trainable_weights = transformer_weights
- globalLocalFeature = embedding
- bilstm = layers.Bidirectional(layers.LSTM(BiRNN_UNITS//2,return_sequences=True))(globalLocalFeature)
- bilstm_dense = layers.TimeDistributed(layers.Dense(len(chunk_tags)))(bilstm)
- crf = CRF(len(chunk_tags),sparse_target=True)
- crf_out = crf(bilstm_dense)
- model = models.Model(input=[input],output = [crf_out])
- model.summary()
- model.compile(optimizer = optimizers.Adadelta(2e-2,clipvalue=5), loss = crf.loss_function, metrics = [crf.accuracy])
- import keras
- print(keras.engine.topology._collect_previous_mask(globalLocalFeature))
- return model
- def getBilstmCRF_tf(sess,MAX_LEN,vocab,EMBED_DIM,BiRNN_UNITS,chunk_tags,weights):
- # from tensorflow.contrib.layers.python.layers import initializers
- # from tensorflow.contrib.crf import crf_log_likelihood
- def layer_embedding(input):
- embedding = tf.get_variable("embedding",initializer=np.array(weights,dtype=np.float32) if weights is not None else None,dtype=tf.float32)
- return tf.nn.embedding_lookup(params=embedding,ids=input)
- def layer_bilstm(input,length):
- with tf.variable_scope("bilstm"):
- forward_cell = tf.contrib.rnn.BasicLSTMCell(BiRNN_UNITS,state_is_tuple=True)
- backward_cell = tf.contrib.rnn.BasicLSTMCell(BiRNN_UNITS,state_is_tuple=True)
- outputs, _ = tf.nn.bidirectional_dynamic_rnn(forward_cell, backward_cell, input, dtype=tf.float32, sequence_length=length)
- return outputs
- def layer_project(input,drop_out,num_tags,batch_size,time_step,BiRNN_UNITS):
- with tf.variable_scope("project"):
- with tf.variable_scope("hidden"):
- w_hidden = tf.get_variable(name="w_hidden",shape=(BiRNN_UNITS*2,BiRNN_UNITS),dtype=tf.float32,initializer=initializers.xavier_initializer(),regularizer=tf.contrib.layers.l2_regularizer(0.001))
- b_hidden = tf.get_variable(name="b_hidden",shape=(BiRNN_UNITS),dtype=tf.float32,initializer=tf.zeros_initializer())
- _reshape = tf.reshape(input,shape=(-1,BiRNN_UNITS*2))
- _hidden = tf.tanh(tf.nn.xw_plus_b(_reshape,w_hidden,b_hidden))
- dropout_hidden = tf.nn.dropout(_hidden,drop_out)
- with tf.variable_scope("out"):
- w_out = tf.get_variable(name="w_out",shape=(BiRNN_UNITS,num_tags),dtype=tf.float32,initializer=initializers.xavier_initializer(),regularizer=tf.contrib.layers.l2_regularizer(0.001))
- b_out = tf.get_variable(name="b_out",shape=(num_tags),dtype=tf.float32,initializer=tf.zeros_initializer())
- _pred = tf.nn.xw_plus_b(dropout_hidden,w_out,b_out)
- return tf.reshape(_pred,shape=(-1,time_step,num_tags),name="logits")
- def layer_loss(input,target,length,chunk_tags,batch_size,step_size):
- with tf.variable_scope("crf_loss"):
- # small = -1000
- #
- # start_logits = tf.concat([small*tf.ones(shape=(batch_size,1,chunk_tags)),tf.zeros(shape=(batch_size,1,1))],axis=-1)
- # new_input = tf.concat([input,small*tf.ones(shape=(batch_size,step_size,1))],axis=-1)
- # new_input = tf.concat([start_logits,new_input],axis=1)
- # new_target = tf.concat([chunk_tags*tf.ones(shape=(batch_size,1),dtype=tf.int32),target],axis=-1)
- # trans = tf.get_variable(name="transitions",shape=(chunk_tags+1,chunk_tags+1),initializer=initializers.xavier_initializer())
- # log_likelihood,trans = crf_log_likelihood(inputs=new_input,tag_indices=new_target,transition_params=trans,sequence_lengths=length+1)
- trans = tf.get_variable(name="transitions",shape=(chunk_tags,chunk_tags),initializer=initializers.xavier_initializer())
- log_likelihood,trans = crf_log_likelihood(inputs=input,tag_indices=target,transition_params=trans,sequence_lengths=length)
- return tf.reduce_mean(-log_likelihood),trans
- with sess.graph.as_default():
- char_input = tf.placeholder(name="char_input",shape=(None,None),dtype=tf.int32)
- target = tf.placeholder(name="target",shape=(None,None),dtype=tf.int32)
- length = tf.placeholder(name="lengths",shape=(None,),dtype=tf.int32)
- keepprob = tf.placeholder(name="keepprob",dtype=tf.float32)
- _embedding = layer_embedding(char_input)
- _shape = tf.shape(char_input)
- batch_size = _shape[0]
- step_size = _shape[-1]
- bilstm = layer_bilstm(_embedding,length)
- print(bilstm)
- _logits = layer_project(bilstm,keepprob,len(chunk_tags),batch_size,step_size,BiRNN_UNITS)
- crf_loss,trans = layer_loss(_logits,target,length,len(chunk_tags),batch_size,step_size)
- global_step = tf.Variable(0, trainable=False)
- with tf.variable_scope("optimizer"):
- opt = tf.train.AdamOptimizer(0.002)
- grads_vars = opt.compute_gradients(crf_loss)
- capped_grads_vars = [[tf.clip_by_value(g, -5, 5), v] for g, v in grads_vars]
- train_op = opt.apply_gradients(capped_grads_vars, global_step)
- return char_input,_logits,target,length,keepprob,crf_loss,trans,train_op
- if __name__=="__main__":
- getTextCNNModel()
|