models.py 38 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009
  1. '''
  2. Created on 2019年4月22日
  3. @author: User
  4. '''
  5. from keras import models,layers,losses,optimizers
  6. from keras.callbacks import ModelCheckpoint
  7. import numpy as np
  8. from BiddingKG.dl.common.Utils import *
  9. import keras.backend as K
  10. import tensorflow as tf
  11. import math
  12. import six
  13. def getTextCNNModel(input_shape,vocab,embedding_weights,classes):
  14. def resize(x):
  15. _shape = shape_list(x)
  16. print("#$",_shape)
  17. x1 = tf.reshape(x,[_shape[0],2,_shape[1]//2,_shape[2]]) # type: object
  18. print("--")
  19. x2 = tf.transpose(x1,[1,0,2,3])
  20. x_l,x_r = tf.split(x2,[1,1],axis=0)
  21. return [tf.squeeze(x_l,axis=0),tf.squeeze(x_r,axis=0)]
  22. def resize_input(x):
  23. _shape = shape_list(x)
  24. x1 = tf.reshape(x,[_shape[0],3,_shape[1]//3]) # type: object
  25. x2 = tf.transpose(x1,[1,0,2])
  26. x_l,x_c,x_r = tf.split(x2,[1,1,1],axis=0)
  27. return [tf.squeeze(x_l,axis=0),tf.squeeze(x_c,axis=0),tf.squeeze(x_r,axis=0)]
  28. # assert len(input_shape)==3
  29. list_input = []
  30. for i in range(input_shape[0]):
  31. list_input.append(layers.Input(shape=(input_shape[1],),dtype=tf.int32,name="input%d"%(i)))
  32. print("list_input",list_input)
  33. list_embedding = []
  34. # if len(list_input)==1:
  35. # list_resizeinput = layers.Lambda(resize_input)(list_input[0])
  36. # else:
  37. concat_input = layers.Lambda(lambda x:tf.concat(x,axis=-1))(list_input)
  38. embedding_input = [concat_input]
  39. # embedding_input = list_input
  40. embedding = layers.Embedding(len(vocab),input_shape[2],weights=[embedding_weights] if embedding_weights is not None else None,trainable=True,name="char_embeding")
  41. for i in range(len(embedding_input)):
  42. print(i)
  43. list_embedding.append(embedding(embedding_input[i]))
  44. print(list_embedding)
  45. #
  46. set_variables = set()
  47. for v in tf.trainable_variables():
  48. set_variables.add(v.name)
  49. list_bert = []
  50. for i in range(len(list_embedding)):
  51. for v in tf.trainable_variables():
  52. set_variables.add(v.name)
  53. bert_layer = layers.Lambda(lambda x:transformer_model(input_tensor=x,name="bert%d"%(i)),trainable=True,name="bert%d"%(i))
  54. list_bert.append(bert_layer(list_embedding[i]))
  55. #set bert_weights to trainable
  56. bert_weights = []
  57. for v in tf.trainable_variables():
  58. if v.name not in set_variables:
  59. print("++++",v.name)
  60. bert_weights.append(v)
  61. bert_layer._trainable_weights = bert_weights
  62. _resize = layers.Lambda(lambda x:resize(x))(list_bert[0])
  63. list_w2v = _resize
  64. list_conv = []
  65. list_kernel = [2,5,8]
  66. for i in range(len(list_w2v)):
  67. list_temp = []
  68. for kernel in list_kernel:
  69. list_temp.append(layers.Conv1D(10,kernel,strides=1,padding="same",activation="relu")(list_w2v[i]))
  70. list_conv.append(layers.merge(list_temp,mode="concat"))
  71. layers.Conv2D
  72. list_matrix = []
  73. for i in range(len(list_conv)):
  74. list_matrix.append(layers.Dense(12,activation="relu")(list_conv[i]))
  75. if len(list_matrix)>1:
  76. ave = layers.merge(list_matrix,mode="concat")
  77. dropout = layers.Dropout(0.3)(ave)
  78. else:
  79. dropout = layers.Dropout(0.3)(list_matrix[0])
  80. flatten = layers.Flatten()(dropout)
  81. matrix = layers.Dense(classes*10,activation="relu")(flatten)
  82. out = layers.Dense(classes,activation="softmax")(matrix)
  83. model = models.Model(list_input,out)
  84. model.compile(optimizer=optimizers.Adadelta(),loss=losses.categorical_crossentropy,metrics=[precision,recall,f1_score])
  85. model.summary()
  86. return model
  87. class Attention(layers.Layer):
  88. def __init__(self, **kwargs):
  89. super(Attention, self).__init__(**kwargs)
  90. def build(self, input_shape):
  91. # W: (EMBED_SIZE, 1)
  92. # b: (MAX_TIMESTEPS, 1)
  93. # u: (MAX_TIMESTEPS, MAX_TIMESTEPS)
  94. self.W = self.add_weight(name="W_{:s}".format(self.name),
  95. shape=(input_shape[-1], 1),
  96. initializer="normal")
  97. self.b = self.add_weight(name="b_{:s}".format(self.name),
  98. shape=(input_shape[1], 1),
  99. initializer="zeros")
  100. self.u = self.add_weight(name="u_{:s}".format(self.name),
  101. shape=(input_shape[1], input_shape[1]),
  102. initializer="normal")
  103. super(Attention, self).build(input_shape)
  104. def call(self, x, mask=None):
  105. # input: (BATCH_SIZE, MAX_TIMESTEPS, EMBED_SIZE)
  106. # et: (BATCH_SIZE, MAX_TIMESTEPS)
  107. et = K.squeeze(K.tanh(K.dot(x, self.W) + self.b), axis=-1)
  108. # at: (BATCH_SIZE, MAX_TIMESTEPS)
  109. at = K.dot(et, self.u)
  110. at = K.exp(at)
  111. if mask is not None:
  112. at *= K.cast(mask, K.floatx())
  113. # ot: (BATCH_SIZE, MAX_TIMESTEPS, EMBED_SIZE)
  114. at /= K.cast(K.sum(at, axis=1, keepdims=True) + K.epsilon(), K.floatx())
  115. atx = K.expand_dims(at, axis=-1)
  116. ot = atx * x
  117. # output: (BATCH_SIZE, EMBED_SIZE)
  118. return K.sum(ot, axis=1)
  119. def compute_mask(self, input, input_mask=None):
  120. # do not pass the mask to the next layers
  121. return None
  122. def compute_output_shape(self, input_shape):
  123. # output shape: (BATCH_SIZE, EMBED_SIZE)
  124. return (input_shape[0], input_shape[-1])
  125. def get_config(self):
  126. return super(Attention, self).get_config()
  127. def gelu(x):
  128. """Gaussian Error Linear Unit.
  129. This is a smoother version of the RELU.
  130. Original paper: https://arxiv.org/abs/1606.08415
  131. Args:
  132. x: float Tensor to perform activation.
  133. Returns:
  134. `x` with the GELU activation applied.
  135. """
  136. cdf = 0.5 * (1.0 + tf.tanh(
  137. (np.sqrt(2 / np.pi) * (x + 0.044715 * tf.pow(x, 3)))))
  138. return x * cdf
  139. def shape_list(x):
  140. """Return list of dims, statically where possible."""
  141. x = tf.convert_to_tensor(x)
  142. # If unknown rank, return dynamic shape
  143. if x.get_shape().dims is None:
  144. return tf.shape(x)
  145. static = x.get_shape().as_list()
  146. shape = tf.shape(x)
  147. ret = []
  148. for i in range(len(static)):
  149. dim = static[i]
  150. if dim is None:
  151. dim = shape[i]
  152. ret.append(dim)
  153. return ret
  154. def get_timing_signal_1d(length,
  155. channels,
  156. min_timescale=1.0,
  157. max_timescale=1.0e4,
  158. start_index=0):
  159. """Gets a bunch of sinusoids of different frequencies.
  160. Each channel of the input Tensor is incremented by a sinusoid of a different
  161. frequency and phase.
  162. This allows attention to learn to use absolute and relative positions.
  163. Timing signals should be added to some precursors of both the query and the
  164. memory inputs to attention.
  165. The use of relative position is possible because sin(x+y) and cos(x+y) can be
  166. expressed in terms of y, sin(x) and cos(x).
  167. In particular, we use a geometric sequence of timescales starting with
  168. min_timescale and ending with max_timescale. The number of different
  169. timescales is equal to channels / 2. For each timescale, we
  170. generate the two sinusoidal signals sin(timestep/timescale) and
  171. cos(timestep/timescale). All of these sinusoids are concatenated in
  172. the channels dimension.
  173. Args:
  174. length: scalar, length of timing signal sequence.
  175. channels: scalar, size of timing embeddings to create. The number of
  176. different timescales is equal to channels / 2.
  177. min_timescale: a float
  178. max_timescale: a float
  179. start_index: index of first position
  180. Returns:
  181. a Tensor of timing signals [1, length, channels]
  182. """
  183. position = tf.to_float(tf.range(length) + start_index)
  184. num_timescales = channels // 2
  185. log_timescale_increment = (
  186. math.log(float(max_timescale) / float(min_timescale)) /
  187. (tf.to_float(num_timescales) - 1))
  188. inv_timescales = min_timescale * tf.exp(
  189. tf.to_float(tf.range(num_timescales)) * -log_timescale_increment)
  190. scaled_time = tf.expand_dims(position, 1) * tf.expand_dims(inv_timescales, 0)
  191. signal = tf.concat([tf.sin(scaled_time), tf.cos(scaled_time)], axis=1)
  192. signal = tf.pad(signal, [[0, 0], [0, tf.mod(channels, 2)]])
  193. signal = tf.reshape(signal, [1, length, channels])
  194. return signal
  195. def add_timing_signal_1d(x,
  196. min_timescale=1.0,
  197. max_timescale=1.0e4,
  198. start_index=0):
  199. """Adds a bunch of sinusoids of different frequencies to a Tensor.
  200. Each channel of the input Tensor is incremented by a sinusoid of a different
  201. frequency and phase.
  202. This allows attention to learn to use absolute and relative positions.
  203. Timing signals should be added to some precursors of both the query and the
  204. memory inputs to attention.
  205. The use of relative position is possible because sin(x+y) and cos(x+y) can be
  206. experessed in terms of y, sin(x) and cos(x).
  207. In particular, we use a geometric sequence of timescales starting with
  208. min_timescale and ending with max_timescale. The number of different
  209. timescales is equal to channels / 2. For each timescale, we
  210. generate the two sinusoidal signals sin(timestep/timescale) and
  211. cos(timestep/timescale). All of these sinusoids are concatenated in
  212. the channels dimension.
  213. Args:
  214. x: a Tensor with shape [batch, length, channels]
  215. min_timescale: a float
  216. max_timescale: a float
  217. start_index: index of first position
  218. Returns:
  219. a Tensor the same shape as x.
  220. """
  221. batchs = shape_list(x)[0]
  222. length = shape_list(x)[1]
  223. channels = shape_list(x)[2]
  224. signal = get_timing_signal_1d(length, channels, min_timescale, max_timescale,
  225. start_index)
  226. _signal = tf.tile(signal,[batchs,1,1])
  227. _concat = tf.concat([x,_signal],axis=2)
  228. # _concat = tf.quantized_concat(2,[x,signal],input_mins=[-10,-10],input_maxes=[10,10])[0]
  229. print("##",_concat)
  230. return _concat
  231. def get_activation(activation_string):
  232. """Maps a string to a Python function, e.g., "relu" => `tf.nn.relu`.
  233. Args:
  234. activation_string: String name of the activation function.
  235. Returns:
  236. A Python function corresponding to the activation function. If
  237. `activation_string` is None, empty, or "linear", this will return None.
  238. If `activation_string` is not a string, it will return `activation_string`.
  239. Raises:
  240. ValueError: The `activation_string` does not correspond to a known
  241. activation.
  242. """
  243. # We assume that anything that"s not a string is already an activation
  244. # function, so we just return it.
  245. if not isinstance(activation_string, six.string_types):
  246. return activation_string
  247. if not activation_string:
  248. return None
  249. act = activation_string.lower()
  250. if act == "linear":
  251. return None
  252. elif act == "relu":
  253. return tf.nn.relu
  254. elif act == "gelu":
  255. return gelu
  256. elif act == "tanh":
  257. return tf.tanh
  258. else:
  259. raise ValueError("Unsupported activation: %s" % act)
  260. def dropout(input_tensor, dropout_prob):
  261. """Perform dropout.
  262. Args:
  263. input_tensor: float Tensor.
  264. dropout_prob: Python float. The probability of dropping out a value (NOT of
  265. *keeping* a dimension as in `tf.nn.dropout`).
  266. Returns:
  267. A version of `input_tensor` with dropout applied.
  268. """
  269. if dropout_prob is None or dropout_prob == 0.0:
  270. return input_tensor
  271. output = tf.nn.dropout(input_tensor, 1.0 - dropout_prob)
  272. return output
  273. def layer_norm(input_tensor, name=None):
  274. """Run layer normalization on the last dimension of the tensor."""
  275. return tf.contrib.layers.layer_norm(
  276. inputs=input_tensor, begin_norm_axis=-1, begin_params_axis=-1, scope=name)
  277. def layer_norm_and_dropout(input_tensor, dropout_prob, name=None):
  278. """Runs layer normalization followed by dropout."""
  279. output_tensor = layer_norm(input_tensor, name)
  280. output_tensor = dropout(output_tensor, dropout_prob)
  281. return output_tensor
  282. def create_initializer(initializer_range=0.02):
  283. """Creates a `truncated_normal_initializer` with the given range."""
  284. return tf.truncated_normal_initializer(stddev=initializer_range)
  285. def reshape_to_matrix(input_tensor):
  286. """Reshapes a >= rank 2 tensor to a rank 2 tensor (i.e., a matrix)."""
  287. ndims = input_tensor.shape.ndims
  288. if ndims < 2:
  289. raise ValueError("Input tensor must have at least rank 2. Shape = %s" %
  290. (input_tensor.shape))
  291. if ndims == 2:
  292. return input_tensor
  293. width = input_tensor.shape[-1]
  294. output_tensor = tf.reshape(input_tensor, [-1, width])
  295. return output_tensor
  296. def reshape_from_matrix(output_tensor, orig_shape_list):
  297. """Reshapes a rank 2 tensor back to its original rank >= 2 tensor."""
  298. if len(orig_shape_list) == 2:
  299. return output_tensor
  300. output_shape = shape_list(output_tensor)
  301. orig_dims = orig_shape_list[0:-1]
  302. width = output_shape[-1]
  303. return tf.reshape(output_tensor, orig_dims + [width])
  304. def assert_rank(tensor, expected_rank, name=None):
  305. """Raises an exception if the tensor rank is not of the expected rank.
  306. Args:
  307. tensor: A tf.Tensor to check the rank of.
  308. expected_rank: Python integer or list of integers, expected rank.
  309. name: Optional name of the tensor for the error message.
  310. Raises:
  311. ValueError: If the expected shape doesn't match the actual shape.
  312. """
  313. if name is None:
  314. name = tensor.name
  315. expected_rank_dict = {}
  316. if isinstance(expected_rank, six.integer_types):
  317. expected_rank_dict[expected_rank] = True
  318. else:
  319. for x in expected_rank:
  320. expected_rank_dict[x] = True
  321. actual_rank = tensor.shape.ndims
  322. if actual_rank not in expected_rank_dict:
  323. scope_name = tf.get_variable_scope().name
  324. raise ValueError(
  325. "For the tensor `%s` in scope `%s`, the actual rank "
  326. "`%d` (shape = %s) is not equal to the expected rank `%s`" %
  327. (name, scope_name, actual_rank, str(tensor.shape), str(expected_rank)))
  328. def get_shape_list(tensor, expected_rank=None, name=None):
  329. """Returns a list of the shape of tensor, preferring static dimensions.
  330. Args:
  331. tensor: A tf.Tensor object to find the shape of.
  332. expected_rank: (optional) int. The expected rank of `tensor`. If this is
  333. specified and the `tensor` has a different rank, and exception will be
  334. thrown.
  335. name: Optional name of the tensor for the error message.
  336. Returns:
  337. A list of dimensions of the shape of tensor. All static dimensions will
  338. be returned as python integers, and dynamic dimensions will be returned
  339. as tf.Tensor scalars.
  340. """
  341. if name is None:
  342. name = tensor.name
  343. if expected_rank is not None:
  344. assert_rank(tensor, expected_rank, name)
  345. shape = tensor.shape.as_list()
  346. non_static_indexes = []
  347. for (index, dim) in enumerate(shape):
  348. if dim is None:
  349. non_static_indexes.append(index)
  350. if not non_static_indexes:
  351. return shape
  352. dyn_shape = tf.shape(tensor)
  353. for index in non_static_indexes:
  354. shape[index] = dyn_shape[index]
  355. return shape
  356. def attention_layer(from_tensor,
  357. to_tensor,
  358. attention_mask=None,
  359. num_attention_heads=1,
  360. size_per_head=10,
  361. query_act=None,
  362. key_act=None,
  363. value_act=None,
  364. attention_probs_dropout_prob=0.0,
  365. initializer_range=0.02,
  366. do_return_2d_tensor=False,
  367. batch_size=None,
  368. from_seq_length=None,
  369. to_seq_length=None):
  370. """Performs multi-headed attention from `from_tensor` to `to_tensor`.
  371. This is an implementation of multi-headed attention based on "Attention
  372. is all you Need". If `from_tensor` and `to_tensor` are the same, then
  373. this is self-attention. Each timestep in `from_tensor` attends to the
  374. corresponding sequence in `to_tensor`, and returns a fixed-with vector.
  375. This function first projects `from_tensor` into a "query" tensor and
  376. `to_tensor` into "key" and "value" tensors. These are (effectively) a list
  377. of tensors of length `num_attention_heads`, where each tensor is of shape
  378. [batch_size, seq_length, size_per_head].
  379. Then, the query and key tensors are dot-producted and scaled. These are
  380. softmaxed to obtain attention probabilities. The value tensors are then
  381. interpolated by these probabilities, then concatenated back to a single
  382. tensor and returned.
  383. In practice, the multi-headed attention are done with transposes and
  384. reshapes rather than actual separate tensors.
  385. Args:
  386. from_tensor: float Tensor of shape [batch_size, from_seq_length,
  387. from_width].
  388. to_tensor: float Tensor of shape [batch_size, to_seq_length, to_width].
  389. attention_mask: (optional) int32 Tensor of shape [batch_size,
  390. from_seq_length, to_seq_length]. The values should be 1 or 0. The
  391. attention scores will effectively be set to -infinity for any positions in
  392. the mask that are 0, and will be unchanged for positions that are 1.
  393. num_attention_heads: int. Number of attention heads.
  394. size_per_head: int. Size of each attention head.
  395. query_act: (optional) Activation function for the query transform.
  396. key_act: (optional) Activation function for the key transform.
  397. value_act: (optional) Activation function for the value transform.
  398. attention_probs_dropout_prob: (optional) float. Dropout probability of the
  399. attention probabilities.
  400. initializer_range: float. Range of the weight initializer.
  401. do_return_2d_tensor: bool. If True, the output will be of shape [batch_size
  402. * from_seq_length, num_attention_heads * size_per_head]. If False, the
  403. output will be of shape [batch_size, from_seq_length, num_attention_heads
  404. * size_per_head].
  405. batch_size: (Optional) int. If the input is 2D, this might be the batch size
  406. of the 3D version of the `from_tensor` and `to_tensor`.
  407. from_seq_length: (Optional) If the input is 2D, this might be the seq length
  408. of the 3D version of the `from_tensor`.
  409. to_seq_length: (Optional) If the input is 2D, this might be the seq length
  410. of the 3D version of the `to_tensor`.
  411. Returns:
  412. float Tensor of shape [batch_size, from_seq_length,
  413. num_attention_heads * size_per_head]. (If `do_return_2d_tensor` is
  414. true, this will be of shape [batch_size * from_seq_length,
  415. num_attention_heads * size_per_head]).
  416. Raises:
  417. ValueError: Any of the arguments or tensor shapes are invalid.
  418. """
  419. def transpose_for_scores(input_tensor, batch_size, num_attention_heads,
  420. seq_length, width):
  421. output_tensor = tf.reshape(
  422. input_tensor, [batch_size, seq_length, num_attention_heads, width])
  423. output_tensor = tf.transpose(output_tensor, [0, 2, 1, 3])
  424. return output_tensor
  425. from_shape = get_shape_list(from_tensor, expected_rank=[2, 3])
  426. to_shape = get_shape_list(to_tensor, expected_rank=[2, 3])
  427. if len(from_shape) != len(to_shape):
  428. raise ValueError(
  429. "The rank of `from_tensor` must match the rank of `to_tensor`.")
  430. if len(from_shape) == 3:
  431. batch_size = from_shape[0]
  432. from_seq_length = from_shape[1]
  433. to_seq_length = to_shape[1]
  434. elif len(from_shape) == 2:
  435. if (batch_size is None or from_seq_length is None or to_seq_length is None):
  436. raise ValueError(
  437. "When passing in rank 2 tensors to attention_layer, the values "
  438. "for `batch_size`, `from_seq_length`, and `to_seq_length` "
  439. "must all be specified.")
  440. # Scalar dimensions referenced here:
  441. # B = batch size (number of sequences)
  442. # F = `from_tensor` sequence length
  443. # T = `to_tensor` sequence length
  444. # N = `num_attention_heads`
  445. # H = `size_per_head`
  446. from_tensor_2d = reshape_to_matrix(from_tensor)
  447. to_tensor_2d = reshape_to_matrix(to_tensor)
  448. # `query_layer` = [B*F, N*H]
  449. '''
  450. query_matrix = tf.get_variable(name="query",shape=(shape_list(from_tensor_2d)[-1],num_attention_heads * size_per_head),initializer=create_initializer(initializer_range))
  451. query_layer = tf.matmul(from_tensor_2d,query_matrix)
  452. if query_act is not None:
  453. query_layer = query_act(query_layer)
  454. key_matrix = tf.get_variable(name="key",shape=(shape_list(from_tensor_2d)[-1],num_attention_heads * size_per_head),initializer=create_initializer(initializer_range))
  455. key_layer = tf.matmul(from_tensor_2d,key_matrix)
  456. if key_act is not None:
  457. key_layer =key_act(key_layer)
  458. value_matrix = tf.get_variable(name="value",shape=(shape_list(from_tensor_2d)[-1],num_attention_heads * size_per_head),initializer=create_initializer(initializer_range))
  459. value_layer = tf.matmul(from_tensor_2d,value_matrix)
  460. if value_act is not None:
  461. value_layer = value_act(value_layer)
  462. '''
  463. query_layer = tf.layers.dense(
  464. from_tensor_2d,
  465. num_attention_heads * size_per_head,
  466. activation=query_act,
  467. name="query",
  468. kernel_initializer=create_initializer(initializer_range))
  469. # `key_layer` = [B*T, N*H]
  470. key_layer = tf.layers.dense(
  471. to_tensor_2d,
  472. num_attention_heads * size_per_head,
  473. activation=key_act,
  474. name="key",
  475. kernel_initializer=create_initializer(initializer_range))
  476. # `value_layer` = [B*T, N*H]
  477. value_layer = tf.layers.dense(
  478. to_tensor_2d,
  479. num_attention_heads * size_per_head,
  480. activation=value_act,
  481. name="value",
  482. kernel_initializer=create_initializer(initializer_range))
  483. # `query_layer` = [B, N, F, H]
  484. query_layer = transpose_for_scores(query_layer, batch_size,
  485. num_attention_heads, from_seq_length,
  486. size_per_head)
  487. # `key_layer` = [B, N, T, H]
  488. key_layer = transpose_for_scores(key_layer, batch_size, num_attention_heads,
  489. to_seq_length, size_per_head)
  490. # Take the dot product between "query" and "key" to get the raw
  491. # attention scores.
  492. # `attention_scores` = [B, N, F, T]
  493. attention_scores = tf.matmul(query_layer, key_layer, transpose_b=True)
  494. attention_scores = tf.multiply(attention_scores,
  495. 1.0 / math.sqrt(float(size_per_head)))
  496. print(attention_scores)
  497. if attention_mask is not None:
  498. # `attention_mask` = [B, 1, F, T]
  499. attention_mask = tf.expand_dims(attention_mask, axis=[1])
  500. # Since attention_mask is 1.0 for positions we want to attend and 0.0 for
  501. # masked positions, this operation will create a tensor which is 0.0 for
  502. # positions we want to attend and -10000.0 for masked positions.
  503. adder = (1.0 - tf.cast(attention_mask, tf.float32)) * -10000.0
  504. # Since we are adding it to the raw scores before the softmax, this is
  505. # effectively the same as removing these entirely.
  506. attention_scores += adder
  507. # Normalize the attention scores to probabilities.
  508. # `attention_probs` = [B, N, F, T]
  509. # B = batch size (number of sequences)
  510. # F = `from_tensor` sequence length
  511. # T = `to_tensor` sequence length
  512. # N = `num_attention_heads`
  513. # H = `size_per_head`
  514. #attention_scores = tf.reshape(attention_scores,[batch_size,num_attention_heads,from_seq_length,to_seq_length])
  515. attention_probs = tf.nn.softmax(attention_scores)
  516. # This is actually dropping out entire tokens to attend to, which might
  517. # seem a bit unusual, but is taken from the original Transformer paper.
  518. attention_probs = dropout(attention_probs, attention_probs_dropout_prob)
  519. # `value_layer` = [B, T, N, H]
  520. value_layer = tf.reshape(
  521. value_layer,
  522. [batch_size, to_seq_length, num_attention_heads, size_per_head])
  523. # `value_layer` = [B, N, T, H]
  524. value_layer = tf.transpose(value_layer, [0, 2, 1, 3])
  525. # `context_layer` = [B, N, F, H]
  526. context_layer = tf.matmul(attention_probs, value_layer)
  527. # `context_layer` = [B, F, N, H]
  528. context_layer = tf.transpose(context_layer, [0, 2, 1, 3])
  529. if do_return_2d_tensor:
  530. # `context_layer` = [B*F, N*H]
  531. context_layer = tf.reshape(
  532. context_layer,
  533. [batch_size * from_seq_length, num_attention_heads * size_per_head])
  534. else:
  535. # `context_layer` = [B, F, N*H]
  536. context_layer = tf.reshape(
  537. context_layer,
  538. [batch_size, from_seq_length, num_attention_heads * size_per_head])
  539. return context_layer
  540. def transformer_model(input_tensor,
  541. attention_mask=None,
  542. hidden_size=120,
  543. num_hidden_layers=1,
  544. num_attention_heads=6,
  545. intermediate_size=256,
  546. intermediate_act_fn=gelu,
  547. hidden_dropout_prob=0.1,
  548. attention_probs_dropout_prob=0.1,
  549. initializer_range=0.02,
  550. do_return_all_layers=False,
  551. name=None):
  552. input_tensor = add_timing_signal_1d(input_tensor,max_timescale=1000)
  553. if hidden_size % num_attention_heads != 0:
  554. raise ValueError(
  555. "The hidden size (%d) is not a multiple of the number of attention "
  556. "heads (%d)" % (hidden_size, num_attention_heads))
  557. attention_head_size = int(hidden_size / num_attention_heads)
  558. input_shape = get_shape_list(input_tensor, expected_rank=3)
  559. batch_size = input_shape[0]
  560. seq_length = input_shape[1]
  561. input_width = input_shape[2]
  562. # The Transformer performs sum residuals on all layers so the input needs
  563. # to be the same as the hidden size.
  564. if input_width != hidden_size:
  565. raise ValueError("The width of the input tensor (%d) != hidden size (%d)" %
  566. (input_width, hidden_size))
  567. # We keep the representation as a 2D tensor to avoid re-shaping it back and
  568. # forth from a 3D tensor to a 2D tensor. Re-shapes are normally free on
  569. # the GPU/CPU but may not be free on the TPU, so we want to minimize them to
  570. # help the optimizer.
  571. prev_output = reshape_to_matrix(input_tensor)
  572. all_layer_outputs = []
  573. if name is not None:
  574. _name = str(name)+"encoder"
  575. else:
  576. _name = "encoder"
  577. with tf.variable_scope(_name,reuse=tf.AUTO_REUSE):
  578. for layer_idx in range(num_hidden_layers):
  579. with tf.variable_scope("layer_%d" % layer_idx,reuse=tf.AUTO_REUSE):
  580. layer_input = prev_output
  581. with tf.variable_scope("attention",reuse=tf.AUTO_REUSE):
  582. attention_heads = []
  583. with tf.variable_scope("self",reuse=tf.AUTO_REUSE):
  584. attention_head = attention_layer(
  585. from_tensor=layer_input,
  586. to_tensor=layer_input,
  587. attention_mask=attention_mask,
  588. num_attention_heads=num_attention_heads,
  589. size_per_head=attention_head_size,
  590. attention_probs_dropout_prob=attention_probs_dropout_prob,
  591. initializer_range=initializer_range,
  592. do_return_2d_tensor=True,
  593. batch_size=batch_size,
  594. from_seq_length=seq_length,
  595. to_seq_length=seq_length)
  596. attention_heads.append(attention_head)
  597. attention_output = None
  598. if len(attention_heads) == 1:
  599. attention_output = attention_heads[0]
  600. else:
  601. # In the case where we have other sequences, we just concatenate
  602. # them to the self-attention head before the projection.
  603. attention_output = tf.concat(attention_heads, axis=-1)
  604. # Run a linear projection of `hidden_size` then add a residual
  605. # with `layer_input`.
  606. with tf.variable_scope("output",reuse=tf.AUTO_REUSE):
  607. attention_output = tf.layers.dense(
  608. attention_output,
  609. hidden_size,
  610. kernel_initializer=create_initializer(initializer_range))
  611. attention_output = dropout(attention_output, hidden_dropout_prob)
  612. attention_output = layer_norm(attention_output + layer_input)
  613. # The activation is only applied to the "intermediate" hidden layer.
  614. with tf.variable_scope("intermediate",reuse=tf.AUTO_REUSE):
  615. intermediate_output = tf.layers.dense(
  616. attention_output,
  617. intermediate_size,
  618. activation=intermediate_act_fn,
  619. kernel_initializer=create_initializer(initializer_range))
  620. # Down-project back to `hidden_size` then add the residual.
  621. with tf.variable_scope("output",reuse=tf.AUTO_REUSE):
  622. layer_output = tf.layers.dense(
  623. intermediate_output,
  624. hidden_size,
  625. kernel_initializer=create_initializer(initializer_range))
  626. layer_output = dropout(layer_output, hidden_dropout_prob)
  627. layer_output = layer_norm(layer_output + attention_output)
  628. prev_output = layer_output
  629. all_layer_outputs.append(layer_output)
  630. if do_return_all_layers:
  631. final_outputs = []
  632. for layer_output in all_layer_outputs:
  633. final_output = reshape_from_matrix(layer_output, input_shape)
  634. final_outputs.append(final_output)
  635. return final_outputs
  636. else:
  637. final_output = reshape_from_matrix(prev_output, input_shape)
  638. return final_output
  639. # def getBiLSTMModel(input_shape,vocab,embedding_weights,classes,use_am=False):
  640. #
  641. # assert len(input_shape)==3
  642. # list_input = []
  643. # for i in range(input_shape[0]):
  644. # list_input.append(layers.Input(shape=(input_shape[1],),dtype=tf.int32))
  645. # print(list_input)
  646. # list_embedding = []
  647. #
  648. # embedding = layers.Embedding(len(vocab),input_shape[2],weights=[embedding_weights] if embedding_weights is not None else None,trainable=True,name="char_embeding")
  649. # for i in range(len(list_input)):
  650. # list_embedding.append(embedding(list_input[i]))
  651. # #
  652. # bert_layer = layers.Lambda(transformer_model,trainable=True,name="bert")
  653. # set_variables = set()
  654. # for v in tf.trainable_variables():
  655. # set_variables.add(v.name)
  656. # list_bert = []
  657. # for i in range(len(list_embedding)):
  658. # list_bert.append(bert_layer(list_embedding[i]))
  659. # #set bert_weights to trainable
  660. # bert_weights = []
  661. # for v in tf.trainable_variables():
  662. # if v.name not in set_variables:
  663. # bert_weights.append(v)
  664. # bert_layer._trainable_weights = bert_weights
  665. #
  666. # list_w2v = list_bert
  667. # list_lstm = []
  668. #
  669. # if use_am:
  670. # for i in range(len(list_w2v)):
  671. # list_lstm.append(Attention()(layers.Bidirectional(layers.LSTM(120,activation="relu",return_sequences=True))(list_w2v[i])))
  672. # else:
  673. # for i in range(len(list_w2v)):
  674. # list_lstm.append(layers.Bidirectional(layers.LSTM(24,activation="relu"))(list_w2v[i]))
  675. #
  676. #
  677. # list_matrix = []
  678. # for i in range(len(list_lstm)):
  679. # list_matrix.append(layers.Dense(classes*2,activation="relu")(list_lstm[i]))
  680. #
  681. # if len(list_matrix)>1:
  682. # ave = layers.merge(list_matrix,mode="concat")
  683. #
  684. # dropout = layers.Dropout(0.4)(ave)
  685. # else:
  686. # dropout = layers.Dropout(0.4)(list_matrix[0])
  687. #
  688. #
  689. # matrix = layers.Dense(classes*10,activation="sigmoid")(dropout)
  690. #
  691. # out = layers.Dense(classes,activation="softmax")(matrix)
  692. #
  693. # model = models.Model(list_input,out)
  694. #
  695. # model.compile(optimizer=optimizers.Adam(lr=0.00002),loss=losses.categorical_crossentropy,metrics=[precision,recall,f1_score])
  696. #
  697. # model.summary()
  698. #
  699. # return model
  700. def getBiLSTMModel(input_shape,vocab,embedding_weights,classes,use_am=False):
  701. def resize(x):
  702. _shape = shape_list(x)
  703. print("#$",_shape)
  704. x1 = tf.reshape(x,[_shape[0],3,_shape[1]//3,_shape[2]]) # type: object
  705. print("--")
  706. x2 = tf.transpose(x1,[1,0,2,3])
  707. x_l,x_c,x_r = tf.split(x2,[1,1,1],axis=0)
  708. return [tf.squeeze(x_l,axis=0),tf.squeeze(x_c,axis=0),tf.squeeze(x_r,axis=0)]
  709. def resize_input(x):
  710. _shape = shape_list(x)
  711. x1 = tf.reshape(x,[_shape[0],3,_shape[1]//3]) # type: object
  712. x2 = tf.transpose(x1,[1,0,2])
  713. x_l,x_c,x_r = tf.split(x2,[1,1,1],axis=0)
  714. return [tf.squeeze(x_l,axis=0),tf.squeeze(x_c,axis=0),tf.squeeze(x_r,axis=0)]
  715. # assert len(input_shape)==3
  716. list_input = []
  717. for i in range(input_shape[0]):
  718. list_input.append(layers.Input(shape=(input_shape[1],),dtype=tf.int32,name="input%d"%(i)))
  719. print("list_input",list_input)
  720. list_embedding = []
  721. # if len(list_input)==1:
  722. # list_resizeinput = layers.Lambda(resize_input)(list_input[0])
  723. # else:
  724. # concat_input = layers.Lambda(lambda x:tf.concat(x,axis=-1))(list_input)
  725. # embedding_input = [concat_input]
  726. embedding_input = list_input
  727. embedding = layers.Embedding(len(vocab),input_shape[2],weights=[embedding_weights] if embedding_weights is not None else None,trainable=True,name="char_embeding")
  728. for i in range(len(embedding_input)):
  729. print(i)
  730. list_embedding.append(embedding(embedding_input[i]))
  731. print(list_embedding)
  732. #
  733. set_variables = set()
  734. for v in tf.trainable_variables():
  735. set_variables.add(v.name)
  736. # list_bert = []
  737. # for i in range(len(list_embedding)):
  738. # for v in tf.trainable_variables():
  739. # set_variables.add(v.name)
  740. # bert_layer = layers.Lambda(lambda x:transformer_model(input_tensor=x,name="bert%d"%(i)),trainable=True,name="bert%d"%(i))
  741. # list_bert.append(bert_layer(list_embedding[i]))
  742. # #set bert_weights to trainable
  743. # bert_weights = []
  744. # for v in tf.trainable_variables():
  745. # if v.name not in set_variables:
  746. # print("++++",v.name)
  747. # bert_weights.append(v)
  748. # bert_layer._trainable_weights = bert_weights
  749. # bert_layer = layers.Lambda(lambda x:transformer_model(input_tensor=x,name="bert%d"%(i)),trainable=True,name="bert%d"%(0))
  750. # list_bert = []
  751. # for i in range(len(list_embedding)):
  752. # list_bert.append(bert_layer(list_embedding[i]))
  753. # #set bert_weights to trainable
  754. # bert_weights = []
  755. # for v in tf.trainable_variables():
  756. # if v.name not in set_variables:
  757. # print("++++",v.name)
  758. # bert_weights.append(v)
  759. # bert_layer._trainable_weights = bert_weights
  760. # print("##",list_bert)
  761. # context_embedding = []
  762. # list_kernel = [5,8]
  763. # for i in range(len(list_bert)):
  764. # list_temp = []
  765. # for kernel in list_kernel:
  766. # list_temp.append(layers.Conv1D(3,kernel,strides=1,padding="same",activation="relu")(list_bert[i]))
  767. # context_embedding.append(layers.Dense(12,activation="relu")(layers.Flatten()(layers.merge(list_temp,mode="concat"))))
  768. # context_embedding = [layers.GlobalMaxPool1D()(item) for item in list_bert]
  769. # _resize = layers.Lambda(lambda x:resize(x))(list_bert[0])
  770. list_w2v = list_embedding
  771. list_lstm = []
  772. if use_am:
  773. for i in range(len(list_w2v)):
  774. list_lstm.append(Attention()(layers.Bidirectional(layers.LSTM(120,activation="relu",return_sequences=True))(list_w2v[i])))
  775. else:
  776. for i in range(len(list_w2v)):
  777. list_lstm.append(layers.Bidirectional(layers.LSTM(24,activation="relu"))(list_w2v[i]))
  778. # list_avg = []
  779. # for i in range(len(list_lstm)):
  780. # list_avg.append(layers.GlobalAveragePooling1D()(list_lstm[i]))
  781. # list_matrix = []
  782. # for i in range(len(list_lstm)):
  783. # list_matrix.append(layers.Dense(12,activation="relu")(list_lstm[i]))
  784. # list_matrix.extend(context_embedding)
  785. if len(list_lstm)>1:
  786. ave = layers.merge(list_lstm,mode="concat")
  787. dropout = layers.Dropout(0.2)(ave)
  788. else:
  789. dropout = layers.Dropout(0.2)(list_lstm[0])
  790. matrix = layers.Dense(48,activation="tanh")(dropout)
  791. out = layers.Dense(classes,activation="softmax")(matrix)
  792. # out = layers.Dense(classes,activation="sigmoid")(dropout)
  793. # out = layers.Lambda(lambda x:layers.activations.softmax(x))(out)
  794. model = models.Model(list_input,out)
  795. model.compile(optimizer=optimizers.Adam(lr=0.01),loss=losses.categorical_crossentropy,metrics=[precision,recall,f1_score])
  796. model.summary()
  797. return model
  798. def getBiLSTMModel_entity(input_shape,vocab,embedding_weights,classes):
  799. list_input = []
  800. for i in range(input_shape[0]):
  801. list_input.append(layers.Input(shape=(input_shape[1],),dtype=tf.int32,name="input%d"%(i)))
  802. print("list_input",list_input)
  803. list_embedding = []
  804. embedding_input = list_input
  805. embedding = layers.Embedding(len(vocab),input_shape[2],weights=[embedding_weights] if embedding_weights is not None else None,trainable=True,name="char_embeding")
  806. for i in range(len(embedding_input)):
  807. print(i)
  808. list_embedding.append(embedding(embedding_input[i]))
  809. print(list_embedding)
  810. list_w2v = list_embedding
  811. list_lstm = []
  812. for i in range(len(list_w2v)):
  813. list_lstm.append(layers.Bidirectional(layers.LSTM(24,activation="relu"))(list_w2v[i]))
  814. # list_avg = []
  815. # for i in range(len(list_lstm)):
  816. # list_avg.append(layers.GlobalAveragePooling1D()(list_lstm[i]))
  817. list_matrix = []
  818. for i in range(len(list_lstm)):
  819. list_matrix.append(layers.Dense(12,activation="relu")(list_lstm[i]))
  820. if len(list_matrix)>1:
  821. ave = layers.merge(list_matrix,mode="concat")
  822. dropout = layers.Dropout(0.2)(ave)
  823. else:
  824. dropout = layers.Dropout(0.2)(list_matrix[0])
  825. matrix = layers.Dense(classes*10,activation="relu")(dropout)
  826. out = layers.Dense(classes,activation="softmax")(matrix)
  827. # out = layers.Dense(classes,activation="sigmoid")(dropout)
  828. # out = layers.Lambda(lambda x:layers.activations.softmax(x))(out)
  829. model = models.Model(list_input,out)
  830. model.compile(optimizer=optimizers.Adam(lr=0.00001),loss=losses.categorical_crossentropy,metrics=[precision,recall,f1_score])
  831. model.summary()
  832. return model
  833. if __name__=="__main__":
  834. getTextCNNModel((3,100,60),[1,2,3,4,5],None,2)
  835. model = getBiLSTMModel((3,100,256),fool_char_to_id.keys(),None,3,use_am=False)
  836. #getBiLSTMModel_entity((20,20,3,100,60),[1,2,3,4,5],None,6)