From a19f50f4ea8e0f459b62326030ce9c2c232dedda Mon Sep 17 00:00:00 2001 From: jueliangguke <10181759+jueliangguke@users.noreply.github.com> Date: Wed, 29 May 2019 17:37:19 +0800 Subject: [PATCH 1/2] fix bugs about TransformerDecoder's arguments --- examples/vae_text/vae_train.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/vae_text/vae_train.py b/examples/vae_text/vae_train.py index 0f38ea02..2e636560 100644 --- a/examples/vae_text/vae_train.py +++ b/examples/vae_text/vae_train.py @@ -121,7 +121,6 @@ def _main(_): decoder_initial_state_size = decoder.cell.state_size elif config.decoder_hparams["type"] == 'transformer': decoder = tx.modules.TransformerDecoder( - embedding=decoder_embedder.embedding, hparams=config.trans_hparams) decoder_initial_state_size = tf.TensorShape( [1, config.dec_emb_hparams["dim"]]) @@ -292,6 +291,7 @@ def _cat_embedder(ids): memory=dcdr_states, decoding_strategy="infer_sample", memory_sequence_length=tf.ones(tf.shape(dcdr_states)[0]), + embedding=decoder_embedder, max_decoding_length=100, start_tokens=start_tokens, end_token=end_token) From 216302c9aa22f27f29a9365016126a2f82ed8eb1 Mon Sep 17 00:00:00 2001 From: Zhiting Hu Date: Thu, 30 May 2019 21:39:16 -0400 Subject: [PATCH 2/2] update examples/vae_text --- .gitignore | 3 + examples/transformer/transformer_main.py | 7 +- examples/vae_text/config_lstm_ptb.py | 4 +- examples/vae_text/config_lstm_yahoo.py | 4 +- examples/vae_text/config_trans_ptb.py | 97 +++++++++---------- examples/vae_text/config_trans_yahoo.py | 12 ++- examples/vae_text/vae_train.py | 91 ++++++++++------- texar/modules/embedders/position_embedders.py | 3 +- 8 files changed, 124 insertions(+), 97 deletions(-) diff --git a/.gitignore b/.gitignore index 29e1fbcf..d28b2e4f 100644 --- a/.gitignore +++ b/.gitignore @@ -257,6 +257,9 @@ simple-examples.tgz /examples/vae_text/simple-examples/ /examples/vae_text/data/ +/examples/vae_text/models/ +/examples/vae_text/yahoo.zip +/examples/vae_text/simple-examples.tgz /examples/transformer/data/ /examples/transformer/temp/ diff --git a/examples/transformer/transformer_main.py b/examples/transformer/transformer_main.py index ba54bb41..64aa5d67 100644 --- a/examples/transformer/transformer_main.py +++ b/examples/transformer/transformer_main.py @@ -75,8 +75,6 @@ def main(): # (text sequence length excluding padding) encoder_input_length = tf.reduce_sum( 1 - tf.to_int32(tf.equal(encoder_input, 0)), axis=1) - decoder_input_length = tf.reduce_sum( - 1 - tf.to_int32(tf.equal(decoder_input, 0)), axis=1) labels = tf.placeholder(tf.int64, shape=(None, None)) is_target = tf.to_float(tf.not_equal(labels, 0)) @@ -152,8 +150,9 @@ def main(): start_tokens = tf.fill([batch_size], bos_token_id) def _embedding_fn(x, y): - return tgt_embedder(x) * config_model.hidden_dim ** 0.5 + pos_embedder( - y) + x_w_embed = tgt_embedder(x) + y_p_embed = pos_embedder(y) + return x_w_embed * config_model.hidden_dim ** 0.5 + y_p_embed predictions = decoder( memory=encoder_output, diff --git a/examples/vae_text/config_lstm_ptb.py b/examples/vae_text/config_lstm_ptb.py index 36956abe..74c416e3 100644 --- a/examples/vae_text/config_lstm_ptb.py +++ b/examples/vae_text/config_lstm_ptb.py @@ -38,9 +38,7 @@ } -decoder_hparams = { - "type": "lstm" -} +decoder_type = 'lstm' enc_cell_hparams = { "type": "LSTMBlockCell", diff --git a/examples/vae_text/config_lstm_yahoo.py b/examples/vae_text/config_lstm_yahoo.py index 33b45433..b2439080 100644 --- a/examples/vae_text/config_lstm_yahoo.py +++ b/examples/vae_text/config_lstm_yahoo.py @@ -43,9 +43,7 @@ residual_dropout = 0.2 num_blocks = 3 -decoder_hparams = { - "type": "lstm" -} +decoder_type = 'lstm' enc_cell_hparams = { "type": "LSTMBlockCell", diff --git a/examples/vae_text/config_trans_ptb.py b/examples/vae_text/config_trans_ptb.py index 793d679d..975b25b5 100644 --- a/examples/vae_text/config_trans_ptb.py +++ b/examples/vae_text/config_trans_ptb.py @@ -11,13 +11,12 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. - -"""VAE config. +"""Config file of VAE with Trasnformer decoder, on PTB data. """ # pylint: disable=invalid-name, too-few-public-methods, missing-docstring -dataset = "ptb" +dataset = 'ptb' num_epochs = 100 hidden_size = 256 dec_dropout_in = 0. @@ -29,10 +28,10 @@ latent_dims = 32 lr_decay_hparams = { - "init_lr": 0.001, - "threshold": 2, - "decay_factor": 0.5, - "max_decay": 5 + 'init_lr': 0.001, + 'threshold': 2, + 'decay_factor': 0.5, + 'max_decay': 5 } @@ -42,24 +41,22 @@ residual_dropout = 0.2 num_blocks = 3 -decoder_hparams = { - "type": "transformer" -} +decoder_type = 'transformer' enc_cell_hparams = { - "type": "LSTMBlockCell", - "kwargs": { - "num_units": hidden_size, - "forget_bias": 0. + 'type': 'LSTMBlockCell', + 'kwargs': { + 'num_units': hidden_size, + 'forget_bias': 0. }, - "dropout": {"output_keep_prob": 1. - enc_dropout_out}, - "num_layers": 1 + 'dropout': {'output_keep_prob': 1. - enc_dropout_out}, + 'num_layers': 1 } enc_emb_hparams = { 'name': 'lookup_table', - "dim": embed_dim, - "dropout_rate": enc_dropout_in, + 'dim': embed_dim, + 'dropout_rate': enc_dropout_in, 'initializer' : { 'type': 'random_normal_initializer', 'kwargs': { @@ -71,8 +68,8 @@ dec_emb_hparams = { 'name': 'lookup_table', - "dim": embed_dim, - "dropout_rate": dec_dropout_in, + 'dim': embed_dim, + 'dropout_rate': dec_dropout_in, 'initializer' : { 'type': 'random_normal_initializer', 'kwargs': { @@ -82,6 +79,11 @@ } } +max_pos = 200 # max sequence length in training data +dec_pos_emb_hparams = { + 'dim': hidden_size, +} + # due to the residual connection, the embed_dim should be equal to hidden_size trans_hparams = { 'output_layer_bias': False, @@ -89,9 +91,6 @@ 'residual_dropout': residual_dropout, 'num_blocks': num_blocks, 'dim': hidden_size, - 'position_embedder_hparams': { - 'dim': hidden_size, - }, 'initializer': { 'type': 'variance_scaling_initializer', 'kwargs': { @@ -138,48 +137,48 @@ # KL annealing kl_anneal_hparams = { - "warm_up": 10, - "start": 0.1 + 'warm_up': 10, + 'start': 0.1 } train_data_hparams = { - "num_epochs": 1, - "batch_size": batch_size, - "seed": 123, - "dataset": { - "files": './simple-examples/data/ptb.train.txt', - "vocab_file": './simple-examples/data/vocab.txt' + 'num_epochs': 1, + 'batch_size': batch_size, + 'seed': 123, + 'dataset': { + 'files': './simple-examples/data/ptb.train.txt', + 'vocab_file': './simple-examples/data/vocab.txt' } } val_data_hparams = { - "num_epochs": 1, - "batch_size": batch_size, - "seed": 123, - "dataset": { - "files": './simple-examples/data/ptb.valid.txt', - "vocab_file": './simple-examples/data/vocab.txt' + 'num_epochs': 1, + 'batch_size': batch_size, + 'seed': 123, + 'dataset': { + 'files': './simple-examples/data/ptb.valid.txt', + 'vocab_file': './simple-examples/data/vocab.txt' } } test_data_hparams = { - "num_epochs": 1, - "batch_size": batch_size, - "dataset": { - "files": './simple-examples/data/ptb.test.txt', - "vocab_file": './simple-examples/data/vocab.txt' + 'num_epochs': 1, + 'batch_size': batch_size, + 'dataset': { + 'files': './simple-examples/data/ptb.test.txt', + 'vocab_file': './simple-examples/data/vocab.txt' } } opt_hparams = { - "optimizer": { - "type": "AdamOptimizer", - "kwargs": { - "learning_rate": 0.001 + 'optimizer': { + 'type': 'AdamOptimizer', + 'kwargs': { + 'learning_rate': 0.001 } }, - "gradient_clip": { - "type": "clip_by_global_norm", - "kwargs": {"clip_norm": 5.} + 'gradient_clip': { + 'type': 'clip_by_global_norm', + 'kwargs': {'clip_norm': 5.} } } diff --git a/examples/vae_text/config_trans_yahoo.py b/examples/vae_text/config_trans_yahoo.py index d2b031ad..c081a459 100644 --- a/examples/vae_text/config_trans_yahoo.py +++ b/examples/vae_text/config_trans_yahoo.py @@ -42,9 +42,7 @@ residual_dropout = 0.2 num_blocks = 3 -decoder_hparams = { - "type": "transformer" -} +decoder_type = 'transformer' enc_cell_hparams = { "type": "LSTMBlockCell", @@ -82,6 +80,12 @@ } } + +max_pos = 200 # max sequence length in training data +dec_pos_emb_hparams = { + 'dim': hidden_size, +} + # due to the residual connection, the embed_dim should be equal to hidden_size trans_hparams = { 'output_layer_bias': False, @@ -134,7 +138,7 @@ } # KL annealing -kl_anneal_hparams={ +kl_anneal_hparams = { "warm_up": 10, "start": 0.1 } diff --git a/examples/vae_text/vae_train.py b/examples/vae_text/vae_train.py index 2e636560..da0b4b72 100644 --- a/examples/vae_text/vae_train.py +++ b/examples/vae_text/vae_train.py @@ -55,6 +55,7 @@ config = importlib.import_module(FLAGS.config) + def kl_dvg(means, logvars): """compute the KL divergence between Gaussian distribution """ @@ -93,7 +94,7 @@ def _main(_): os.makedirs(save_dir) suffix = "%s_%sDecoder.ckpt" % \ - (config.dataset, config.decoder_hparams["type"]) + (config.dataset, config.decoder_type) save_path = os.path.join(save_dir, suffix) @@ -102,25 +103,39 @@ def _main(_): (train_data.dataset_size() / config.batch_size)) # Model architecture - encoder_embedder = tx.modules.WordEmbedder( - vocab_size=train_data.vocab.size, hparams=config.enc_emb_hparams) - decoder_embedder = tx.modules.WordEmbedder( - vocab_size=train_data.vocab.size, hparams=config.dec_emb_hparams) - - - input_embed = encoder_embedder(data_batch["text_ids"]) - output_embed = decoder_embedder(data_batch["text_ids"][:, :-1]) - + encoder_w_embedder = tx.modules.WordEmbedder( + vocab_size=train_data.vocab.size, hparams=config.enc_emb_hparams) + input_embed = encoder_w_embedder(data_batch["text_ids"]) encoder = tx.modules.UnidirectionalRNNEncoder( hparams={"rnn_cell": config.enc_cell_hparams}) - if config.decoder_hparams["type"] == "lstm": + decoder_w_embedder = tx.modules.WordEmbedder( + vocab_size=train_data.vocab.size, hparams=config.dec_emb_hparams) + output_w_embed = decoder_w_embedder(data_batch["text_ids"][:, :-1]) + + if config.decoder_type == "lstm": + output_embed = output_w_embed + decoder = tx.modules.BasicRNNDecoder( vocab_size=train_data.vocab.size, hparams={"rnn_cell": config.dec_cell_hparams}) decoder_initial_state_size = decoder.cell.state_size - elif config.decoder_hparams["type"] == 'transformer': + elif config.decoder_type == 'transformer': + # position embedding + decoder_p_embedder = tx.modules.SinusoidsPositionEmbedder( + position_size=config.max_pos, hparams=config.dec_pos_emb_hparams) + batch_size = tf.shape(data_batch["text_ids"])[0] + max_seq_len = tf.shape(data_batch["text_ids"])[1] - 1 + batch_max_seq_len = tf.ones([batch_size], tf.int32) * max_seq_len + output_p_embed = decoder_p_embedder(sequence_length=batch_max_seq_len) + + output_w_embed = output_w_embed * config.hidden_size ** 0.5 + output_embed = output_w_embed + output_p_embed + + # decoder decoder = tx.modules.TransformerDecoder( + # tie word embedding with output layer + output_layer=tf.transpose(decoder_w_embedder.embedding, (1, 0)), hparams=config.trans_hparams) decoder_initial_state_size = tf.TensorShape( [1, config.dec_emb_hparams["dim"]]) @@ -133,6 +148,7 @@ def _main(_): connector_stoch = tx.modules.ReparameterizedStochasticConnector( decoder_initial_state_size) + ## encoder -> connector -> decoder _, ecdr_states = encoder( input_embed, @@ -149,7 +165,7 @@ def _main(_): dcdr_states, latent_z = connector_stoch(dst) # decoder - if config.decoder_hparams["type"] == "lstm": + if config.decoder_type == "lstm": # concat latent variable to input at every time step latent_z = tf.expand_dims(latent_z, axis=1) latent_z = tf.tile(latent_z, [1, tf.shape(output_embed)[1], 1]) @@ -224,13 +240,13 @@ def _run_epoch(sess, epoch, mode_string, display=10): fetches_ = sess.run(fetches, feed_dict=feed) - batch_size = len(fetches_["lengths"]) - num_sents += batch_size + batch_size_ = len(fetches_["lengths"]) + num_sents += batch_size_ num_words += sum(fetches_["lengths"]) - nll_ += fetches_["nll"] * batch_size - kl_loss_ += fetches_["kl_loss"] * batch_size - rc_loss_ += fetches_["rc_loss"] * batch_size + nll_ += fetches_["nll"] * batch_size_ + kl_loss_ += fetches_["kl_loss"] * batch_size_ + rc_loss_ += fetches_["rc_loss"] * batch_size_ if step % display == 0 and mode_string == 'train': print('%s: epoch %d, step %d, nll %.4f, klw: %.4f, ' \ @@ -255,7 +271,7 @@ def _run_epoch(sess, epoch, mode_string, display=10): return nll_ / num_sents, np.exp(nll_ / num_words) - def generate(sess, saver, fname=None): + def _generate(sess, saver, fname=None): if tf.train.checkpoint_exists(FLAGS.model): saver.restore(sess, FLAGS.model) else: @@ -269,16 +285,17 @@ def generate(sess, saver, fname=None): dcdr_states, latent_z = connector_stoch(dst) - # to concatenate latent variable to input word embeddings - def _cat_embedder(ids): - embedding = decoder_embedder(ids) - return tf.concat([embedding, latent_z], axis=1) - vocab = train_data.vocab - start_tokens = tf.ones(batch_size, tf.int32) * vocab.bos_token_id; - end_token = vocab.eos_token_id; + start_tokens = tf.ones(batch_size, tf.int32) * vocab.bos_token_id + end_token = vocab.eos_token_id + + if config.decoder_type == "lstm": + def _cat_embedder(ids): + """Concatenates latent variable to input word embeddings + """ + embedding = decoder_w_embedder(ids) + return tf.concat([embedding, latent_z], axis=1) - if config.decoder_hparams["type"] == "lstm": outputs, _, _ = decoder( initial_state=dcdr_states, decoding_strategy="infer_sample", @@ -287,11 +304,16 @@ def _cat_embedder(ids): start_tokens=start_tokens, end_token=end_token) else: + def _embedding_fn(ids, times): + w_embed = decoder_w_embedder(ids) + p_embed = decoder_p_embedder(times) + return w_embed * config.hidden_size ** 0.5 + p_embed + outputs, _ = decoder( memory=dcdr_states, decoding_strategy="infer_sample", memory_sequence_length=tf.ones(tf.shape(dcdr_states)[0]), - embedding=decoder_embedder, + embedding=_embedding_fn, max_decoding_length=100, start_tokens=start_tokens, end_token=end_token) @@ -299,26 +321,29 @@ def _cat_embedder(ids): sample_tokens = vocab.map_ids_to_tokens(outputs.sample_id) sess.run(tf.tables_initializer()) - mode_key = tf.estimator.ModeKeys.EVAL - feed = {tx.global_mode():mode_key} + feed = {tx.global_mode(): tf.estimator.ModeKeys.PREDICT} sample_tokens_ = sess.run(sample_tokens, feed_dict=feed) + if fname is None: fh = sys.stdout else: fh = open(fname, 'w', encoding='utf-8') for sent in sample_tokens_: - sent = list(sent) - end_id = sent.index(vocab.eos_token) + sent = tx.utils.compat_as_text(list(sent)) + end_id = len(sent) + if vocab.eos_token in sent: + end_id = sent.index(vocab.eos_token) fh.write(' '.join(sent[:end_id+1]) + '\n') + print('Output done') fh.close() saver = tf.train.Saver() with tf.Session() as sess: # generate samples from prior if FLAGS.mode == "predict": - generate(sess, saver, FLAGS.out) + _generate(sess, saver, FLAGS.out) return sess.run(tf.global_variables_initializer()) diff --git a/texar/modules/embedders/position_embedders.py b/texar/modules/embedders/position_embedders.py index 9f736d28..ed31c887 100644 --- a/texar/modules/embedders/position_embedders.py +++ b/texar/modules/embedders/position_embedders.py @@ -308,8 +308,9 @@ def _build(self, positions=None, sequence_length=None): `[batch_size]`. Time steps beyond the respective sequence lengths will have zero-valued embeddings. + Returns: - A `Tensor` of shape `[batch_size, position_size, dim]`. + A `Tensor` of shape `[batch_size, max_time, dim]`. """ inputs = positions if positions is None: