From 7afafdfd1b7b71ecb1d032cabc70a5a9b2973b38 Mon Sep 17 00:00:00 2001
From: Yaming Yang <yayaming@microsoft.com>
Date: Sun, 22 Mar 2020 12:07:07 +0000
Subject: [PATCH 01/12] Add evalutaion scripts for TextNAS.

---
 examples/nas/textnas/dataloader.py  |   3 +-
 examples/nas/textnas/eval_arc.py    | 571 ++++++++++++++++++++++++++++
 examples/nas/textnas/macro_child.py | 223 +++++++++++
 examples/nas/textnas/run_sst2.sh    |  69 ++++
 examples/nas/textnas/run_sst5.sh    |  69 ++++
 examples/nas/textnas/train_sst2.sh  |  69 ++++
 examples/nas/textnas/train_sst5.sh  |  69 ++++
 7 files changed, 1072 insertions(+), 1 deletion(-)
 create mode 100644 examples/nas/textnas/eval_arc.py
 create mode 100644 examples/nas/textnas/macro_child.py
 create mode 100755 examples/nas/textnas/run_sst2.sh
 create mode 100755 examples/nas/textnas/run_sst5.sh
 create mode 100755 examples/nas/textnas/train_sst2.sh
 create mode 100755 examples/nas/textnas/train_sst5.sh

diff --git a/examples/nas/textnas/dataloader.py b/examples/nas/textnas/dataloader.py
index e5a4ed363f..083f1c7413 100644
--- a/examples/nas/textnas/dataloader.py
+++ b/examples/nas/textnas/dataloader.py
@@ -241,7 +241,8 @@ def init_trainable_embedding(embedding_path, word_id_dict, embed_dim=300):
     embedding = np.random.random([len(word_id_dict), embed_dim]).astype(np.float32) / 2.0 - 0.25
     embedding[0] = np.zeros(embed_dim)  # PAD
     embedding[1] = (np.random.rand(embed_dim) - 0.5) / 2  # UNK
-    for word, idx in word_id_dict.items():
+    for word in sorted(word_id_dict.keys()):
+        idx = word_id_dict[word]
         if idx == 0 or idx == 1:
             continue
         if word in word_embed_model["mapping"]:
diff --git a/examples/nas/textnas/eval_arc.py b/examples/nas/textnas/eval_arc.py
new file mode 100644
index 0000000000..63a79d1672
--- /dev/null
+++ b/examples/nas/textnas/eval_arc.py
@@ -0,0 +1,571 @@
+# Copyright (c) Microsoft Corporation.
+# Licensed under the MIT license.
+
+import os
+import pickle
+import shutil
+import sys
+import random
+import math
+
+import time
+import datetime
+import argparse
+import distutils.util
+
+import numpy as np
+import torch
+from torch import nn
+from torch import optim
+from torch.utils.data import DataLoader
+import torch.nn.functional as Func
+
+from macro_child import MacroChild
+from dataloader import read_data_sst
+
+
+def parse_args():
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "--reset_output_dir",
+        type=distutils.util.strtobool,
+        default=True,
+        help="Whether to clean the output dir if existed. (default: %(default)s)")
+    parser.add_argument(
+        "--embedding_model",
+        type=str,
+        default="glove",
+        help="Embedding type. (default: %(default)s)")
+    parser.add_argument(
+        "--child_fixed_arc",
+        type=str,
+        required=True,
+        help="Architecture description. (default: %(default)s)")
+    parser.add_argument(
+        "--data_path",
+        type=str,
+        default="data",
+        help="Directory containing the dataset and embedding file. (default: %(default)s)")
+    parser.add_argument(
+        "--output_dir",
+        type=str,
+        default="output",
+        help="The output directory. (default: %(default)s)")
+    parser.add_argument(
+        "--child_lr_decay_scheme",
+        type=str,
+        default="cosine",
+        help="Learning rate annealing strategy, only 'cosine' supported. (default: %(default)s)")
+    parser.add_argument(
+        "--batch_size",
+        type=int,
+        default=128,
+        help="Number of samples each batch for training. (default: %(default)s)")
+    parser.add_argument(
+        "--eval_batch_size",
+        type=int,
+        default=128,
+        help="Number of samples each batch for evaluation. (default: %(default)s)")
+    parser.add_argument(
+        "--class_num",
+        type=int,
+        default=5,
+        help="The number of categories. (default: %(default)s)")
+    parser.add_argument(
+        "--global_seed",
+        type=int,
+        default=1234,
+        help="Seed for reproduction. (default: %(default)s)")
+    parser.add_argument(
+        "--max_input_length",
+        type=int,
+        default=64,
+        help="The maximum length of the sentence. (default: %(default)s)")
+    parser.add_argument(
+        "--num_epochs",
+        type=int,
+        default=10,
+        help="The number of training epochs. (default: %(default)s)")
+    parser.add_argument(
+        "--child_num_layers",
+        type=int,
+        default=24,
+        help="The layer number of the architecture. (default: %(default)s)")
+    parser.add_argument(
+        "--child_out_filters",
+        type=int,
+        default=256,
+        help="The dimension of hidden states. (default: %(default)s)")
+    parser.add_argument(
+        "--child_out_filters_scale",
+        type=int,
+        default=1,
+        help="The scale of hidden state dimension. (default: %(default)s)")
+    parser.add_argument(
+        "--child_lr_T_0",
+        type=int,
+        default=10,
+        help="The length of one cycle. (default: %(default)s)")
+    parser.add_argument(
+        "--child_lr_T_mul",
+        type=int,
+        default=2,
+        help="The multiplication factor per cycle. (default: %(default)s)")
+    parser.add_argument(
+        "--min_count",
+        type=int,
+        default=1,
+        help="The threshold to cut off low frequent words. (default: %(default)s)")
+    parser.add_argument(
+        "--num_last_layer_output",
+        type=int,
+        default=0,
+        help="The last n layers as output, 0 for all. (default: %(default)s)")
+    parser.add_argument(
+        "--train_ratio",
+        type=float,
+        default=1.0,
+        help="The sample ratio for the training set. (default: %(default)s)")
+    parser.add_argument(
+        "--valid_ratio",
+        type=float,
+        default=1.0,
+        help="The sample ratio for the dev set. (default: %(default)s)")
+    parser.add_argument(
+        "--child_grad_bound",
+        type=float,
+        default=5.0,
+        help="The threshold for gradient clipping. (default: %(default)s)")
+    parser.add_argument(
+        "--child_lr",
+        type=float,
+        default=0.02,
+        help="The initial learning rate. (default: %(default)s)")
+    parser.add_argument(
+        "--cnn_keep_prob",
+        type=float,
+        default=0.8,
+        help="Keep prob for cnn layer. (default: %(default)s)")
+    parser.add_argument(
+        "--final_output_keep_prob",
+        type=float,
+        default=1.0,
+        help="Keep prob for the last output layer. (default: %(default)s)")
+    parser.add_argument(
+        "--lstm_out_keep_prob",
+        type=float,
+        default=0.8,
+        help="Keep prob for the RNN layer. (default: %(default)s)")
+    parser.add_argument(
+        "--embed_keep_prob",
+        type=float,
+        default=0.8,
+        help="Keep prob for the embedding layer. (default: %(default)s)")
+    parser.add_argument(
+        "--attention_keep_prob",
+        type=float,
+        default=0.8,
+        help="Keep prob for the self-attention layer. (default: %(default)s)")
+    parser.add_argument(
+        "--child_l2_reg",
+        type=float,
+        default=3e-6,
+        help="Weight decay factor. (default: %(default)s)")
+    parser.add_argument(
+        "--child_lr_max",
+        type=float,
+        default=0.002,
+        help="The max learning rate. (default: %(default)s)")
+    parser.add_argument(
+        "--child_lr_min",
+        type=float,
+        default=0.001,
+        help="The min learning rate. (default: %(default)s)")
+    parser.add_argument(
+        "--child_optim_algo",
+        type=str,
+        default="adam",
+        help="Optimization algorithm. (default: %(default)s)")
+    parser.add_argument(
+        "--checkpoint_dir",
+        type=str,
+        default="best_checkpoint",
+        help="Path for saved checkpoints. (default: %(default)s)")
+    parser.add_argument(
+        "--output_type",
+        type=str,
+        default="avg_pool",
+        help="Opertor type for the time steps reduction. (default: %(default)s)")
+    parser.add_argument(
+        "--multi_path",
+        type=distutils.util.strtobool,
+        default=False,
+        help="Search for multiple path in the architecture. (default: %(default)s)")
+    parser.add_argument(
+        "--is_binary",
+        type=distutils.util.strtobool,
+        default=False,
+        help="Binary label for sst dataset. (default: %(default)s)")
+    parser.add_argument(
+        "--all_layer_output",
+        type=distutils.util.strtobool,
+        default=True,
+        help="Use all layers as output. (default: %(default)s)")
+    parser.add_argument(
+        "--output_linear_combine",
+        type=distutils.util.strtobool,
+        default=True,
+        help="Combine all the layers in linear way. (default: %(default)s)")
+    parser.add_argument(
+        "--is_mask",
+        type=distutils.util.strtobool,
+        default=True,
+        help="Apply mask. (default: %(default)s)")
+    parser.add_argument(
+        "--fixed_seed",
+        type=distutils.util.strtobool,
+        default=True,
+        help="Fix the seed. (default: %(default)s)")
+    parser.add_argument(
+        "--load_checkpoint",
+        type=distutils.util.strtobool,
+        default=False,
+        help="Wether to load checkpoint. (default: %(default)s)")
+    parser.add_argument(
+        "--log_every",
+        type=int,
+        default=50,
+        help="How many steps to log. (default: %(default)s)")
+    parser.add_argument(
+        "--eval_every_epochs",
+        type=int,
+        default=1,
+        help="How many epochs to eval. (default: %(default)s)")
+
+    global FLAGS
+
+    FLAGS = parser.parse_args()
+
+
+def set_random_seed(seed):
+    print("-" * 80)
+    print("set random seed for data reading: {}".format(seed))
+    random.seed(seed)
+    os.environ['PYTHONHASHSEED'] = str(seed)
+    np.random.seed(seed)
+    random.seed(seed)
+    torch.manual_seed(seed)
+    torch.cuda.manual_seed(seed)
+    torch.backends.cudnn.deterministic = True
+
+
+def get_model(embedding, num_layers):
+    print("num layers: {0}".format(num_layers))
+    assert FLAGS.child_fixed_arc is not None, "Architecture should be provided."
+
+    child_model = MacroChild(
+        embedding=embedding,
+        fixed_arc=FLAGS.child_fixed_arc,
+        out_filters_scale=FLAGS.child_out_filters_scale,
+        num_layers=num_layers,
+        out_filters=FLAGS.child_out_filters,
+        cnn_keep_prob=FLAGS.cnn_keep_prob,
+        final_output_keep_prob=FLAGS.final_output_keep_prob,
+        lstm_out_keep_prob=FLAGS.lstm_out_keep_prob,
+        embed_keep_prob=FLAGS.embed_keep_prob,
+        attention_keep_prob=FLAGS.attention_keep_prob,
+        multi_path=FLAGS.multi_path,
+        embedding_model=FLAGS.embedding_model,
+        all_layer_output=FLAGS.all_layer_output,
+        output_linear_combine=FLAGS.output_linear_combine,
+        num_last_layer_output=FLAGS.num_last_layer_output,
+        is_mask=FLAGS.is_mask,
+        output_type=FLAGS.output_type,
+        class_num=FLAGS.class_num)
+
+    return child_model
+
+
+def print_arc(arc, num_layers):
+   start = 0
+   for i in range(0, num_layers):
+      end = start + i + 1
+      if FLAGS.multi_path:
+          end += 1
+      out_str = "fixed_arc=\"$fixed_arc {0}\"".format(np.reshape(arc[start: end], [-1]))
+      out_str = out_str.replace("[", "").replace("]", "")
+      print(out_str)
+
+      start = end
+
+
+def eval_once(child_model, eval_set, criterion, valid_dataloader=None, test_dataloader=None):
+    if eval_set == "test":
+        assert test_dataloader is not None
+        dataloader = test_dataloader
+    elif eval_set == "valid":
+        assert valid_dataloader is not None
+        dataloader = valid_dataloader
+    else:
+        raise NotImplementedError("Unknown eval_set '{}'".format(eval_set))
+
+    tot_acc = 0
+    tot = 0
+    losses = []
+
+    with torch.no_grad():  # save memory
+        for batch in dataloader:
+            (sent_ids, mask), labels = batch
+
+            sent_ids = sent_ids.cuda()
+            mask = mask.cuda()
+            labels = labels.cuda()
+
+            logits = child_model(sent_ids, mask)  # run
+
+            loss = criterion(logits, labels.long())
+            loss = loss.mean()
+            preds = logits.argmax(dim=1).long()
+            acc = torch.eq(preds, labels.long()).long().sum().item()
+
+            losses.append(loss)
+            tot_acc += acc
+            tot += len(labels)
+
+    losses = torch.tensor(losses)
+    loss = losses.mean()
+    if tot > 0:
+        final_acc = float(tot_acc) / tot
+    else:
+        final_acc = 0
+        print("Error in calculating final_acc")
+    return final_acc, loss
+
+
+def print_user_flags(FLAGS, line_limit=80):
+    print("-" * 80)
+
+    log_strings = ""
+    for flag_name in sorted(vars(FLAGS)):
+        value = "{}".format(getattr(FLAGS, flag_name))
+        log_string = flag_name
+        log_string += "." * (line_limit - len(flag_name) - len(value))
+        log_string += value
+        log_strings = log_strings + log_string
+        log_strings = log_strings + "\n"
+    print(log_strings)
+
+
+def count_model_params(trainable_params):
+    num_vars = 0
+    for var in trainable_params:
+        num_vars += np.prod([dim for dim in var.size()])
+    return num_vars
+
+
+def update_lr(
+        optimizer,
+        epoch,
+        l2_reg=1e-4,
+        lr_warmup_val=None,
+        lr_init=0.1,
+        lr_decay_scheme="cosine",
+        lr_max=0.002,
+        lr_min=0.000000001,
+        lr_T_0=4,
+        lr_T_mul=1,
+        sync_replicas=False,
+        num_aggregate=None,
+        num_replicas=None):
+    if lr_decay_scheme == "cosine":
+        assert lr_max is not None, "Need lr_max to use lr_cosine"
+        assert lr_min is not None, "Need lr_min to use lr_cosine"
+        assert lr_T_0 is not None, "Need lr_T_0 to use lr_cosine"
+        assert lr_T_mul is not None, "Need lr_T_mul to use lr_cosine"
+
+        T_i = lr_T_0
+        t_epoch = epoch
+        last_reset = 0
+        while True:
+            t_epoch -= T_i
+            if t_epoch < 0:
+              break
+            last_reset += T_i
+            T_i *= lr_T_mul
+
+        T_curr = epoch - last_reset
+
+        def _update():
+            rate = T_curr / T_i * 3.1415926
+            lr = lr_min + 0.5 * (lr_max - lr_min) * (1.0 + math.cos(rate))
+            return lr
+
+        learning_rate = _update()
+    else:
+        raise ValueError("Unknown learning rate decay scheme {}".format(lr_decay_scheme))
+
+    #update lr in optimizer
+    for params_group in optimizer.param_groups:
+        params_group['lr'] = learning_rate
+    return learning_rate
+
+def train(data_path, output_dir, num_layers):
+    print("Build dataloader")
+    train_dataset, valid_dataset, test_dataset, embedding = \
+        read_data_sst(data_path,
+                      FLAGS.max_input_length,
+                      FLAGS.min_count,
+                      train_ratio=FLAGS.train_ratio,
+                      valid_ratio=FLAGS.valid_ratio,
+                      is_binary=FLAGS.is_binary)
+    train_dataloader = DataLoader(train_dataset, batch_size=FLAGS.batch_size, shuffle=True, pin_memory=True)
+    test_dataloader = DataLoader(test_dataset, batch_size=FLAGS.eval_batch_size, pin_memory=True)
+    valid_dataloader = DataLoader(valid_dataset, batch_size=FLAGS.eval_batch_size, pin_memory=True)
+
+    print("Build model")
+    print("-" * 80)
+    child_model = get_model(embedding, num_layers)
+    print("Finish build model")
+
+    for name, var in child_model.named_parameters():
+        print(name, var.size(), var.requires_grad)  # output all params
+
+    num_vars = count_model_params(child_model.parameters())
+    print("Model has {} params".format(num_vars))
+
+    for m in child_model.modules():  # initializer
+        if isinstance(m, (nn.Conv1d, nn.Linear)):
+            nn.init.xavier_uniform_(m.weight)
+
+    criterion = nn.CrossEntropyLoss()
+
+    # get optimizer
+    if FLAGS.child_optim_algo == "adam":
+        optimizer = optim.Adam(child_model.parameters(), eps=1e-3, weight_decay=FLAGS.child_l2_reg)  # with L2
+    else:
+        raise ValueError("Unknown optim_algo {}".format(optim_algo))
+
+    child_model.cuda()
+    criterion.cuda()
+
+    fixed_arc = np.array([int(x) for x in FLAGS.child_fixed_arc.split(" ") if x])
+    print_arc(fixed_arc, num_layers)
+
+    print("Start training")
+    print("-" * 80)
+    start_time = time.time()
+    step = 0
+
+    # save path
+    model_save_path = os.path.join(FLAGS.output_dir, "model.pth")
+    best_model_save_path = os.path.join(FLAGS.output_dir, "best_model.pth")
+    best_acc = 0
+    start_epoch = 0
+    if FLAGS.load_checkpoint:
+        if os.path.isfile(model_save_path):
+            checkpoint = torch.load(model_save_path, map_location = torch.device('cpu'))
+            step = checkpoint['step']
+            start_epoch = checkpoint['epoch']
+            child_model.load_state_dict(checkpoint['child_model_state_dict'])
+            optimizer.load_state_dict(checkpoint['optimizer_state_dict'])
+
+    for epoch in range(start_epoch, FLAGS.num_epochs):
+        lr = update_lr(optimizer,
+                       epoch,
+                       l2_reg=FLAGS.child_l2_reg,
+                       lr_warmup_val=None,
+                       lr_init=FLAGS.child_lr,
+                       lr_decay_scheme=FLAGS.child_lr_decay_scheme,
+                       lr_max=FLAGS.child_lr_max,
+                       lr_min=FLAGS.child_lr_min,
+                       lr_T_0=FLAGS.child_lr_T_0,
+                       lr_T_mul=FLAGS.child_lr_T_mul)
+        child_model.train()
+        for batch in train_dataloader:
+            (sent_ids, mask), labels = batch
+
+            sent_ids = sent_ids.cuda()
+            mask = mask.cuda()
+            labels = labels.cuda()
+
+            step += 1
+
+            logits = child_model(sent_ids, mask)  # run
+
+            loss = criterion(logits, labels.long())
+            loss = loss.mean()
+            preds = logits.argmax(dim=1).long()
+            acc = torch.eq(preds, labels.long()).long().sum().item()
+
+            optimizer.zero_grad()
+            loss.backward()
+            grad_norm = 0
+            trainable_params = child_model.parameters()
+
+            assert FLAGS.child_grad_bound is not None, "Need grad_bound to clip gradients."
+            # compute the gradient norm value
+            grad_norm = nn.utils.clip_grad_norm_(trainable_params, 99999999)
+            for param in trainable_params:
+                nn.utils.clip_grad_norm_(param, grad_bound) #clip grad
+
+            optimizer.step()
+
+            if step % FLAGS.log_every == 0:
+                curr_time = time.time()
+                log_string = ""
+                log_string += "epoch={:<6d}".format(epoch)
+                log_string += "ch_step={:<6d}".format(step)
+                log_string += " loss={:<8.6f}".format(loss)
+                log_string += " lr={:<8.4f}".format(lr)
+                log_string += " |g|={:<8.4f}".format(grad_norm)
+                log_string += " tr_acc={:<3d}/{:>3d}".format(acc, logits.size()[0])
+                log_string += " mins={:<10.2f}".format(float(curr_time - start_time) / 60)
+                print(log_string)
+        epoch += 1
+        save_state = {
+            'step' : step,
+            'epoch' : epoch,
+            'child_model_state_dict' : child_model.state_dict(),
+            'optimizer_state_dict' : optimizer.state_dict()}
+        torch.save(save_state, model_save_path)
+        child_model.eval()
+        print("Epoch {}: Eval".format(epoch))
+        eval_acc, eval_loss = eval_once(child_model, "test", criterion, test_dataloader=test_dataloader)
+        print("ch_step={} {}_accuracy={:<6.4f} {}_loss={:<6.4f}".format(step, "test", eval_acc, "test", eval_loss))
+        if eval_acc > best_acc:
+            best_acc = eval_acc
+            print("Save best model")
+            save_state = {
+                'step' : step,
+                'epoch' : epoch,
+                'child_model_state_dict' : child_model.state_dict(),
+                'optimizer_state_dict' : optimizer.state_dict()}
+            torch.save(save_state, best_model_save_path)
+
+    return eval_acc
+
+
+def main():
+    parse_args()
+    print("-" * 80)
+    if not os.path.isdir(FLAGS.output_dir):
+        print("Path {} does not exist. Creating.".format(FLAGS.output_dir))
+        os.makedirs(FLAGS.output_dir)
+    elif FLAGS.reset_output_dir:
+        print("Path {} exists. Remove and remake.".format(FLAGS.output_dir))
+        shutil.rmtree(FLAGS.output_dir, ignore_errors=True)
+        os.makedirs(FLAGS.output_dir)
+    print("-" * 80)
+    log_file = os.path.join(FLAGS.output_dir, "stdout")
+    print("Logging to {}".format(log_file))
+
+    print_user_flags(FLAGS)
+
+    if FLAGS.fixed_seed:
+        set_random_seed(FLAGS.global_seed)
+
+    train(FLAGS.data_path, FLAGS.output_dir, FLAGS.child_num_layers)
+
+
+if __name__ == "__main__":
+  main()
diff --git a/examples/nas/textnas/macro_child.py b/examples/nas/textnas/macro_child.py
new file mode 100644
index 0000000000..a74f24f5bc
--- /dev/null
+++ b/examples/nas/textnas/macro_child.py
@@ -0,0 +1,223 @@
+# Copyright (c) Microsoft Corporation.
+# Licensed under the MIT license.
+
+import os
+import sys
+
+import numpy as np
+import torch
+from torch import nn
+import torch.nn.functional as Func
+
+from ops import *
+from utils import GlobalAvgPool, GlobalMaxPool
+
+class MacroChild(nn.Module):
+    def __init__(self,
+                 embedding,
+                 fixed_arc=None,
+                 out_filters_scale=1,
+                 num_layers=2,
+                 out_filters=24,
+                 cnn_keep_prob=1.0,
+                 final_output_keep_prob=1.0,
+                 lstm_out_keep_prob=1.0,
+                 embed_keep_prob=1.0,
+                 attention_keep_prob=1.0,
+                 multi_path=False,
+                 embedding_model="none",
+                 all_layer_output=False,
+                 output_linear_combine=False,
+                 num_last_layer_output=0,
+                 is_mask=False,
+                 output_type="avg_pool",
+                 class_num=5,
+                 *args,
+                 **kwargs):
+        super(MacroChild, self).__init__()
+
+        self.fixed_arc = fixed_arc
+        self.all_layer_output = all_layer_output
+        self.output_linear_combine = output_linear_combine
+        self.num_last_layer_output = max(num_last_layer_output, 0)
+        self.is_mask = is_mask
+        self.output_type = output_type
+        self.multi_path = multi_path
+        self.embedding_model = embedding_model
+        self.out_filters = out_filters * out_filters_scale
+        self.num_layers = num_layers
+        self.class_num = class_num
+        self.cnn_keep_prob = cnn_keep_prob
+        self.final_output_keep_prob = final_output_keep_prob
+        self.lstm_out_keep_prob = lstm_out_keep_prob
+        self.embed_keep_prob = embed_keep_prob
+        self.attention_keep_prob = attention_keep_prob
+
+        fixed_arc = np.array([int(x) for x in self.fixed_arc.split(" ") if x])
+        self.sample_arc = fixed_arc
+
+        layers = []
+
+        out_filters = self.out_filters
+        if self.embedding_model == "glove":
+            self.embedding = nn.Parameter(embedding)
+        else:
+            raise NotImplementedError("Unknown embedding_model '{}'".format(embedding_model))
+
+        self.init_conv = ConvBN(1, self.embedding.size()[1], out_filters, cnn_keep_prob, False, True)
+
+        for layer_id in range(self.num_layers):
+            layers.append(self.make_fixed_layer(layer_id, out_filters))
+        self.layers = nn.ModuleList(layers)
+
+        if self.all_layer_output and self.output_linear_combine:  # use linear_combine
+            self._linear_combine = LinearCombine(self.num_layers)
+        self.linear_out = nn.Linear(out_filters, self.class_num)
+
+        self.embed_dropout= nn.Dropout(p=(1 - embed_keep_prob))
+        self.output_dropout= nn.Dropout(p=(1 - final_output_keep_prob))
+
+        if self.output_type == "avg_pool":
+            self.output_pool = GlobalAvgPool()
+        elif self.output_type == "max_pool":
+            self.output_pool = GlobalMaxPool()
+        else:
+            raise ValueError("Unsupported output type.")
+
+    def forward(self, sent_ids, mask):
+        seq = Func.embedding(sent_ids.long(), self.embedding)
+        seq = self.embed_dropout(seq)
+
+        seq = torch.transpose(seq, 1, 2)  # from (N, L, C) -> (N, C, L)
+
+        x = self.init_conv(seq, mask)
+
+        start_idx = 0
+        prev_layers = []
+        final_flags = []
+
+        for layer_id in range(self.num_layers):  # run layers
+            layer = self.layers[layer_id]
+            x = self.run_fixed_layer(x, mask, prev_layers, layer, layer_id, start_idx,
+                                     final_flags=final_flags)  # run needed branches
+            prev_layers.append(x)
+            final_flags.append(1)
+
+            start_idx += 1 + layer_id
+            if self.multi_path:
+                start_idx += 1
+
+        final_layers = []
+        final_layers_idx = []
+        for i in range(0, len(prev_layers)):
+            if self.all_layer_output:
+                if self.num_last_layer_output == 0:
+                    final_layers.append(prev_layers[i])
+                    final_layers_idx.append(i)
+                elif i >= max((len(prev_layers) - self.num_last_layer_output), 0):
+                    final_layers.append(prev_layers[i])
+                    final_layers_idx.append(i)
+            else:
+                final_layers.append(final_flags[i] * prev_layers[i])
+
+        if self.all_layer_output and self.output_linear_combine:  # all layer ooutput and use linear_combine
+            x = self._linear_combine(torch.stack(final_layers))
+        else:
+            x = sum(final_layers)
+            if not self.all_layer_output:
+                x /= sum(final_flags)
+            else:
+                x /= len(final_layers)
+
+        x = self.output_pool(x, mask)
+        x = self.output_dropout(x)
+        x = self.linear_out(x)
+        return x
+
+    def make_fixed_layer(self, layer_id, out_filters):
+        size = [1, 3, 5, 7]
+        separables = [False, False, False, False]
+
+        branches = []
+
+        if self.multi_path:
+            branch_id = (layer_id + 1) * (layer_id + 2) // 2
+        else:
+            branch_id = (layer_id) * (layer_id + 1) // 2
+
+        bn_flag = False
+        for i in range(layer_id):
+            if self.sample_arc[branch_id + 1 + i] == 1:
+                bn_flag = True
+        branch_id = self.sample_arc[branch_id]
+
+        for operation_id in [0, 1, 2, 3]:  # conv_opt
+            if branch_id == operation_id:
+                filter_size = size[operation_id]
+                separable = separables[operation_id]
+                op = ConvBN(filter_size, out_filters, out_filters, self.cnn_keep_prob, False, True)
+                branches.append(op)
+        if branch_id == 4:
+            branches.append(AvgPool(3, False, True))
+        elif branch_id == 5:
+            branches.append(MaxPool(3, False, True))
+        elif branch_id == 6:
+            branches.append(RNN(out_filters, self.lstm_out_keep_prob))
+        elif branch_id == 7:
+            branches.append(Attention(out_filters, 4, self.attention_keep_prob, self.is_mask))
+
+        branches = nn.ModuleList(branches)
+        bn = None
+        if bn_flag:
+            bn = BatchNorm(self.out_filters, False, True)
+
+        return nn.ModuleList([branches, bn])
+
+    def run_fixed_layer(self, x, mask, prev_layers, layers, layer_id, start_idx, final_flags):
+        layer = layers[0]
+        bn = layers[1]
+
+        if len(prev_layers) > 0:
+            if self.multi_path:
+                pre_layer_id = self.sample_arc[start_idx]
+                num_pre_layers = len(prev_layers)
+                if num_pre_layers > 5:
+                    num_pre_layers = 5
+                if pre_layer_id >= num_pre_layers:
+                    final_flags[-1] = 0
+                    inputs = prev_layers[-1]
+                else:
+                    layer_idx = len(prev_layers) - 1 - pre_layer_id
+                    final_flags[layer_idx] = 0
+                    inputs = prev_layers[layer_idx]
+            else:
+                inputs = prev_layers[-1]
+                final_flags[-1] = 0
+        else:
+            inputs = x
+
+        if self.multi_path:
+            start_idx += 1
+
+        branches = []
+        # run branch op
+        branch_id = 0
+        branches.append(layer[branch_id](inputs, mask))
+
+        if layer_id == 0:
+            out = sum(branches)
+        else:
+            skip_start = start_idx + 1
+            skip = self.sample_arc[skip_start:skip_start + layer_id]
+
+            res_layers = []
+            for i in range(layer_id):
+                if skip[i] == 1:
+                    res_layers.append(prev_layers[i])
+                    final_flags[i] = 0
+            prev = branches + res_layers
+            out = sum(prev)  # tensor sum
+            if len(prev) > 1:
+                out = bn(out, mask)
+
+        return out
diff --git a/examples/nas/textnas/run_sst2.sh b/examples/nas/textnas/run_sst2.sh
new file mode 100755
index 0000000000..6918f7a099
--- /dev/null
+++ b/examples/nas/textnas/run_sst2.sh
@@ -0,0 +1,69 @@
+# Copyright (c) Microsoft Corporation.
+# Licensed under the MIT license.
+
+export PYTHONPATH="$(pwd)"
+export CUDA_VISIBLE_DEVICES=0
+
+fixed_arc="$fixed_arc 0 5"
+fixed_arc="$fixed_arc 1 7 0"
+fixed_arc="$fixed_arc 1 3 0 0"
+fixed_arc="$fixed_arc 3 6 0 1 1"
+fixed_arc="$fixed_arc 1 1 0 1 0 0"
+fixed_arc="$fixed_arc 0 1 0 0 1 0 1"
+fixed_arc="$fixed_arc 0 0 0 0 1 1 1 1"
+fixed_arc="$fixed_arc 4 6 0 1 1 0 0 0 0"
+fixed_arc="$fixed_arc 3 5 0 1 0 1 1 0 1 1"
+fixed_arc="$fixed_arc 2 5 0 0 0 0 0 1 0 1 1"
+fixed_arc="$fixed_arc 4 7 1 1 0 0 1 0 0 1 1 0"
+fixed_arc="$fixed_arc 4 4 1 0 0 1 1 1 1 0 0 0 0"
+fixed_arc="$fixed_arc 1 1 1 1 0 1 0 1 0 0 1 0 0 0"
+fixed_arc="$fixed_arc 1 2 1 1 0 0 0 0 1 0 0 1 1 0 1"
+fixed_arc="$fixed_arc 1 0 0 0 1 0 0 0 1 1 1 0 1 1 0 0"
+fixed_arc="$fixed_arc 0 3 1 0 0 0 0 0 1 0 0 0 1 1 0 0 1"
+fixed_arc="$fixed_arc 1 2 0 0 0 0 0 0 0 0 0 1 1 0 1 0 1 0"
+fixed_arc="$fixed_arc 0 4 0 1 1 1 1 0 0 1 0 1 0 0 0 0 1 0 0"
+fixed_arc="$fixed_arc 2 3 0 0 0 0 0 1 0 0 0 0 0 0 0 1 1 1 0 1"
+fixed_arc="$fixed_arc 2 3 0 0 0 1 0 0 0 0 0 0 1 0 0 0 1 0 0 0 0"
+fixed_arc="$fixed_arc 3 5 1 1 0 0 0 0 0 0 0 0 1 1 0 0 1 1 1 1 0 0"
+fixed_arc="$fixed_arc 4 7 0 1 1 1 0 0 0 1 0 1 1 1 1 0 1 0 0 0 0 0 0"
+fixed_arc="$fixed_arc 0 7 0 0 1 1 1 1 0 1 0 1 1 0 0 0 0 1 0 0 0 0 0 1"
+fixed_arc="$fixed_arc 1 3 1 0 0 1 0 0 0 1 0 0 0 0 1 0 0 0 1 0 0 0 0 0 0"
+
+python eval_arc.py \
+  --train_ratio=1.0 \
+  --valid_ratio=1.0 \
+  --min_count=1 \
+  --is_mask=True \
+  --is_binary=True \
+  --embedding_model="glove" \
+  --child_lr_decay_scheme="cosine" \
+  --data_path="data" \
+  --class_num=2 \
+  --child_optim_algo="adam" \
+  --output_dir="output_sst2" \
+  --global_seed=1234 \
+  --max_input_length=64 \
+  --batch_size=128 \
+  --eval_batch_size=128 \
+  --num_epochs=10 \
+  --log_every=50 \
+  --eval_every_epochs=1 \
+  --child_num_layers=24 \
+  --child_out_filters=256 \
+  --child_l2_reg=1e-6 \
+  --cnn_keep_prob=0.8 \
+  --final_output_keep_prob=1.0 \
+  --embed_keep_prob=0.8 \
+  --lstm_out_keep_prob=0.8 \
+  --attention_keep_prob=0.8 \
+  --child_lr=0.02 \
+  --child_lr_max=0.002 \
+  --child_lr_min=5e-6 \
+  --child_lr_T_0=10 \
+  --child_lr_T_mul=2 \
+  --multi_path=True \
+  --child_fixed_arc="${fixed_arc}" \
+  --fixed_seed=True \
+  --all_layer_output=True \
+  --output_linear_combine=True \
+  "$@"
diff --git a/examples/nas/textnas/run_sst5.sh b/examples/nas/textnas/run_sst5.sh
new file mode 100755
index 0000000000..b26df14b9b
--- /dev/null
+++ b/examples/nas/textnas/run_sst5.sh
@@ -0,0 +1,69 @@
+# Copyright (c) Microsoft Corporation.
+# Licensed under the MIT license.
+
+export PYTHONPATH="$(pwd)"
+export CUDA_VISIBLE_DEVICES=0
+
+fixed_arc="$fixed_arc 0 5"
+fixed_arc="$fixed_arc 1 7 0"
+fixed_arc="$fixed_arc 1 3 0 0"
+fixed_arc="$fixed_arc 3 6 0 1 1"
+fixed_arc="$fixed_arc 1 1 0 1 0 0"
+fixed_arc="$fixed_arc 0 1 0 0 1 0 1"
+fixed_arc="$fixed_arc 0 0 0 0 1 1 1 1"
+fixed_arc="$fixed_arc 4 6 0 1 1 0 0 0 0"
+fixed_arc="$fixed_arc 3 5 0 1 0 1 1 0 1 1"
+fixed_arc="$fixed_arc 2 5 0 0 0 0 0 1 0 1 1"
+fixed_arc="$fixed_arc 4 7 1 1 0 0 1 0 0 1 1 0"
+fixed_arc="$fixed_arc 4 4 1 0 0 1 1 1 1 0 0 0 0"
+fixed_arc="$fixed_arc 1 1 1 1 0 1 0 1 0 0 1 0 0 0"
+fixed_arc="$fixed_arc 1 2 1 1 0 0 0 0 1 0 0 1 1 0 1"
+fixed_arc="$fixed_arc 1 0 0 0 1 0 0 0 1 1 1 0 1 1 0 0"
+fixed_arc="$fixed_arc 0 3 1 0 0 0 0 0 1 0 0 0 1 1 0 0 1"
+fixed_arc="$fixed_arc 1 2 0 0 0 0 0 0 0 0 0 1 1 0 1 0 1 0"
+fixed_arc="$fixed_arc 0 4 0 1 1 1 1 0 0 1 0 1 0 0 0 0 1 0 0"
+fixed_arc="$fixed_arc 2 3 0 0 0 0 0 1 0 0 0 0 0 0 0 1 1 1 0 1"
+fixed_arc="$fixed_arc 2 3 0 0 0 1 0 0 0 0 0 0 1 0 0 0 1 0 0 0 0"
+fixed_arc="$fixed_arc 3 5 1 1 0 0 0 0 0 0 0 0 1 1 0 0 1 1 1 1 0 0"
+fixed_arc="$fixed_arc 4 7 0 1 1 1 0 0 0 1 0 1 1 1 1 0 1 0 0 0 0 0 0"
+fixed_arc="$fixed_arc 0 7 0 0 1 1 1 1 0 1 0 1 1 0 0 0 0 1 0 0 0 0 0 1"
+fixed_arc="$fixed_arc 1 3 1 0 0 1 0 0 0 1 0 0 0 0 1 0 0 0 1 0 0 0 0 0 0"
+
+python eval_arc.py \
+  --train_ratio=1.0 \
+  --valid_ratio=1.0 \
+  --min_count=1 \
+  --is_mask=True \
+  --is_binary=True \
+  --embedding_model="glove" \
+  --child_lr_decay_scheme="cosine" \
+  --data_path="data" \
+  --class_num=5 \
+  --child_optim_algo="adam" \
+  --output_dir="output_sst5" \
+  --global_seed=1234 \
+  --max_input_length=64 \
+  --batch_size=256 \
+  --eval_batch_size=128 \
+  --num_epochs=10 \
+  --log_every=50 \
+  --eval_every_epochs=1 \
+  --child_num_layers=24 \
+  --child_out_filters=256 \
+  --child_l2_reg=1e-6 \
+  --cnn_keep_prob=0.8 \
+  --final_output_keep_prob=1.0 \
+  --embed_keep_prob=0.8 \
+  --lstm_out_keep_prob=0.8 \
+  --attention_keep_prob=0.8 \
+  --child_lr=0.02 \
+  --child_lr_max=0.002 \
+  --child_lr_min=0.0002 \
+  --child_lr_T_0=10 \
+  --child_lr_T_mul=2 \
+  --multi_path=True \
+  --child_fixed_arc="${fixed_arc}" \
+  --fixed_seed=True \
+  --all_layer_output=True \
+  --output_linear_combine=True \
+  "$@"
diff --git a/examples/nas/textnas/train_sst2.sh b/examples/nas/textnas/train_sst2.sh
new file mode 100755
index 0000000000..6918f7a099
--- /dev/null
+++ b/examples/nas/textnas/train_sst2.sh
@@ -0,0 +1,69 @@
+# Copyright (c) Microsoft Corporation.
+# Licensed under the MIT license.
+
+export PYTHONPATH="$(pwd)"
+export CUDA_VISIBLE_DEVICES=0
+
+fixed_arc="$fixed_arc 0 5"
+fixed_arc="$fixed_arc 1 7 0"
+fixed_arc="$fixed_arc 1 3 0 0"
+fixed_arc="$fixed_arc 3 6 0 1 1"
+fixed_arc="$fixed_arc 1 1 0 1 0 0"
+fixed_arc="$fixed_arc 0 1 0 0 1 0 1"
+fixed_arc="$fixed_arc 0 0 0 0 1 1 1 1"
+fixed_arc="$fixed_arc 4 6 0 1 1 0 0 0 0"
+fixed_arc="$fixed_arc 3 5 0 1 0 1 1 0 1 1"
+fixed_arc="$fixed_arc 2 5 0 0 0 0 0 1 0 1 1"
+fixed_arc="$fixed_arc 4 7 1 1 0 0 1 0 0 1 1 0"
+fixed_arc="$fixed_arc 4 4 1 0 0 1 1 1 1 0 0 0 0"
+fixed_arc="$fixed_arc 1 1 1 1 0 1 0 1 0 0 1 0 0 0"
+fixed_arc="$fixed_arc 1 2 1 1 0 0 0 0 1 0 0 1 1 0 1"
+fixed_arc="$fixed_arc 1 0 0 0 1 0 0 0 1 1 1 0 1 1 0 0"
+fixed_arc="$fixed_arc 0 3 1 0 0 0 0 0 1 0 0 0 1 1 0 0 1"
+fixed_arc="$fixed_arc 1 2 0 0 0 0 0 0 0 0 0 1 1 0 1 0 1 0"
+fixed_arc="$fixed_arc 0 4 0 1 1 1 1 0 0 1 0 1 0 0 0 0 1 0 0"
+fixed_arc="$fixed_arc 2 3 0 0 0 0 0 1 0 0 0 0 0 0 0 1 1 1 0 1"
+fixed_arc="$fixed_arc 2 3 0 0 0 1 0 0 0 0 0 0 1 0 0 0 1 0 0 0 0"
+fixed_arc="$fixed_arc 3 5 1 1 0 0 0 0 0 0 0 0 1 1 0 0 1 1 1 1 0 0"
+fixed_arc="$fixed_arc 4 7 0 1 1 1 0 0 0 1 0 1 1 1 1 0 1 0 0 0 0 0 0"
+fixed_arc="$fixed_arc 0 7 0 0 1 1 1 1 0 1 0 1 1 0 0 0 0 1 0 0 0 0 0 1"
+fixed_arc="$fixed_arc 1 3 1 0 0 1 0 0 0 1 0 0 0 0 1 0 0 0 1 0 0 0 0 0 0"
+
+python eval_arc.py \
+  --train_ratio=1.0 \
+  --valid_ratio=1.0 \
+  --min_count=1 \
+  --is_mask=True \
+  --is_binary=True \
+  --embedding_model="glove" \
+  --child_lr_decay_scheme="cosine" \
+  --data_path="data" \
+  --class_num=2 \
+  --child_optim_algo="adam" \
+  --output_dir="output_sst2" \
+  --global_seed=1234 \
+  --max_input_length=64 \
+  --batch_size=128 \
+  --eval_batch_size=128 \
+  --num_epochs=10 \
+  --log_every=50 \
+  --eval_every_epochs=1 \
+  --child_num_layers=24 \
+  --child_out_filters=256 \
+  --child_l2_reg=1e-6 \
+  --cnn_keep_prob=0.8 \
+  --final_output_keep_prob=1.0 \
+  --embed_keep_prob=0.8 \
+  --lstm_out_keep_prob=0.8 \
+  --attention_keep_prob=0.8 \
+  --child_lr=0.02 \
+  --child_lr_max=0.002 \
+  --child_lr_min=5e-6 \
+  --child_lr_T_0=10 \
+  --child_lr_T_mul=2 \
+  --multi_path=True \
+  --child_fixed_arc="${fixed_arc}" \
+  --fixed_seed=True \
+  --all_layer_output=True \
+  --output_linear_combine=True \
+  "$@"
diff --git a/examples/nas/textnas/train_sst5.sh b/examples/nas/textnas/train_sst5.sh
new file mode 100755
index 0000000000..b26df14b9b
--- /dev/null
+++ b/examples/nas/textnas/train_sst5.sh
@@ -0,0 +1,69 @@
+# Copyright (c) Microsoft Corporation.
+# Licensed under the MIT license.
+
+export PYTHONPATH="$(pwd)"
+export CUDA_VISIBLE_DEVICES=0
+
+fixed_arc="$fixed_arc 0 5"
+fixed_arc="$fixed_arc 1 7 0"
+fixed_arc="$fixed_arc 1 3 0 0"
+fixed_arc="$fixed_arc 3 6 0 1 1"
+fixed_arc="$fixed_arc 1 1 0 1 0 0"
+fixed_arc="$fixed_arc 0 1 0 0 1 0 1"
+fixed_arc="$fixed_arc 0 0 0 0 1 1 1 1"
+fixed_arc="$fixed_arc 4 6 0 1 1 0 0 0 0"
+fixed_arc="$fixed_arc 3 5 0 1 0 1 1 0 1 1"
+fixed_arc="$fixed_arc 2 5 0 0 0 0 0 1 0 1 1"
+fixed_arc="$fixed_arc 4 7 1 1 0 0 1 0 0 1 1 0"
+fixed_arc="$fixed_arc 4 4 1 0 0 1 1 1 1 0 0 0 0"
+fixed_arc="$fixed_arc 1 1 1 1 0 1 0 1 0 0 1 0 0 0"
+fixed_arc="$fixed_arc 1 2 1 1 0 0 0 0 1 0 0 1 1 0 1"
+fixed_arc="$fixed_arc 1 0 0 0 1 0 0 0 1 1 1 0 1 1 0 0"
+fixed_arc="$fixed_arc 0 3 1 0 0 0 0 0 1 0 0 0 1 1 0 0 1"
+fixed_arc="$fixed_arc 1 2 0 0 0 0 0 0 0 0 0 1 1 0 1 0 1 0"
+fixed_arc="$fixed_arc 0 4 0 1 1 1 1 0 0 1 0 1 0 0 0 0 1 0 0"
+fixed_arc="$fixed_arc 2 3 0 0 0 0 0 1 0 0 0 0 0 0 0 1 1 1 0 1"
+fixed_arc="$fixed_arc 2 3 0 0 0 1 0 0 0 0 0 0 1 0 0 0 1 0 0 0 0"
+fixed_arc="$fixed_arc 3 5 1 1 0 0 0 0 0 0 0 0 1 1 0 0 1 1 1 1 0 0"
+fixed_arc="$fixed_arc 4 7 0 1 1 1 0 0 0 1 0 1 1 1 1 0 1 0 0 0 0 0 0"
+fixed_arc="$fixed_arc 0 7 0 0 1 1 1 1 0 1 0 1 1 0 0 0 0 1 0 0 0 0 0 1"
+fixed_arc="$fixed_arc 1 3 1 0 0 1 0 0 0 1 0 0 0 0 1 0 0 0 1 0 0 0 0 0 0"
+
+python eval_arc.py \
+  --train_ratio=1.0 \
+  --valid_ratio=1.0 \
+  --min_count=1 \
+  --is_mask=True \
+  --is_binary=True \
+  --embedding_model="glove" \
+  --child_lr_decay_scheme="cosine" \
+  --data_path="data" \
+  --class_num=5 \
+  --child_optim_algo="adam" \
+  --output_dir="output_sst5" \
+  --global_seed=1234 \
+  --max_input_length=64 \
+  --batch_size=256 \
+  --eval_batch_size=128 \
+  --num_epochs=10 \
+  --log_every=50 \
+  --eval_every_epochs=1 \
+  --child_num_layers=24 \
+  --child_out_filters=256 \
+  --child_l2_reg=1e-6 \
+  --cnn_keep_prob=0.8 \
+  --final_output_keep_prob=1.0 \
+  --embed_keep_prob=0.8 \
+  --lstm_out_keep_prob=0.8 \
+  --attention_keep_prob=0.8 \
+  --child_lr=0.02 \
+  --child_lr_max=0.002 \
+  --child_lr_min=0.0002 \
+  --child_lr_T_0=10 \
+  --child_lr_T_mul=2 \
+  --multi_path=True \
+  --child_fixed_arc="${fixed_arc}" \
+  --fixed_seed=True \
+  --all_layer_output=True \
+  --output_linear_combine=True \
+  "$@"

From 931ca9254b3237fbf47288f32e71dfca66ef10ac Mon Sep 17 00:00:00 2001
From: Yaming Yang <yayaming@microsoft.com>
Date: Sun, 22 Mar 2020 12:13:05 +0000
Subject: [PATCH 02/12] Remove duplicated run_sst2.sh and run_sst5.sh.

---
 examples/nas/textnas/run_sst2.sh | 69 --------------------------------
 examples/nas/textnas/run_sst5.sh | 69 --------------------------------
 2 files changed, 138 deletions(-)
 delete mode 100755 examples/nas/textnas/run_sst2.sh
 delete mode 100755 examples/nas/textnas/run_sst5.sh

diff --git a/examples/nas/textnas/run_sst2.sh b/examples/nas/textnas/run_sst2.sh
deleted file mode 100755
index 6918f7a099..0000000000
--- a/examples/nas/textnas/run_sst2.sh
+++ /dev/null
@@ -1,69 +0,0 @@
-# Copyright (c) Microsoft Corporation.
-# Licensed under the MIT license.
-
-export PYTHONPATH="$(pwd)"
-export CUDA_VISIBLE_DEVICES=0
-
-fixed_arc="$fixed_arc 0 5"
-fixed_arc="$fixed_arc 1 7 0"
-fixed_arc="$fixed_arc 1 3 0 0"
-fixed_arc="$fixed_arc 3 6 0 1 1"
-fixed_arc="$fixed_arc 1 1 0 1 0 0"
-fixed_arc="$fixed_arc 0 1 0 0 1 0 1"
-fixed_arc="$fixed_arc 0 0 0 0 1 1 1 1"
-fixed_arc="$fixed_arc 4 6 0 1 1 0 0 0 0"
-fixed_arc="$fixed_arc 3 5 0 1 0 1 1 0 1 1"
-fixed_arc="$fixed_arc 2 5 0 0 0 0 0 1 0 1 1"
-fixed_arc="$fixed_arc 4 7 1 1 0 0 1 0 0 1 1 0"
-fixed_arc="$fixed_arc 4 4 1 0 0 1 1 1 1 0 0 0 0"
-fixed_arc="$fixed_arc 1 1 1 1 0 1 0 1 0 0 1 0 0 0"
-fixed_arc="$fixed_arc 1 2 1 1 0 0 0 0 1 0 0 1 1 0 1"
-fixed_arc="$fixed_arc 1 0 0 0 1 0 0 0 1 1 1 0 1 1 0 0"
-fixed_arc="$fixed_arc 0 3 1 0 0 0 0 0 1 0 0 0 1 1 0 0 1"
-fixed_arc="$fixed_arc 1 2 0 0 0 0 0 0 0 0 0 1 1 0 1 0 1 0"
-fixed_arc="$fixed_arc 0 4 0 1 1 1 1 0 0 1 0 1 0 0 0 0 1 0 0"
-fixed_arc="$fixed_arc 2 3 0 0 0 0 0 1 0 0 0 0 0 0 0 1 1 1 0 1"
-fixed_arc="$fixed_arc 2 3 0 0 0 1 0 0 0 0 0 0 1 0 0 0 1 0 0 0 0"
-fixed_arc="$fixed_arc 3 5 1 1 0 0 0 0 0 0 0 0 1 1 0 0 1 1 1 1 0 0"
-fixed_arc="$fixed_arc 4 7 0 1 1 1 0 0 0 1 0 1 1 1 1 0 1 0 0 0 0 0 0"
-fixed_arc="$fixed_arc 0 7 0 0 1 1 1 1 0 1 0 1 1 0 0 0 0 1 0 0 0 0 0 1"
-fixed_arc="$fixed_arc 1 3 1 0 0 1 0 0 0 1 0 0 0 0 1 0 0 0 1 0 0 0 0 0 0"
-
-python eval_arc.py \
-  --train_ratio=1.0 \
-  --valid_ratio=1.0 \
-  --min_count=1 \
-  --is_mask=True \
-  --is_binary=True \
-  --embedding_model="glove" \
-  --child_lr_decay_scheme="cosine" \
-  --data_path="data" \
-  --class_num=2 \
-  --child_optim_algo="adam" \
-  --output_dir="output_sst2" \
-  --global_seed=1234 \
-  --max_input_length=64 \
-  --batch_size=128 \
-  --eval_batch_size=128 \
-  --num_epochs=10 \
-  --log_every=50 \
-  --eval_every_epochs=1 \
-  --child_num_layers=24 \
-  --child_out_filters=256 \
-  --child_l2_reg=1e-6 \
-  --cnn_keep_prob=0.8 \
-  --final_output_keep_prob=1.0 \
-  --embed_keep_prob=0.8 \
-  --lstm_out_keep_prob=0.8 \
-  --attention_keep_prob=0.8 \
-  --child_lr=0.02 \
-  --child_lr_max=0.002 \
-  --child_lr_min=5e-6 \
-  --child_lr_T_0=10 \
-  --child_lr_T_mul=2 \
-  --multi_path=True \
-  --child_fixed_arc="${fixed_arc}" \
-  --fixed_seed=True \
-  --all_layer_output=True \
-  --output_linear_combine=True \
-  "$@"
diff --git a/examples/nas/textnas/run_sst5.sh b/examples/nas/textnas/run_sst5.sh
deleted file mode 100755
index b26df14b9b..0000000000
--- a/examples/nas/textnas/run_sst5.sh
+++ /dev/null
@@ -1,69 +0,0 @@
-# Copyright (c) Microsoft Corporation.
-# Licensed under the MIT license.
-
-export PYTHONPATH="$(pwd)"
-export CUDA_VISIBLE_DEVICES=0
-
-fixed_arc="$fixed_arc 0 5"
-fixed_arc="$fixed_arc 1 7 0"
-fixed_arc="$fixed_arc 1 3 0 0"
-fixed_arc="$fixed_arc 3 6 0 1 1"
-fixed_arc="$fixed_arc 1 1 0 1 0 0"
-fixed_arc="$fixed_arc 0 1 0 0 1 0 1"
-fixed_arc="$fixed_arc 0 0 0 0 1 1 1 1"
-fixed_arc="$fixed_arc 4 6 0 1 1 0 0 0 0"
-fixed_arc="$fixed_arc 3 5 0 1 0 1 1 0 1 1"
-fixed_arc="$fixed_arc 2 5 0 0 0 0 0 1 0 1 1"
-fixed_arc="$fixed_arc 4 7 1 1 0 0 1 0 0 1 1 0"
-fixed_arc="$fixed_arc 4 4 1 0 0 1 1 1 1 0 0 0 0"
-fixed_arc="$fixed_arc 1 1 1 1 0 1 0 1 0 0 1 0 0 0"
-fixed_arc="$fixed_arc 1 2 1 1 0 0 0 0 1 0 0 1 1 0 1"
-fixed_arc="$fixed_arc 1 0 0 0 1 0 0 0 1 1 1 0 1 1 0 0"
-fixed_arc="$fixed_arc 0 3 1 0 0 0 0 0 1 0 0 0 1 1 0 0 1"
-fixed_arc="$fixed_arc 1 2 0 0 0 0 0 0 0 0 0 1 1 0 1 0 1 0"
-fixed_arc="$fixed_arc 0 4 0 1 1 1 1 0 0 1 0 1 0 0 0 0 1 0 0"
-fixed_arc="$fixed_arc 2 3 0 0 0 0 0 1 0 0 0 0 0 0 0 1 1 1 0 1"
-fixed_arc="$fixed_arc 2 3 0 0 0 1 0 0 0 0 0 0 1 0 0 0 1 0 0 0 0"
-fixed_arc="$fixed_arc 3 5 1 1 0 0 0 0 0 0 0 0 1 1 0 0 1 1 1 1 0 0"
-fixed_arc="$fixed_arc 4 7 0 1 1 1 0 0 0 1 0 1 1 1 1 0 1 0 0 0 0 0 0"
-fixed_arc="$fixed_arc 0 7 0 0 1 1 1 1 0 1 0 1 1 0 0 0 0 1 0 0 0 0 0 1"
-fixed_arc="$fixed_arc 1 3 1 0 0 1 0 0 0 1 0 0 0 0 1 0 0 0 1 0 0 0 0 0 0"
-
-python eval_arc.py \
-  --train_ratio=1.0 \
-  --valid_ratio=1.0 \
-  --min_count=1 \
-  --is_mask=True \
-  --is_binary=True \
-  --embedding_model="glove" \
-  --child_lr_decay_scheme="cosine" \
-  --data_path="data" \
-  --class_num=5 \
-  --child_optim_algo="adam" \
-  --output_dir="output_sst5" \
-  --global_seed=1234 \
-  --max_input_length=64 \
-  --batch_size=256 \
-  --eval_batch_size=128 \
-  --num_epochs=10 \
-  --log_every=50 \
-  --eval_every_epochs=1 \
-  --child_num_layers=24 \
-  --child_out_filters=256 \
-  --child_l2_reg=1e-6 \
-  --cnn_keep_prob=0.8 \
-  --final_output_keep_prob=1.0 \
-  --embed_keep_prob=0.8 \
-  --lstm_out_keep_prob=0.8 \
-  --attention_keep_prob=0.8 \
-  --child_lr=0.02 \
-  --child_lr_max=0.002 \
-  --child_lr_min=0.0002 \
-  --child_lr_T_0=10 \
-  --child_lr_T_mul=2 \
-  --multi_path=True \
-  --child_fixed_arc="${fixed_arc}" \
-  --fixed_seed=True \
-  --all_layer_output=True \
-  --output_linear_combine=True \
-  "$@"

From 5160ac0c9e90f43f2895460331866c92b81d4d32 Mon Sep 17 00:00:00 2001
From: Yaming Yang <yayaming@microsoft.com>
Date: Sun, 22 Mar 2020 12:41:32 +0000
Subject: [PATCH 03/12] Minor fixes.

---
 examples/nas/textnas/train_sst2.sh | 2 +-
 examples/nas/textnas/train_sst5.sh | 4 ++--
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/examples/nas/textnas/train_sst2.sh b/examples/nas/textnas/train_sst2.sh
index 6918f7a099..c3f24a2be7 100755
--- a/examples/nas/textnas/train_sst2.sh
+++ b/examples/nas/textnas/train_sst2.sh
@@ -29,7 +29,7 @@ fixed_arc="$fixed_arc 4 7 0 1 1 1 0 0 0 1 0 1 1 1 1 0 1 0 0 0 0 0 0"
 fixed_arc="$fixed_arc 0 7 0 0 1 1 1 1 0 1 0 1 1 0 0 0 0 1 0 0 0 0 0 1"
 fixed_arc="$fixed_arc 1 3 1 0 0 1 0 0 0 1 0 0 0 0 1 0 0 0 1 0 0 0 0 0 0"
 
-python eval_arc.py \
+python -u eval_arc.py \
   --train_ratio=1.0 \
   --valid_ratio=1.0 \
   --min_count=1 \
diff --git a/examples/nas/textnas/train_sst5.sh b/examples/nas/textnas/train_sst5.sh
index b26df14b9b..5a59476b6b 100755
--- a/examples/nas/textnas/train_sst5.sh
+++ b/examples/nas/textnas/train_sst5.sh
@@ -29,12 +29,12 @@ fixed_arc="$fixed_arc 4 7 0 1 1 1 0 0 0 1 0 1 1 1 1 0 1 0 0 0 0 0 0"
 fixed_arc="$fixed_arc 0 7 0 0 1 1 1 1 0 1 0 1 1 0 0 0 0 1 0 0 0 0 0 1"
 fixed_arc="$fixed_arc 1 3 1 0 0 1 0 0 0 1 0 0 0 0 1 0 0 0 1 0 0 0 0 0 0"
 
-python eval_arc.py \
+python -u eval_arc.py \
   --train_ratio=1.0 \
   --valid_ratio=1.0 \
   --min_count=1 \
   --is_mask=True \
-  --is_binary=True \
+  --is_binary=False \
   --embedding_model="glove" \
   --child_lr_decay_scheme="cosine" \
   --data_path="data" \

From 5137dcd4d2400a8b16a7625483d9ac73e2ec50b0 Mon Sep 17 00:00:00 2001
From: Yaming Yang <yayaming@microsoft.com>
Date: Mon, 23 Mar 2020 06:12:17 +0000
Subject: [PATCH 04/12] Bug fix.

---
 examples/nas/textnas/eval_arc.py | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/examples/nas/textnas/eval_arc.py b/examples/nas/textnas/eval_arc.py
index 63a79d1672..980151baff 100644
--- a/examples/nas/textnas/eval_arc.py
+++ b/examples/nas/textnas/eval_arc.py
@@ -443,7 +443,7 @@ def train(data_path, output_dir, num_layers):
     if FLAGS.child_optim_algo == "adam":
         optimizer = optim.Adam(child_model.parameters(), eps=1e-3, weight_decay=FLAGS.child_l2_reg)  # with L2
     else:
-        raise ValueError("Unknown optim_algo {}".format(optim_algo))
+        raise ValueError("Unknown optim_algo {}".format(FLAGS.child_optim_algo))
 
     child_model.cuda()
     criterion.cuda()
@@ -506,7 +506,7 @@ def train(data_path, output_dir, num_layers):
             # compute the gradient norm value
             grad_norm = nn.utils.clip_grad_norm_(trainable_params, 99999999)
             for param in trainable_params:
-                nn.utils.clip_grad_norm_(param, grad_bound) #clip grad
+                nn.utils.clip_grad_norm_(param, FLAGS.child_grad_bound)  # clip grad
 
             optimizer.step()
 
@@ -521,6 +521,7 @@ def train(data_path, output_dir, num_layers):
                 log_string += " tr_acc={:<3d}/{:>3d}".format(acc, logits.size()[0])
                 log_string += " mins={:<10.2f}".format(float(curr_time - start_time) / 60)
                 print(log_string)
+
         epoch += 1
         save_state = {
             'step' : step,

From d02bc9773b75cb490d46c45d3cb17675bbb85115 Mon Sep 17 00:00:00 2001
From: Yaming Yang <yayaming@microsoft.com>
Date: Wed, 25 Mar 2020 02:24:58 +0000
Subject: [PATCH 05/12] Adjust the running paramters.

---
 examples/nas/textnas/eval_arc.py   | 1 +
 examples/nas/textnas/train_sst2.sh | 6 +++---
 examples/nas/textnas/train_sst5.sh | 2 +-
 3 files changed, 5 insertions(+), 4 deletions(-)

diff --git a/examples/nas/textnas/eval_arc.py b/examples/nas/textnas/eval_arc.py
index 980151baff..c61edc86f1 100644
--- a/examples/nas/textnas/eval_arc.py
+++ b/examples/nas/textnas/eval_arc.py
@@ -409,6 +409,7 @@ def _update():
         params_group['lr'] = learning_rate
     return learning_rate
 
+
 def train(data_path, output_dir, num_layers):
     print("Build dataloader")
     train_dataset, valid_dataset, test_dataset, embedding = \
diff --git a/examples/nas/textnas/train_sst2.sh b/examples/nas/textnas/train_sst2.sh
index c3f24a2be7..40cc43aa16 100755
--- a/examples/nas/textnas/train_sst2.sh
+++ b/examples/nas/textnas/train_sst2.sh
@@ -43,14 +43,14 @@ python -u eval_arc.py \
   --output_dir="output_sst2" \
   --global_seed=1234 \
   --max_input_length=64 \
-  --batch_size=128 \
+  --batch_size=256 \
   --eval_batch_size=128 \
   --num_epochs=10 \
   --log_every=50 \
   --eval_every_epochs=1 \
   --child_num_layers=24 \
-  --child_out_filters=256 \
-  --child_l2_reg=1e-6 \
+  --child_out_filters=128 \
+  --child_l2_reg=2e-5 \
   --cnn_keep_prob=0.8 \
   --final_output_keep_prob=1.0 \
   --embed_keep_prob=0.8 \
diff --git a/examples/nas/textnas/train_sst5.sh b/examples/nas/textnas/train_sst5.sh
index 5a59476b6b..79da3518f6 100755
--- a/examples/nas/textnas/train_sst5.sh
+++ b/examples/nas/textnas/train_sst5.sh
@@ -58,7 +58,7 @@ python -u eval_arc.py \
   --attention_keep_prob=0.8 \
   --child_lr=0.02 \
   --child_lr_max=0.002 \
-  --child_lr_min=0.0002 \
+  --child_lr_min=1e-6 \
   --child_lr_T_0=10 \
   --child_lr_T_mul=2 \
   --multi_path=True \

From eefe0e05cb4cfb0b610fbe6fc117e998c7ec63f4 Mon Sep 17 00:00:00 2001
From: Yaming Yang <yayaming@microsoft.com>
Date: Sun, 29 Mar 2020 13:45:25 +0000
Subject: [PATCH 06/12] Adopted to nni interface.

---
 examples/nas/textnas/macro_child.py           | 223 ------------------
 .../nas/textnas/{eval_arc.py => retrain.py}   | 153 +++++-------
 examples/nas/textnas/run_retrain.sh           |  41 ++++
 examples/nas/textnas/train_sst2.sh            |  69 ------
 examples/nas/textnas/train_sst5.sh            |  69 ------
 5 files changed, 99 insertions(+), 456 deletions(-)
 delete mode 100644 examples/nas/textnas/macro_child.py
 rename examples/nas/textnas/{eval_arc.py => retrain.py} (80%)
 create mode 100755 examples/nas/textnas/run_retrain.sh
 delete mode 100755 examples/nas/textnas/train_sst2.sh
 delete mode 100755 examples/nas/textnas/train_sst5.sh

diff --git a/examples/nas/textnas/macro_child.py b/examples/nas/textnas/macro_child.py
deleted file mode 100644
index a74f24f5bc..0000000000
--- a/examples/nas/textnas/macro_child.py
+++ /dev/null
@@ -1,223 +0,0 @@
-# Copyright (c) Microsoft Corporation.
-# Licensed under the MIT license.
-
-import os
-import sys
-
-import numpy as np
-import torch
-from torch import nn
-import torch.nn.functional as Func
-
-from ops import *
-from utils import GlobalAvgPool, GlobalMaxPool
-
-class MacroChild(nn.Module):
-    def __init__(self,
-                 embedding,
-                 fixed_arc=None,
-                 out_filters_scale=1,
-                 num_layers=2,
-                 out_filters=24,
-                 cnn_keep_prob=1.0,
-                 final_output_keep_prob=1.0,
-                 lstm_out_keep_prob=1.0,
-                 embed_keep_prob=1.0,
-                 attention_keep_prob=1.0,
-                 multi_path=False,
-                 embedding_model="none",
-                 all_layer_output=False,
-                 output_linear_combine=False,
-                 num_last_layer_output=0,
-                 is_mask=False,
-                 output_type="avg_pool",
-                 class_num=5,
-                 *args,
-                 **kwargs):
-        super(MacroChild, self).__init__()
-
-        self.fixed_arc = fixed_arc
-        self.all_layer_output = all_layer_output
-        self.output_linear_combine = output_linear_combine
-        self.num_last_layer_output = max(num_last_layer_output, 0)
-        self.is_mask = is_mask
-        self.output_type = output_type
-        self.multi_path = multi_path
-        self.embedding_model = embedding_model
-        self.out_filters = out_filters * out_filters_scale
-        self.num_layers = num_layers
-        self.class_num = class_num
-        self.cnn_keep_prob = cnn_keep_prob
-        self.final_output_keep_prob = final_output_keep_prob
-        self.lstm_out_keep_prob = lstm_out_keep_prob
-        self.embed_keep_prob = embed_keep_prob
-        self.attention_keep_prob = attention_keep_prob
-
-        fixed_arc = np.array([int(x) for x in self.fixed_arc.split(" ") if x])
-        self.sample_arc = fixed_arc
-
-        layers = []
-
-        out_filters = self.out_filters
-        if self.embedding_model == "glove":
-            self.embedding = nn.Parameter(embedding)
-        else:
-            raise NotImplementedError("Unknown embedding_model '{}'".format(embedding_model))
-
-        self.init_conv = ConvBN(1, self.embedding.size()[1], out_filters, cnn_keep_prob, False, True)
-
-        for layer_id in range(self.num_layers):
-            layers.append(self.make_fixed_layer(layer_id, out_filters))
-        self.layers = nn.ModuleList(layers)
-
-        if self.all_layer_output and self.output_linear_combine:  # use linear_combine
-            self._linear_combine = LinearCombine(self.num_layers)
-        self.linear_out = nn.Linear(out_filters, self.class_num)
-
-        self.embed_dropout= nn.Dropout(p=(1 - embed_keep_prob))
-        self.output_dropout= nn.Dropout(p=(1 - final_output_keep_prob))
-
-        if self.output_type == "avg_pool":
-            self.output_pool = GlobalAvgPool()
-        elif self.output_type == "max_pool":
-            self.output_pool = GlobalMaxPool()
-        else:
-            raise ValueError("Unsupported output type.")
-
-    def forward(self, sent_ids, mask):
-        seq = Func.embedding(sent_ids.long(), self.embedding)
-        seq = self.embed_dropout(seq)
-
-        seq = torch.transpose(seq, 1, 2)  # from (N, L, C) -> (N, C, L)
-
-        x = self.init_conv(seq, mask)
-
-        start_idx = 0
-        prev_layers = []
-        final_flags = []
-
-        for layer_id in range(self.num_layers):  # run layers
-            layer = self.layers[layer_id]
-            x = self.run_fixed_layer(x, mask, prev_layers, layer, layer_id, start_idx,
-                                     final_flags=final_flags)  # run needed branches
-            prev_layers.append(x)
-            final_flags.append(1)
-
-            start_idx += 1 + layer_id
-            if self.multi_path:
-                start_idx += 1
-
-        final_layers = []
-        final_layers_idx = []
-        for i in range(0, len(prev_layers)):
-            if self.all_layer_output:
-                if self.num_last_layer_output == 0:
-                    final_layers.append(prev_layers[i])
-                    final_layers_idx.append(i)
-                elif i >= max((len(prev_layers) - self.num_last_layer_output), 0):
-                    final_layers.append(prev_layers[i])
-                    final_layers_idx.append(i)
-            else:
-                final_layers.append(final_flags[i] * prev_layers[i])
-
-        if self.all_layer_output and self.output_linear_combine:  # all layer ooutput and use linear_combine
-            x = self._linear_combine(torch.stack(final_layers))
-        else:
-            x = sum(final_layers)
-            if not self.all_layer_output:
-                x /= sum(final_flags)
-            else:
-                x /= len(final_layers)
-
-        x = self.output_pool(x, mask)
-        x = self.output_dropout(x)
-        x = self.linear_out(x)
-        return x
-
-    def make_fixed_layer(self, layer_id, out_filters):
-        size = [1, 3, 5, 7]
-        separables = [False, False, False, False]
-
-        branches = []
-
-        if self.multi_path:
-            branch_id = (layer_id + 1) * (layer_id + 2) // 2
-        else:
-            branch_id = (layer_id) * (layer_id + 1) // 2
-
-        bn_flag = False
-        for i in range(layer_id):
-            if self.sample_arc[branch_id + 1 + i] == 1:
-                bn_flag = True
-        branch_id = self.sample_arc[branch_id]
-
-        for operation_id in [0, 1, 2, 3]:  # conv_opt
-            if branch_id == operation_id:
-                filter_size = size[operation_id]
-                separable = separables[operation_id]
-                op = ConvBN(filter_size, out_filters, out_filters, self.cnn_keep_prob, False, True)
-                branches.append(op)
-        if branch_id == 4:
-            branches.append(AvgPool(3, False, True))
-        elif branch_id == 5:
-            branches.append(MaxPool(3, False, True))
-        elif branch_id == 6:
-            branches.append(RNN(out_filters, self.lstm_out_keep_prob))
-        elif branch_id == 7:
-            branches.append(Attention(out_filters, 4, self.attention_keep_prob, self.is_mask))
-
-        branches = nn.ModuleList(branches)
-        bn = None
-        if bn_flag:
-            bn = BatchNorm(self.out_filters, False, True)
-
-        return nn.ModuleList([branches, bn])
-
-    def run_fixed_layer(self, x, mask, prev_layers, layers, layer_id, start_idx, final_flags):
-        layer = layers[0]
-        bn = layers[1]
-
-        if len(prev_layers) > 0:
-            if self.multi_path:
-                pre_layer_id = self.sample_arc[start_idx]
-                num_pre_layers = len(prev_layers)
-                if num_pre_layers > 5:
-                    num_pre_layers = 5
-                if pre_layer_id >= num_pre_layers:
-                    final_flags[-1] = 0
-                    inputs = prev_layers[-1]
-                else:
-                    layer_idx = len(prev_layers) - 1 - pre_layer_id
-                    final_flags[layer_idx] = 0
-                    inputs = prev_layers[layer_idx]
-            else:
-                inputs = prev_layers[-1]
-                final_flags[-1] = 0
-        else:
-            inputs = x
-
-        if self.multi_path:
-            start_idx += 1
-
-        branches = []
-        # run branch op
-        branch_id = 0
-        branches.append(layer[branch_id](inputs, mask))
-
-        if layer_id == 0:
-            out = sum(branches)
-        else:
-            skip_start = start_idx + 1
-            skip = self.sample_arc[skip_start:skip_start + layer_id]
-
-            res_layers = []
-            for i in range(layer_id):
-                if skip[i] == 1:
-                    res_layers.append(prev_layers[i])
-                    final_flags[i] = 0
-            prev = branches + res_layers
-            out = sum(prev)  # tensor sum
-            if len(prev) > 1:
-                out = bn(out, mask)
-
-        return out
diff --git a/examples/nas/textnas/eval_arc.py b/examples/nas/textnas/retrain.py
similarity index 80%
rename from examples/nas/textnas/eval_arc.py
rename to examples/nas/textnas/retrain.py
index c61edc86f1..ab8f5c661c 100644
--- a/examples/nas/textnas/eval_arc.py
+++ b/examples/nas/textnas/retrain.py
@@ -1,10 +1,11 @@
 # Copyright (c) Microsoft Corporation.
 # Licensed under the MIT license.
 
+import sys
 import os
+import logging
 import pickle
 import shutil
-import sys
 import random
 import math
 
@@ -20,10 +21,14 @@
 from torch.utils.data import DataLoader
 import torch.nn.functional as Func
 
-from macro_child import MacroChild
+from model import Model
+from nni.nas.pytorch.fixed import apply_fixed_architecture
 from dataloader import read_data_sst
 
 
+logger = logging.getLogger("nni.textnas")
+
+
 def parse_args():
     parser = argparse.ArgumentParser()
     parser.add_argument(
@@ -31,16 +36,11 @@ def parse_args():
         type=distutils.util.strtobool,
         default=True,
         help="Whether to clean the output dir if existed. (default: %(default)s)")
-    parser.add_argument(
-        "--embedding_model",
-        type=str,
-        default="glove",
-        help="Embedding type. (default: %(default)s)")
     parser.add_argument(
         "--child_fixed_arc",
         type=str,
         required=True,
-        help="Architecture description. (default: %(default)s)")
+        help="Architecture json file. (default: %(default)s)")
     parser.add_argument(
         "--data_path",
         type=str,
@@ -116,11 +116,6 @@ def parse_args():
         type=int,
         default=1,
         help="The threshold to cut off low frequent words. (default: %(default)s)")
-    parser.add_argument(
-        "--num_last_layer_output",
-        type=int,
-        default=0,
-        help="The last n layers as output, 0 for all. (default: %(default)s)")
     parser.add_argument(
         "--train_ratio",
         type=float,
@@ -194,7 +189,7 @@ def parse_args():
     parser.add_argument(
         "--output_type",
         type=str,
-        default="avg_pool",
+        default="avg",
         help="Opertor type for the time steps reduction. (default: %(default)s)")
     parser.add_argument(
         "--multi_path",
@@ -207,15 +202,10 @@ def parse_args():
         default=False,
         help="Binary label for sst dataset. (default: %(default)s)")
     parser.add_argument(
-        "--all_layer_output",
-        type=distutils.util.strtobool,
-        default=True,
-        help="Use all layers as output. (default: %(default)s)")
-    parser.add_argument(
-        "--output_linear_combine",
+        "--is_cuda",
         type=distutils.util.strtobool,
         default=True,
-        help="Combine all the layers in linear way. (default: %(default)s)")
+        help="Specify the device type. (default: %(default)s)")
     parser.add_argument(
         "--is_mask",
         type=distutils.util.strtobool,
@@ -248,58 +238,40 @@ def parse_args():
 
 
 def set_random_seed(seed):
-    print("-" * 80)
-    print("set random seed for data reading: {}".format(seed))
+    logger.info("set random seed for data reading: {}".format(seed))
     random.seed(seed)
     os.environ['PYTHONHASHSEED'] = str(seed)
     np.random.seed(seed)
     random.seed(seed)
     torch.manual_seed(seed)
-    torch.cuda.manual_seed(seed)
-    torch.backends.cudnn.deterministic = True
+    if FLAGS.is_cuda:
+        torch.cuda.manual_seed(seed)
+        torch.backends.cudnn.deterministic = True
 
 
 def get_model(embedding, num_layers):
-    print("num layers: {0}".format(num_layers))
+    logger.info("num layers: {0}".format(num_layers))
     assert FLAGS.child_fixed_arc is not None, "Architecture should be provided."
 
-    child_model = MacroChild(
+    child_model = Model(
         embedding=embedding,
-        fixed_arc=FLAGS.child_fixed_arc,
-        out_filters_scale=FLAGS.child_out_filters_scale,
+        hidden_units=FLAGS.child_out_filters_scale * FLAGS.child_out_filters,
         num_layers=num_layers,
-        out_filters=FLAGS.child_out_filters,
+        num_classes=FLAGS.class_num,
+        choose_from_k=5 if FLAGS.multi_path else 1,
+        lstm_keep_prob=FLAGS.lstm_out_keep_prob,
         cnn_keep_prob=FLAGS.cnn_keep_prob,
-        final_output_keep_prob=FLAGS.final_output_keep_prob,
-        lstm_out_keep_prob=FLAGS.lstm_out_keep_prob,
+        att_keep_prob=FLAGS.attention_keep_prob,
+        att_mask=FLAGS.is_mask,
         embed_keep_prob=FLAGS.embed_keep_prob,
-        attention_keep_prob=FLAGS.attention_keep_prob,
-        multi_path=FLAGS.multi_path,
-        embedding_model=FLAGS.embedding_model,
-        all_layer_output=FLAGS.all_layer_output,
-        output_linear_combine=FLAGS.output_linear_combine,
-        num_last_layer_output=FLAGS.num_last_layer_output,
-        is_mask=FLAGS.is_mask,
-        output_type=FLAGS.output_type,
-        class_num=FLAGS.class_num)
+        final_output_keep_prob=FLAGS.final_output_keep_prob,
+        global_pool=FLAGS.output_type)
 
+    apply_fixed_architecture(child_model, FLAGS.child_fixed_arc)
     return child_model
 
 
-def print_arc(arc, num_layers):
-   start = 0
-   for i in range(0, num_layers):
-      end = start + i + 1
-      if FLAGS.multi_path:
-          end += 1
-      out_str = "fixed_arc=\"$fixed_arc {0}\"".format(np.reshape(arc[start: end], [-1]))
-      out_str = out_str.replace("[", "").replace("]", "")
-      print(out_str)
-
-      start = end
-
-
-def eval_once(child_model, eval_set, criterion, valid_dataloader=None, test_dataloader=None):
+def eval_once(child_model, device, eval_set, criterion, valid_dataloader=None, test_dataloader=None):
     if eval_set == "test":
         assert test_dataloader is not None
         dataloader = test_dataloader
@@ -317,11 +289,11 @@ def eval_once(child_model, eval_set, criterion, valid_dataloader=None, test_data
         for batch in dataloader:
             (sent_ids, mask), labels = batch
 
-            sent_ids = sent_ids.cuda()
-            mask = mask.cuda()
-            labels = labels.cuda()
+            sent_ids = sent_ids.to(device, non_blocking=True)
+            mask = mask.to(device, non_blocking=True)
+            labels = labels.to(device, non_blocking=True)
 
-            logits = child_model(sent_ids, mask)  # run
+            logits = child_model((sent_ids, mask))  # run
 
             loss = criterion(logits, labels.long())
             loss = loss.mean()
@@ -338,14 +310,12 @@ def eval_once(child_model, eval_set, criterion, valid_dataloader=None, test_data
         final_acc = float(tot_acc) / tot
     else:
         final_acc = 0
-        print("Error in calculating final_acc")
+        logger.info("Error in calculating final_acc")
     return final_acc, loss
 
 
 def print_user_flags(FLAGS, line_limit=80):
-    print("-" * 80)
-
-    log_strings = ""
+    log_strings = "\n" + "-" * line_limit + "\n"
     for flag_name in sorted(vars(FLAGS)):
         value = "{}".format(getattr(FLAGS, flag_name))
         log_string = flag_name
@@ -353,7 +323,8 @@ def print_user_flags(FLAGS, line_limit=80):
         log_string += value
         log_strings = log_strings + log_string
         log_strings = log_strings + "\n"
-    print(log_strings)
+    log_strings += "-" * line_limit
+    logger.info(log_strings)
 
 
 def count_model_params(trainable_params):
@@ -410,8 +381,8 @@ def _update():
     return learning_rate
 
 
-def train(data_path, output_dir, num_layers):
-    print("Build dataloader")
+def train(device, data_path, output_dir, num_layers):
+    logger.info("Build dataloader")
     train_dataset, valid_dataset, test_dataset, embedding = \
         read_data_sst(data_path,
                       FLAGS.max_input_length,
@@ -423,16 +394,15 @@ def train(data_path, output_dir, num_layers):
     test_dataloader = DataLoader(test_dataset, batch_size=FLAGS.eval_batch_size, pin_memory=True)
     valid_dataloader = DataLoader(valid_dataset, batch_size=FLAGS.eval_batch_size, pin_memory=True)
 
-    print("Build model")
-    print("-" * 80)
+    logger.info("Build model")
     child_model = get_model(embedding, num_layers)
-    print("Finish build model")
+    logger.info("Finish build model")
 
-    for name, var in child_model.named_parameters():
-        print(name, var.size(), var.requires_grad)  # output all params
+    #for name, var in child_model.named_parameters():
+    #    logger.info(name, var.size(), var.requires_grad)  # output all params
 
     num_vars = count_model_params(child_model.parameters())
-    print("Model has {} params".format(num_vars))
+    logger.info("Model has {} params".format(num_vars))
 
     for m in child_model.modules():  # initializer
         if isinstance(m, (nn.Conv1d, nn.Linear)):
@@ -446,14 +416,10 @@ def train(data_path, output_dir, num_layers):
     else:
         raise ValueError("Unknown optim_algo {}".format(FLAGS.child_optim_algo))
 
-    child_model.cuda()
-    criterion.cuda()
-
-    fixed_arc = np.array([int(x) for x in FLAGS.child_fixed_arc.split(" ") if x])
-    print_arc(fixed_arc, num_layers)
+    child_model.to(device)
+    criterion.to(device)
 
-    print("Start training")
-    print("-" * 80)
+    logger.info("Start training")
     start_time = time.time()
     step = 0
 
@@ -485,13 +451,13 @@ def train(data_path, output_dir, num_layers):
         for batch in train_dataloader:
             (sent_ids, mask), labels = batch
 
-            sent_ids = sent_ids.cuda()
-            mask = mask.cuda()
-            labels = labels.cuda()
+            sent_ids = sent_ids.to(device, non_blocking=True)
+            mask = mask.to(device, non_blocking=True)
+            labels = labels.to(device, non_blocking=True)
 
             step += 1
 
-            logits = child_model(sent_ids, mask)  # run
+            logits = child_model((sent_ids, mask))  # run
 
             loss = criterion(logits, labels.long())
             loss = loss.mean()
@@ -521,7 +487,7 @@ def train(data_path, output_dir, num_layers):
                 log_string += " |g|={:<8.4f}".format(grad_norm)
                 log_string += " tr_acc={:<3d}/{:>3d}".format(acc, logits.size()[0])
                 log_string += " mins={:<10.2f}".format(float(curr_time - start_time) / 60)
-                print(log_string)
+                logger.info(log_string)
 
         epoch += 1
         save_state = {
@@ -531,12 +497,12 @@ def train(data_path, output_dir, num_layers):
             'optimizer_state_dict' : optimizer.state_dict()}
         torch.save(save_state, model_save_path)
         child_model.eval()
-        print("Epoch {}: Eval".format(epoch))
-        eval_acc, eval_loss = eval_once(child_model, "test", criterion, test_dataloader=test_dataloader)
-        print("ch_step={} {}_accuracy={:<6.4f} {}_loss={:<6.4f}".format(step, "test", eval_acc, "test", eval_loss))
+        logger.info("Epoch {}: Eval".format(epoch))
+        eval_acc, eval_loss = eval_once(child_model, device, "test", criterion, test_dataloader=test_dataloader)
+        logger.info("ch_step={} {}_accuracy={:<6.4f} {}_loss={:<6.4f}".format(step, "test", eval_acc, "test", eval_loss))
         if eval_acc > best_acc:
             best_acc = eval_acc
-            print("Save best model")
+            logger.info("Save best model")
             save_state = {
                 'step' : step,
                 'epoch' : epoch,
@@ -549,24 +515,21 @@ def train(data_path, output_dir, num_layers):
 
 def main():
     parse_args()
-    print("-" * 80)
     if not os.path.isdir(FLAGS.output_dir):
-        print("Path {} does not exist. Creating.".format(FLAGS.output_dir))
+        logger.info("Path {} does not exist. Creating.".format(FLAGS.output_dir))
         os.makedirs(FLAGS.output_dir)
     elif FLAGS.reset_output_dir:
-        print("Path {} exists. Remove and remake.".format(FLAGS.output_dir))
+        logger.info("Path {} exists. Remove and remake.".format(FLAGS.output_dir))
         shutil.rmtree(FLAGS.output_dir, ignore_errors=True)
         os.makedirs(FLAGS.output_dir)
-    print("-" * 80)
-    log_file = os.path.join(FLAGS.output_dir, "stdout")
-    print("Logging to {}".format(log_file))
 
     print_user_flags(FLAGS)
 
     if FLAGS.fixed_seed:
         set_random_seed(FLAGS.global_seed)
 
-    train(FLAGS.data_path, FLAGS.output_dir, FLAGS.child_num_layers)
+    device = torch.device("cuda" if FLAGS.is_cuda else "cpu")
+    train(device, FLAGS.data_path, FLAGS.output_dir, FLAGS.child_num_layers)
 
 
 if __name__ == "__main__":
diff --git a/examples/nas/textnas/run_retrain.sh b/examples/nas/textnas/run_retrain.sh
new file mode 100755
index 0000000000..5c8ea66ae9
--- /dev/null
+++ b/examples/nas/textnas/run_retrain.sh
@@ -0,0 +1,41 @@
+# Copyright (c) Microsoft Corporation.
+# Licensed under the MIT license.
+
+export PYTHONPATH="$(pwd)"
+export CUDA_VISIBLE_DEVICES=0
+
+python -u retrain.py \
+  --train_ratio=1.0 \
+  --valid_ratio=1.0 \
+  --min_count=1 \
+  --is_mask=True \
+  --is_binary=True \
+  --child_lr_decay_scheme="cosine" \
+  --data_path="data" \
+  --class_num=2 \
+  --child_optim_algo="adam" \
+  --output_dir="output_sst2" \
+  --global_seed=1234 \
+  --max_input_length=64 \
+  --batch_size=128 \
+  --eval_batch_size=128 \
+  --num_epochs=10 \
+  --log_every=50 \
+  --eval_every_epochs=1 \
+  --child_num_layers=24 \
+  --child_out_filters=256 \
+  --child_l2_reg=1e-6 \
+  --cnn_keep_prob=0.8 \
+  --final_output_keep_prob=1.0 \
+  --embed_keep_prob=0.8 \
+  --lstm_out_keep_prob=0.8 \
+  --attention_keep_prob=0.8 \
+  --child_lr=0.02 \
+  --child_lr_max=0.002 \
+  --child_lr_min=5e-6 \
+  --child_lr_T_0=10 \
+  --child_lr_T_mul=2 \
+  --multi_path=True \
+  --child_fixed_arc="./checkpoints/architecture_00.json" \
+  --fixed_seed=True \
+  "$@"
diff --git a/examples/nas/textnas/train_sst2.sh b/examples/nas/textnas/train_sst2.sh
deleted file mode 100755
index 40cc43aa16..0000000000
--- a/examples/nas/textnas/train_sst2.sh
+++ /dev/null
@@ -1,69 +0,0 @@
-# Copyright (c) Microsoft Corporation.
-# Licensed under the MIT license.
-
-export PYTHONPATH="$(pwd)"
-export CUDA_VISIBLE_DEVICES=0
-
-fixed_arc="$fixed_arc 0 5"
-fixed_arc="$fixed_arc 1 7 0"
-fixed_arc="$fixed_arc 1 3 0 0"
-fixed_arc="$fixed_arc 3 6 0 1 1"
-fixed_arc="$fixed_arc 1 1 0 1 0 0"
-fixed_arc="$fixed_arc 0 1 0 0 1 0 1"
-fixed_arc="$fixed_arc 0 0 0 0 1 1 1 1"
-fixed_arc="$fixed_arc 4 6 0 1 1 0 0 0 0"
-fixed_arc="$fixed_arc 3 5 0 1 0 1 1 0 1 1"
-fixed_arc="$fixed_arc 2 5 0 0 0 0 0 1 0 1 1"
-fixed_arc="$fixed_arc 4 7 1 1 0 0 1 0 0 1 1 0"
-fixed_arc="$fixed_arc 4 4 1 0 0 1 1 1 1 0 0 0 0"
-fixed_arc="$fixed_arc 1 1 1 1 0 1 0 1 0 0 1 0 0 0"
-fixed_arc="$fixed_arc 1 2 1 1 0 0 0 0 1 0 0 1 1 0 1"
-fixed_arc="$fixed_arc 1 0 0 0 1 0 0 0 1 1 1 0 1 1 0 0"
-fixed_arc="$fixed_arc 0 3 1 0 0 0 0 0 1 0 0 0 1 1 0 0 1"
-fixed_arc="$fixed_arc 1 2 0 0 0 0 0 0 0 0 0 1 1 0 1 0 1 0"
-fixed_arc="$fixed_arc 0 4 0 1 1 1 1 0 0 1 0 1 0 0 0 0 1 0 0"
-fixed_arc="$fixed_arc 2 3 0 0 0 0 0 1 0 0 0 0 0 0 0 1 1 1 0 1"
-fixed_arc="$fixed_arc 2 3 0 0 0 1 0 0 0 0 0 0 1 0 0 0 1 0 0 0 0"
-fixed_arc="$fixed_arc 3 5 1 1 0 0 0 0 0 0 0 0 1 1 0 0 1 1 1 1 0 0"
-fixed_arc="$fixed_arc 4 7 0 1 1 1 0 0 0 1 0 1 1 1 1 0 1 0 0 0 0 0 0"
-fixed_arc="$fixed_arc 0 7 0 0 1 1 1 1 0 1 0 1 1 0 0 0 0 1 0 0 0 0 0 1"
-fixed_arc="$fixed_arc 1 3 1 0 0 1 0 0 0 1 0 0 0 0 1 0 0 0 1 0 0 0 0 0 0"
-
-python -u eval_arc.py \
-  --train_ratio=1.0 \
-  --valid_ratio=1.0 \
-  --min_count=1 \
-  --is_mask=True \
-  --is_binary=True \
-  --embedding_model="glove" \
-  --child_lr_decay_scheme="cosine" \
-  --data_path="data" \
-  --class_num=2 \
-  --child_optim_algo="adam" \
-  --output_dir="output_sst2" \
-  --global_seed=1234 \
-  --max_input_length=64 \
-  --batch_size=256 \
-  --eval_batch_size=128 \
-  --num_epochs=10 \
-  --log_every=50 \
-  --eval_every_epochs=1 \
-  --child_num_layers=24 \
-  --child_out_filters=128 \
-  --child_l2_reg=2e-5 \
-  --cnn_keep_prob=0.8 \
-  --final_output_keep_prob=1.0 \
-  --embed_keep_prob=0.8 \
-  --lstm_out_keep_prob=0.8 \
-  --attention_keep_prob=0.8 \
-  --child_lr=0.02 \
-  --child_lr_max=0.002 \
-  --child_lr_min=5e-6 \
-  --child_lr_T_0=10 \
-  --child_lr_T_mul=2 \
-  --multi_path=True \
-  --child_fixed_arc="${fixed_arc}" \
-  --fixed_seed=True \
-  --all_layer_output=True \
-  --output_linear_combine=True \
-  "$@"
diff --git a/examples/nas/textnas/train_sst5.sh b/examples/nas/textnas/train_sst5.sh
deleted file mode 100755
index 79da3518f6..0000000000
--- a/examples/nas/textnas/train_sst5.sh
+++ /dev/null
@@ -1,69 +0,0 @@
-# Copyright (c) Microsoft Corporation.
-# Licensed under the MIT license.
-
-export PYTHONPATH="$(pwd)"
-export CUDA_VISIBLE_DEVICES=0
-
-fixed_arc="$fixed_arc 0 5"
-fixed_arc="$fixed_arc 1 7 0"
-fixed_arc="$fixed_arc 1 3 0 0"
-fixed_arc="$fixed_arc 3 6 0 1 1"
-fixed_arc="$fixed_arc 1 1 0 1 0 0"
-fixed_arc="$fixed_arc 0 1 0 0 1 0 1"
-fixed_arc="$fixed_arc 0 0 0 0 1 1 1 1"
-fixed_arc="$fixed_arc 4 6 0 1 1 0 0 0 0"
-fixed_arc="$fixed_arc 3 5 0 1 0 1 1 0 1 1"
-fixed_arc="$fixed_arc 2 5 0 0 0 0 0 1 0 1 1"
-fixed_arc="$fixed_arc 4 7 1 1 0 0 1 0 0 1 1 0"
-fixed_arc="$fixed_arc 4 4 1 0 0 1 1 1 1 0 0 0 0"
-fixed_arc="$fixed_arc 1 1 1 1 0 1 0 1 0 0 1 0 0 0"
-fixed_arc="$fixed_arc 1 2 1 1 0 0 0 0 1 0 0 1 1 0 1"
-fixed_arc="$fixed_arc 1 0 0 0 1 0 0 0 1 1 1 0 1 1 0 0"
-fixed_arc="$fixed_arc 0 3 1 0 0 0 0 0 1 0 0 0 1 1 0 0 1"
-fixed_arc="$fixed_arc 1 2 0 0 0 0 0 0 0 0 0 1 1 0 1 0 1 0"
-fixed_arc="$fixed_arc 0 4 0 1 1 1 1 0 0 1 0 1 0 0 0 0 1 0 0"
-fixed_arc="$fixed_arc 2 3 0 0 0 0 0 1 0 0 0 0 0 0 0 1 1 1 0 1"
-fixed_arc="$fixed_arc 2 3 0 0 0 1 0 0 0 0 0 0 1 0 0 0 1 0 0 0 0"
-fixed_arc="$fixed_arc 3 5 1 1 0 0 0 0 0 0 0 0 1 1 0 0 1 1 1 1 0 0"
-fixed_arc="$fixed_arc 4 7 0 1 1 1 0 0 0 1 0 1 1 1 1 0 1 0 0 0 0 0 0"
-fixed_arc="$fixed_arc 0 7 0 0 1 1 1 1 0 1 0 1 1 0 0 0 0 1 0 0 0 0 0 1"
-fixed_arc="$fixed_arc 1 3 1 0 0 1 0 0 0 1 0 0 0 0 1 0 0 0 1 0 0 0 0 0 0"
-
-python -u eval_arc.py \
-  --train_ratio=1.0 \
-  --valid_ratio=1.0 \
-  --min_count=1 \
-  --is_mask=True \
-  --is_binary=False \
-  --embedding_model="glove" \
-  --child_lr_decay_scheme="cosine" \
-  --data_path="data" \
-  --class_num=5 \
-  --child_optim_algo="adam" \
-  --output_dir="output_sst5" \
-  --global_seed=1234 \
-  --max_input_length=64 \
-  --batch_size=256 \
-  --eval_batch_size=128 \
-  --num_epochs=10 \
-  --log_every=50 \
-  --eval_every_epochs=1 \
-  --child_num_layers=24 \
-  --child_out_filters=256 \
-  --child_l2_reg=1e-6 \
-  --cnn_keep_prob=0.8 \
-  --final_output_keep_prob=1.0 \
-  --embed_keep_prob=0.8 \
-  --lstm_out_keep_prob=0.8 \
-  --attention_keep_prob=0.8 \
-  --child_lr=0.02 \
-  --child_lr_max=0.002 \
-  --child_lr_min=1e-6 \
-  --child_lr_T_0=10 \
-  --child_lr_T_mul=2 \
-  --multi_path=True \
-  --child_fixed_arc="${fixed_arc}" \
-  --fixed_seed=True \
-  --all_layer_output=True \
-  --output_linear_combine=True \
-  "$@"

From 0f9a7c6bb6bf54d8d44ce74edd9a740ebf9aa70a Mon Sep 17 00:00:00 2001
From: Yaming Yang <yayaming@microsoft.com>
Date: Thu, 2 Apr 2020 07:30:26 +0000
Subject: [PATCH 07/12] Add doc of TextNAS.

---
 docs/en_US/NAS/TextNAS.md               |  56 +++++++
 examples/nas/textnas/README.md          |   6 +-
 examples/nas/textnas/README_zh_CN.md    |   6 +-
 examples/nas/textnas/arc/final_arc.json | 212 ++++++++++++++++++++++++
 examples/nas/textnas/run_retrain.sh     |   2 +-
 5 files changed, 279 insertions(+), 3 deletions(-)
 create mode 100644 docs/en_US/NAS/TextNAS.md
 create mode 100644 examples/nas/textnas/arc/final_arc.json

diff --git a/docs/en_US/NAS/TextNAS.md b/docs/en_US/NAS/TextNAS.md
new file mode 100644
index 0000000000..3e6303c2ce
--- /dev/null
+++ b/docs/en_US/NAS/TextNAS.md
@@ -0,0 +1,56 @@
+# TextNAS
+
+## Introduction
+
+This is the implementation of the TextNAS algorithm proposed in the paper [TextNAS: A Neural Architecture Search Space tailored for Text Representation](https://arxiv.org/pdf/1912.10729.pdf). TextNAS is a neural architecture search algorithm tailored for text representation, more specifically, TextNAS is based on a novel search space consists of operators widely adopted to solve various NLP tasks, and TextNAS also supports multi-path ensemble within a single network to balance the width and depth of the architecture. 
+
+The search space of TextNAS contains: 
+
+    * 1-D convolutional operator with filter size 1, 3, 5, 7 
+    * recurrent operator (bi-directional GRU) 
+    * self-attention operator
+    * pooling operator (max/average)
+
+Following the ENAS algorithm, TextNAS also utilizes parameter sharing to accelerate the search speed and adopts a reinforcement-learning controller for the architecture sampling and generation. Please refer to the paper for more details of TextNAS.
+
+## Examples
+
+### Search Space
+
+[Example code](https://github.com/microsoft/nni/tree/master/examples/nas/textnas)
+
+```bash
+# In case NNI code is not cloned. If the code is cloned already, ignore this line and enter code folder.
+git clone https://github.com/Microsoft/nni.git
+
+# search the best architecture
+cd examples/nas/textnas
+
+# view more options for search
+python3 search.py -h
+```
+
+### retrain
+
+```bash
+# In case NNI code is not cloned. If the code is cloned already, ignore this line and enter code folder.
+git clone https://github.com/Microsoft/nni.git
+
+# search the best architecture
+cd examples/nas/textnas
+
+# default to retrain on sst-2
+sh run_retrain.sh
+```
+
+## Reference
+
+### PyTorch
+
+```eval_rst
+..  autoclass:: nni.nas.pytorch.enas.EnasTrainer
+    :members:
+
+..  autoclass:: nni.nas.pytorch.enas.EnasMutator
+    :members:
+```
diff --git a/examples/nas/textnas/README.md b/examples/nas/textnas/README.md
index fb261ad04d..f8ebe24afd 100644
--- a/examples/nas/textnas/README.md
+++ b/examples/nas/textnas/README.md
@@ -42,4 +42,8 @@ By default, 20 sampled architectures will be exported into `checkpoints` directo
 
 ## Retrain
 
-Not ready.
+```
+sh run_retrain.sh
+```
+
+By default, the script will retrain the architecture provided by the author on the SST-2 dataset.
diff --git a/examples/nas/textnas/README_zh_CN.md b/examples/nas/textnas/README_zh_CN.md
index 22bbbb4c9b..f9ad379939 100644
--- a/examples/nas/textnas/README_zh_CN.md
+++ b/examples/nas/textnas/README_zh_CN.md
@@ -42,4 +42,8 @@ python search.py
 
 ## 重新训练
 
-待完成
+```
+sh run_retrain.sh
+```
+
+默认情况下，会在SST-2数据集上重新训练作者提供的结构。
diff --git a/examples/nas/textnas/arc/final_arc.json b/examples/nas/textnas/arc/final_arc.json
new file mode 100644
index 0000000000..c1e12c2d4b
--- /dev/null
+++ b/examples/nas/textnas/arc/final_arc.json
@@ -0,0 +1,212 @@
+{
+  "LayerChoice1": [
+    false, false, false, false, false, true, false, false
+  ],
+  "InputChoice2": [
+    true
+  ],
+  "LayerChoice3": [
+    false, false, false, false, false, false, false, true
+  ],
+  "InputChoice4": [
+    false
+  ],
+  "InputChoice5": [
+    true, false
+  ],
+  "LayerChoice6": [
+    false, false, false, true, false, false, false, false
+  ],
+  "InputChoice7": [
+    false, false
+  ],
+  "InputChoice8": [
+    false, false, true
+  ],
+  "LayerChoice9": [
+    false, false, false, false, false, false, true, false
+  ],
+  "InputChoice10": [
+    false, true, true
+  ],
+  "InputChoice11": [
+    false, false, true, false
+  ],
+  "LayerChoice12": [
+    false, true, false, false, false, false, false, false
+  ],
+  "InputChoice13": [
+    false, true, false, false
+  ],
+  "InputChoice14": [
+    false, false, false, false, true
+  ],
+  "LayerChoice15": [
+    false, true, false, false, false, false, false, false
+  ],
+  "InputChoice16": [
+    false, false, true, false, true
+  ],
+  "InputChoice17": [
+    false, false, false, false, true
+  ],
+  "LayerChoice18": [
+    true, false, false, false, false, false, false, false
+  ],
+  "InputChoice19": [
+    false, false, true, true, true, true
+  ],
+  "InputChoice20": [
+    true, false, false, false, false
+  ],
+  "LayerChoice21": [
+    false, false, false, false, false, false, true, false
+  ],
+  "InputChoice22": [
+    false, true, true, false, false, false, false
+  ],
+  "InputChoice23": [
+    false, true, false, false, false
+  ],
+  "LayerChoice24": [
+    false, false, false, false, false, true, false, false
+  ],
+  "InputChoice25": [
+    false, true, false, true, true, false, true, true
+  ],
+  "InputChoice26": [
+    false, false, true, false, false
+  ],
+  "LayerChoice27": [
+    false, false, false, false, false, true, false, false
+  ],
+  "InputChoice28": [
+    false, false, false, false, false, true, false, true, true
+  ],
+  "InputChoice29": [
+    true, false, false, false, false
+  ],
+  "LayerChoice30": [
+    false, false, false, false, false, false, false, true
+  ],
+  "InputChoice31": [
+    true, true, false, false, true, false, false, true, true, false
+  ],
+  "InputChoice32": [
+    true, false, false, false, false
+  ],
+  "LayerChoice33": [
+    false, false, false, false, true, false, false, false
+  ],
+  "InputChoice34": [
+    true, false, false, true, true, true, true, false, false, false, false
+  ],
+  "InputChoice35": [
+    false, false, false, true, false
+  ],
+  "LayerChoice36": [
+    false, true, false, false, false, false, false, false
+  ],
+  "InputChoice37": [
+    true, true, false, true, false, true, false, false, true, false, false, false
+  ],
+  "InputChoice38": [
+    false, false, false, true, false
+  ],
+  "LayerChoice39": [
+    false, false, true, false, false, false, false, false
+  ],
+  "InputChoice40": [
+    true, true, false, false, false, false, true, false, false, true, true, false, true
+  ],
+  "InputChoice41": [
+    false, false, false, true, false
+  ],
+  "LayerChoice42": [
+    true, false, false, false, false, false, false, false
+  ],
+  "InputChoice43": [
+    false, false, true, false, false, false, true, true, true, false, true, true, false, false
+  ],
+  "InputChoice44": [
+    false, false, false, false, true
+  ],
+  "LayerChoice45": [
+    false, false, false, true, false, false, false, false
+  ],
+  "InputChoice46": [
+    true, false, false, false, false, false, true, false, false, false, true, true, false, false, true
+  ],
+  "InputChoice47": [
+    false, false, false, true, false
+  ],
+  "LayerChoice48": [
+    false, false, true, false, false, false, false, false
+  ],
+  "InputChoice49": [
+    false, false, false, false, false, false, false, false, false, true, true, false, true, false, true, false
+  ],
+  "InputChoice50": [
+    false, false, false, false, true
+  ],
+  "LayerChoice51": [
+    false, false, false, false, true, false, false, false
+  ],
+  "InputChoice52": [
+    false, true, true, true, true, false, false, true, false, true, false, false, false, false, true, false, false
+  ],
+  "InputChoice53": [
+    false, false, true, false, false
+  ],
+  "LayerChoice54": [
+    false, false, false, true, false, false, false, false
+  ],
+  "InputChoice55": [
+    false, false, false, false, false, true, false, false, false, false, false, false, false, true, true, true, false, true
+  ],
+  "InputChoice56": [
+    false, false, true, false, false
+  ],
+  "LayerChoice57": [
+    false, false, false, true, false, false, false, false
+  ],
+  "InputChoice58": [
+    false, false, false, true, false, false, false, false, false, false, true, false, false, false, true, false, false, false, false
+  ],
+  "InputChoice59": [
+    false, true, false, false, false
+  ],
+  "LayerChoice60": [
+    false, false, false, false, false, true, false, false
+  ],
+  "InputChoice61": [
+    true, true, false, false, false, false, false, false, false, false, true, true, false, false, true, true, true, true, false, false
+  ],
+  "InputChoice62": [
+    true, false, false, false, false
+  ],
+  "LayerChoice63": [
+    false, false, false, false, false, false, false, true
+  ],
+  "InputChoice64": [
+    false, true, true, true, false, false, false, true, false, true, true, true, true, false, true, false, false, false, false, false, false
+  ],
+  "InputChoice65": [
+    false, false, false, false, true
+  ],
+  "LayerChoice66": [
+    false, false, false, false, false, false, false, true
+  ],
+  "InputChoice67": [
+    false, false, true, true, true, true, false, true, false, true, true, false, false, false, false, true, false, false, false, false, false, true
+  ],
+  "InputChoice68": [
+    false, false, false, true, false
+  ],
+  "LayerChoice69": [
+    false, false, false, true, false, false, false, false
+  ],
+  "InputChoice70": [
+    true, false, false, true, false, false, false, true, false, false, false, false, true, false, false, false, true, false, false, false, false, false, false
+  ]
+}
diff --git a/examples/nas/textnas/run_retrain.sh b/examples/nas/textnas/run_retrain.sh
index 5c8ea66ae9..b9306dbed2 100755
--- a/examples/nas/textnas/run_retrain.sh
+++ b/examples/nas/textnas/run_retrain.sh
@@ -36,6 +36,6 @@ python -u retrain.py \
   --child_lr_T_0=10 \
   --child_lr_T_mul=2 \
   --multi_path=True \
-  --child_fixed_arc="./checkpoints/architecture_00.json" \
+  --child_fixed_arc="./arc/final_arc.json" \
   --fixed_seed=True \
   "$@"

From cc06ecdfc8fbc173ead4d2366ce10960821a0c1e Mon Sep 17 00:00:00 2001
From: Yaming Yang <yayaming@microsoft.com>
Date: Thu, 2 Apr 2020 07:50:24 +0000
Subject: [PATCH 08/12] Undo modification of the Chinese document.

---
 examples/nas/textnas/README_zh_CN.md | 6 +-----
 1 file changed, 1 insertion(+), 5 deletions(-)

diff --git a/examples/nas/textnas/README_zh_CN.md b/examples/nas/textnas/README_zh_CN.md
index f9ad379939..22bbbb4c9b 100644
--- a/examples/nas/textnas/README_zh_CN.md
+++ b/examples/nas/textnas/README_zh_CN.md
@@ -42,8 +42,4 @@ python search.py
 
 ## 重新训练
 
-```
-sh run_retrain.sh
-```
-
-默认情况下，会在SST-2数据集上重新训练作者提供的结构。
+待完成

From 4c1b0f28902294741eef3dd0053a62e98fd5e6d1 Mon Sep 17 00:00:00 2001
From: Yuge Zhang <scottyugochang@gmail.com>
Date: Thu, 2 Apr 2020 16:43:53 +0800
Subject: [PATCH 09/12] Incorporate previous doc

---
 docs/en_US/NAS/TextNAS.md | 30 ++++++++++++++++++++++++++++++
 1 file changed, 30 insertions(+)

diff --git a/docs/en_US/NAS/TextNAS.md b/docs/en_US/NAS/TextNAS.md
index 3e6303c2ce..d67647b88c 100644
--- a/docs/en_US/NAS/TextNAS.md
+++ b/docs/en_US/NAS/TextNAS.md
@@ -13,6 +13,32 @@ The search space of TextNAS contains:
 
 Following the ENAS algorithm, TextNAS also utilizes parameter sharing to accelerate the search speed and adopts a reinforcement-learning controller for the architecture sampling and generation. Please refer to the paper for more details of TextNAS.
 
+## Preparation
+
+Prepare the word vectors and SST dataset, and organize them in data directory as shown below:
+
+```
+textnas
+├── data
+│   ├── sst
+│   │   └── trees
+│   │       ├── dev.txt
+│   │       ├── test.txt
+│   │       └── train.txt
+│   └── glove.840B.300d.txt
+├── dataloader.py
+├── model.py
+├── ops.py
+├── README.md
+├── search.py
+└── utils.py
+```
+
+The following link might be helpful for finding and downloading the corresponding dataset:
+
+* [GloVe: Global Vectors for Word Representation](https://nlp.stanford.edu/projects/glove/)
+* [Recursive Deep Models for Semantic Compositionality Over a Sentiment Treebank](https://nlp.stanford.edu/sentiment/)
+
 ## Examples
 
 ### Search Space
@@ -30,6 +56,10 @@ cd examples/nas/textnas
 python3 search.py -h
 ```
 
+After each search epoch, 10 sampled architectures will be tested directly. Their performances are expected to be 40% - 42% after 10 epochs.
+
+By default, 20 sampled architectures will be exported into `checkpoints` directory for next step.
+
 ### retrain
 
 ```bash

From 7f8e7472c069d3d67f9581d614fdd583cf2bae2a Mon Sep 17 00:00:00 2001
From: Yuge Zhang <scottyugochang@gmail.com>
Date: Thu, 2 Apr 2020 17:03:00 +0800
Subject: [PATCH 10/12] Add direct link to dataset

---
 docs/en_US/NAS/TextNAS.md | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/docs/en_US/NAS/TextNAS.md b/docs/en_US/NAS/TextNAS.md
index d67647b88c..2bdc64c241 100644
--- a/docs/en_US/NAS/TextNAS.md
+++ b/docs/en_US/NAS/TextNAS.md
@@ -37,7 +37,9 @@ textnas
 The following link might be helpful for finding and downloading the corresponding dataset:
 
 * [GloVe: Global Vectors for Word Representation](https://nlp.stanford.edu/projects/glove/)
+  * [glove.840B.300d.txt](http://nlp.stanford.edu/data/glove.840B.300d.zip)
 * [Recursive Deep Models for Semantic Compositionality Over a Sentiment Treebank](https://nlp.stanford.edu/sentiment/)
+  * [trainDevTestTrees_PTB.zip](https://nlp.stanford.edu/sentiment/trainDevTestTrees_PTB.zip)
 
 ## Examples
 

From bf90867955bfa87975882b84f676badf8031588b Mon Sep 17 00:00:00 2001
From: Yaming Yang <yayaming@microsoft.com>
Date: Thu, 2 Apr 2020 13:32:05 +0000
Subject: [PATCH 11/12] Change to py3 and add doc to the index.

---
 docs/en_US/nas.rst                  | 1 +
 examples/nas/textnas/run_retrain.sh | 2 +-
 2 files changed, 2 insertions(+), 1 deletion(-)

diff --git a/docs/en_US/nas.rst b/docs/en_US/nas.rst
index 0a56caa742..f5a06c5c9a 100644
--- a/docs/en_US/nas.rst
+++ b/docs/en_US/nas.rst
@@ -26,5 +26,6 @@ For details, please refer to the following tutorials:
     SPOS <NAS/SPOS>
     CDARTS <NAS/CDARTS>
     ProxylessNAS <NAS/Proxylessnas>
+    TextNAS <NAS/TextNAS>
     Customize a NAS Algorithm <NAS/Advanced>
     API Reference <NAS/NasReference>
diff --git a/examples/nas/textnas/run_retrain.sh b/examples/nas/textnas/run_retrain.sh
index b9306dbed2..1f02121e31 100755
--- a/examples/nas/textnas/run_retrain.sh
+++ b/examples/nas/textnas/run_retrain.sh
@@ -4,7 +4,7 @@
 export PYTHONPATH="$(pwd)"
 export CUDA_VISIBLE_DEVICES=0
 
-python -u retrain.py \
+python3 -u retrain.py \
   --train_ratio=1.0 \
   --valid_ratio=1.0 \
   --min_count=1 \

From 56b804cd7d1d9b05fdad971f412fd99108f03ba7 Mon Sep 17 00:00:00 2001
From: QuanluZhang <z.quanluzhang@gmail.com>
Date: Fri, 3 Apr 2020 17:10:23 +0800
Subject: [PATCH 12/12] Update TextNAS.md

---
 docs/en_US/NAS/TextNAS.md | 10 +---------
 1 file changed, 1 insertion(+), 9 deletions(-)

diff --git a/docs/en_US/NAS/TextNAS.md b/docs/en_US/NAS/TextNAS.md
index 2bdc64c241..7c455534ec 100644
--- a/docs/en_US/NAS/TextNAS.md
+++ b/docs/en_US/NAS/TextNAS.md
@@ -77,12 +77,4 @@ sh run_retrain.sh
 
 ## Reference
 
-### PyTorch
-
-```eval_rst
-..  autoclass:: nni.nas.pytorch.enas.EnasTrainer
-    :members:
-
-..  autoclass:: nni.nas.pytorch.enas.EnasMutator
-    :members:
-```
+TextNAS directly uses EnasTrainer, please refer to [ENAS](./ENAS.md) for the trainer APIs.