From 7afafdfd1b7b71ecb1d032cabc70a5a9b2973b38 Mon Sep 17 00:00:00 2001 From: Yaming Yang Date: Sun, 22 Mar 2020 12:07:07 +0000 Subject: [PATCH 01/12] Add evalutaion scripts for TextNAS. --- examples/nas/textnas/dataloader.py | 3 +- examples/nas/textnas/eval_arc.py | 571 ++++++++++++++++++++++++++++ examples/nas/textnas/macro_child.py | 223 +++++++++++ examples/nas/textnas/run_sst2.sh | 69 ++++ examples/nas/textnas/run_sst5.sh | 69 ++++ examples/nas/textnas/train_sst2.sh | 69 ++++ examples/nas/textnas/train_sst5.sh | 69 ++++ 7 files changed, 1072 insertions(+), 1 deletion(-) create mode 100644 examples/nas/textnas/eval_arc.py create mode 100644 examples/nas/textnas/macro_child.py create mode 100755 examples/nas/textnas/run_sst2.sh create mode 100755 examples/nas/textnas/run_sst5.sh create mode 100755 examples/nas/textnas/train_sst2.sh create mode 100755 examples/nas/textnas/train_sst5.sh diff --git a/examples/nas/textnas/dataloader.py b/examples/nas/textnas/dataloader.py index e5a4ed363f..083f1c7413 100644 --- a/examples/nas/textnas/dataloader.py +++ b/examples/nas/textnas/dataloader.py @@ -241,7 +241,8 @@ def init_trainable_embedding(embedding_path, word_id_dict, embed_dim=300): embedding = np.random.random([len(word_id_dict), embed_dim]).astype(np.float32) / 2.0 - 0.25 embedding[0] = np.zeros(embed_dim) # PAD embedding[1] = (np.random.rand(embed_dim) - 0.5) / 2 # UNK - for word, idx in word_id_dict.items(): + for word in sorted(word_id_dict.keys()): + idx = word_id_dict[word] if idx == 0 or idx == 1: continue if word in word_embed_model["mapping"]: diff --git a/examples/nas/textnas/eval_arc.py b/examples/nas/textnas/eval_arc.py new file mode 100644 index 0000000000..63a79d1672 --- /dev/null +++ b/examples/nas/textnas/eval_arc.py @@ -0,0 +1,571 @@ +# Copyright (c) Microsoft Corporation. +# Licensed under the MIT license. + +import os +import pickle +import shutil +import sys +import random +import math + +import time +import datetime +import argparse +import distutils.util + +import numpy as np +import torch +from torch import nn +from torch import optim +from torch.utils.data import DataLoader +import torch.nn.functional as Func + +from macro_child import MacroChild +from dataloader import read_data_sst + + +def parse_args(): + parser = argparse.ArgumentParser() + parser.add_argument( + "--reset_output_dir", + type=distutils.util.strtobool, + default=True, + help="Whether to clean the output dir if existed. (default: %(default)s)") + parser.add_argument( + "--embedding_model", + type=str, + default="glove", + help="Embedding type. (default: %(default)s)") + parser.add_argument( + "--child_fixed_arc", + type=str, + required=True, + help="Architecture description. (default: %(default)s)") + parser.add_argument( + "--data_path", + type=str, + default="data", + help="Directory containing the dataset and embedding file. (default: %(default)s)") + parser.add_argument( + "--output_dir", + type=str, + default="output", + help="The output directory. (default: %(default)s)") + parser.add_argument( + "--child_lr_decay_scheme", + type=str, + default="cosine", + help="Learning rate annealing strategy, only 'cosine' supported. (default: %(default)s)") + parser.add_argument( + "--batch_size", + type=int, + default=128, + help="Number of samples each batch for training. (default: %(default)s)") + parser.add_argument( + "--eval_batch_size", + type=int, + default=128, + help="Number of samples each batch for evaluation. (default: %(default)s)") + parser.add_argument( + "--class_num", + type=int, + default=5, + help="The number of categories. (default: %(default)s)") + parser.add_argument( + "--global_seed", + type=int, + default=1234, + help="Seed for reproduction. (default: %(default)s)") + parser.add_argument( + "--max_input_length", + type=int, + default=64, + help="The maximum length of the sentence. (default: %(default)s)") + parser.add_argument( + "--num_epochs", + type=int, + default=10, + help="The number of training epochs. (default: %(default)s)") + parser.add_argument( + "--child_num_layers", + type=int, + default=24, + help="The layer number of the architecture. (default: %(default)s)") + parser.add_argument( + "--child_out_filters", + type=int, + default=256, + help="The dimension of hidden states. (default: %(default)s)") + parser.add_argument( + "--child_out_filters_scale", + type=int, + default=1, + help="The scale of hidden state dimension. (default: %(default)s)") + parser.add_argument( + "--child_lr_T_0", + type=int, + default=10, + help="The length of one cycle. (default: %(default)s)") + parser.add_argument( + "--child_lr_T_mul", + type=int, + default=2, + help="The multiplication factor per cycle. (default: %(default)s)") + parser.add_argument( + "--min_count", + type=int, + default=1, + help="The threshold to cut off low frequent words. (default: %(default)s)") + parser.add_argument( + "--num_last_layer_output", + type=int, + default=0, + help="The last n layers as output, 0 for all. (default: %(default)s)") + parser.add_argument( + "--train_ratio", + type=float, + default=1.0, + help="The sample ratio for the training set. (default: %(default)s)") + parser.add_argument( + "--valid_ratio", + type=float, + default=1.0, + help="The sample ratio for the dev set. (default: %(default)s)") + parser.add_argument( + "--child_grad_bound", + type=float, + default=5.0, + help="The threshold for gradient clipping. (default: %(default)s)") + parser.add_argument( + "--child_lr", + type=float, + default=0.02, + help="The initial learning rate. (default: %(default)s)") + parser.add_argument( + "--cnn_keep_prob", + type=float, + default=0.8, + help="Keep prob for cnn layer. (default: %(default)s)") + parser.add_argument( + "--final_output_keep_prob", + type=float, + default=1.0, + help="Keep prob for the last output layer. (default: %(default)s)") + parser.add_argument( + "--lstm_out_keep_prob", + type=float, + default=0.8, + help="Keep prob for the RNN layer. (default: %(default)s)") + parser.add_argument( + "--embed_keep_prob", + type=float, + default=0.8, + help="Keep prob for the embedding layer. (default: %(default)s)") + parser.add_argument( + "--attention_keep_prob", + type=float, + default=0.8, + help="Keep prob for the self-attention layer. (default: %(default)s)") + parser.add_argument( + "--child_l2_reg", + type=float, + default=3e-6, + help="Weight decay factor. (default: %(default)s)") + parser.add_argument( + "--child_lr_max", + type=float, + default=0.002, + help="The max learning rate. (default: %(default)s)") + parser.add_argument( + "--child_lr_min", + type=float, + default=0.001, + help="The min learning rate. (default: %(default)s)") + parser.add_argument( + "--child_optim_algo", + type=str, + default="adam", + help="Optimization algorithm. (default: %(default)s)") + parser.add_argument( + "--checkpoint_dir", + type=str, + default="best_checkpoint", + help="Path for saved checkpoints. (default: %(default)s)") + parser.add_argument( + "--output_type", + type=str, + default="avg_pool", + help="Opertor type for the time steps reduction. (default: %(default)s)") + parser.add_argument( + "--multi_path", + type=distutils.util.strtobool, + default=False, + help="Search for multiple path in the architecture. (default: %(default)s)") + parser.add_argument( + "--is_binary", + type=distutils.util.strtobool, + default=False, + help="Binary label for sst dataset. (default: %(default)s)") + parser.add_argument( + "--all_layer_output", + type=distutils.util.strtobool, + default=True, + help="Use all layers as output. (default: %(default)s)") + parser.add_argument( + "--output_linear_combine", + type=distutils.util.strtobool, + default=True, + help="Combine all the layers in linear way. (default: %(default)s)") + parser.add_argument( + "--is_mask", + type=distutils.util.strtobool, + default=True, + help="Apply mask. (default: %(default)s)") + parser.add_argument( + "--fixed_seed", + type=distutils.util.strtobool, + default=True, + help="Fix the seed. (default: %(default)s)") + parser.add_argument( + "--load_checkpoint", + type=distutils.util.strtobool, + default=False, + help="Wether to load checkpoint. (default: %(default)s)") + parser.add_argument( + "--log_every", + type=int, + default=50, + help="How many steps to log. (default: %(default)s)") + parser.add_argument( + "--eval_every_epochs", + type=int, + default=1, + help="How many epochs to eval. (default: %(default)s)") + + global FLAGS + + FLAGS = parser.parse_args() + + +def set_random_seed(seed): + print("-" * 80) + print("set random seed for data reading: {}".format(seed)) + random.seed(seed) + os.environ['PYTHONHASHSEED'] = str(seed) + np.random.seed(seed) + random.seed(seed) + torch.manual_seed(seed) + torch.cuda.manual_seed(seed) + torch.backends.cudnn.deterministic = True + + +def get_model(embedding, num_layers): + print("num layers: {0}".format(num_layers)) + assert FLAGS.child_fixed_arc is not None, "Architecture should be provided." + + child_model = MacroChild( + embedding=embedding, + fixed_arc=FLAGS.child_fixed_arc, + out_filters_scale=FLAGS.child_out_filters_scale, + num_layers=num_layers, + out_filters=FLAGS.child_out_filters, + cnn_keep_prob=FLAGS.cnn_keep_prob, + final_output_keep_prob=FLAGS.final_output_keep_prob, + lstm_out_keep_prob=FLAGS.lstm_out_keep_prob, + embed_keep_prob=FLAGS.embed_keep_prob, + attention_keep_prob=FLAGS.attention_keep_prob, + multi_path=FLAGS.multi_path, + embedding_model=FLAGS.embedding_model, + all_layer_output=FLAGS.all_layer_output, + output_linear_combine=FLAGS.output_linear_combine, + num_last_layer_output=FLAGS.num_last_layer_output, + is_mask=FLAGS.is_mask, + output_type=FLAGS.output_type, + class_num=FLAGS.class_num) + + return child_model + + +def print_arc(arc, num_layers): + start = 0 + for i in range(0, num_layers): + end = start + i + 1 + if FLAGS.multi_path: + end += 1 + out_str = "fixed_arc=\"$fixed_arc {0}\"".format(np.reshape(arc[start: end], [-1])) + out_str = out_str.replace("[", "").replace("]", "") + print(out_str) + + start = end + + +def eval_once(child_model, eval_set, criterion, valid_dataloader=None, test_dataloader=None): + if eval_set == "test": + assert test_dataloader is not None + dataloader = test_dataloader + elif eval_set == "valid": + assert valid_dataloader is not None + dataloader = valid_dataloader + else: + raise NotImplementedError("Unknown eval_set '{}'".format(eval_set)) + + tot_acc = 0 + tot = 0 + losses = [] + + with torch.no_grad(): # save memory + for batch in dataloader: + (sent_ids, mask), labels = batch + + sent_ids = sent_ids.cuda() + mask = mask.cuda() + labels = labels.cuda() + + logits = child_model(sent_ids, mask) # run + + loss = criterion(logits, labels.long()) + loss = loss.mean() + preds = logits.argmax(dim=1).long() + acc = torch.eq(preds, labels.long()).long().sum().item() + + losses.append(loss) + tot_acc += acc + tot += len(labels) + + losses = torch.tensor(losses) + loss = losses.mean() + if tot > 0: + final_acc = float(tot_acc) / tot + else: + final_acc = 0 + print("Error in calculating final_acc") + return final_acc, loss + + +def print_user_flags(FLAGS, line_limit=80): + print("-" * 80) + + log_strings = "" + for flag_name in sorted(vars(FLAGS)): + value = "{}".format(getattr(FLAGS, flag_name)) + log_string = flag_name + log_string += "." * (line_limit - len(flag_name) - len(value)) + log_string += value + log_strings = log_strings + log_string + log_strings = log_strings + "\n" + print(log_strings) + + +def count_model_params(trainable_params): + num_vars = 0 + for var in trainable_params: + num_vars += np.prod([dim for dim in var.size()]) + return num_vars + + +def update_lr( + optimizer, + epoch, + l2_reg=1e-4, + lr_warmup_val=None, + lr_init=0.1, + lr_decay_scheme="cosine", + lr_max=0.002, + lr_min=0.000000001, + lr_T_0=4, + lr_T_mul=1, + sync_replicas=False, + num_aggregate=None, + num_replicas=None): + if lr_decay_scheme == "cosine": + assert lr_max is not None, "Need lr_max to use lr_cosine" + assert lr_min is not None, "Need lr_min to use lr_cosine" + assert lr_T_0 is not None, "Need lr_T_0 to use lr_cosine" + assert lr_T_mul is not None, "Need lr_T_mul to use lr_cosine" + + T_i = lr_T_0 + t_epoch = epoch + last_reset = 0 + while True: + t_epoch -= T_i + if t_epoch < 0: + break + last_reset += T_i + T_i *= lr_T_mul + + T_curr = epoch - last_reset + + def _update(): + rate = T_curr / T_i * 3.1415926 + lr = lr_min + 0.5 * (lr_max - lr_min) * (1.0 + math.cos(rate)) + return lr + + learning_rate = _update() + else: + raise ValueError("Unknown learning rate decay scheme {}".format(lr_decay_scheme)) + + #update lr in optimizer + for params_group in optimizer.param_groups: + params_group['lr'] = learning_rate + return learning_rate + +def train(data_path, output_dir, num_layers): + print("Build dataloader") + train_dataset, valid_dataset, test_dataset, embedding = \ + read_data_sst(data_path, + FLAGS.max_input_length, + FLAGS.min_count, + train_ratio=FLAGS.train_ratio, + valid_ratio=FLAGS.valid_ratio, + is_binary=FLAGS.is_binary) + train_dataloader = DataLoader(train_dataset, batch_size=FLAGS.batch_size, shuffle=True, pin_memory=True) + test_dataloader = DataLoader(test_dataset, batch_size=FLAGS.eval_batch_size, pin_memory=True) + valid_dataloader = DataLoader(valid_dataset, batch_size=FLAGS.eval_batch_size, pin_memory=True) + + print("Build model") + print("-" * 80) + child_model = get_model(embedding, num_layers) + print("Finish build model") + + for name, var in child_model.named_parameters(): + print(name, var.size(), var.requires_grad) # output all params + + num_vars = count_model_params(child_model.parameters()) + print("Model has {} params".format(num_vars)) + + for m in child_model.modules(): # initializer + if isinstance(m, (nn.Conv1d, nn.Linear)): + nn.init.xavier_uniform_(m.weight) + + criterion = nn.CrossEntropyLoss() + + # get optimizer + if FLAGS.child_optim_algo == "adam": + optimizer = optim.Adam(child_model.parameters(), eps=1e-3, weight_decay=FLAGS.child_l2_reg) # with L2 + else: + raise ValueError("Unknown optim_algo {}".format(optim_algo)) + + child_model.cuda() + criterion.cuda() + + fixed_arc = np.array([int(x) for x in FLAGS.child_fixed_arc.split(" ") if x]) + print_arc(fixed_arc, num_layers) + + print("Start training") + print("-" * 80) + start_time = time.time() + step = 0 + + # save path + model_save_path = os.path.join(FLAGS.output_dir, "model.pth") + best_model_save_path = os.path.join(FLAGS.output_dir, "best_model.pth") + best_acc = 0 + start_epoch = 0 + if FLAGS.load_checkpoint: + if os.path.isfile(model_save_path): + checkpoint = torch.load(model_save_path, map_location = torch.device('cpu')) + step = checkpoint['step'] + start_epoch = checkpoint['epoch'] + child_model.load_state_dict(checkpoint['child_model_state_dict']) + optimizer.load_state_dict(checkpoint['optimizer_state_dict']) + + for epoch in range(start_epoch, FLAGS.num_epochs): + lr = update_lr(optimizer, + epoch, + l2_reg=FLAGS.child_l2_reg, + lr_warmup_val=None, + lr_init=FLAGS.child_lr, + lr_decay_scheme=FLAGS.child_lr_decay_scheme, + lr_max=FLAGS.child_lr_max, + lr_min=FLAGS.child_lr_min, + lr_T_0=FLAGS.child_lr_T_0, + lr_T_mul=FLAGS.child_lr_T_mul) + child_model.train() + for batch in train_dataloader: + (sent_ids, mask), labels = batch + + sent_ids = sent_ids.cuda() + mask = mask.cuda() + labels = labels.cuda() + + step += 1 + + logits = child_model(sent_ids, mask) # run + + loss = criterion(logits, labels.long()) + loss = loss.mean() + preds = logits.argmax(dim=1).long() + acc = torch.eq(preds, labels.long()).long().sum().item() + + optimizer.zero_grad() + loss.backward() + grad_norm = 0 + trainable_params = child_model.parameters() + + assert FLAGS.child_grad_bound is not None, "Need grad_bound to clip gradients." + # compute the gradient norm value + grad_norm = nn.utils.clip_grad_norm_(trainable_params, 99999999) + for param in trainable_params: + nn.utils.clip_grad_norm_(param, grad_bound) #clip grad + + optimizer.step() + + if step % FLAGS.log_every == 0: + curr_time = time.time() + log_string = "" + log_string += "epoch={:<6d}".format(epoch) + log_string += "ch_step={:<6d}".format(step) + log_string += " loss={:<8.6f}".format(loss) + log_string += " lr={:<8.4f}".format(lr) + log_string += " |g|={:<8.4f}".format(grad_norm) + log_string += " tr_acc={:<3d}/{:>3d}".format(acc, logits.size()[0]) + log_string += " mins={:<10.2f}".format(float(curr_time - start_time) / 60) + print(log_string) + epoch += 1 + save_state = { + 'step' : step, + 'epoch' : epoch, + 'child_model_state_dict' : child_model.state_dict(), + 'optimizer_state_dict' : optimizer.state_dict()} + torch.save(save_state, model_save_path) + child_model.eval() + print("Epoch {}: Eval".format(epoch)) + eval_acc, eval_loss = eval_once(child_model, "test", criterion, test_dataloader=test_dataloader) + print("ch_step={} {}_accuracy={:<6.4f} {}_loss={:<6.4f}".format(step, "test", eval_acc, "test", eval_loss)) + if eval_acc > best_acc: + best_acc = eval_acc + print("Save best model") + save_state = { + 'step' : step, + 'epoch' : epoch, + 'child_model_state_dict' : child_model.state_dict(), + 'optimizer_state_dict' : optimizer.state_dict()} + torch.save(save_state, best_model_save_path) + + return eval_acc + + +def main(): + parse_args() + print("-" * 80) + if not os.path.isdir(FLAGS.output_dir): + print("Path {} does not exist. Creating.".format(FLAGS.output_dir)) + os.makedirs(FLAGS.output_dir) + elif FLAGS.reset_output_dir: + print("Path {} exists. Remove and remake.".format(FLAGS.output_dir)) + shutil.rmtree(FLAGS.output_dir, ignore_errors=True) + os.makedirs(FLAGS.output_dir) + print("-" * 80) + log_file = os.path.join(FLAGS.output_dir, "stdout") + print("Logging to {}".format(log_file)) + + print_user_flags(FLAGS) + + if FLAGS.fixed_seed: + set_random_seed(FLAGS.global_seed) + + train(FLAGS.data_path, FLAGS.output_dir, FLAGS.child_num_layers) + + +if __name__ == "__main__": + main() diff --git a/examples/nas/textnas/macro_child.py b/examples/nas/textnas/macro_child.py new file mode 100644 index 0000000000..a74f24f5bc --- /dev/null +++ b/examples/nas/textnas/macro_child.py @@ -0,0 +1,223 @@ +# Copyright (c) Microsoft Corporation. +# Licensed under the MIT license. + +import os +import sys + +import numpy as np +import torch +from torch import nn +import torch.nn.functional as Func + +from ops import * +from utils import GlobalAvgPool, GlobalMaxPool + +class MacroChild(nn.Module): + def __init__(self, + embedding, + fixed_arc=None, + out_filters_scale=1, + num_layers=2, + out_filters=24, + cnn_keep_prob=1.0, + final_output_keep_prob=1.0, + lstm_out_keep_prob=1.0, + embed_keep_prob=1.0, + attention_keep_prob=1.0, + multi_path=False, + embedding_model="none", + all_layer_output=False, + output_linear_combine=False, + num_last_layer_output=0, + is_mask=False, + output_type="avg_pool", + class_num=5, + *args, + **kwargs): + super(MacroChild, self).__init__() + + self.fixed_arc = fixed_arc + self.all_layer_output = all_layer_output + self.output_linear_combine = output_linear_combine + self.num_last_layer_output = max(num_last_layer_output, 0) + self.is_mask = is_mask + self.output_type = output_type + self.multi_path = multi_path + self.embedding_model = embedding_model + self.out_filters = out_filters * out_filters_scale + self.num_layers = num_layers + self.class_num = class_num + self.cnn_keep_prob = cnn_keep_prob + self.final_output_keep_prob = final_output_keep_prob + self.lstm_out_keep_prob = lstm_out_keep_prob + self.embed_keep_prob = embed_keep_prob + self.attention_keep_prob = attention_keep_prob + + fixed_arc = np.array([int(x) for x in self.fixed_arc.split(" ") if x]) + self.sample_arc = fixed_arc + + layers = [] + + out_filters = self.out_filters + if self.embedding_model == "glove": + self.embedding = nn.Parameter(embedding) + else: + raise NotImplementedError("Unknown embedding_model '{}'".format(embedding_model)) + + self.init_conv = ConvBN(1, self.embedding.size()[1], out_filters, cnn_keep_prob, False, True) + + for layer_id in range(self.num_layers): + layers.append(self.make_fixed_layer(layer_id, out_filters)) + self.layers = nn.ModuleList(layers) + + if self.all_layer_output and self.output_linear_combine: # use linear_combine + self._linear_combine = LinearCombine(self.num_layers) + self.linear_out = nn.Linear(out_filters, self.class_num) + + self.embed_dropout= nn.Dropout(p=(1 - embed_keep_prob)) + self.output_dropout= nn.Dropout(p=(1 - final_output_keep_prob)) + + if self.output_type == "avg_pool": + self.output_pool = GlobalAvgPool() + elif self.output_type == "max_pool": + self.output_pool = GlobalMaxPool() + else: + raise ValueError("Unsupported output type.") + + def forward(self, sent_ids, mask): + seq = Func.embedding(sent_ids.long(), self.embedding) + seq = self.embed_dropout(seq) + + seq = torch.transpose(seq, 1, 2) # from (N, L, C) -> (N, C, L) + + x = self.init_conv(seq, mask) + + start_idx = 0 + prev_layers = [] + final_flags = [] + + for layer_id in range(self.num_layers): # run layers + layer = self.layers[layer_id] + x = self.run_fixed_layer(x, mask, prev_layers, layer, layer_id, start_idx, + final_flags=final_flags) # run needed branches + prev_layers.append(x) + final_flags.append(1) + + start_idx += 1 + layer_id + if self.multi_path: + start_idx += 1 + + final_layers = [] + final_layers_idx = [] + for i in range(0, len(prev_layers)): + if self.all_layer_output: + if self.num_last_layer_output == 0: + final_layers.append(prev_layers[i]) + final_layers_idx.append(i) + elif i >= max((len(prev_layers) - self.num_last_layer_output), 0): + final_layers.append(prev_layers[i]) + final_layers_idx.append(i) + else: + final_layers.append(final_flags[i] * prev_layers[i]) + + if self.all_layer_output and self.output_linear_combine: # all layer ooutput and use linear_combine + x = self._linear_combine(torch.stack(final_layers)) + else: + x = sum(final_layers) + if not self.all_layer_output: + x /= sum(final_flags) + else: + x /= len(final_layers) + + x = self.output_pool(x, mask) + x = self.output_dropout(x) + x = self.linear_out(x) + return x + + def make_fixed_layer(self, layer_id, out_filters): + size = [1, 3, 5, 7] + separables = [False, False, False, False] + + branches = [] + + if self.multi_path: + branch_id = (layer_id + 1) * (layer_id + 2) // 2 + else: + branch_id = (layer_id) * (layer_id + 1) // 2 + + bn_flag = False + for i in range(layer_id): + if self.sample_arc[branch_id + 1 + i] == 1: + bn_flag = True + branch_id = self.sample_arc[branch_id] + + for operation_id in [0, 1, 2, 3]: # conv_opt + if branch_id == operation_id: + filter_size = size[operation_id] + separable = separables[operation_id] + op = ConvBN(filter_size, out_filters, out_filters, self.cnn_keep_prob, False, True) + branches.append(op) + if branch_id == 4: + branches.append(AvgPool(3, False, True)) + elif branch_id == 5: + branches.append(MaxPool(3, False, True)) + elif branch_id == 6: + branches.append(RNN(out_filters, self.lstm_out_keep_prob)) + elif branch_id == 7: + branches.append(Attention(out_filters, 4, self.attention_keep_prob, self.is_mask)) + + branches = nn.ModuleList(branches) + bn = None + if bn_flag: + bn = BatchNorm(self.out_filters, False, True) + + return nn.ModuleList([branches, bn]) + + def run_fixed_layer(self, x, mask, prev_layers, layers, layer_id, start_idx, final_flags): + layer = layers[0] + bn = layers[1] + + if len(prev_layers) > 0: + if self.multi_path: + pre_layer_id = self.sample_arc[start_idx] + num_pre_layers = len(prev_layers) + if num_pre_layers > 5: + num_pre_layers = 5 + if pre_layer_id >= num_pre_layers: + final_flags[-1] = 0 + inputs = prev_layers[-1] + else: + layer_idx = len(prev_layers) - 1 - pre_layer_id + final_flags[layer_idx] = 0 + inputs = prev_layers[layer_idx] + else: + inputs = prev_layers[-1] + final_flags[-1] = 0 + else: + inputs = x + + if self.multi_path: + start_idx += 1 + + branches = [] + # run branch op + branch_id = 0 + branches.append(layer[branch_id](inputs, mask)) + + if layer_id == 0: + out = sum(branches) + else: + skip_start = start_idx + 1 + skip = self.sample_arc[skip_start:skip_start + layer_id] + + res_layers = [] + for i in range(layer_id): + if skip[i] == 1: + res_layers.append(prev_layers[i]) + final_flags[i] = 0 + prev = branches + res_layers + out = sum(prev) # tensor sum + if len(prev) > 1: + out = bn(out, mask) + + return out diff --git a/examples/nas/textnas/run_sst2.sh b/examples/nas/textnas/run_sst2.sh new file mode 100755 index 0000000000..6918f7a099 --- /dev/null +++ b/examples/nas/textnas/run_sst2.sh @@ -0,0 +1,69 @@ +# Copyright (c) Microsoft Corporation. +# Licensed under the MIT license. + +export PYTHONPATH="$(pwd)" +export CUDA_VISIBLE_DEVICES=0 + +fixed_arc="$fixed_arc 0 5" +fixed_arc="$fixed_arc 1 7 0" +fixed_arc="$fixed_arc 1 3 0 0" +fixed_arc="$fixed_arc 3 6 0 1 1" +fixed_arc="$fixed_arc 1 1 0 1 0 0" +fixed_arc="$fixed_arc 0 1 0 0 1 0 1" +fixed_arc="$fixed_arc 0 0 0 0 1 1 1 1" +fixed_arc="$fixed_arc 4 6 0 1 1 0 0 0 0" +fixed_arc="$fixed_arc 3 5 0 1 0 1 1 0 1 1" +fixed_arc="$fixed_arc 2 5 0 0 0 0 0 1 0 1 1" +fixed_arc="$fixed_arc 4 7 1 1 0 0 1 0 0 1 1 0" +fixed_arc="$fixed_arc 4 4 1 0 0 1 1 1 1 0 0 0 0" +fixed_arc="$fixed_arc 1 1 1 1 0 1 0 1 0 0 1 0 0 0" +fixed_arc="$fixed_arc 1 2 1 1 0 0 0 0 1 0 0 1 1 0 1" +fixed_arc="$fixed_arc 1 0 0 0 1 0 0 0 1 1 1 0 1 1 0 0" +fixed_arc="$fixed_arc 0 3 1 0 0 0 0 0 1 0 0 0 1 1 0 0 1" +fixed_arc="$fixed_arc 1 2 0 0 0 0 0 0 0 0 0 1 1 0 1 0 1 0" +fixed_arc="$fixed_arc 0 4 0 1 1 1 1 0 0 1 0 1 0 0 0 0 1 0 0" +fixed_arc="$fixed_arc 2 3 0 0 0 0 0 1 0 0 0 0 0 0 0 1 1 1 0 1" +fixed_arc="$fixed_arc 2 3 0 0 0 1 0 0 0 0 0 0 1 0 0 0 1 0 0 0 0" +fixed_arc="$fixed_arc 3 5 1 1 0 0 0 0 0 0 0 0 1 1 0 0 1 1 1 1 0 0" +fixed_arc="$fixed_arc 4 7 0 1 1 1 0 0 0 1 0 1 1 1 1 0 1 0 0 0 0 0 0" +fixed_arc="$fixed_arc 0 7 0 0 1 1 1 1 0 1 0 1 1 0 0 0 0 1 0 0 0 0 0 1" +fixed_arc="$fixed_arc 1 3 1 0 0 1 0 0 0 1 0 0 0 0 1 0 0 0 1 0 0 0 0 0 0" + +python eval_arc.py \ + --train_ratio=1.0 \ + --valid_ratio=1.0 \ + --min_count=1 \ + --is_mask=True \ + --is_binary=True \ + --embedding_model="glove" \ + --child_lr_decay_scheme="cosine" \ + --data_path="data" \ + --class_num=2 \ + --child_optim_algo="adam" \ + --output_dir="output_sst2" \ + --global_seed=1234 \ + --max_input_length=64 \ + --batch_size=128 \ + --eval_batch_size=128 \ + --num_epochs=10 \ + --log_every=50 \ + --eval_every_epochs=1 \ + --child_num_layers=24 \ + --child_out_filters=256 \ + --child_l2_reg=1e-6 \ + --cnn_keep_prob=0.8 \ + --final_output_keep_prob=1.0 \ + --embed_keep_prob=0.8 \ + --lstm_out_keep_prob=0.8 \ + --attention_keep_prob=0.8 \ + --child_lr=0.02 \ + --child_lr_max=0.002 \ + --child_lr_min=5e-6 \ + --child_lr_T_0=10 \ + --child_lr_T_mul=2 \ + --multi_path=True \ + --child_fixed_arc="${fixed_arc}" \ + --fixed_seed=True \ + --all_layer_output=True \ + --output_linear_combine=True \ + "$@" diff --git a/examples/nas/textnas/run_sst5.sh b/examples/nas/textnas/run_sst5.sh new file mode 100755 index 0000000000..b26df14b9b --- /dev/null +++ b/examples/nas/textnas/run_sst5.sh @@ -0,0 +1,69 @@ +# Copyright (c) Microsoft Corporation. +# Licensed under the MIT license. + +export PYTHONPATH="$(pwd)" +export CUDA_VISIBLE_DEVICES=0 + +fixed_arc="$fixed_arc 0 5" +fixed_arc="$fixed_arc 1 7 0" +fixed_arc="$fixed_arc 1 3 0 0" +fixed_arc="$fixed_arc 3 6 0 1 1" +fixed_arc="$fixed_arc 1 1 0 1 0 0" +fixed_arc="$fixed_arc 0 1 0 0 1 0 1" +fixed_arc="$fixed_arc 0 0 0 0 1 1 1 1" +fixed_arc="$fixed_arc 4 6 0 1 1 0 0 0 0" +fixed_arc="$fixed_arc 3 5 0 1 0 1 1 0 1 1" +fixed_arc="$fixed_arc 2 5 0 0 0 0 0 1 0 1 1" +fixed_arc="$fixed_arc 4 7 1 1 0 0 1 0 0 1 1 0" +fixed_arc="$fixed_arc 4 4 1 0 0 1 1 1 1 0 0 0 0" +fixed_arc="$fixed_arc 1 1 1 1 0 1 0 1 0 0 1 0 0 0" +fixed_arc="$fixed_arc 1 2 1 1 0 0 0 0 1 0 0 1 1 0 1" +fixed_arc="$fixed_arc 1 0 0 0 1 0 0 0 1 1 1 0 1 1 0 0" +fixed_arc="$fixed_arc 0 3 1 0 0 0 0 0 1 0 0 0 1 1 0 0 1" +fixed_arc="$fixed_arc 1 2 0 0 0 0 0 0 0 0 0 1 1 0 1 0 1 0" +fixed_arc="$fixed_arc 0 4 0 1 1 1 1 0 0 1 0 1 0 0 0 0 1 0 0" +fixed_arc="$fixed_arc 2 3 0 0 0 0 0 1 0 0 0 0 0 0 0 1 1 1 0 1" +fixed_arc="$fixed_arc 2 3 0 0 0 1 0 0 0 0 0 0 1 0 0 0 1 0 0 0 0" +fixed_arc="$fixed_arc 3 5 1 1 0 0 0 0 0 0 0 0 1 1 0 0 1 1 1 1 0 0" +fixed_arc="$fixed_arc 4 7 0 1 1 1 0 0 0 1 0 1 1 1 1 0 1 0 0 0 0 0 0" +fixed_arc="$fixed_arc 0 7 0 0 1 1 1 1 0 1 0 1 1 0 0 0 0 1 0 0 0 0 0 1" +fixed_arc="$fixed_arc 1 3 1 0 0 1 0 0 0 1 0 0 0 0 1 0 0 0 1 0 0 0 0 0 0" + +python eval_arc.py \ + --train_ratio=1.0 \ + --valid_ratio=1.0 \ + --min_count=1 \ + --is_mask=True \ + --is_binary=True \ + --embedding_model="glove" \ + --child_lr_decay_scheme="cosine" \ + --data_path="data" \ + --class_num=5 \ + --child_optim_algo="adam" \ + --output_dir="output_sst5" \ + --global_seed=1234 \ + --max_input_length=64 \ + --batch_size=256 \ + --eval_batch_size=128 \ + --num_epochs=10 \ + --log_every=50 \ + --eval_every_epochs=1 \ + --child_num_layers=24 \ + --child_out_filters=256 \ + --child_l2_reg=1e-6 \ + --cnn_keep_prob=0.8 \ + --final_output_keep_prob=1.0 \ + --embed_keep_prob=0.8 \ + --lstm_out_keep_prob=0.8 \ + --attention_keep_prob=0.8 \ + --child_lr=0.02 \ + --child_lr_max=0.002 \ + --child_lr_min=0.0002 \ + --child_lr_T_0=10 \ + --child_lr_T_mul=2 \ + --multi_path=True \ + --child_fixed_arc="${fixed_arc}" \ + --fixed_seed=True \ + --all_layer_output=True \ + --output_linear_combine=True \ + "$@" diff --git a/examples/nas/textnas/train_sst2.sh b/examples/nas/textnas/train_sst2.sh new file mode 100755 index 0000000000..6918f7a099 --- /dev/null +++ b/examples/nas/textnas/train_sst2.sh @@ -0,0 +1,69 @@ +# Copyright (c) Microsoft Corporation. +# Licensed under the MIT license. + +export PYTHONPATH="$(pwd)" +export CUDA_VISIBLE_DEVICES=0 + +fixed_arc="$fixed_arc 0 5" +fixed_arc="$fixed_arc 1 7 0" +fixed_arc="$fixed_arc 1 3 0 0" +fixed_arc="$fixed_arc 3 6 0 1 1" +fixed_arc="$fixed_arc 1 1 0 1 0 0" +fixed_arc="$fixed_arc 0 1 0 0 1 0 1" +fixed_arc="$fixed_arc 0 0 0 0 1 1 1 1" +fixed_arc="$fixed_arc 4 6 0 1 1 0 0 0 0" +fixed_arc="$fixed_arc 3 5 0 1 0 1 1 0 1 1" +fixed_arc="$fixed_arc 2 5 0 0 0 0 0 1 0 1 1" +fixed_arc="$fixed_arc 4 7 1 1 0 0 1 0 0 1 1 0" +fixed_arc="$fixed_arc 4 4 1 0 0 1 1 1 1 0 0 0 0" +fixed_arc="$fixed_arc 1 1 1 1 0 1 0 1 0 0 1 0 0 0" +fixed_arc="$fixed_arc 1 2 1 1 0 0 0 0 1 0 0 1 1 0 1" +fixed_arc="$fixed_arc 1 0 0 0 1 0 0 0 1 1 1 0 1 1 0 0" +fixed_arc="$fixed_arc 0 3 1 0 0 0 0 0 1 0 0 0 1 1 0 0 1" +fixed_arc="$fixed_arc 1 2 0 0 0 0 0 0 0 0 0 1 1 0 1 0 1 0" +fixed_arc="$fixed_arc 0 4 0 1 1 1 1 0 0 1 0 1 0 0 0 0 1 0 0" +fixed_arc="$fixed_arc 2 3 0 0 0 0 0 1 0 0 0 0 0 0 0 1 1 1 0 1" +fixed_arc="$fixed_arc 2 3 0 0 0 1 0 0 0 0 0 0 1 0 0 0 1 0 0 0 0" +fixed_arc="$fixed_arc 3 5 1 1 0 0 0 0 0 0 0 0 1 1 0 0 1 1 1 1 0 0" +fixed_arc="$fixed_arc 4 7 0 1 1 1 0 0 0 1 0 1 1 1 1 0 1 0 0 0 0 0 0" +fixed_arc="$fixed_arc 0 7 0 0 1 1 1 1 0 1 0 1 1 0 0 0 0 1 0 0 0 0 0 1" +fixed_arc="$fixed_arc 1 3 1 0 0 1 0 0 0 1 0 0 0 0 1 0 0 0 1 0 0 0 0 0 0" + +python eval_arc.py \ + --train_ratio=1.0 \ + --valid_ratio=1.0 \ + --min_count=1 \ + --is_mask=True \ + --is_binary=True \ + --embedding_model="glove" \ + --child_lr_decay_scheme="cosine" \ + --data_path="data" \ + --class_num=2 \ + --child_optim_algo="adam" \ + --output_dir="output_sst2" \ + --global_seed=1234 \ + --max_input_length=64 \ + --batch_size=128 \ + --eval_batch_size=128 \ + --num_epochs=10 \ + --log_every=50 \ + --eval_every_epochs=1 \ + --child_num_layers=24 \ + --child_out_filters=256 \ + --child_l2_reg=1e-6 \ + --cnn_keep_prob=0.8 \ + --final_output_keep_prob=1.0 \ + --embed_keep_prob=0.8 \ + --lstm_out_keep_prob=0.8 \ + --attention_keep_prob=0.8 \ + --child_lr=0.02 \ + --child_lr_max=0.002 \ + --child_lr_min=5e-6 \ + --child_lr_T_0=10 \ + --child_lr_T_mul=2 \ + --multi_path=True \ + --child_fixed_arc="${fixed_arc}" \ + --fixed_seed=True \ + --all_layer_output=True \ + --output_linear_combine=True \ + "$@" diff --git a/examples/nas/textnas/train_sst5.sh b/examples/nas/textnas/train_sst5.sh new file mode 100755 index 0000000000..b26df14b9b --- /dev/null +++ b/examples/nas/textnas/train_sst5.sh @@ -0,0 +1,69 @@ +# Copyright (c) Microsoft Corporation. +# Licensed under the MIT license. + +export PYTHONPATH="$(pwd)" +export CUDA_VISIBLE_DEVICES=0 + +fixed_arc="$fixed_arc 0 5" +fixed_arc="$fixed_arc 1 7 0" +fixed_arc="$fixed_arc 1 3 0 0" +fixed_arc="$fixed_arc 3 6 0 1 1" +fixed_arc="$fixed_arc 1 1 0 1 0 0" +fixed_arc="$fixed_arc 0 1 0 0 1 0 1" +fixed_arc="$fixed_arc 0 0 0 0 1 1 1 1" +fixed_arc="$fixed_arc 4 6 0 1 1 0 0 0 0" +fixed_arc="$fixed_arc 3 5 0 1 0 1 1 0 1 1" +fixed_arc="$fixed_arc 2 5 0 0 0 0 0 1 0 1 1" +fixed_arc="$fixed_arc 4 7 1 1 0 0 1 0 0 1 1 0" +fixed_arc="$fixed_arc 4 4 1 0 0 1 1 1 1 0 0 0 0" +fixed_arc="$fixed_arc 1 1 1 1 0 1 0 1 0 0 1 0 0 0" +fixed_arc="$fixed_arc 1 2 1 1 0 0 0 0 1 0 0 1 1 0 1" +fixed_arc="$fixed_arc 1 0 0 0 1 0 0 0 1 1 1 0 1 1 0 0" +fixed_arc="$fixed_arc 0 3 1 0 0 0 0 0 1 0 0 0 1 1 0 0 1" +fixed_arc="$fixed_arc 1 2 0 0 0 0 0 0 0 0 0 1 1 0 1 0 1 0" +fixed_arc="$fixed_arc 0 4 0 1 1 1 1 0 0 1 0 1 0 0 0 0 1 0 0" +fixed_arc="$fixed_arc 2 3 0 0 0 0 0 1 0 0 0 0 0 0 0 1 1 1 0 1" +fixed_arc="$fixed_arc 2 3 0 0 0 1 0 0 0 0 0 0 1 0 0 0 1 0 0 0 0" +fixed_arc="$fixed_arc 3 5 1 1 0 0 0 0 0 0 0 0 1 1 0 0 1 1 1 1 0 0" +fixed_arc="$fixed_arc 4 7 0 1 1 1 0 0 0 1 0 1 1 1 1 0 1 0 0 0 0 0 0" +fixed_arc="$fixed_arc 0 7 0 0 1 1 1 1 0 1 0 1 1 0 0 0 0 1 0 0 0 0 0 1" +fixed_arc="$fixed_arc 1 3 1 0 0 1 0 0 0 1 0 0 0 0 1 0 0 0 1 0 0 0 0 0 0" + +python eval_arc.py \ + --train_ratio=1.0 \ + --valid_ratio=1.0 \ + --min_count=1 \ + --is_mask=True \ + --is_binary=True \ + --embedding_model="glove" \ + --child_lr_decay_scheme="cosine" \ + --data_path="data" \ + --class_num=5 \ + --child_optim_algo="adam" \ + --output_dir="output_sst5" \ + --global_seed=1234 \ + --max_input_length=64 \ + --batch_size=256 \ + --eval_batch_size=128 \ + --num_epochs=10 \ + --log_every=50 \ + --eval_every_epochs=1 \ + --child_num_layers=24 \ + --child_out_filters=256 \ + --child_l2_reg=1e-6 \ + --cnn_keep_prob=0.8 \ + --final_output_keep_prob=1.0 \ + --embed_keep_prob=0.8 \ + --lstm_out_keep_prob=0.8 \ + --attention_keep_prob=0.8 \ + --child_lr=0.02 \ + --child_lr_max=0.002 \ + --child_lr_min=0.0002 \ + --child_lr_T_0=10 \ + --child_lr_T_mul=2 \ + --multi_path=True \ + --child_fixed_arc="${fixed_arc}" \ + --fixed_seed=True \ + --all_layer_output=True \ + --output_linear_combine=True \ + "$@" From 931ca9254b3237fbf47288f32e71dfca66ef10ac Mon Sep 17 00:00:00 2001 From: Yaming Yang Date: Sun, 22 Mar 2020 12:13:05 +0000 Subject: [PATCH 02/12] Remove duplicated run_sst2.sh and run_sst5.sh. --- examples/nas/textnas/run_sst2.sh | 69 -------------------------------- examples/nas/textnas/run_sst5.sh | 69 -------------------------------- 2 files changed, 138 deletions(-) delete mode 100755 examples/nas/textnas/run_sst2.sh delete mode 100755 examples/nas/textnas/run_sst5.sh diff --git a/examples/nas/textnas/run_sst2.sh b/examples/nas/textnas/run_sst2.sh deleted file mode 100755 index 6918f7a099..0000000000 --- a/examples/nas/textnas/run_sst2.sh +++ /dev/null @@ -1,69 +0,0 @@ -# Copyright (c) Microsoft Corporation. -# Licensed under the MIT license. - -export PYTHONPATH="$(pwd)" -export CUDA_VISIBLE_DEVICES=0 - -fixed_arc="$fixed_arc 0 5" -fixed_arc="$fixed_arc 1 7 0" -fixed_arc="$fixed_arc 1 3 0 0" -fixed_arc="$fixed_arc 3 6 0 1 1" -fixed_arc="$fixed_arc 1 1 0 1 0 0" -fixed_arc="$fixed_arc 0 1 0 0 1 0 1" -fixed_arc="$fixed_arc 0 0 0 0 1 1 1 1" -fixed_arc="$fixed_arc 4 6 0 1 1 0 0 0 0" -fixed_arc="$fixed_arc 3 5 0 1 0 1 1 0 1 1" -fixed_arc="$fixed_arc 2 5 0 0 0 0 0 1 0 1 1" -fixed_arc="$fixed_arc 4 7 1 1 0 0 1 0 0 1 1 0" -fixed_arc="$fixed_arc 4 4 1 0 0 1 1 1 1 0 0 0 0" -fixed_arc="$fixed_arc 1 1 1 1 0 1 0 1 0 0 1 0 0 0" -fixed_arc="$fixed_arc 1 2 1 1 0 0 0 0 1 0 0 1 1 0 1" -fixed_arc="$fixed_arc 1 0 0 0 1 0 0 0 1 1 1 0 1 1 0 0" -fixed_arc="$fixed_arc 0 3 1 0 0 0 0 0 1 0 0 0 1 1 0 0 1" -fixed_arc="$fixed_arc 1 2 0 0 0 0 0 0 0 0 0 1 1 0 1 0 1 0" -fixed_arc="$fixed_arc 0 4 0 1 1 1 1 0 0 1 0 1 0 0 0 0 1 0 0" -fixed_arc="$fixed_arc 2 3 0 0 0 0 0 1 0 0 0 0 0 0 0 1 1 1 0 1" -fixed_arc="$fixed_arc 2 3 0 0 0 1 0 0 0 0 0 0 1 0 0 0 1 0 0 0 0" -fixed_arc="$fixed_arc 3 5 1 1 0 0 0 0 0 0 0 0 1 1 0 0 1 1 1 1 0 0" -fixed_arc="$fixed_arc 4 7 0 1 1 1 0 0 0 1 0 1 1 1 1 0 1 0 0 0 0 0 0" -fixed_arc="$fixed_arc 0 7 0 0 1 1 1 1 0 1 0 1 1 0 0 0 0 1 0 0 0 0 0 1" -fixed_arc="$fixed_arc 1 3 1 0 0 1 0 0 0 1 0 0 0 0 1 0 0 0 1 0 0 0 0 0 0" - -python eval_arc.py \ - --train_ratio=1.0 \ - --valid_ratio=1.0 \ - --min_count=1 \ - --is_mask=True \ - --is_binary=True \ - --embedding_model="glove" \ - --child_lr_decay_scheme="cosine" \ - --data_path="data" \ - --class_num=2 \ - --child_optim_algo="adam" \ - --output_dir="output_sst2" \ - --global_seed=1234 \ - --max_input_length=64 \ - --batch_size=128 \ - --eval_batch_size=128 \ - --num_epochs=10 \ - --log_every=50 \ - --eval_every_epochs=1 \ - --child_num_layers=24 \ - --child_out_filters=256 \ - --child_l2_reg=1e-6 \ - --cnn_keep_prob=0.8 \ - --final_output_keep_prob=1.0 \ - --embed_keep_prob=0.8 \ - --lstm_out_keep_prob=0.8 \ - --attention_keep_prob=0.8 \ - --child_lr=0.02 \ - --child_lr_max=0.002 \ - --child_lr_min=5e-6 \ - --child_lr_T_0=10 \ - --child_lr_T_mul=2 \ - --multi_path=True \ - --child_fixed_arc="${fixed_arc}" \ - --fixed_seed=True \ - --all_layer_output=True \ - --output_linear_combine=True \ - "$@" diff --git a/examples/nas/textnas/run_sst5.sh b/examples/nas/textnas/run_sst5.sh deleted file mode 100755 index b26df14b9b..0000000000 --- a/examples/nas/textnas/run_sst5.sh +++ /dev/null @@ -1,69 +0,0 @@ -# Copyright (c) Microsoft Corporation. -# Licensed under the MIT license. - -export PYTHONPATH="$(pwd)" -export CUDA_VISIBLE_DEVICES=0 - -fixed_arc="$fixed_arc 0 5" -fixed_arc="$fixed_arc 1 7 0" -fixed_arc="$fixed_arc 1 3 0 0" -fixed_arc="$fixed_arc 3 6 0 1 1" -fixed_arc="$fixed_arc 1 1 0 1 0 0" -fixed_arc="$fixed_arc 0 1 0 0 1 0 1" -fixed_arc="$fixed_arc 0 0 0 0 1 1 1 1" -fixed_arc="$fixed_arc 4 6 0 1 1 0 0 0 0" -fixed_arc="$fixed_arc 3 5 0 1 0 1 1 0 1 1" -fixed_arc="$fixed_arc 2 5 0 0 0 0 0 1 0 1 1" -fixed_arc="$fixed_arc 4 7 1 1 0 0 1 0 0 1 1 0" -fixed_arc="$fixed_arc 4 4 1 0 0 1 1 1 1 0 0 0 0" -fixed_arc="$fixed_arc 1 1 1 1 0 1 0 1 0 0 1 0 0 0" -fixed_arc="$fixed_arc 1 2 1 1 0 0 0 0 1 0 0 1 1 0 1" -fixed_arc="$fixed_arc 1 0 0 0 1 0 0 0 1 1 1 0 1 1 0 0" -fixed_arc="$fixed_arc 0 3 1 0 0 0 0 0 1 0 0 0 1 1 0 0 1" -fixed_arc="$fixed_arc 1 2 0 0 0 0 0 0 0 0 0 1 1 0 1 0 1 0" -fixed_arc="$fixed_arc 0 4 0 1 1 1 1 0 0 1 0 1 0 0 0 0 1 0 0" -fixed_arc="$fixed_arc 2 3 0 0 0 0 0 1 0 0 0 0 0 0 0 1 1 1 0 1" -fixed_arc="$fixed_arc 2 3 0 0 0 1 0 0 0 0 0 0 1 0 0 0 1 0 0 0 0" -fixed_arc="$fixed_arc 3 5 1 1 0 0 0 0 0 0 0 0 1 1 0 0 1 1 1 1 0 0" -fixed_arc="$fixed_arc 4 7 0 1 1 1 0 0 0 1 0 1 1 1 1 0 1 0 0 0 0 0 0" -fixed_arc="$fixed_arc 0 7 0 0 1 1 1 1 0 1 0 1 1 0 0 0 0 1 0 0 0 0 0 1" -fixed_arc="$fixed_arc 1 3 1 0 0 1 0 0 0 1 0 0 0 0 1 0 0 0 1 0 0 0 0 0 0" - -python eval_arc.py \ - --train_ratio=1.0 \ - --valid_ratio=1.0 \ - --min_count=1 \ - --is_mask=True \ - --is_binary=True \ - --embedding_model="glove" \ - --child_lr_decay_scheme="cosine" \ - --data_path="data" \ - --class_num=5 \ - --child_optim_algo="adam" \ - --output_dir="output_sst5" \ - --global_seed=1234 \ - --max_input_length=64 \ - --batch_size=256 \ - --eval_batch_size=128 \ - --num_epochs=10 \ - --log_every=50 \ - --eval_every_epochs=1 \ - --child_num_layers=24 \ - --child_out_filters=256 \ - --child_l2_reg=1e-6 \ - --cnn_keep_prob=0.8 \ - --final_output_keep_prob=1.0 \ - --embed_keep_prob=0.8 \ - --lstm_out_keep_prob=0.8 \ - --attention_keep_prob=0.8 \ - --child_lr=0.02 \ - --child_lr_max=0.002 \ - --child_lr_min=0.0002 \ - --child_lr_T_0=10 \ - --child_lr_T_mul=2 \ - --multi_path=True \ - --child_fixed_arc="${fixed_arc}" \ - --fixed_seed=True \ - --all_layer_output=True \ - --output_linear_combine=True \ - "$@" From 5160ac0c9e90f43f2895460331866c92b81d4d32 Mon Sep 17 00:00:00 2001 From: Yaming Yang Date: Sun, 22 Mar 2020 12:41:32 +0000 Subject: [PATCH 03/12] Minor fixes. --- examples/nas/textnas/train_sst2.sh | 2 +- examples/nas/textnas/train_sst5.sh | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/examples/nas/textnas/train_sst2.sh b/examples/nas/textnas/train_sst2.sh index 6918f7a099..c3f24a2be7 100755 --- a/examples/nas/textnas/train_sst2.sh +++ b/examples/nas/textnas/train_sst2.sh @@ -29,7 +29,7 @@ fixed_arc="$fixed_arc 4 7 0 1 1 1 0 0 0 1 0 1 1 1 1 0 1 0 0 0 0 0 0" fixed_arc="$fixed_arc 0 7 0 0 1 1 1 1 0 1 0 1 1 0 0 0 0 1 0 0 0 0 0 1" fixed_arc="$fixed_arc 1 3 1 0 0 1 0 0 0 1 0 0 0 0 1 0 0 0 1 0 0 0 0 0 0" -python eval_arc.py \ +python -u eval_arc.py \ --train_ratio=1.0 \ --valid_ratio=1.0 \ --min_count=1 \ diff --git a/examples/nas/textnas/train_sst5.sh b/examples/nas/textnas/train_sst5.sh index b26df14b9b..5a59476b6b 100755 --- a/examples/nas/textnas/train_sst5.sh +++ b/examples/nas/textnas/train_sst5.sh @@ -29,12 +29,12 @@ fixed_arc="$fixed_arc 4 7 0 1 1 1 0 0 0 1 0 1 1 1 1 0 1 0 0 0 0 0 0" fixed_arc="$fixed_arc 0 7 0 0 1 1 1 1 0 1 0 1 1 0 0 0 0 1 0 0 0 0 0 1" fixed_arc="$fixed_arc 1 3 1 0 0 1 0 0 0 1 0 0 0 0 1 0 0 0 1 0 0 0 0 0 0" -python eval_arc.py \ +python -u eval_arc.py \ --train_ratio=1.0 \ --valid_ratio=1.0 \ --min_count=1 \ --is_mask=True \ - --is_binary=True \ + --is_binary=False \ --embedding_model="glove" \ --child_lr_decay_scheme="cosine" \ --data_path="data" \ From 5137dcd4d2400a8b16a7625483d9ac73e2ec50b0 Mon Sep 17 00:00:00 2001 From: Yaming Yang Date: Mon, 23 Mar 2020 06:12:17 +0000 Subject: [PATCH 04/12] Bug fix. --- examples/nas/textnas/eval_arc.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/examples/nas/textnas/eval_arc.py b/examples/nas/textnas/eval_arc.py index 63a79d1672..980151baff 100644 --- a/examples/nas/textnas/eval_arc.py +++ b/examples/nas/textnas/eval_arc.py @@ -443,7 +443,7 @@ def train(data_path, output_dir, num_layers): if FLAGS.child_optim_algo == "adam": optimizer = optim.Adam(child_model.parameters(), eps=1e-3, weight_decay=FLAGS.child_l2_reg) # with L2 else: - raise ValueError("Unknown optim_algo {}".format(optim_algo)) + raise ValueError("Unknown optim_algo {}".format(FLAGS.child_optim_algo)) child_model.cuda() criterion.cuda() @@ -506,7 +506,7 @@ def train(data_path, output_dir, num_layers): # compute the gradient norm value grad_norm = nn.utils.clip_grad_norm_(trainable_params, 99999999) for param in trainable_params: - nn.utils.clip_grad_norm_(param, grad_bound) #clip grad + nn.utils.clip_grad_norm_(param, FLAGS.child_grad_bound) # clip grad optimizer.step() @@ -521,6 +521,7 @@ def train(data_path, output_dir, num_layers): log_string += " tr_acc={:<3d}/{:>3d}".format(acc, logits.size()[0]) log_string += " mins={:<10.2f}".format(float(curr_time - start_time) / 60) print(log_string) + epoch += 1 save_state = { 'step' : step, From d02bc9773b75cb490d46c45d3cb17675bbb85115 Mon Sep 17 00:00:00 2001 From: Yaming Yang Date: Wed, 25 Mar 2020 02:24:58 +0000 Subject: [PATCH 05/12] Adjust the running paramters. --- examples/nas/textnas/eval_arc.py | 1 + examples/nas/textnas/train_sst2.sh | 6 +++--- examples/nas/textnas/train_sst5.sh | 2 +- 3 files changed, 5 insertions(+), 4 deletions(-) diff --git a/examples/nas/textnas/eval_arc.py b/examples/nas/textnas/eval_arc.py index 980151baff..c61edc86f1 100644 --- a/examples/nas/textnas/eval_arc.py +++ b/examples/nas/textnas/eval_arc.py @@ -409,6 +409,7 @@ def _update(): params_group['lr'] = learning_rate return learning_rate + def train(data_path, output_dir, num_layers): print("Build dataloader") train_dataset, valid_dataset, test_dataset, embedding = \ diff --git a/examples/nas/textnas/train_sst2.sh b/examples/nas/textnas/train_sst2.sh index c3f24a2be7..40cc43aa16 100755 --- a/examples/nas/textnas/train_sst2.sh +++ b/examples/nas/textnas/train_sst2.sh @@ -43,14 +43,14 @@ python -u eval_arc.py \ --output_dir="output_sst2" \ --global_seed=1234 \ --max_input_length=64 \ - --batch_size=128 \ + --batch_size=256 \ --eval_batch_size=128 \ --num_epochs=10 \ --log_every=50 \ --eval_every_epochs=1 \ --child_num_layers=24 \ - --child_out_filters=256 \ - --child_l2_reg=1e-6 \ + --child_out_filters=128 \ + --child_l2_reg=2e-5 \ --cnn_keep_prob=0.8 \ --final_output_keep_prob=1.0 \ --embed_keep_prob=0.8 \ diff --git a/examples/nas/textnas/train_sst5.sh b/examples/nas/textnas/train_sst5.sh index 5a59476b6b..79da3518f6 100755 --- a/examples/nas/textnas/train_sst5.sh +++ b/examples/nas/textnas/train_sst5.sh @@ -58,7 +58,7 @@ python -u eval_arc.py \ --attention_keep_prob=0.8 \ --child_lr=0.02 \ --child_lr_max=0.002 \ - --child_lr_min=0.0002 \ + --child_lr_min=1e-6 \ --child_lr_T_0=10 \ --child_lr_T_mul=2 \ --multi_path=True \ From eefe0e05cb4cfb0b610fbe6fc117e998c7ec63f4 Mon Sep 17 00:00:00 2001 From: Yaming Yang Date: Sun, 29 Mar 2020 13:45:25 +0000 Subject: [PATCH 06/12] Adopted to nni interface. --- examples/nas/textnas/macro_child.py | 223 ------------------ .../nas/textnas/{eval_arc.py => retrain.py} | 153 +++++------- examples/nas/textnas/run_retrain.sh | 41 ++++ examples/nas/textnas/train_sst2.sh | 69 ------ examples/nas/textnas/train_sst5.sh | 69 ------ 5 files changed, 99 insertions(+), 456 deletions(-) delete mode 100644 examples/nas/textnas/macro_child.py rename examples/nas/textnas/{eval_arc.py => retrain.py} (80%) create mode 100755 examples/nas/textnas/run_retrain.sh delete mode 100755 examples/nas/textnas/train_sst2.sh delete mode 100755 examples/nas/textnas/train_sst5.sh diff --git a/examples/nas/textnas/macro_child.py b/examples/nas/textnas/macro_child.py deleted file mode 100644 index a74f24f5bc..0000000000 --- a/examples/nas/textnas/macro_child.py +++ /dev/null @@ -1,223 +0,0 @@ -# Copyright (c) Microsoft Corporation. -# Licensed under the MIT license. - -import os -import sys - -import numpy as np -import torch -from torch import nn -import torch.nn.functional as Func - -from ops import * -from utils import GlobalAvgPool, GlobalMaxPool - -class MacroChild(nn.Module): - def __init__(self, - embedding, - fixed_arc=None, - out_filters_scale=1, - num_layers=2, - out_filters=24, - cnn_keep_prob=1.0, - final_output_keep_prob=1.0, - lstm_out_keep_prob=1.0, - embed_keep_prob=1.0, - attention_keep_prob=1.0, - multi_path=False, - embedding_model="none", - all_layer_output=False, - output_linear_combine=False, - num_last_layer_output=0, - is_mask=False, - output_type="avg_pool", - class_num=5, - *args, - **kwargs): - super(MacroChild, self).__init__() - - self.fixed_arc = fixed_arc - self.all_layer_output = all_layer_output - self.output_linear_combine = output_linear_combine - self.num_last_layer_output = max(num_last_layer_output, 0) - self.is_mask = is_mask - self.output_type = output_type - self.multi_path = multi_path - self.embedding_model = embedding_model - self.out_filters = out_filters * out_filters_scale - self.num_layers = num_layers - self.class_num = class_num - self.cnn_keep_prob = cnn_keep_prob - self.final_output_keep_prob = final_output_keep_prob - self.lstm_out_keep_prob = lstm_out_keep_prob - self.embed_keep_prob = embed_keep_prob - self.attention_keep_prob = attention_keep_prob - - fixed_arc = np.array([int(x) for x in self.fixed_arc.split(" ") if x]) - self.sample_arc = fixed_arc - - layers = [] - - out_filters = self.out_filters - if self.embedding_model == "glove": - self.embedding = nn.Parameter(embedding) - else: - raise NotImplementedError("Unknown embedding_model '{}'".format(embedding_model)) - - self.init_conv = ConvBN(1, self.embedding.size()[1], out_filters, cnn_keep_prob, False, True) - - for layer_id in range(self.num_layers): - layers.append(self.make_fixed_layer(layer_id, out_filters)) - self.layers = nn.ModuleList(layers) - - if self.all_layer_output and self.output_linear_combine: # use linear_combine - self._linear_combine = LinearCombine(self.num_layers) - self.linear_out = nn.Linear(out_filters, self.class_num) - - self.embed_dropout= nn.Dropout(p=(1 - embed_keep_prob)) - self.output_dropout= nn.Dropout(p=(1 - final_output_keep_prob)) - - if self.output_type == "avg_pool": - self.output_pool = GlobalAvgPool() - elif self.output_type == "max_pool": - self.output_pool = GlobalMaxPool() - else: - raise ValueError("Unsupported output type.") - - def forward(self, sent_ids, mask): - seq = Func.embedding(sent_ids.long(), self.embedding) - seq = self.embed_dropout(seq) - - seq = torch.transpose(seq, 1, 2) # from (N, L, C) -> (N, C, L) - - x = self.init_conv(seq, mask) - - start_idx = 0 - prev_layers = [] - final_flags = [] - - for layer_id in range(self.num_layers): # run layers - layer = self.layers[layer_id] - x = self.run_fixed_layer(x, mask, prev_layers, layer, layer_id, start_idx, - final_flags=final_flags) # run needed branches - prev_layers.append(x) - final_flags.append(1) - - start_idx += 1 + layer_id - if self.multi_path: - start_idx += 1 - - final_layers = [] - final_layers_idx = [] - for i in range(0, len(prev_layers)): - if self.all_layer_output: - if self.num_last_layer_output == 0: - final_layers.append(prev_layers[i]) - final_layers_idx.append(i) - elif i >= max((len(prev_layers) - self.num_last_layer_output), 0): - final_layers.append(prev_layers[i]) - final_layers_idx.append(i) - else: - final_layers.append(final_flags[i] * prev_layers[i]) - - if self.all_layer_output and self.output_linear_combine: # all layer ooutput and use linear_combine - x = self._linear_combine(torch.stack(final_layers)) - else: - x = sum(final_layers) - if not self.all_layer_output: - x /= sum(final_flags) - else: - x /= len(final_layers) - - x = self.output_pool(x, mask) - x = self.output_dropout(x) - x = self.linear_out(x) - return x - - def make_fixed_layer(self, layer_id, out_filters): - size = [1, 3, 5, 7] - separables = [False, False, False, False] - - branches = [] - - if self.multi_path: - branch_id = (layer_id + 1) * (layer_id + 2) // 2 - else: - branch_id = (layer_id) * (layer_id + 1) // 2 - - bn_flag = False - for i in range(layer_id): - if self.sample_arc[branch_id + 1 + i] == 1: - bn_flag = True - branch_id = self.sample_arc[branch_id] - - for operation_id in [0, 1, 2, 3]: # conv_opt - if branch_id == operation_id: - filter_size = size[operation_id] - separable = separables[operation_id] - op = ConvBN(filter_size, out_filters, out_filters, self.cnn_keep_prob, False, True) - branches.append(op) - if branch_id == 4: - branches.append(AvgPool(3, False, True)) - elif branch_id == 5: - branches.append(MaxPool(3, False, True)) - elif branch_id == 6: - branches.append(RNN(out_filters, self.lstm_out_keep_prob)) - elif branch_id == 7: - branches.append(Attention(out_filters, 4, self.attention_keep_prob, self.is_mask)) - - branches = nn.ModuleList(branches) - bn = None - if bn_flag: - bn = BatchNorm(self.out_filters, False, True) - - return nn.ModuleList([branches, bn]) - - def run_fixed_layer(self, x, mask, prev_layers, layers, layer_id, start_idx, final_flags): - layer = layers[0] - bn = layers[1] - - if len(prev_layers) > 0: - if self.multi_path: - pre_layer_id = self.sample_arc[start_idx] - num_pre_layers = len(prev_layers) - if num_pre_layers > 5: - num_pre_layers = 5 - if pre_layer_id >= num_pre_layers: - final_flags[-1] = 0 - inputs = prev_layers[-1] - else: - layer_idx = len(prev_layers) - 1 - pre_layer_id - final_flags[layer_idx] = 0 - inputs = prev_layers[layer_idx] - else: - inputs = prev_layers[-1] - final_flags[-1] = 0 - else: - inputs = x - - if self.multi_path: - start_idx += 1 - - branches = [] - # run branch op - branch_id = 0 - branches.append(layer[branch_id](inputs, mask)) - - if layer_id == 0: - out = sum(branches) - else: - skip_start = start_idx + 1 - skip = self.sample_arc[skip_start:skip_start + layer_id] - - res_layers = [] - for i in range(layer_id): - if skip[i] == 1: - res_layers.append(prev_layers[i]) - final_flags[i] = 0 - prev = branches + res_layers - out = sum(prev) # tensor sum - if len(prev) > 1: - out = bn(out, mask) - - return out diff --git a/examples/nas/textnas/eval_arc.py b/examples/nas/textnas/retrain.py similarity index 80% rename from examples/nas/textnas/eval_arc.py rename to examples/nas/textnas/retrain.py index c61edc86f1..ab8f5c661c 100644 --- a/examples/nas/textnas/eval_arc.py +++ b/examples/nas/textnas/retrain.py @@ -1,10 +1,11 @@ # Copyright (c) Microsoft Corporation. # Licensed under the MIT license. +import sys import os +import logging import pickle import shutil -import sys import random import math @@ -20,10 +21,14 @@ from torch.utils.data import DataLoader import torch.nn.functional as Func -from macro_child import MacroChild +from model import Model +from nni.nas.pytorch.fixed import apply_fixed_architecture from dataloader import read_data_sst +logger = logging.getLogger("nni.textnas") + + def parse_args(): parser = argparse.ArgumentParser() parser.add_argument( @@ -31,16 +36,11 @@ def parse_args(): type=distutils.util.strtobool, default=True, help="Whether to clean the output dir if existed. (default: %(default)s)") - parser.add_argument( - "--embedding_model", - type=str, - default="glove", - help="Embedding type. (default: %(default)s)") parser.add_argument( "--child_fixed_arc", type=str, required=True, - help="Architecture description. (default: %(default)s)") + help="Architecture json file. (default: %(default)s)") parser.add_argument( "--data_path", type=str, @@ -116,11 +116,6 @@ def parse_args(): type=int, default=1, help="The threshold to cut off low frequent words. (default: %(default)s)") - parser.add_argument( - "--num_last_layer_output", - type=int, - default=0, - help="The last n layers as output, 0 for all. (default: %(default)s)") parser.add_argument( "--train_ratio", type=float, @@ -194,7 +189,7 @@ def parse_args(): parser.add_argument( "--output_type", type=str, - default="avg_pool", + default="avg", help="Opertor type for the time steps reduction. (default: %(default)s)") parser.add_argument( "--multi_path", @@ -207,15 +202,10 @@ def parse_args(): default=False, help="Binary label for sst dataset. (default: %(default)s)") parser.add_argument( - "--all_layer_output", - type=distutils.util.strtobool, - default=True, - help="Use all layers as output. (default: %(default)s)") - parser.add_argument( - "--output_linear_combine", + "--is_cuda", type=distutils.util.strtobool, default=True, - help="Combine all the layers in linear way. (default: %(default)s)") + help="Specify the device type. (default: %(default)s)") parser.add_argument( "--is_mask", type=distutils.util.strtobool, @@ -248,58 +238,40 @@ def parse_args(): def set_random_seed(seed): - print("-" * 80) - print("set random seed for data reading: {}".format(seed)) + logger.info("set random seed for data reading: {}".format(seed)) random.seed(seed) os.environ['PYTHONHASHSEED'] = str(seed) np.random.seed(seed) random.seed(seed) torch.manual_seed(seed) - torch.cuda.manual_seed(seed) - torch.backends.cudnn.deterministic = True + if FLAGS.is_cuda: + torch.cuda.manual_seed(seed) + torch.backends.cudnn.deterministic = True def get_model(embedding, num_layers): - print("num layers: {0}".format(num_layers)) + logger.info("num layers: {0}".format(num_layers)) assert FLAGS.child_fixed_arc is not None, "Architecture should be provided." - child_model = MacroChild( + child_model = Model( embedding=embedding, - fixed_arc=FLAGS.child_fixed_arc, - out_filters_scale=FLAGS.child_out_filters_scale, + hidden_units=FLAGS.child_out_filters_scale * FLAGS.child_out_filters, num_layers=num_layers, - out_filters=FLAGS.child_out_filters, + num_classes=FLAGS.class_num, + choose_from_k=5 if FLAGS.multi_path else 1, + lstm_keep_prob=FLAGS.lstm_out_keep_prob, cnn_keep_prob=FLAGS.cnn_keep_prob, - final_output_keep_prob=FLAGS.final_output_keep_prob, - lstm_out_keep_prob=FLAGS.lstm_out_keep_prob, + att_keep_prob=FLAGS.attention_keep_prob, + att_mask=FLAGS.is_mask, embed_keep_prob=FLAGS.embed_keep_prob, - attention_keep_prob=FLAGS.attention_keep_prob, - multi_path=FLAGS.multi_path, - embedding_model=FLAGS.embedding_model, - all_layer_output=FLAGS.all_layer_output, - output_linear_combine=FLAGS.output_linear_combine, - num_last_layer_output=FLAGS.num_last_layer_output, - is_mask=FLAGS.is_mask, - output_type=FLAGS.output_type, - class_num=FLAGS.class_num) + final_output_keep_prob=FLAGS.final_output_keep_prob, + global_pool=FLAGS.output_type) + apply_fixed_architecture(child_model, FLAGS.child_fixed_arc) return child_model -def print_arc(arc, num_layers): - start = 0 - for i in range(0, num_layers): - end = start + i + 1 - if FLAGS.multi_path: - end += 1 - out_str = "fixed_arc=\"$fixed_arc {0}\"".format(np.reshape(arc[start: end], [-1])) - out_str = out_str.replace("[", "").replace("]", "") - print(out_str) - - start = end - - -def eval_once(child_model, eval_set, criterion, valid_dataloader=None, test_dataloader=None): +def eval_once(child_model, device, eval_set, criterion, valid_dataloader=None, test_dataloader=None): if eval_set == "test": assert test_dataloader is not None dataloader = test_dataloader @@ -317,11 +289,11 @@ def eval_once(child_model, eval_set, criterion, valid_dataloader=None, test_data for batch in dataloader: (sent_ids, mask), labels = batch - sent_ids = sent_ids.cuda() - mask = mask.cuda() - labels = labels.cuda() + sent_ids = sent_ids.to(device, non_blocking=True) + mask = mask.to(device, non_blocking=True) + labels = labels.to(device, non_blocking=True) - logits = child_model(sent_ids, mask) # run + logits = child_model((sent_ids, mask)) # run loss = criterion(logits, labels.long()) loss = loss.mean() @@ -338,14 +310,12 @@ def eval_once(child_model, eval_set, criterion, valid_dataloader=None, test_data final_acc = float(tot_acc) / tot else: final_acc = 0 - print("Error in calculating final_acc") + logger.info("Error in calculating final_acc") return final_acc, loss def print_user_flags(FLAGS, line_limit=80): - print("-" * 80) - - log_strings = "" + log_strings = "\n" + "-" * line_limit + "\n" for flag_name in sorted(vars(FLAGS)): value = "{}".format(getattr(FLAGS, flag_name)) log_string = flag_name @@ -353,7 +323,8 @@ def print_user_flags(FLAGS, line_limit=80): log_string += value log_strings = log_strings + log_string log_strings = log_strings + "\n" - print(log_strings) + log_strings += "-" * line_limit + logger.info(log_strings) def count_model_params(trainable_params): @@ -410,8 +381,8 @@ def _update(): return learning_rate -def train(data_path, output_dir, num_layers): - print("Build dataloader") +def train(device, data_path, output_dir, num_layers): + logger.info("Build dataloader") train_dataset, valid_dataset, test_dataset, embedding = \ read_data_sst(data_path, FLAGS.max_input_length, @@ -423,16 +394,15 @@ def train(data_path, output_dir, num_layers): test_dataloader = DataLoader(test_dataset, batch_size=FLAGS.eval_batch_size, pin_memory=True) valid_dataloader = DataLoader(valid_dataset, batch_size=FLAGS.eval_batch_size, pin_memory=True) - print("Build model") - print("-" * 80) + logger.info("Build model") child_model = get_model(embedding, num_layers) - print("Finish build model") + logger.info("Finish build model") - for name, var in child_model.named_parameters(): - print(name, var.size(), var.requires_grad) # output all params + #for name, var in child_model.named_parameters(): + # logger.info(name, var.size(), var.requires_grad) # output all params num_vars = count_model_params(child_model.parameters()) - print("Model has {} params".format(num_vars)) + logger.info("Model has {} params".format(num_vars)) for m in child_model.modules(): # initializer if isinstance(m, (nn.Conv1d, nn.Linear)): @@ -446,14 +416,10 @@ def train(data_path, output_dir, num_layers): else: raise ValueError("Unknown optim_algo {}".format(FLAGS.child_optim_algo)) - child_model.cuda() - criterion.cuda() - - fixed_arc = np.array([int(x) for x in FLAGS.child_fixed_arc.split(" ") if x]) - print_arc(fixed_arc, num_layers) + child_model.to(device) + criterion.to(device) - print("Start training") - print("-" * 80) + logger.info("Start training") start_time = time.time() step = 0 @@ -485,13 +451,13 @@ def train(data_path, output_dir, num_layers): for batch in train_dataloader: (sent_ids, mask), labels = batch - sent_ids = sent_ids.cuda() - mask = mask.cuda() - labels = labels.cuda() + sent_ids = sent_ids.to(device, non_blocking=True) + mask = mask.to(device, non_blocking=True) + labels = labels.to(device, non_blocking=True) step += 1 - logits = child_model(sent_ids, mask) # run + logits = child_model((sent_ids, mask)) # run loss = criterion(logits, labels.long()) loss = loss.mean() @@ -521,7 +487,7 @@ def train(data_path, output_dir, num_layers): log_string += " |g|={:<8.4f}".format(grad_norm) log_string += " tr_acc={:<3d}/{:>3d}".format(acc, logits.size()[0]) log_string += " mins={:<10.2f}".format(float(curr_time - start_time) / 60) - print(log_string) + logger.info(log_string) epoch += 1 save_state = { @@ -531,12 +497,12 @@ def train(data_path, output_dir, num_layers): 'optimizer_state_dict' : optimizer.state_dict()} torch.save(save_state, model_save_path) child_model.eval() - print("Epoch {}: Eval".format(epoch)) - eval_acc, eval_loss = eval_once(child_model, "test", criterion, test_dataloader=test_dataloader) - print("ch_step={} {}_accuracy={:<6.4f} {}_loss={:<6.4f}".format(step, "test", eval_acc, "test", eval_loss)) + logger.info("Epoch {}: Eval".format(epoch)) + eval_acc, eval_loss = eval_once(child_model, device, "test", criterion, test_dataloader=test_dataloader) + logger.info("ch_step={} {}_accuracy={:<6.4f} {}_loss={:<6.4f}".format(step, "test", eval_acc, "test", eval_loss)) if eval_acc > best_acc: best_acc = eval_acc - print("Save best model") + logger.info("Save best model") save_state = { 'step' : step, 'epoch' : epoch, @@ -549,24 +515,21 @@ def train(data_path, output_dir, num_layers): def main(): parse_args() - print("-" * 80) if not os.path.isdir(FLAGS.output_dir): - print("Path {} does not exist. Creating.".format(FLAGS.output_dir)) + logger.info("Path {} does not exist. Creating.".format(FLAGS.output_dir)) os.makedirs(FLAGS.output_dir) elif FLAGS.reset_output_dir: - print("Path {} exists. Remove and remake.".format(FLAGS.output_dir)) + logger.info("Path {} exists. Remove and remake.".format(FLAGS.output_dir)) shutil.rmtree(FLAGS.output_dir, ignore_errors=True) os.makedirs(FLAGS.output_dir) - print("-" * 80) - log_file = os.path.join(FLAGS.output_dir, "stdout") - print("Logging to {}".format(log_file)) print_user_flags(FLAGS) if FLAGS.fixed_seed: set_random_seed(FLAGS.global_seed) - train(FLAGS.data_path, FLAGS.output_dir, FLAGS.child_num_layers) + device = torch.device("cuda" if FLAGS.is_cuda else "cpu") + train(device, FLAGS.data_path, FLAGS.output_dir, FLAGS.child_num_layers) if __name__ == "__main__": diff --git a/examples/nas/textnas/run_retrain.sh b/examples/nas/textnas/run_retrain.sh new file mode 100755 index 0000000000..5c8ea66ae9 --- /dev/null +++ b/examples/nas/textnas/run_retrain.sh @@ -0,0 +1,41 @@ +# Copyright (c) Microsoft Corporation. +# Licensed under the MIT license. + +export PYTHONPATH="$(pwd)" +export CUDA_VISIBLE_DEVICES=0 + +python -u retrain.py \ + --train_ratio=1.0 \ + --valid_ratio=1.0 \ + --min_count=1 \ + --is_mask=True \ + --is_binary=True \ + --child_lr_decay_scheme="cosine" \ + --data_path="data" \ + --class_num=2 \ + --child_optim_algo="adam" \ + --output_dir="output_sst2" \ + --global_seed=1234 \ + --max_input_length=64 \ + --batch_size=128 \ + --eval_batch_size=128 \ + --num_epochs=10 \ + --log_every=50 \ + --eval_every_epochs=1 \ + --child_num_layers=24 \ + --child_out_filters=256 \ + --child_l2_reg=1e-6 \ + --cnn_keep_prob=0.8 \ + --final_output_keep_prob=1.0 \ + --embed_keep_prob=0.8 \ + --lstm_out_keep_prob=0.8 \ + --attention_keep_prob=0.8 \ + --child_lr=0.02 \ + --child_lr_max=0.002 \ + --child_lr_min=5e-6 \ + --child_lr_T_0=10 \ + --child_lr_T_mul=2 \ + --multi_path=True \ + --child_fixed_arc="./checkpoints/architecture_00.json" \ + --fixed_seed=True \ + "$@" diff --git a/examples/nas/textnas/train_sst2.sh b/examples/nas/textnas/train_sst2.sh deleted file mode 100755 index 40cc43aa16..0000000000 --- a/examples/nas/textnas/train_sst2.sh +++ /dev/null @@ -1,69 +0,0 @@ -# Copyright (c) Microsoft Corporation. -# Licensed under the MIT license. - -export PYTHONPATH="$(pwd)" -export CUDA_VISIBLE_DEVICES=0 - -fixed_arc="$fixed_arc 0 5" -fixed_arc="$fixed_arc 1 7 0" -fixed_arc="$fixed_arc 1 3 0 0" -fixed_arc="$fixed_arc 3 6 0 1 1" -fixed_arc="$fixed_arc 1 1 0 1 0 0" -fixed_arc="$fixed_arc 0 1 0 0 1 0 1" -fixed_arc="$fixed_arc 0 0 0 0 1 1 1 1" -fixed_arc="$fixed_arc 4 6 0 1 1 0 0 0 0" -fixed_arc="$fixed_arc 3 5 0 1 0 1 1 0 1 1" -fixed_arc="$fixed_arc 2 5 0 0 0 0 0 1 0 1 1" -fixed_arc="$fixed_arc 4 7 1 1 0 0 1 0 0 1 1 0" -fixed_arc="$fixed_arc 4 4 1 0 0 1 1 1 1 0 0 0 0" -fixed_arc="$fixed_arc 1 1 1 1 0 1 0 1 0 0 1 0 0 0" -fixed_arc="$fixed_arc 1 2 1 1 0 0 0 0 1 0 0 1 1 0 1" -fixed_arc="$fixed_arc 1 0 0 0 1 0 0 0 1 1 1 0 1 1 0 0" -fixed_arc="$fixed_arc 0 3 1 0 0 0 0 0 1 0 0 0 1 1 0 0 1" -fixed_arc="$fixed_arc 1 2 0 0 0 0 0 0 0 0 0 1 1 0 1 0 1 0" -fixed_arc="$fixed_arc 0 4 0 1 1 1 1 0 0 1 0 1 0 0 0 0 1 0 0" -fixed_arc="$fixed_arc 2 3 0 0 0 0 0 1 0 0 0 0 0 0 0 1 1 1 0 1" -fixed_arc="$fixed_arc 2 3 0 0 0 1 0 0 0 0 0 0 1 0 0 0 1 0 0 0 0" -fixed_arc="$fixed_arc 3 5 1 1 0 0 0 0 0 0 0 0 1 1 0 0 1 1 1 1 0 0" -fixed_arc="$fixed_arc 4 7 0 1 1 1 0 0 0 1 0 1 1 1 1 0 1 0 0 0 0 0 0" -fixed_arc="$fixed_arc 0 7 0 0 1 1 1 1 0 1 0 1 1 0 0 0 0 1 0 0 0 0 0 1" -fixed_arc="$fixed_arc 1 3 1 0 0 1 0 0 0 1 0 0 0 0 1 0 0 0 1 0 0 0 0 0 0" - -python -u eval_arc.py \ - --train_ratio=1.0 \ - --valid_ratio=1.0 \ - --min_count=1 \ - --is_mask=True \ - --is_binary=True \ - --embedding_model="glove" \ - --child_lr_decay_scheme="cosine" \ - --data_path="data" \ - --class_num=2 \ - --child_optim_algo="adam" \ - --output_dir="output_sst2" \ - --global_seed=1234 \ - --max_input_length=64 \ - --batch_size=256 \ - --eval_batch_size=128 \ - --num_epochs=10 \ - --log_every=50 \ - --eval_every_epochs=1 \ - --child_num_layers=24 \ - --child_out_filters=128 \ - --child_l2_reg=2e-5 \ - --cnn_keep_prob=0.8 \ - --final_output_keep_prob=1.0 \ - --embed_keep_prob=0.8 \ - --lstm_out_keep_prob=0.8 \ - --attention_keep_prob=0.8 \ - --child_lr=0.02 \ - --child_lr_max=0.002 \ - --child_lr_min=5e-6 \ - --child_lr_T_0=10 \ - --child_lr_T_mul=2 \ - --multi_path=True \ - --child_fixed_arc="${fixed_arc}" \ - --fixed_seed=True \ - --all_layer_output=True \ - --output_linear_combine=True \ - "$@" diff --git a/examples/nas/textnas/train_sst5.sh b/examples/nas/textnas/train_sst5.sh deleted file mode 100755 index 79da3518f6..0000000000 --- a/examples/nas/textnas/train_sst5.sh +++ /dev/null @@ -1,69 +0,0 @@ -# Copyright (c) Microsoft Corporation. -# Licensed under the MIT license. - -export PYTHONPATH="$(pwd)" -export CUDA_VISIBLE_DEVICES=0 - -fixed_arc="$fixed_arc 0 5" -fixed_arc="$fixed_arc 1 7 0" -fixed_arc="$fixed_arc 1 3 0 0" -fixed_arc="$fixed_arc 3 6 0 1 1" -fixed_arc="$fixed_arc 1 1 0 1 0 0" -fixed_arc="$fixed_arc 0 1 0 0 1 0 1" -fixed_arc="$fixed_arc 0 0 0 0 1 1 1 1" -fixed_arc="$fixed_arc 4 6 0 1 1 0 0 0 0" -fixed_arc="$fixed_arc 3 5 0 1 0 1 1 0 1 1" -fixed_arc="$fixed_arc 2 5 0 0 0 0 0 1 0 1 1" -fixed_arc="$fixed_arc 4 7 1 1 0 0 1 0 0 1 1 0" -fixed_arc="$fixed_arc 4 4 1 0 0 1 1 1 1 0 0 0 0" -fixed_arc="$fixed_arc 1 1 1 1 0 1 0 1 0 0 1 0 0 0" -fixed_arc="$fixed_arc 1 2 1 1 0 0 0 0 1 0 0 1 1 0 1" -fixed_arc="$fixed_arc 1 0 0 0 1 0 0 0 1 1 1 0 1 1 0 0" -fixed_arc="$fixed_arc 0 3 1 0 0 0 0 0 1 0 0 0 1 1 0 0 1" -fixed_arc="$fixed_arc 1 2 0 0 0 0 0 0 0 0 0 1 1 0 1 0 1 0" -fixed_arc="$fixed_arc 0 4 0 1 1 1 1 0 0 1 0 1 0 0 0 0 1 0 0" -fixed_arc="$fixed_arc 2 3 0 0 0 0 0 1 0 0 0 0 0 0 0 1 1 1 0 1" -fixed_arc="$fixed_arc 2 3 0 0 0 1 0 0 0 0 0 0 1 0 0 0 1 0 0 0 0" -fixed_arc="$fixed_arc 3 5 1 1 0 0 0 0 0 0 0 0 1 1 0 0 1 1 1 1 0 0" -fixed_arc="$fixed_arc 4 7 0 1 1 1 0 0 0 1 0 1 1 1 1 0 1 0 0 0 0 0 0" -fixed_arc="$fixed_arc 0 7 0 0 1 1 1 1 0 1 0 1 1 0 0 0 0 1 0 0 0 0 0 1" -fixed_arc="$fixed_arc 1 3 1 0 0 1 0 0 0 1 0 0 0 0 1 0 0 0 1 0 0 0 0 0 0" - -python -u eval_arc.py \ - --train_ratio=1.0 \ - --valid_ratio=1.0 \ - --min_count=1 \ - --is_mask=True \ - --is_binary=False \ - --embedding_model="glove" \ - --child_lr_decay_scheme="cosine" \ - --data_path="data" \ - --class_num=5 \ - --child_optim_algo="adam" \ - --output_dir="output_sst5" \ - --global_seed=1234 \ - --max_input_length=64 \ - --batch_size=256 \ - --eval_batch_size=128 \ - --num_epochs=10 \ - --log_every=50 \ - --eval_every_epochs=1 \ - --child_num_layers=24 \ - --child_out_filters=256 \ - --child_l2_reg=1e-6 \ - --cnn_keep_prob=0.8 \ - --final_output_keep_prob=1.0 \ - --embed_keep_prob=0.8 \ - --lstm_out_keep_prob=0.8 \ - --attention_keep_prob=0.8 \ - --child_lr=0.02 \ - --child_lr_max=0.002 \ - --child_lr_min=1e-6 \ - --child_lr_T_0=10 \ - --child_lr_T_mul=2 \ - --multi_path=True \ - --child_fixed_arc="${fixed_arc}" \ - --fixed_seed=True \ - --all_layer_output=True \ - --output_linear_combine=True \ - "$@" From 0f9a7c6bb6bf54d8d44ce74edd9a740ebf9aa70a Mon Sep 17 00:00:00 2001 From: Yaming Yang Date: Thu, 2 Apr 2020 07:30:26 +0000 Subject: [PATCH 07/12] Add doc of TextNAS. --- docs/en_US/NAS/TextNAS.md | 56 +++++++ examples/nas/textnas/README.md | 6 +- examples/nas/textnas/README_zh_CN.md | 6 +- examples/nas/textnas/arc/final_arc.json | 212 ++++++++++++++++++++++++ examples/nas/textnas/run_retrain.sh | 2 +- 5 files changed, 279 insertions(+), 3 deletions(-) create mode 100644 docs/en_US/NAS/TextNAS.md create mode 100644 examples/nas/textnas/arc/final_arc.json diff --git a/docs/en_US/NAS/TextNAS.md b/docs/en_US/NAS/TextNAS.md new file mode 100644 index 0000000000..3e6303c2ce --- /dev/null +++ b/docs/en_US/NAS/TextNAS.md @@ -0,0 +1,56 @@ +# TextNAS + +## Introduction + +This is the implementation of the TextNAS algorithm proposed in the paper [TextNAS: A Neural Architecture Search Space tailored for Text Representation](https://arxiv.org/pdf/1912.10729.pdf). TextNAS is a neural architecture search algorithm tailored for text representation, more specifically, TextNAS is based on a novel search space consists of operators widely adopted to solve various NLP tasks, and TextNAS also supports multi-path ensemble within a single network to balance the width and depth of the architecture. + +The search space of TextNAS contains: + + * 1-D convolutional operator with filter size 1, 3, 5, 7 + * recurrent operator (bi-directional GRU) + * self-attention operator + * pooling operator (max/average) + +Following the ENAS algorithm, TextNAS also utilizes parameter sharing to accelerate the search speed and adopts a reinforcement-learning controller for the architecture sampling and generation. Please refer to the paper for more details of TextNAS. + +## Examples + +### Search Space + +[Example code](https://github.com/microsoft/nni/tree/master/examples/nas/textnas) + +```bash +# In case NNI code is not cloned. If the code is cloned already, ignore this line and enter code folder. +git clone https://github.com/Microsoft/nni.git + +# search the best architecture +cd examples/nas/textnas + +# view more options for search +python3 search.py -h +``` + +### retrain + +```bash +# In case NNI code is not cloned. If the code is cloned already, ignore this line and enter code folder. +git clone https://github.com/Microsoft/nni.git + +# search the best architecture +cd examples/nas/textnas + +# default to retrain on sst-2 +sh run_retrain.sh +``` + +## Reference + +### PyTorch + +```eval_rst +.. autoclass:: nni.nas.pytorch.enas.EnasTrainer + :members: + +.. autoclass:: nni.nas.pytorch.enas.EnasMutator + :members: +``` diff --git a/examples/nas/textnas/README.md b/examples/nas/textnas/README.md index fb261ad04d..f8ebe24afd 100644 --- a/examples/nas/textnas/README.md +++ b/examples/nas/textnas/README.md @@ -42,4 +42,8 @@ By default, 20 sampled architectures will be exported into `checkpoints` directo ## Retrain -Not ready. +``` +sh run_retrain.sh +``` + +By default, the script will retrain the architecture provided by the author on the SST-2 dataset. diff --git a/examples/nas/textnas/README_zh_CN.md b/examples/nas/textnas/README_zh_CN.md index 22bbbb4c9b..f9ad379939 100644 --- a/examples/nas/textnas/README_zh_CN.md +++ b/examples/nas/textnas/README_zh_CN.md @@ -42,4 +42,8 @@ python search.py ## 重新训练 -待完成 +``` +sh run_retrain.sh +``` + +默认情况下,会在SST-2数据集上重新训练作者提供的结构。 diff --git a/examples/nas/textnas/arc/final_arc.json b/examples/nas/textnas/arc/final_arc.json new file mode 100644 index 0000000000..c1e12c2d4b --- /dev/null +++ b/examples/nas/textnas/arc/final_arc.json @@ -0,0 +1,212 @@ +{ + "LayerChoice1": [ + false, false, false, false, false, true, false, false + ], + "InputChoice2": [ + true + ], + "LayerChoice3": [ + false, false, false, false, false, false, false, true + ], + "InputChoice4": [ + false + ], + "InputChoice5": [ + true, false + ], + "LayerChoice6": [ + false, false, false, true, false, false, false, false + ], + "InputChoice7": [ + false, false + ], + "InputChoice8": [ + false, false, true + ], + "LayerChoice9": [ + false, false, false, false, false, false, true, false + ], + "InputChoice10": [ + false, true, true + ], + "InputChoice11": [ + false, false, true, false + ], + "LayerChoice12": [ + false, true, false, false, false, false, false, false + ], + "InputChoice13": [ + false, true, false, false + ], + "InputChoice14": [ + false, false, false, false, true + ], + "LayerChoice15": [ + false, true, false, false, false, false, false, false + ], + "InputChoice16": [ + false, false, true, false, true + ], + "InputChoice17": [ + false, false, false, false, true + ], + "LayerChoice18": [ + true, false, false, false, false, false, false, false + ], + "InputChoice19": [ + false, false, true, true, true, true + ], + "InputChoice20": [ + true, false, false, false, false + ], + "LayerChoice21": [ + false, false, false, false, false, false, true, false + ], + "InputChoice22": [ + false, true, true, false, false, false, false + ], + "InputChoice23": [ + false, true, false, false, false + ], + "LayerChoice24": [ + false, false, false, false, false, true, false, false + ], + "InputChoice25": [ + false, true, false, true, true, false, true, true + ], + "InputChoice26": [ + false, false, true, false, false + ], + "LayerChoice27": [ + false, false, false, false, false, true, false, false + ], + "InputChoice28": [ + false, false, false, false, false, true, false, true, true + ], + "InputChoice29": [ + true, false, false, false, false + ], + "LayerChoice30": [ + false, false, false, false, false, false, false, true + ], + "InputChoice31": [ + true, true, false, false, true, false, false, true, true, false + ], + "InputChoice32": [ + true, false, false, false, false + ], + "LayerChoice33": [ + false, false, false, false, true, false, false, false + ], + "InputChoice34": [ + true, false, false, true, true, true, true, false, false, false, false + ], + "InputChoice35": [ + false, false, false, true, false + ], + "LayerChoice36": [ + false, true, false, false, false, false, false, false + ], + "InputChoice37": [ + true, true, false, true, false, true, false, false, true, false, false, false + ], + "InputChoice38": [ + false, false, false, true, false + ], + "LayerChoice39": [ + false, false, true, false, false, false, false, false + ], + "InputChoice40": [ + true, true, false, false, false, false, true, false, false, true, true, false, true + ], + "InputChoice41": [ + false, false, false, true, false + ], + "LayerChoice42": [ + true, false, false, false, false, false, false, false + ], + "InputChoice43": [ + false, false, true, false, false, false, true, true, true, false, true, true, false, false + ], + "InputChoice44": [ + false, false, false, false, true + ], + "LayerChoice45": [ + false, false, false, true, false, false, false, false + ], + "InputChoice46": [ + true, false, false, false, false, false, true, false, false, false, true, true, false, false, true + ], + "InputChoice47": [ + false, false, false, true, false + ], + "LayerChoice48": [ + false, false, true, false, false, false, false, false + ], + "InputChoice49": [ + false, false, false, false, false, false, false, false, false, true, true, false, true, false, true, false + ], + "InputChoice50": [ + false, false, false, false, true + ], + "LayerChoice51": [ + false, false, false, false, true, false, false, false + ], + "InputChoice52": [ + false, true, true, true, true, false, false, true, false, true, false, false, false, false, true, false, false + ], + "InputChoice53": [ + false, false, true, false, false + ], + "LayerChoice54": [ + false, false, false, true, false, false, false, false + ], + "InputChoice55": [ + false, false, false, false, false, true, false, false, false, false, false, false, false, true, true, true, false, true + ], + "InputChoice56": [ + false, false, true, false, false + ], + "LayerChoice57": [ + false, false, false, true, false, false, false, false + ], + "InputChoice58": [ + false, false, false, true, false, false, false, false, false, false, true, false, false, false, true, false, false, false, false + ], + "InputChoice59": [ + false, true, false, false, false + ], + "LayerChoice60": [ + false, false, false, false, false, true, false, false + ], + "InputChoice61": [ + true, true, false, false, false, false, false, false, false, false, true, true, false, false, true, true, true, true, false, false + ], + "InputChoice62": [ + true, false, false, false, false + ], + "LayerChoice63": [ + false, false, false, false, false, false, false, true + ], + "InputChoice64": [ + false, true, true, true, false, false, false, true, false, true, true, true, true, false, true, false, false, false, false, false, false + ], + "InputChoice65": [ + false, false, false, false, true + ], + "LayerChoice66": [ + false, false, false, false, false, false, false, true + ], + "InputChoice67": [ + false, false, true, true, true, true, false, true, false, true, true, false, false, false, false, true, false, false, false, false, false, true + ], + "InputChoice68": [ + false, false, false, true, false + ], + "LayerChoice69": [ + false, false, false, true, false, false, false, false + ], + "InputChoice70": [ + true, false, false, true, false, false, false, true, false, false, false, false, true, false, false, false, true, false, false, false, false, false, false + ] +} diff --git a/examples/nas/textnas/run_retrain.sh b/examples/nas/textnas/run_retrain.sh index 5c8ea66ae9..b9306dbed2 100755 --- a/examples/nas/textnas/run_retrain.sh +++ b/examples/nas/textnas/run_retrain.sh @@ -36,6 +36,6 @@ python -u retrain.py \ --child_lr_T_0=10 \ --child_lr_T_mul=2 \ --multi_path=True \ - --child_fixed_arc="./checkpoints/architecture_00.json" \ + --child_fixed_arc="./arc/final_arc.json" \ --fixed_seed=True \ "$@" From cc06ecdfc8fbc173ead4d2366ce10960821a0c1e Mon Sep 17 00:00:00 2001 From: Yaming Yang Date: Thu, 2 Apr 2020 07:50:24 +0000 Subject: [PATCH 08/12] Undo modification of the Chinese document. --- examples/nas/textnas/README_zh_CN.md | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) diff --git a/examples/nas/textnas/README_zh_CN.md b/examples/nas/textnas/README_zh_CN.md index f9ad379939..22bbbb4c9b 100644 --- a/examples/nas/textnas/README_zh_CN.md +++ b/examples/nas/textnas/README_zh_CN.md @@ -42,8 +42,4 @@ python search.py ## 重新训练 -``` -sh run_retrain.sh -``` - -默认情况下,会在SST-2数据集上重新训练作者提供的结构。 +待完成 From 4c1b0f28902294741eef3dd0053a62e98fd5e6d1 Mon Sep 17 00:00:00 2001 From: Yuge Zhang Date: Thu, 2 Apr 2020 16:43:53 +0800 Subject: [PATCH 09/12] Incorporate previous doc --- docs/en_US/NAS/TextNAS.md | 30 ++++++++++++++++++++++++++++++ 1 file changed, 30 insertions(+) diff --git a/docs/en_US/NAS/TextNAS.md b/docs/en_US/NAS/TextNAS.md index 3e6303c2ce..d67647b88c 100644 --- a/docs/en_US/NAS/TextNAS.md +++ b/docs/en_US/NAS/TextNAS.md @@ -13,6 +13,32 @@ The search space of TextNAS contains: Following the ENAS algorithm, TextNAS also utilizes parameter sharing to accelerate the search speed and adopts a reinforcement-learning controller for the architecture sampling and generation. Please refer to the paper for more details of TextNAS. +## Preparation + +Prepare the word vectors and SST dataset, and organize them in data directory as shown below: + +``` +textnas +├── data +│ ├── sst +│ │ └── trees +│ │ ├── dev.txt +│ │ ├── test.txt +│ │ └── train.txt +│ └── glove.840B.300d.txt +├── dataloader.py +├── model.py +├── ops.py +├── README.md +├── search.py +└── utils.py +``` + +The following link might be helpful for finding and downloading the corresponding dataset: + +* [GloVe: Global Vectors for Word Representation](https://nlp.stanford.edu/projects/glove/) +* [Recursive Deep Models for Semantic Compositionality Over a Sentiment Treebank](https://nlp.stanford.edu/sentiment/) + ## Examples ### Search Space @@ -30,6 +56,10 @@ cd examples/nas/textnas python3 search.py -h ``` +After each search epoch, 10 sampled architectures will be tested directly. Their performances are expected to be 40% - 42% after 10 epochs. + +By default, 20 sampled architectures will be exported into `checkpoints` directory for next step. + ### retrain ```bash From 7f8e7472c069d3d67f9581d614fdd583cf2bae2a Mon Sep 17 00:00:00 2001 From: Yuge Zhang Date: Thu, 2 Apr 2020 17:03:00 +0800 Subject: [PATCH 10/12] Add direct link to dataset --- docs/en_US/NAS/TextNAS.md | 2 ++ 1 file changed, 2 insertions(+) diff --git a/docs/en_US/NAS/TextNAS.md b/docs/en_US/NAS/TextNAS.md index d67647b88c..2bdc64c241 100644 --- a/docs/en_US/NAS/TextNAS.md +++ b/docs/en_US/NAS/TextNAS.md @@ -37,7 +37,9 @@ textnas The following link might be helpful for finding and downloading the corresponding dataset: * [GloVe: Global Vectors for Word Representation](https://nlp.stanford.edu/projects/glove/) + * [glove.840B.300d.txt](http://nlp.stanford.edu/data/glove.840B.300d.zip) * [Recursive Deep Models for Semantic Compositionality Over a Sentiment Treebank](https://nlp.stanford.edu/sentiment/) + * [trainDevTestTrees_PTB.zip](https://nlp.stanford.edu/sentiment/trainDevTestTrees_PTB.zip) ## Examples From bf90867955bfa87975882b84f676badf8031588b Mon Sep 17 00:00:00 2001 From: Yaming Yang Date: Thu, 2 Apr 2020 13:32:05 +0000 Subject: [PATCH 11/12] Change to py3 and add doc to the index. --- docs/en_US/nas.rst | 1 + examples/nas/textnas/run_retrain.sh | 2 +- 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/docs/en_US/nas.rst b/docs/en_US/nas.rst index 0a56caa742..f5a06c5c9a 100644 --- a/docs/en_US/nas.rst +++ b/docs/en_US/nas.rst @@ -26,5 +26,6 @@ For details, please refer to the following tutorials: SPOS CDARTS ProxylessNAS + TextNAS Customize a NAS Algorithm API Reference diff --git a/examples/nas/textnas/run_retrain.sh b/examples/nas/textnas/run_retrain.sh index b9306dbed2..1f02121e31 100755 --- a/examples/nas/textnas/run_retrain.sh +++ b/examples/nas/textnas/run_retrain.sh @@ -4,7 +4,7 @@ export PYTHONPATH="$(pwd)" export CUDA_VISIBLE_DEVICES=0 -python -u retrain.py \ +python3 -u retrain.py \ --train_ratio=1.0 \ --valid_ratio=1.0 \ --min_count=1 \ From 56b804cd7d1d9b05fdad971f412fd99108f03ba7 Mon Sep 17 00:00:00 2001 From: QuanluZhang Date: Fri, 3 Apr 2020 17:10:23 +0800 Subject: [PATCH 12/12] Update TextNAS.md --- docs/en_US/NAS/TextNAS.md | 10 +--------- 1 file changed, 1 insertion(+), 9 deletions(-) diff --git a/docs/en_US/NAS/TextNAS.md b/docs/en_US/NAS/TextNAS.md index 2bdc64c241..7c455534ec 100644 --- a/docs/en_US/NAS/TextNAS.md +++ b/docs/en_US/NAS/TextNAS.md @@ -77,12 +77,4 @@ sh run_retrain.sh ## Reference -### PyTorch - -```eval_rst -.. autoclass:: nni.nas.pytorch.enas.EnasTrainer - :members: - -.. autoclass:: nni.nas.pytorch.enas.EnasMutator - :members: -``` +TextNAS directly uses EnasTrainer, please refer to [ENAS](./ENAS.md) for the trainer APIs.