diff --git a/pytorchTUT/01_torch_numpy.py b/pytorchTUT/01_torch_numpy.py
new file mode 100644
index 0000000..fbd195b
--- /dev/null
+++ b/pytorchTUT/01_torch_numpy.py
@@ -0,0 +1,44 @@
+import torch
+import numpy as np
+
+# details about math operation in torch can be found in: http://pytorch.org/docs/torch.html#math-operations
+
+# abs
+data = [-1, -2, 1, 2]
+tensor = torch.FloatTensor(data)  # 32-bit floating point
+print(
+    '\nabs',
+    '\nnumpy: ', np.abs(data),          # [1 2 1 2]
+    '\ntorch: ', torch.abs(tensor)      # [1 2 1 2]
+)
+
+# sin
+print(
+    '\nsin',
+    '\nnumpy: ', np.sin(data),      # [-0.84147098 -0.90929743  0.84147098  0.90929743]
+    '\ntorch: ', torch.sin(tensor)  # [-0.8415 -0.9093  0.8415  0.9093]
+)
+
+# mean
+print(
+    '\nmean',
+    '\nnumpy: ', np.mean(data),         # 0.0
+    '\ntorch: ', torch.mean(tensor)     # 0.0
+)
+
+# matrix multiplication
+data = [[1,2], [3,4]]
+tensor = torch.FloatTensor(data)  # 32-bit floating point
+# correct method
+print(
+    '\nmatrix multiplication (matmul)',
+    '\nnumpy: ', np.matmul(data, data),     # [[7, 10], [15, 22]]
+    '\ntorch: ', torch.mm(tensor, tensor)   # [[7, 10], [15, 22]]
+)
+# incorrect method
+data = np.array(data)
+print(
+    '\nmatrix multiplication (dot)',
+    '\nnumpy: ', data.dot(data),        # [[7, 10], [15, 22]]
+    '\ntorch: ', tensor.dot(tensor)     # this will convert tensor to [1,2,3,4], you'll get 30.0
+)
\ No newline at end of file
diff --git a/pytorchTUT/02_variable.py b/pytorchTUT/02_variable.py
new file mode 100644
index 0000000..1527104
--- /dev/null
+++ b/pytorchTUT/02_variable.py
@@ -0,0 +1,29 @@
+import torch
+from torch.autograd import Variable
+
+# Variable in torch is to build a computational graph,
+# but this graph is dynamic compared with a static graph in Tensorflow or Theano.
+# So torch does not have placeholder, torch can just pass variable to the computational graph.
+
+tensor = torch.FloatTensor([[1,2],[3,4]])            # build a tensor
+variable = Variable(tensor, requires_grad=True)      # build a variable, usually for compute gradients
+
+print(tensor)       # [torch.FloatTensor of size 2x2]
+print(variable)     # [torch.FloatTensor of size 2x2]
+
+# till now the tensor and variable seem the same.
+# However, the variable is a part of the graph, it's a part of the auto-gradient.
+
+t_out = torch.mean(tensor*tensor)       # x^2
+v_out = torch.mean(variable*variable)   # x^2
+print(t_out)
+print(v_out)    # 7.5
+
+v_out.backward()    # backpropagation from v_out
+# v_out = 1/4 * sum(variable*variable)
+# the gradients w.r.t the variable, d(v_out)/d(variable) = 1/4*2*variable = variable/2
+print(variable.grad)
+'''
+ 0.5000  1.0000
+ 1.5000  2.0000
+'''
diff --git a/pytorchTUT/03_activation.py b/pytorchTUT/03_activation.py
new file mode 100644
index 0000000..45f3771
--- /dev/null
+++ b/pytorchTUT/03_activation.py
@@ -0,0 +1,41 @@
+import torch
+import torch.nn.functional as F
+from torch.autograd import Variable
+import matplotlib.pyplot as plt
+
+# fake data
+x = torch.linspace(-5, 5, 200)  # x data (tensor), shape=(100, 1)
+x = Variable(x)
+x_np = x.data.numpy()
+
+# following are popular activation functions
+y_relu = F.relu(x).data.numpy()
+y_sigmoid = F.sigmoid(x).data.numpy()
+y_tanh = F.tanh(x).data.numpy()
+y_softplus = F.softplus(x).data.numpy()
+# y_softmax = F.softmax(x)  softmax is a special kind of activation function, it is about probability
+
+
+# plt to visualize these activation function
+plt.figure(1, figsize=(8, 6))
+plt.subplot(221)
+plt.plot(x_np, y_relu, c='red', label='relu')
+plt.ylim((-1, 5))
+plt.legend(loc='best')
+
+plt.subplot(222)
+plt.plot(x_np, y_sigmoid, c='red', label='sigmoid')
+plt.ylim((-0.2, 1.2))
+plt.legend(loc='best')
+
+plt.subplot(223)
+plt.plot(x_np, y_tanh, c='red', label='tanh')
+plt.ylim((-1.2, 1.2))
+plt.legend(loc='best')
+
+plt.subplot(224)
+plt.plot(x_np, y_softplus, c='red', label='softplus')
+plt.ylim((-0.2, 6))
+plt.legend(loc='best')
+
+plt.show()
\ No newline at end of file
diff --git a/pytorchTUT/04_regression.py b/pytorchTUT/04_regression.py
new file mode 100644
index 0000000..90f02b2
--- /dev/null
+++ b/pytorchTUT/04_regression.py
@@ -0,0 +1,53 @@
+import torch
+from torch.autograd import Variable
+import torch.nn.functional as F
+import matplotlib.pyplot as plt
+
+torch.manual_seed(1)    # reproducible
+
+
+class Net(torch.nn.Module):
+    def __init__(self, n_feature, n_hidden, n_output):
+        super(Net, self).__init__()
+        self.hidden = torch.nn.Linear(n_feature, n_hidden)   # hidden layer
+        self.predict = torch.nn.Linear(n_hidden, n_output)   # output layer
+
+    def forward(self, x):
+        x = F.relu(self.hidden(x))      # activation function for hidden layer
+        x = self.predict(x)             # linear output
+        return x
+
+x = torch.unsqueeze(torch.linspace(-1, 1, 100), dim=1)  # x data (tensor), shape=(100, 1)
+y = x.pow(2) + 0.2*torch.rand(x.size())                 # noisy y data (tensor), shape=(100, 1)
+
+# torch can only train on Variable, so convert them to Variable
+x, y = torch.autograd.Variable(x, requires_grad=False), Variable(y, requires_grad=False)
+
+net = Net(n_feature=1, n_hidden=10, n_output=1)     # define the network
+print(net)  # net architecture
+
+optimizer = torch.optim.SGD(net.parameters(), lr=0.5)
+
+plt.ion()   # something about plotting
+plt.show()
+
+for t in range(100):
+    prediction = net(x)     # input x and predict based on x
+
+    loss_func = torch.nn.MSELoss()      # this is for regression mean squared loss
+    loss = loss_func(prediction, y)     # must be (1. nn output, 2. target)
+
+    optimizer.zero_grad()   # clear gradients for next train
+    loss.backward()         # backpropagation, compute gradients
+    optimizer.step()        # apply gradients
+
+    if t % 5 == 0:
+        # plot and show learning process
+        plt.cla()
+        plt.scatter(x.data.numpy(), y.data.numpy())
+        plt.plot(x.data.numpy(), prediction.data.numpy(), 'r-', lw=5)
+        plt.text(0.5, 0, 'Loss=%.4f' % loss.data[0], fontdict={'size': 20, 'color':  'red'})
+        plt.pause(0.1)
+
+plt.ioff()
+plt.show()
\ No newline at end of file
diff --git a/pytorchTUT/05_classification.py b/pytorchTUT/05_classification.py
new file mode 100644
index 0000000..64972a8
--- /dev/null
+++ b/pytorchTUT/05_classification.py
@@ -0,0 +1,60 @@
+import torch
+from torch.autograd import Variable
+import torch.nn.functional as F
+import matplotlib.pyplot as plt
+
+torch.manual_seed(1)    # reproducible
+
+
+class Net(torch.nn.Module):
+    def __init__(self, n_feature, n_hidden, n_output):
+        super(Net, self).__init__()
+        self.hidden = torch.nn.Linear(n_feature, n_hidden)   # hidden layer
+        self.out = torch.nn.Linear(n_hidden, n_output)   # output layer
+
+    def forward(self, x):
+        x = F.relu(self.hidden(x))      # activation function for hidden layer
+        x = self.out(x)
+        return x
+
+# make fake data
+n_data = torch.ones(100, 2)
+x0 = torch.normal(2*n_data, 1)      # class0 x data (tensor), shape=(100, 2)
+y0 = torch.zeros(100)               # class0 y data (tensor), shape=(100, 1)
+x1 = torch.normal(-2*n_data, 1)     # class1 x data (tensor), shape=(100, 1)
+y1 = torch.ones(100)                # class1 y data (tensor), shape=(100, 1)
+x = torch.cat((x0, x1), 0).type(torch.FloatTensor)  # FloatTensor = 32-bit floating
+y = torch.cat((y0, y1), ).type(torch.LongTensor)    # LongTensor = 64-bit integer
+
+# torch can only train on Variable, so convert them to Variable
+x, y = torch.autograd.Variable(x, requires_grad=False), Variable(y, requires_grad=False)
+
+net = Net(n_feature=2, n_hidden=10, n_output=2)     # define the network
+print(net)  # net architecture
+
+optimizer = torch.optim.SGD(net.parameters(), lr=0.02)
+loss_func = torch.nn.CrossEntropyLoss()  # the target label is not one-hotted
+
+plt.ion()   # something about plotting
+plt.show()
+
+for t in range(100):
+    prediction = net(x)                 # input x and predict based on x
+    loss = loss_func(prediction, y)     # must be (1. nn output, 2. target), the target label is not one-hotted
+
+    optimizer.zero_grad()   # clear gradients for next train
+    loss.backward()         # backpropagation, compute gradients
+    optimizer.step()        # apply gradients
+
+    if t % 2 == 0:
+        # plot and show learning process
+        plt.cla()
+        pred_y = torch.max(F.softmax(prediction), 1)[1].data.numpy().squeeze()
+        target_y = y.data.numpy()
+        plt.scatter(x.data.numpy()[:, 0], x.data.numpy()[:, 1], c=pred_y, s=100, lw=0, cmap='RdYlGn')
+        accuracy = sum(pred_y == target_y)/200
+        plt.text(2, -4, 'Accuracy=%.2f' % accuracy, fontdict={'size': 20, 'color':  'red'})
+        plt.pause(0.1)
+
+plt.ioff()
+plt.show()
\ No newline at end of file
diff --git a/pytorchTUT/06_build_nn_quickly.py b/pytorchTUT/06_build_nn_quickly.py
new file mode 100644
index 0000000..ddfe17c
--- /dev/null
+++ b/pytorchTUT/06_build_nn_quickly.py
@@ -0,0 +1,59 @@
+import torch
+from torch.autograd import Variable
+import matplotlib.pyplot as plt
+
+torch.manual_seed(1)    # reproducible
+
+# fake data
+x = torch.unsqueeze(torch.linspace(-1, 1, 100), dim=1)  # x data (tensor), shape=(100, 1)
+y = x.pow(2) + 0.2*torch.rand(x.size())  # noisy y data (tensor), shape=(100, 1)
+# torch can only train on Variable, so convert them to Variable
+x, y = torch.autograd.Variable(x, requires_grad=False), Variable(y, requires_grad=False)
+
+
+# replace following code with an easy sequential network
+"""
+class Net(torch.nn.Module):
+    def __init__(self, n_feature, n_hidden, n_output):
+        super(Net, self).__init__()
+        self.hidden = torch.nn.Linear(n_feature, n_hidden)   # hidden layer
+        self.predict = torch.nn.Linear(n_hidden, n_output)   # output layer
+
+    def forward(self, x):
+        x = F.relu(self.hidden(x))      # activation function for hidden layer
+        x = self.predict(x)             # linear output
+        return x
+"""
+net = torch.nn.Sequential(
+    torch.nn.Linear(1, 10),
+    torch.nn.ReLU(),
+    torch.nn.Linear(10, 1)
+)
+print(net)  # net architecture
+
+
+optimizer = torch.optim.SGD(net.parameters(), lr=0.5)
+
+plt.ion()   # something about plotting
+plt.show()
+
+for t in range(100):
+    prediction = net(x)     # input x and predict based on x
+
+    loss_func = torch.nn.MSELoss()     # this is for regression mean squared loss
+    loss = loss_func(prediction, y)   # must be (1. nn output, 2. target)
+
+    optimizer.zero_grad()   # clear gradients for next train
+    loss.backward()         # backpropagation, compute gradients
+    optimizer.step()        # apply gradients
+
+    if t % 5 == 0:
+        # plot and show learning process
+        plt.cla()
+        plt.scatter(x.data.numpy(), y.data.numpy())
+        plt.plot(x.data.numpy(), prediction.data.numpy(), 'r-', lw=5)
+        plt.text(0.5, 0, 'Loss=%.4f' % loss.data[0], fontdict={'size': 20, 'color':  'red'})
+        plt.pause(0.1)
+
+plt.ioff()
+plt.show()
\ No newline at end of file
diff --git a/pytorchTUT/07_save_reload.py b/pytorchTUT/07_save_reload.py
new file mode 100644
index 0000000..a8ca393
--- /dev/null
+++ b/pytorchTUT/07_save_reload.py
@@ -0,0 +1,79 @@
+import torch
+from torch.autograd import Variable
+import matplotlib.pyplot as plt
+
+torch.manual_seed(1)    # reproducible
+
+# fake data
+x = torch.unsqueeze(torch.linspace(-1, 1, 100), dim=1)  # x data (tensor), shape=(100, 1)
+y = x.pow(2) + 0.2*torch.rand(x.size())  # noisy y data (tensor), shape=(100, 1)
+x, y = torch.autograd.Variable(x, requires_grad=False), Variable(y, requires_grad=False)
+
+
+def save():
+    # save net1
+    net1 = torch.nn.Sequential(
+        torch.nn.Linear(1, 10),
+        torch.nn.ReLU(),
+        torch.nn.Linear(10, 1)
+    )
+    optimizer = torch.optim.SGD(net1.parameters(), lr=0.5)
+    for t in range(100):
+        prediction = net1(x)
+        loss_func = torch.nn.MSELoss(size_average=True)
+        loss = loss_func(prediction, y)
+        optimizer.zero_grad()
+        loss.backward()
+        optimizer.step()
+
+    # plot result
+    plt.figure(1, figsize=(10, 3))
+    plt.subplot(131)
+    plt.title('Net1')
+    plt.scatter(x.data.numpy(), y.data.numpy())
+    plt.plot(x.data.numpy(), prediction.data.numpy(), 'r-', lw=5)
+
+    # 2 ways to save the net
+    torch.save(net1, 'net.pkl')  # save entire net
+    torch.save(net1.state_dict(), 'net_params.pkl')   # save only the parameters
+
+
+def restore_net():
+    # restore entire net1 to net2
+    net2 = torch.load('net.pkl')
+    prediction = net2(x)
+
+    # plot result
+    plt.subplot(132)
+    plt.title('Net2')
+    plt.scatter(x.data.numpy(), y.data.numpy())
+    plt.plot(x.data.numpy(), prediction.data.numpy(), 'r-', lw=5)
+
+
+def restore_params():
+    # restore only the parameters in net1 to net3
+    net3 = torch.nn.Sequential(
+        torch.nn.Linear(1, 10),
+        torch.nn.ReLU(),
+        torch.nn.Linear(10, 1)
+    )
+
+    # copy net1's parameters into net3
+    net3.load_state_dict(torch.load('net_params.pkl'))
+    prediction = net3(x)
+
+    # plot result
+    plt.subplot(133)
+    plt.title('Net3')
+    plt.scatter(x.data.numpy(), y.data.numpy())
+    plt.plot(x.data.numpy(), prediction.data.numpy(), 'r-', lw=5)
+    plt.show()
+
+# save net1
+save()
+
+# restore entire net (slow)
+restore_net()
+
+# restore only the net parameters
+restore_params()
diff --git a/pytorchTUT/08_batch_train.py b/pytorchTUT/08_batch_train.py
new file mode 100644
index 0000000..ef53598
--- /dev/null
+++ b/pytorchTUT/08_batch_train.py
@@ -0,0 +1,21 @@
+import torch
+import torch.utils.data as Data
+
+BATCH_SIZE = 8
+
+x = torch.linspace(1, 10, 10)       # this is x data (torch tensor)
+y = torch.linspace(10, 1, 10)       # this is y data (torch tensor)
+
+torch_dataset = Data.TensorDataset(data_tensor=x, target_tensor=y)
+loader = Data.DataLoader(
+    dataset=torch_dataset,      # torch TensorDataset format
+    batch_size=BATCH_SIZE,      # mini batch size
+    shuffle=True,               # random shuffle for training
+    num_workers=2,              # subprocesses for loading data
+)
+
+for epoch in range(3):   # train entire dataset 3 times
+    for step, (batch_x, batch_y) in enumerate(loader):  # for each training step
+        # train your data...
+        print('Epoch: ', epoch, '| Step: ', step, '| batch x: ',
+              batch_x.numpy(), '| batch y: ', batch_y.numpy())
diff --git a/pytorchTUT/09_CNN.py b/pytorchTUT/09_CNN.py
new file mode 100644
index 0000000..525112e
--- /dev/null
+++ b/pytorchTUT/09_CNN.py
@@ -0,0 +1,93 @@
+import torch
+import torch.nn as nn
+from torch.autograd import Variable
+import torch.utils.data as Data
+import torchvision
+
+torch.manual_seed(1)    # reproducible
+
+# Hyper Parameters
+EPOCH = 1           # train the training data n times, to save time, we just train 1 epoch
+BATCH_SIZE = 50
+LR = 0.001          # learning rate
+DOWNLOAD_MNIST = False
+
+
+# Mnist digits dataset
+train_data = torchvision.datasets.MNIST(
+    root='./mnist/',
+    train=True,  # this is training data
+    transform=torchvision.transforms.ToTensor(),    # Converts a PIL.Image or numpy.ndarray to
+                                                    # torch.FloatTensor of shape (C x H x W) and normalize in the range [0.0, 1.0]
+    download=DOWNLOAD_MNIST,          # download it if you don't have it
+)
+
+test_data = torchvision.datasets.MNIST(root='./mnist/', train=False)
+
+# Data Loader for easy mini-batch return in training, the image batch shape will be (50, 1, 28, 28)
+train_loader = Data.DataLoader(dataset=train_data, batch_size=BATCH_SIZE, shuffle=True)
+
+# convert test data into Variable, pick 2000 samples to speed up testing
+test_x = Variable(torch.unsqueeze(test_data.test_data, dim=1)).type(torch.FloatTensor)[:2000]/255.   # shape from (2000, 28, 28) to (2000, 1, 28, 28), value in range(0,1)
+test_y = test_data.test_labels[:2000]
+
+
+class CNN(nn.Module):
+    def __init__(self):
+        super(CNN, self).__init__()
+        self.conv1 = nn.Sequential(  # input shape (1, 28, 28)
+            nn.Conv2d(
+                in_channels=1,      # input height
+                out_channels=16,    # n_filters
+                kernel_size=5,      # filter size
+                stride=1,           # filter movement/step
+                padding=2,      # if want same width and length of this image after con2d, padding=(kernel_size-1)/2 if stride=1
+            ),      # output shape (16, 28, 28)
+            nn.ReLU(),    # activation
+            nn.MaxPool2d(kernel_size=2),      # choose max value in 2x2 area, output shape (16, 14, 14)
+        )
+        self.conv2 = nn.Sequential(  # input shape (1, 28, 28)
+            nn.Conv2d(16, 32, 5, 1, 2),  # output shape (32, 14, 14)
+            nn.ReLU(),  # activation
+            nn.MaxPool2d(2),  # output shape (32, 7, 7)
+        )
+        self.out = nn.Linear(32 * 7 * 7, 10)   # fully connected layer, output 10 classes
+
+    def forward(self, x):
+        x = self.conv1(x)
+        x = self.conv2(x)
+        x = x.view(x.size(0), -1)   # flatten the output of conv2 to (batch_size, 32 * 7 * 7)
+        output = self.out(x)
+        return output
+
+
+cnn = CNN()
+print(cnn)  # net architecture
+
+optimizer = torch.optim.Adam(cnn.parameters(), lr=LR)   # optimize all cnn parameters
+loss_func = nn.CrossEntropyLoss()   # the target label is not one-hotted
+
+# training and testing
+for epoch in range(EPOCH):
+    for step, (x, y) in enumerate(train_loader):   # gives batch data, normalize x when iterate train_loader
+        b_x = Variable(x)   # batch x
+        b_y = Variable(y)   # batch y
+
+        output = cnn(b_x)               # cnn output
+        loss = loss_func(output, b_y)   # cross entropy loss
+        optimizer.zero_grad()           # clear gradients for this training step
+        loss.backward()                 # backpropagation, compute gradients
+        optimizer.step()                # apply gradients
+
+        if step % 50 == 0:
+            test_output = cnn(test_x)
+            pred_y = torch.max(test_output, 1)[1].data.squeeze()
+            accuracy = sum(pred_y == test_y) / test_y.size(0)
+            print('Epoch: ', epoch, '| train loss: %.4f' % loss.data[0], '| test accuracy: %.2f' % accuracy)
+
+
+# print 10 predictions from test data
+test_output = cnn(test_x[:10])
+pred_y = torch.max(test_output, 1)[1].data.numpy().squeeze()
+print(pred_y, 'prediction number')
+print(test_y[:10].numpy(), 'real number')
diff --git a/pytorchTUT/10_RNN_classifier.py b/pytorchTUT/10_RNN_classifier.py
new file mode 100644
index 0000000..9befc17
--- /dev/null
+++ b/pytorchTUT/10_RNN_classifier.py
@@ -0,0 +1,91 @@
+import torch
+from torch import nn
+from torch.autograd import Variable
+import torchvision.datasets as dsets
+import torchvision.transforms as transforms
+
+torch.manual_seed(1)    # reproducible
+
+# Hyper Parameters
+EPOCH = 1           # train the training data n times, to save time, we just train 1 epoch
+BATCH_SIZE = 64
+TIME_STEP = 28      # rnn time step / image height
+INPUT_SIZE = 28     # rnn input size / image width
+LR = 0.01           # learning rate
+DOWNLOAD_MNIST = False  # set to True if haven't download the data
+
+
+# Mnist digital dataset
+train_data = dsets.MNIST(
+    root='./mnist/',
+    train=True,  # this is training data
+    transform=transforms.ToTensor(),  # Converts a PIL.Image or numpy.ndarray to
+                                      # torch.FloatTensor of shape (C x H x W) and normalize in the range [0.0, 1.0]
+    download=DOWNLOAD_MNIST,          # download it if you don't have it
+)
+
+test_data = dsets.MNIST(root='./mnist/', train=False, transform=transforms.ToTensor())
+
+# Data Loader for easy mini-batch return in training
+train_loader = torch.utils.data.DataLoader(dataset=train_data, batch_size=BATCH_SIZE, shuffle=True)
+
+# convert test data into Variable, pick 2000 samples to speed up testing
+test_x = Variable(test_data.test_data).type(torch.FloatTensor)[:2000]/255.   # shape (2000, 28, 28) value in range(0,1)
+test_y = test_data.test_labels.numpy().squeeze()[:2000]    # covert to numpy array
+
+
+class RNN(nn.Module):
+    def __init__(self):
+        super(RNN, self).__init__()
+
+        self.rnn = nn.LSTM(     # if use nn.RNN(), it hardly learns
+            input_size=28,
+            hidden_size=64,  # rnn hidden unit
+            num_layers=1,  # number of rnn layer
+            batch_first=True,  # input & output will has batch size as 1s dimension. e.g. (batch, time_step, input_size)
+        )
+
+        self.out = nn.Linear(64, 10)
+
+    def forward(self, x):
+        # x shape (batch, time_step, input_size)
+        # r_out shape (batch, time_step, output_size)
+        # h_n shape (n_layers, batch, hidden_size)
+        # h_c shape (n_layers, batch, hidden_size)
+        r_out, (h_n, h_c) = self.rnn(x, None)   # None represents zero initial hidden state
+
+        # choose r_out at the last time step
+        out = self.out(r_out[:, -1, :])
+        return out
+
+
+rnn = RNN()
+print(rnn)
+
+optimizer = torch.optim.Adam(rnn.parameters(), lr=LR)   # optimize all cnn parameters
+loss_func = nn.CrossEntropyLoss()   # the target label is not one-hotted
+
+# training and testing
+for epoch in range(EPOCH):
+    for step, (x, y) in enumerate(train_loader):   # gives batch data
+        b_x = Variable(x.view(-1, 28, 28))   # reshape x to (batch, time_step, input_size)
+        b_y = Variable(y)   # batch y
+
+        output = rnn(b_x)               # rnn output
+        loss = loss_func(output, b_y)   # cross entropy loss
+        optimizer.zero_grad()           # clear gradients for this training step
+        loss.backward()                 # backpropagation, compute gradients
+        optimizer.step()                # apply gradients
+
+        if step % 50 == 0:
+            test_output = rnn(test_x)  # (samples, time_step, input_size)
+            pred_y = torch.max(test_output, 1)[1].data.numpy().squeeze()
+            accuracy = sum(pred_y == test_y) / test_y.size
+            print('Epoch: ', epoch, '| train loss: %.4f' % loss.data[0], '| test accuracy: %.2f' % accuracy)
+
+# print 10 predictions from test data
+test_output = rnn(test_x[:10].view(-1, 28, 28))
+pred_y = torch.max(test_output, 1)[1].data.numpy().squeeze()
+print(pred_y, 'prediction number')
+print(test_y[:10], 'real number')
+
diff --git a/pytorchTUT/11_RNN_regressor.py b/pytorchTUT/11_RNN_regressor.py
new file mode 100644
index 0000000..04991c3
--- /dev/null
+++ b/pytorchTUT/11_RNN_regressor.py
@@ -0,0 +1,79 @@
+import torch
+from torch import nn
+from torch.autograd import Variable
+import numpy as np
+import matplotlib.pyplot as plt
+
+torch.manual_seed(1)    # reproducible
+
+# Hyper Parameters
+BATCH_SIZE = 64
+TIME_STEP = 5       # rnn time step / image height
+INPUT_SIZE = 1      # rnn input size / image width
+LR = 0.02           # learning rate
+DOWNLOAD_MNIST = False  # set to True if haven't download the data
+
+
+class RNN(nn.Module):
+    def __init__(self):
+        super(RNN, self).__init__()
+
+        self.rnn = nn.RNN(
+            input_size=1,
+            hidden_size=32,  # rnn hidden unit
+            num_layers=1,  # number of rnn layer
+            batch_first=True,  # input & output will has batch size as 1s dimension. e.g. (batch, time_step, input_size)
+        )
+        self.out = nn.Linear(32, 1)
+
+    def forward(self, x, h_state):
+        # x (batch, time_step, input_size)
+        # h_state (n_layers, batch, hidden_size)
+        # r_out (batch, time_step, output_size)
+        r_out, h_state = self.rnn(x, h_state)
+
+        outs = []    # this is where you can find torch is dynamic
+        for time_step in range(r_out.size(1)):    # calculate output for each time step
+            outs.append(self.out(r_out[:, time_step, :]))
+        return torch.stack(outs, dim=1), h_state
+
+
+rnn = RNN()
+print(rnn)
+
+optimizer = torch.optim.Adam(rnn.parameters(), lr=LR)   # optimize all cnn parameters
+loss_func = nn.MSELoss()   # the target label is not one-hotted
+
+h_state = None   # for initial hidden state
+
+plt.figure(1, figsize=(12, 5))
+plt.ion()   # continuously plot
+plt.show()
+
+for step in range(60):
+    start, end = step * np.pi, (step+1)*np.pi
+    # use sin predicts cos
+    steps = np.linspace(start, end, 10, dtype=np.float32)
+    x_np = np.sin(steps)    # float32 for converting torch FloatTensor
+    y_np = np.cos(steps)
+
+    x = Variable(torch.from_numpy(x_np[np.newaxis, :, np.newaxis]))    # shape (batch, time_step, input_size)
+    y = Variable(torch.from_numpy(y_np[np.newaxis, :, np.newaxis]))
+
+    prediction, h_state = rnn(x, h_state)   # rnn output
+    # !! next step is important !!
+    h_state = Variable(h_state.data)  # repack the hidden state, break the connection from last iteration
+
+    loss = loss_func(prediction, y)     # cross entropy loss
+    optimizer.zero_grad()               # clear gradients for this training step
+    loss.backward()                     # backpropagation, compute gradients
+    optimizer.step()                    # apply gradients
+
+    # plotting
+    plt.plot(steps, y_np.flatten(), 'r-')
+    plt.plot(steps, prediction.data.numpy().flatten(), 'b-')
+    plt.draw()
+    plt.pause(0.05)
+
+plt.ioff()
+plt.show()
diff --git a/tensorflowTUT/tf20_RNN2/full_code.py b/tensorflowTUT/tf20_RNN2/full_code.py
index 0cd4eed..b46f670 100644
--- a/tensorflowTUT/tf20_RNN2/full_code.py
+++ b/tensorflowTUT/tf20_RNN2/full_code.py
@@ -124,7 +124,7 @@ def RNN(X, weights, biases):
             print(sess.run(accuracy, feed_dict={
             x: batch_xs,
             y: batch_ys,
-        }))
+            }))
         step += 1