Add word2vec and LSTM.

Mansterteddy · Mar 1, 2019 · d783be8 · d783be8
1 parent 99c26f8
commit d783be8
Show file tree

Hide file tree

Showing 3 changed files with 152 additions and 0 deletions.
diff --git a/pytorch/NLP/Sequence/seq.py b/pytorch/NLP/Sequence/seq.py
@@ -0,0 +1,33 @@
+import torch
+import torch.nn as nn
+import torch.nn.functional as F 
+import torch.optim as optim
+
+torch.manual_seed(1)
+
+lstm = nn.LSTM(3, 3) # Input dim is 3, output dim is 3
+inputs = [torch.randn(1, 3) for _ in range(5)] # Make a sequence of length 5
+
+# initialize the hidden state.
+hidden = (torch.randn(1, 1, 3), torch.randn(1, 1, 3))
+
+for i in inputs:
+    # Step through the sequence one element at a time.
+    # After each step, hidden contains the hidden state.
+    out, hidden = lstm(i.view(1, 1, -1), hidden)
+
+# alternatively, we can do the entire sequence all at once.
+# the first value returned by LSTM is all of the hidden states throughout
+# the sequence. the second is just the most recent hidden state
+# (compare the last slice of "out" with "hidden" below, they are the same)
+# The reason for this is that:
+# "out" will give you access to all hidden states in the sequence
+# "hidden" will allow you to continue the sequence and backpropagate,
+# by passing it as an argument  to the lstm at a later time
+# Add the extra 2nd dimension
+inputs = torch.cat(inputs).view(len(inputs), 1, -1)
+hidden = (torch.randn(1, 1, 3), torch.randn(1, 1, 3))  # clean out hidden state
+out, hidden = lstm(inputs, hidden)
+print(out)
+print(hidden)
+
diff --git a/pytorch/NLP/WordEmbeddings/cbow.py b/pytorch/NLP/WordEmbeddings/cbow.py
@@ -0,0 +1,33 @@
+CONTEXT_SIZE = 2 # 2 words to the left, 2 to the right.
+raw_text = """We are about to study the idea of a computational process.
+Computational processes are abstract beings that inhabit computers.
+As they evolve, processes manipulate other abstract things called data.
+The evolution of a process is directed by a pattern of rules
+called a program. People create programs to direct processes. In effect,
+we conjure the spirits of the computer with our spells.""".split()
+
+# By deriving a set from `raw_text`, we deduplicate the array
+vocab = set(raw_text)
+vocab_size = len(vocab)
+
+word_to_ix = {word: i for i, word in enumerate(vocab)}
+data = []
+for i in range(2, len(raw_text) - 2):
+    context = [raw_text[i - 2], raw_text[i - 1],
+               raw_text[i + 1], raw_text[i + 2]]
+    target = raw_text[i]
+    data.append((context, target))
+print(data[:5])
+
+class CBOW(nn.Module):
+    def __init__(self):
+        pass
+
+    def forward(self, inputs):
+        pass
+
+def make_context_vector(context, word_to_ix):
+    idxs = [word_to_ix[w] for w in context]
+    return torch.tensor(idxs, dtype=torch.long)
+
+make_context_vector(data[0][0], word_to_ix)
diff --git a/pytorch/NLP/WordEmbeddings/embedding.py b/pytorch/NLP/WordEmbeddings/embedding.py
@@ -0,0 +1,86 @@
+import torch
+import torch.nn as nn
+import torch.nn.functional as F 
+import torch.optim as optim
+
+torch.manual_seed(1)
+
+word_to_ix = {"hello": 0, "world": 1}
+embeds = nn.Embedding(2, 5) # 2 words in vocab, 5 dimensional embeddings
+lookup_tensor = torch.tensor([word_to_ix["hello"]], dtype=torch.long)
+hello_embed = embeds(lookup_tensor)
+print(hello_embed)
+
+CONTEXT_SUZE = 2
+EMBEDDING_DIM = 10
+
+# We will use Shakespeare Sonnet 2
+test_sentence = """When forty winters shall besiege thy brow,
+And dig deep trenches in thy beauty's field,
+Thy youth's proud livery so gazed on now,
+Will be a totter'd weed of small worth held:
+Then being asked, where all thy beauty lies,
+Where all the treasure of thy lusty days;
+To say, within thine own deep sunken eyes,
+Were an all-eating shame, and thriftless praise.
+How much more praise deserv'd thy beauty's use,
+If thou couldst answer 'This fair child of mine
+Shall sum my count, and make my old excuse,'
+Proving his beauty by succession thine!
+This were to be new made when thou art old,
+And see thy blood warm when thou feel'st it cold.""".split()
+
+# we should tokenize the input, but we will ignore that for now
+# build a list of tuples.  Each tuple is ([ word_i-2, word_i-1 ], target word)
+trigrams = [([test_sentence[i], test_sentence[i + 1]], test_sentence[i + 2])
+            for i in range(len(test_sentence) - 2)]
+# print the first 3, just so you can see what they look like
+print(trigrams[:3])
+
+vocab = set(test_sentence)
+word_to_ix = {word: i for i, word in enumerate(vocab)}
+
+class NGramLanguageModeler(nn.Module):
+
+    def __init__(self, vocab_size, embedding_dim, context_size):
+        super(NGramLanguageModeler, self).__init__()
+        self.embeddings = nn.Embedding(vocab_size, embedding_dim)
+        self.linear1 = nn.Linear(context_size * embedding_dim, 128)
+        self.linear2 = nn.Linear(128, vocab_size)
+
+    def forward(self, inputs):
+        embeds = self.embeddings(inputs).view((1, -1))
+        out = F.relu(self.linear1(embeds))
+        out = self.linear2(out)
+        log_probs = F.log_softmax(out, dim=1)
+        return log_probs
+
+losses = []
+loss_function = nn.NLLLoss()
+model = NGramLanguageModeler(len(vocab), EMBEDDING_DIM, CONTEXT_SUZE)
+optimizer = optim.SGD(model.parameters(), lr=0.001)
+
+for epoch in range(10):
+    total_loss = 0
+    for context, target in trigrams:
+        # Step 1. Prepare the inputs to be passed to the model (i.e, turn the words into integer indices and wrap them in tensors)
+        context_idxs = torch.tensor([word_to_ix[w] for w in context], dtype=torch.long)
+
+        # Step 2. Recall that torch *accumulates* gradients. Before passing in a new instance, you need to zeor out the gradients from the old instance
+        model.zero_grad()
+
+        # Step 3. Run the forward pass, getting log probabilities over next words
+        log_probs = model(context_idxs)
+
+        # Step 4. Compute your loss function. (Again, Torch wants the target word wrapped in a tensor)
+        loss = loss_function(log_probs, torch.tensor([word_to_ix[target]], dtype=torch.long))
+
+        # Step 5. Do the backward pass and update the gradient
+        loss.backward()
+        optimizer.step()
+
+        # Get the Python number from a 1-element Tensor by calling tensor.item()
+        total_loss += loss.item()
+
+    losses.append(total_loss)
+print(losses) # The loss decreased every iteration over the training data!