From 2d27a957d703fb3d017c6e7e0589c7f5a7232f9c Mon Sep 17 00:00:00 2001
From: Ayushk4 <ayushk4@gmail.com>
Date: Fri, 9 Aug 2019 20:53:26 +0530
Subject: [PATCH 01/16] Port NER API

---
 src/Sequence Labelling/NER_DataDeps.jl       | 29 +++++++
 src/Sequence Labelling/ner.jl                | 38 +++++++++
 src/Sequence Labelling/sequence_labelling.jl | 83 ++++++++++++++++++++
 src/TextAnalysis.jl                          | 12 ++-
 4 files changed, 161 insertions(+), 1 deletion(-)
 create mode 100644 src/Sequence Labelling/NER_DataDeps.jl
 create mode 100644 src/Sequence Labelling/ner.jl
 create mode 100644 src/Sequence Labelling/sequence_labelling.jl
diff --git a/src/Sequence Labelling/NER_DataDeps.jl b/src/Sequence Labelling/NER_DataDeps.jl
new file mode 100644
index 00000000..c42c1abc
--- /dev/null
+++ b/src/Sequence Labelling/NER_DataDeps.jl	
@@ -0,0 +1,29 @@
+using DataDeps
+
+register(DataDep("NER Model Weights",
+    """
+    The weights for NER Sequence Labelling Model.
+    """,
+    "https://raw.githubusercontent.com/Ayushk4/Random_set_of_codes/weights/weights.tar.xz",
+    post_fetch_method = function(fn)
+        unpack(fn)
+        dir = "weights"
+        innerfiles = readdir(dir)
+        mv.(joinpath.(dir, innerfiles), innerfiles)
+        rm(dir)
+    end
+))
+
+register(DataDep("NER Model Dicts",
+    """
+    The character and words dict for NER Sequence Labelling Model.
+    """,
+    "https://raw.githubusercontent.com/Ayushk4/Random_set_of_codes/weights/model_dicts.tar.xz",
+    post_fetch_method = function(fn)
+        unpack(fn)
+        dir = "model_dicts"
+        innerfiles = readdir(dir)
+        mv.(joinpath.(dir, innerfiles), innerfiles)
+        rm(dir)
+    end
+))
diff --git a/src/Sequence Labelling/ner.jl b/src/Sequence Labelling/ner.jl
new file mode 100644
index 00000000..02b8adc7
--- /dev/null
+++ b/src/Sequence Labelling/ner.jl	
@@ -0,0 +1,38 @@
+using BSON, Tracker
+
+const NER_Char_UNK = '¿'
+const NER_Word_UNK = "<UNK>"
+
+struct NERmodel{M}
+    model::M
+end
+
+function load_model_dicts(filepath)
+    labels = BSON.load(joinpath(filepath, "labels.bson"))[:labels]
+    chars_idx = BSON.load(joinpath(filepath, "char_to_embed_idx.bson"))[:get_char_index]
+    words_idx = BSON.load(joinpath(filepath, "word_to_embed_idx.bson"))[:get_word_index]
+
+    return remove_ner_label_prefix.([labels...]), chars_idx, words_idx
+end
+
+NERTagger() = NERTagger(datadep"NER Model Dicts", datadep"NER Model Weights")
+
+function NERTagger(dicts_path, weights_path)
+    labels, chars_idx, words_idx = load_model_dicts(dicts_path)
+    model = BiLSTM_CNN_CRF_Model(labels, chars_idx, words_idx, chars_idx[NER_Char_UNK], words_idx[NER_Word_UNK], weights_path)
+    NERmodel(model)
+end
+
+function (a::NERmodel)(sentence::String)
+    a(WordTokenizers.tokenize(sentence))
+end
+
+function (a::NERmodel)(tokens::Array{String,1})
+    input_oh = [onehotinput(a.model, token) for token in tokens]
+    return (a.model)(input_oh)
+end
+
+function remove_ner_label_prefix(str)
+    str == "O" && return str
+    str = str[3:end]
+end
diff --git a/src/Sequence Labelling/sequence_labelling.jl b/src/Sequence Labelling/sequence_labelling.jl
new file mode 100644
index 00000000..e804d87b
--- /dev/null
+++ b/src/Sequence Labelling/sequence_labelling.jl	
@@ -0,0 +1,83 @@
+using BSON, Tracker
+
+mutable struct BiLSTM_CNN_CRF_Model{C, W, L, D, O, A}
+    labels::Array{String, 1} # List of Labels
+    chars_idx::Dict{Char, Int64} # Dict that maps chars to indices in W_Char_Embed
+    words_idx::Dict{String, Int64} # Dict that maps words to indices in W_word_Embed
+    conv1::C # Convolution Layer over W_Char_Embed to give character representation
+    W_Char_Embed::W # Weights for character embeddings
+    W_word_Embed::W # Further trained GloVe Embeddings
+    forward_lstm::L # Forward LSTM
+    backward::L # Backward LSTM
+    d_out::D # Dense_out
+    c::O # CRF
+    init_α::A
+    UNK_Word_idx::Integer
+    UNK_char_idx::Integer
+end
+
+# BiLSTM_CNN_CRF_Model(labels, chars_idx, words_idx) =
+            # BiLSTM_CNN_CRF_Model(labels, chars_idx, words_idx, :cpu)
+
+function BiLSTM_CNN_CRF_Model(labels, chars_idx, words_idx, UNK_char_idx, UNK_Word_idx; CHAR_EMBED_DIMS=25, WORD_EMBED_DIMS=100,
+                              CNN_OUTPUT_SIZE=30, CONV_PAD= (0,2), CONV_WINDOW_LENGTH = 3, LSTM_STATE_SIZE = 200)
+    n = length(labels)
+    init_α = fill(-10000, (n + 2, 1))
+    init_α[n + 1] = 0
+
+    BiLSTM_CNN_CRF_Model(labels, chars_idx, words_idx, Conv((CHAR_EMBED_DIMS, CONV_WINDOW_LENGTH), 1=>CNN_OUTPUT_SIZE, pad=CONV_PAD),
+                rand(CHAR_EMBED_DIMS, length(chars_idx)), rand(WORD_EMBED_DIMS, length(words_idx)),
+                LSTM(CNN_OUTPUT_SIZE + WORD_EMBED_DIMS, LSTM_STATE_SIZE), LSTM(CNN_OUTPUT_SIZE + WORD_EMBED_DIMS, LSTM_STATE_SIZE),
+                Dense(LSTM_STATE_SIZE * 2, length(labels) + 2), CRF(n), init_α, UNK_Word_idx, UNK_char_idx)
+end
+
+function BiLSTM_CNN_CRF_Model(labels, chars_idx, words_idx, UNK_char_idx,UNK_Word_idx, weights_path)
+    n = length(labels)
+    init_α = fill(-10000, (n + 2, 1))
+    init_α[n + 1] = 0
+
+    W_word_Embed = BSON.load(joinpath(weights_path, "W_word_cpu.bson"))[:W_word_cpu].data
+    W_Char_Embed = BSON.load(joinpath(weights_path, "W_char_cpu.bson"))[:W_char_cpu].data
+    forward_lstm = BSON.load(joinpath(weights_path, "forward_lstm.bson"))[:forward_lstm_cpu]
+    backward = BSON.load(joinpath(weights_path, "backward_lstm.bson"))[:backward_lstm_cpu]
+    d_out = BSON.load(joinpath(weights_path, "d_cpu.bson"))[:d_cpu]
+    c = BSON.load(joinpath(weights_path, "crf.bson"))[:crf_cpu]
+    conv1 = BSON.load(joinpath(weights_path, "conv_cpu.bson"))[:conv_cpu]
+
+    BiLSTM_CNN_CRF_Model(labels, chars_idx, words_idx, conv1, W_Char_Embed, W_word_Embed,
+                forward_lstm, backward, d_out, c, init_α, UNK_Word_idx, UNK_char_idx)
+end
+
+function (a::BiLSTM_CNN_CRF_Model)(x)
+    char_features = Chain(x -> reshape(x, size(x)..., 1,1),
+                          a.conv1,
+                          x -> maximum(x, dims=2),
+                          x -> reshape(x, length(x),1))
+    input_embeddings((w, cs)) = vcat(a.W_word_Embed * w, char_features(a.W_Char_Embed * cs))
+    backward_lstm(x) = reverse((a.backward).(reverse(x)))
+    bilstm_layer(x) = vcat.((a.forward_lstm).(x), backward_lstm(x))
+    m = Chain(x -> input_embeddings.(x),
+              bilstm_layer,
+              x -> (a.d_out).(x))
+
+    oh_outs = viterbi_decode(a.c, m(x), a.init_α)
+    Flux.reset!(a.backward)
+    Flux.reset!(a.forward_lstm)
+    [a.labels[oh.ix] for oh in oh_outs]
+end
+
+# function load(m::BiLSTM_CNN_CRF_Model, weights_path)
+#     m.conv1 = BSON.load(joinpath(weights_path, "conv_cpu.bson"))[:conv_cpu]
+#     println("ConvLoaded")
+#     m.W_word_Embed = BSON.load(joinpath(weights_path, "W_word_cpu.bson"))[:W_word_cpu].data
+#     println("W Word loaded")
+#     m.W_Char_Embed = BSON.load(joinpath(weights_path, "W_char_cpu.bson"))[:W_char_cpu].data
+#     println("W char loaded")
+#     m.forward_lstm = BSON.load(joinpath(weights_path, "forward_lstm.bson"))[:forward_lstm_cpu]
+#     m.backward = BSON.load(joinpath(weights_path, "backward_lstm.bson"))[:backward_lstm_cpu]
+#     m.d_out = BSON.load(joinpath(weights_path, "d_cpu.bson"))[:d_cpu]
+#     m.c = BSON.load(joinpath(weights_path, "crf.bson"))[:crf_cpu]
+# end
+
+onehotinput(m::BiLSTM_CNN_CRF_Model, word) = (onehot(get(m.words_idx, lowercase(word), m.UNK_Word_idx), 1:length(m.words_idx)),
+                onehotbatch([get(m.chars_idx, c, m.UNK_char_idx) for c in word], 1:length(m.chars_idx)))
diff --git a/src/TextAnalysis.jl b/src/TextAnalysis.jl
index 73324d69..9c95a71f 100644
--- a/src/TextAnalysis.jl
+++ b/src/TextAnalysis.jl
@@ -7,7 +7,8 @@ module TextAnalysis
     using DataFrames
     using WordTokenizers
 
-    using Flux
+    using DataDeps
+    using Flux, Tracker
     using Flux: identity, onehot, onecold, @treelike
 
     import DataFrames.DataFrame
@@ -60,6 +61,8 @@ module TextAnalysis
 
     export CRF, viterbi_decode, crf_loss
 
+    export NERTagger, Tracker, Flux
+
     include("tokenizer.jl")
     include("ngramizer.jl")
     include("document.jl")
@@ -74,6 +77,10 @@ module TextAnalysis
     end
     include(depsjl_path)
 
+    function __init__()
+        include(joinpath(@__DIR__, "Sequence Labelling/NER_DataDeps.jl"))
+    end
+
     include("stemmer.jl")
     include("dtm.jl")
     include("tf_idf.jl")
@@ -95,4 +102,7 @@ module TextAnalysis
     include("CRF/crf_utils.jl")
     include("CRF/loss.jl")
 
+    # NER
+    include("Sequence Labelling/ner.jl")
+    include("Sequence Labelling/sequence_labelling.jl")
 end

From e18d0f970c1fca6f0b1574d9ed88516464df5013 Mon Sep 17 00:00:00 2001
From: Ayushk4 <ayushk4@gmail.com>
Date: Fri, 9 Aug 2019 22:07:50 +0530
Subject: [PATCH 02/16] Fix NER API

---
 src/Sequence Labelling/ner.jl                |  2 +-
 src/Sequence Labelling/sequence_labelling.jl | 18 +-----------------
 src/TextAnalysis.jl                          |  2 +-
 3 files changed, 3 insertions(+), 19 deletions(-)

diff --git a/src/Sequence Labelling/ner.jl b/src/Sequence Labelling/ner.jl
index 02b8adc7..c52964ae 100644
--- a/src/Sequence Labelling/ner.jl	
+++ b/src/Sequence Labelling/ner.jl	
@@ -23,7 +23,7 @@ function NERTagger(dicts_path, weights_path)
     NERmodel(model)
 end
 
-function (a::NERmodel)(sentence::String)
+function (a::NERmodel)(sentence::AbstractString)
     a(WordTokenizers.tokenize(sentence))
 end
 
diff --git a/src/Sequence Labelling/sequence_labelling.jl b/src/Sequence Labelling/sequence_labelling.jl
index e804d87b..64d17a0a 100644
--- a/src/Sequence Labelling/sequence_labelling.jl	
+++ b/src/Sequence Labelling/sequence_labelling.jl	
@@ -11,14 +11,11 @@ mutable struct BiLSTM_CNN_CRF_Model{C, W, L, D, O, A}
     backward::L # Backward LSTM
     d_out::D # Dense_out
     c::O # CRF
-    init_α::A
+    init_α::A # For CRF layer
     UNK_Word_idx::Integer
     UNK_char_idx::Integer
 end
 
-# BiLSTM_CNN_CRF_Model(labels, chars_idx, words_idx) =
-            # BiLSTM_CNN_CRF_Model(labels, chars_idx, words_idx, :cpu)
-
 function BiLSTM_CNN_CRF_Model(labels, chars_idx, words_idx, UNK_char_idx, UNK_Word_idx; CHAR_EMBED_DIMS=25, WORD_EMBED_DIMS=100,
                               CNN_OUTPUT_SIZE=30, CONV_PAD= (0,2), CONV_WINDOW_LENGTH = 3, LSTM_STATE_SIZE = 200)
     n = length(labels)
@@ -66,18 +63,5 @@ function (a::BiLSTM_CNN_CRF_Model)(x)
     [a.labels[oh.ix] for oh in oh_outs]
 end
 
-# function load(m::BiLSTM_CNN_CRF_Model, weights_path)
-#     m.conv1 = BSON.load(joinpath(weights_path, "conv_cpu.bson"))[:conv_cpu]
-#     println("ConvLoaded")
-#     m.W_word_Embed = BSON.load(joinpath(weights_path, "W_word_cpu.bson"))[:W_word_cpu].data
-#     println("W Word loaded")
-#     m.W_Char_Embed = BSON.load(joinpath(weights_path, "W_char_cpu.bson"))[:W_char_cpu].data
-#     println("W char loaded")
-#     m.forward_lstm = BSON.load(joinpath(weights_path, "forward_lstm.bson"))[:forward_lstm_cpu]
-#     m.backward = BSON.load(joinpath(weights_path, "backward_lstm.bson"))[:backward_lstm_cpu]
-#     m.d_out = BSON.load(joinpath(weights_path, "d_cpu.bson"))[:d_cpu]
-#     m.c = BSON.load(joinpath(weights_path, "crf.bson"))[:crf_cpu]
-# end
-
 onehotinput(m::BiLSTM_CNN_CRF_Model, word) = (onehot(get(m.words_idx, lowercase(word), m.UNK_Word_idx), 1:length(m.words_idx)),
                 onehotbatch([get(m.chars_idx, c, m.UNK_char_idx) for c in word], 1:length(m.chars_idx)))
diff --git a/src/TextAnalysis.jl b/src/TextAnalysis.jl
index 9c95a71f..f62a7a78 100644
--- a/src/TextAnalysis.jl
+++ b/src/TextAnalysis.jl
@@ -9,7 +9,7 @@ module TextAnalysis
 
     using DataDeps
     using Flux, Tracker
-    using Flux: identity, onehot, onecold, @treelike
+    using Flux: identity, onehot, onecold, @treelike, onehotbatch
 
     import DataFrames.DataFrame
     import Base.depwarn

From 5730d5d4358d5bd66c8b65b2374a8e51eced06a9 Mon Sep 17 00:00:00 2001
From: Ayushk4 <ayushk4@gmail.com>
Date: Fri, 9 Aug 2019 22:08:13 +0530
Subject: [PATCH 03/16] Document NER API

---
 docs/make.jl    |   3 +-
 docs/src/ner.md | 112 ++++++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 114 insertions(+), 1 deletion(-)
 create mode 100644 docs/src/ner.md

diff --git a/docs/make.jl b/docs/make.jl
index fb09b906..92f66925 100644
--- a/docs/make.jl
+++ b/docs/make.jl
@@ -16,7 +16,8 @@ makedocs(
         "Semantic Analysis" => "semantic.md",
         "Classifier" => "classify.md",
         "Extended Example" => "example.md",
-        "Conditional Random Fields" => "crf.md"
+        "Conditional Random Fields" => "crf.md",
+        "Named Entity Recognition" => "ner.md"
     ],
 )
 
diff --git a/docs/src/ner.md b/docs/src/ner.md
new file mode 100644
index 00000000..88620822
--- /dev/null
+++ b/docs/src/ner.md
@@ -0,0 +1,112 @@
+# Named Entity Recognition
+
+The API provided is a pretrained model for tagging Named Entities.
+The current model support 4 types of Named Entities -
+
+- `PER`: Person
+- `LOC`: Location
+- `ORG`: Organisation
+- `MISC`: Miscellaneous
+- `O`: Not a Named Entity
+
+To use the API, we first load the model weights into an instance of tagger.
+The function also accepts the path of model_weights and model_dicts (for character and word embeddings)
+
+    NERTagger()
+    NERTagger(dicts_path, weights_path)
+
+```julia
+julia> ner = NERTagger()
+```
+!!! note
+    When you call `NERTagger()` for the first time, the package will request permission for download the `Model_dicts` and `Model_weights`. Upon downloading, these are store locally and managed by `DataDeps`. So, on subsequent uses the weights will not need to be downloaded again.
+
+Once we create an instance, we can call it to tag a sentence or a sequence of tokens.
+
+    (ner::NERTagger)(sentence::String)
+    (ner::NERTagger)(tokens::Array{String, 1})
+
+```julia
+julia> sentence = "This package is maintained by John Doe."
+"This package is maintained by John Doe."
+
+julia> tags = ner(sentence)
+8-element Array{String,1}:
+ "O"
+ "O"
+ "O"
+ "O"
+ "O"
+ "PER"
+ "PER"
+ "O"
+
+```
+
+The API tokenizes the input sentences via the default tokenizer provided by `WordTokenizers`, this currently being set to the multilingual `TokTok Tokenizer.`
+
+```
+julia> using WordTokenizers
+
+julia> collect(zip(WordTokenizers.tokenize(sentence), tags))
+8-element Array{Tuple{String,String},1}:
+ ("This", "O")
+ ("package", "O")
+ ("is", "O")
+ ("maintained", "O")
+ ("by", "O")
+ ("John", "PER")
+ ("Doe", "PER")
+ (".", "O")
+
+```
+
+For tagging a multisentence text or document, once can use `split_sentences` from `WordTokenizers.jl` package and run the ner model on each.
+
+```julia
+julia> sentences = "Rabinov is winding up his term as ambassador. He will be replaced by Eliahu Ben-Elissar, a former Israeli envoy to Egypt and right-wing Likud party politiian." # Sentence taken from CoNLL 2003 Dataset
+
+julia> splitted_sents = WordTokenizers.split_sentences(sentences)
+
+julia> tag_sequences = ner.(splitted_sents)
+2-element Array{Array{String,1},1}:
+ ["PER", "O", "O", "O", "O", "O", "O", "O", "O"]
+ ["O", "O", "O", "O", "O", "PER", "PER", "O", "O", "O", "MISC", "O", "O", "LOC", "O", "O", "ORG", "ORG", "O", "O"]
+
+julia> zipped = [collect(zip(tag_sequences[i], WordTokenizers.tokenize(splitted_sents[i]))) for i in eachindex(splitted_sents)]
+
+julia> zipped[1]
+9-element Array{Tuple{String,String},1}:
+ ("PER", "Rabinov")
+ ("O", "is")
+ ("O", "winding")
+ ("O", "up")
+ ("O", "his")
+ ("O", "term")
+ ("O", "as")
+ ("O", "ambassador")
+ ("O", ".")
+
+julia> zipped[2]
+20-element Array{Tuple{String,String},1}:
+ ("O", "He")
+ ("O", "will")
+ ("O", "be")
+ ("O", "replaced")
+ ("O", "by")
+ ("PER", "Eliahu")
+ ("PER", "Ben-Elissar")
+ ("O", ",")
+ ("O", "a")
+ ("O", "former")
+ ("MISC", "Israeli")
+ ("O", "envoy")
+ ("O", "to")
+ ("LOC", "Egypt")
+ ("O", "and")
+ ("O", "right-wing")
+ ("ORG", "Likud")
+ ("ORG", "party")
+ ("O", "politiian")
+ ("O", ".")
+```

From 3d88d2648a998d7bfdac940375eac41b62434df0 Mon Sep 17 00:00:00 2001
From: Ayushk4 <ayushk4@gmail.com>
Date: Fri, 9 Aug 2019 22:30:10 +0530
Subject: [PATCH 04/16] Add tests for NER

---
 test/ner.jl      | 22 ++++++++++++++++++++++
 test/runtests.jl |  2 +-
 2 files changed, 23 insertions(+), 1 deletion(-)
 create mode 100644 test/ner.jl

diff --git a/test/ner.jl b/test/ner.jl
new file mode 100644
index 00000000..1578df9b
--- /dev/null
+++ b/test/ner.jl
@@ -0,0 +1,22 @@
+using WordTokenizers
+
+@testset "NER" begin
+    ner = NERTagger()
+
+    @testset "Basic" begin
+        str = "Mr. Foo Bar works in Google, California."
+        @test ner(str) == ["O", "PER", "PER", "O", "O", "ORG", "O", "LOC", "O"]
+
+        str = "If the Irish win the World Cup this year, it will be their 3rd time in a row."
+        @test ner(str) == [ "O", "O", "MISC", "O", "O", "MISC", "MISC", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O"]
+    end
+
+    @testset "Unknown Unicode characters" begin
+        # Making sure that the NER model handles for unknown unicode characters
+        str = "आ β⬰ 5¥ "
+        @test length(ner(str)) == length(WordTokenizers.tokenize(str))
+
+        str = "You owe John Doe 5¥."
+        @test ner(str) ==  [ "O", "O", "PER", "PER", "O", "O", "O"]
+    end
+end
diff --git a/test/runtests.jl b/test/runtests.jl
index 89664788..5035e67f 100644
--- a/test/runtests.jl
+++ b/test/runtests.jl
@@ -9,8 +9,8 @@ using WordTokenizers
 
 println("Running tests:")
 
+include("ner.jl")
 include("crf.jl")
-
 include("tokenizer.jl")
 include("ngramizer.jl")
 include("document.jl")

From ebff0cf0dcd73bb20fbf105d9de71f9e84e6f721 Mon Sep 17 00:00:00 2001
From: Ayushk4 <ayushk4@gmail.com>
Date: Fri, 9 Aug 2019 22:32:06 +0530
Subject: [PATCH 05/16] Update travis, appveyor scripts for always datadeps
 accept

---
 .travis.yml  | 2 ++
 appveyor.yml | 1 +
 2 files changed, 3 insertions(+)

diff --git a/.travis.yml b/.travis.yml
index 19e1aa7b..4d73a9c5 100644
--- a/.travis.yml
+++ b/.travis.yml
@@ -1,6 +1,8 @@
 language: julia
 os:
   - linux
+env:
+  - DATADEPS_ALWAYS_ACCEPT=true
 julia:
   - 0.7
   - 1.0
diff --git a/appveyor.yml b/appveyor.yml
index ea42c124..d32e6d4a 100644
--- a/appveyor.yml
+++ b/appveyor.yml
@@ -1,4 +1,5 @@
 environment:
+  DATADEPS_ALWAYS_ACCEPT: True
   matrix:
   - julia_version: 0.7
   - julia_version: 1

From 49aea7c9dd8f04fbf6ef33a83082b88e7d91a56d Mon Sep 17 00:00:00 2001
From: Ayushk4 <ayushk4@gmail.com>
Date: Sat, 10 Aug 2019 15:43:40 +0530
Subject: [PATCH 06/16] Call function instead using include for datadeps

---
 src/Sequence Labelling/NER_DataDeps.jl        | 29 --------------
 src/Sequence Labelling/ner.jl                 | 38 -------------------
 src/TextAnalysis.jl                           | 13 ++++---
 src/sequence/ner_datadeps.jl                  | 29 ++++++++++++++
 .../sequence_models.jl}                       |  0
 5 files changed, 36 insertions(+), 73 deletions(-)
 delete mode 100644 src/Sequence Labelling/NER_DataDeps.jl
 delete mode 100644 src/Sequence Labelling/ner.jl
 create mode 100644 src/sequence/ner_datadeps.jl
 rename src/{Sequence Labelling/sequence_labelling.jl => sequence/sequence_models.jl} (100%)

diff --git a/src/Sequence Labelling/NER_DataDeps.jl b/src/Sequence Labelling/NER_DataDeps.jl
deleted file mode 100644
index c42c1abc..00000000
--- a/src/Sequence Labelling/NER_DataDeps.jl	
+++ /dev/null
@@ -1,29 +0,0 @@
-using DataDeps
-
-register(DataDep("NER Model Weights",
-    """
-    The weights for NER Sequence Labelling Model.
-    """,
-    "https://raw.githubusercontent.com/Ayushk4/Random_set_of_codes/weights/weights.tar.xz",
-    post_fetch_method = function(fn)
-        unpack(fn)
-        dir = "weights"
-        innerfiles = readdir(dir)
-        mv.(joinpath.(dir, innerfiles), innerfiles)
-        rm(dir)
-    end
-))
-
-register(DataDep("NER Model Dicts",
-    """
-    The character and words dict for NER Sequence Labelling Model.
-    """,
-    "https://raw.githubusercontent.com/Ayushk4/Random_set_of_codes/weights/model_dicts.tar.xz",
-    post_fetch_method = function(fn)
-        unpack(fn)
-        dir = "model_dicts"
-        innerfiles = readdir(dir)
-        mv.(joinpath.(dir, innerfiles), innerfiles)
-        rm(dir)
-    end
-))
diff --git a/src/Sequence Labelling/ner.jl b/src/Sequence Labelling/ner.jl
deleted file mode 100644
index c52964ae..00000000
--- a/src/Sequence Labelling/ner.jl	
+++ /dev/null
@@ -1,38 +0,0 @@
-using BSON, Tracker
-
-const NER_Char_UNK = '¿'
-const NER_Word_UNK = "<UNK>"
-
-struct NERmodel{M}
-    model::M
-end
-
-function load_model_dicts(filepath)
-    labels = BSON.load(joinpath(filepath, "labels.bson"))[:labels]
-    chars_idx = BSON.load(joinpath(filepath, "char_to_embed_idx.bson"))[:get_char_index]
-    words_idx = BSON.load(joinpath(filepath, "word_to_embed_idx.bson"))[:get_word_index]
-
-    return remove_ner_label_prefix.([labels...]), chars_idx, words_idx
-end
-
-NERTagger() = NERTagger(datadep"NER Model Dicts", datadep"NER Model Weights")
-
-function NERTagger(dicts_path, weights_path)
-    labels, chars_idx, words_idx = load_model_dicts(dicts_path)
-    model = BiLSTM_CNN_CRF_Model(labels, chars_idx, words_idx, chars_idx[NER_Char_UNK], words_idx[NER_Word_UNK], weights_path)
-    NERmodel(model)
-end
-
-function (a::NERmodel)(sentence::AbstractString)
-    a(WordTokenizers.tokenize(sentence))
-end
-
-function (a::NERmodel)(tokens::Array{String,1})
-    input_oh = [onehotinput(a.model, token) for token in tokens]
-    return (a.model)(input_oh)
-end
-
-function remove_ner_label_prefix(str)
-    str == "O" && return str
-    str = str[3:end]
-end
diff --git a/src/TextAnalysis.jl b/src/TextAnalysis.jl
index 39a9bf52..50863ae3 100644
--- a/src/TextAnalysis.jl
+++ b/src/TextAnalysis.jl
@@ -80,10 +80,6 @@ module TextAnalysis
     end
     include(depsjl_path)
 
-    function __init__()
-        include(joinpath(@__DIR__, "Sequence Labelling/NER_DataDeps.jl"))
-    end
-
     include("stemmer.jl")
     include("dtm.jl")
     include("tf_idf.jl")
@@ -107,6 +103,11 @@ module TextAnalysis
     include("CRF/loss.jl")
 
     # NER
-    include("Sequence Labelling/ner.jl")
-    include("Sequence Labelling/sequence_labelling.jl")
+    include("sequence/ner_datadeps.jl")
+    include("sequence/ner.jl")
+    include("sequence/sequence_models.jl")
+
+    function __init__()
+        ner_datadep_register()
+    end
 end
diff --git a/src/sequence/ner_datadeps.jl b/src/sequence/ner_datadeps.jl
new file mode 100644
index 00000000..d5ce83d0
--- /dev/null
+++ b/src/sequence/ner_datadeps.jl
@@ -0,0 +1,29 @@
+function ner_datadep_register()
+    register(DataDep("NER Model Weights",
+        """
+        The weights for NER Sequence Labelling Model.
+        """,
+        "https://raw.githubusercontent.com/Ayushk4/Random_set_of_codes/weights/weights.tar.xz",
+        post_fetch_method = function(fn)
+            unpack(fn)
+            dir = "weights"
+            innerfiles = readdir(dir)
+            mv.(joinpath.(dir, innerfiles), innerfiles)
+            rm(dir)
+        end
+    ))
+
+    register(DataDep("NER Model Dicts",
+        """
+        The character and words dict for NER Sequence Labelling Model.
+        """,
+        "https://raw.githubusercontent.com/Ayushk4/Random_set_of_codes/weights/model_dicts.tar.xz",
+        post_fetch_method = function(fn)
+            unpack(fn)
+            dir = "model_dicts"
+            innerfiles = readdir(dir)
+            mv.(joinpath.(dir, innerfiles), innerfiles)
+            rm(dir)
+        end
+    ))
+end
diff --git a/src/Sequence Labelling/sequence_labelling.jl b/src/sequence/sequence_models.jl
similarity index 100%
rename from src/Sequence Labelling/sequence_labelling.jl
rename to src/sequence/sequence_models.jl

From e29dc7dcc4de86e5d0cad1e9a6766957a7ddcea8 Mon Sep 17 00:00:00 2001
From: Ayushk4 <ayushk4@gmail.com>
Date: Sat, 10 Aug 2019 15:45:12 +0530
Subject: [PATCH 07/16] Add APIs for NER Tagging over Documents and Corpus

---
 src/sequence/ner.jl | 50 +++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 50 insertions(+)
 create mode 100644 src/sequence/ner.jl

diff --git a/src/sequence/ner.jl b/src/sequence/ner.jl
new file mode 100644
index 00000000..ea93532f
--- /dev/null
+++ b/src/sequence/ner.jl
@@ -0,0 +1,50 @@
+using BSON, Tracker
+
+const NER_Char_UNK = '¿'
+const NER_Word_UNK = "<UNK>"
+
+struct NERmodel{M}
+    model::M
+end
+
+function load_model_dicts(filepath)
+    labels = BSON.load(joinpath(filepath, "labels.bson"))[:labels]
+    chars_idx = BSON.load(joinpath(filepath, "char_to_embed_idx.bson"))[:get_char_index]
+    words_idx = BSON.load(joinpath(filepath, "word_to_embed_idx.bson"))[:get_word_index]
+
+    return remove_ner_label_prefix.([labels...]), chars_idx, words_idx
+end
+
+NERTagger() = NERTagger(datadep"NER Model Dicts", datadep"NER Model Weights")
+
+function NERTagger(dicts_path, weights_path)
+    labels, chars_idx, words_idx = load_model_dicts(dicts_path)
+    model = BiLSTM_CNN_CRF_Model(labels, chars_idx, words_idx, chars_idx[NER_Char_UNK], words_idx[NER_Word_UNK], weights_path)
+    NERmodel(model)
+end
+
+function (a::NERmodel)(tokens::Array{String,1})
+    input_oh = [onehotinput(a.model, token) for token in tokens]
+    return (a.model)(input_oh)
+end
+
+function (a::NERmodel)(sentence::AbstractString)
+    a(WordTokenizers.tokenize(sentence))
+end
+
+function (a::NERmodel)(doc::AbstractDocument)
+    return vcat(a.(WordTokenizers.split_sentences(text(doc))))
+end
+
+function (a::NERmodel)(ngd::NGramDocument)
+    throw("Sequence Labelling not possible for NGramsDocument")
+end
+
+function (a::NERmodel)(crps::Corpus)
+    return a.(crps.documents)
+end
+
+function remove_ner_label_prefix(str)
+    str == "O" && return str
+    str = str[3:end]
+end

From 53e8eecc01c72fe8a40b5e8348d03ba99dd523ef Mon Sep 17 00:00:00 2001
From: Ayushk4 <ayushk4@gmail.com>
Date: Sat, 10 Aug 2019 15:45:57 +0530
Subject: [PATCH 08/16] Update docs for NER APIs

---
 docs/src/ner.md | 30 +++++++++++++++++++++++++++++-
 1 file changed, 29 insertions(+), 1 deletion(-)

diff --git a/docs/src/ner.md b/docs/src/ner.md
index 88620822..ec38bbd4 100644
--- a/docs/src/ner.md
+++ b/docs/src/ner.md
@@ -21,10 +21,14 @@ julia> ner = NERTagger()
 !!! note
     When you call `NERTagger()` for the first time, the package will request permission for download the `Model_dicts` and `Model_weights`. Upon downloading, these are store locally and managed by `DataDeps`. So, on subsequent uses the weights will not need to be downloaded again.
 
-Once we create an instance, we can call it to tag a sentence or a sequence of tokens.
+Once we create an instance, we can call it to tag a String (sentence), sequence of tokens, `AbstractDocument` or `Corpus`.
 
     (ner::NERTagger)(sentence::String)
     (ner::NERTagger)(tokens::Array{String, 1})
+    (ner::NERTagger)(sd::StringDocument)
+    (ner::NERTagger)(fd::FileDocument)
+    (ner::NERTagger)(td::TokenDocument)
+    (ner::NERTagger)(crps::Corpus)
 
 ```julia
 julia> sentence = "This package is maintained by John Doe."
@@ -110,3 +114,27 @@ julia> zipped[2]
  ("O", "politiian")
  ("O", ".")
 ```
+
+Since the tagging the Named Entities is done on sentence level,
+the text of `AbstractDocument` is sentence_tokenized and then labelled for over sentence.
+However is not possible for `NGramDocument` as text cannot be recreated.
+For `TokenDocument`, text is approximated for splitting into sentences, hence the following throws a warning when tagging the `Corpus`.
+
+```julia
+julia> crps = Corpus([StringDocument("We aRE vErY ClOSE tO ThE HEaDQuarTeRS."), TokenDocument("this is Bangalore.")])
+A Corpus with 2 documents:
+ * 1 StringDocument's
+ * 0 FileDocument's
+ * 1 TokenDocument's
+ * 0 NGramDocument's
+
+Corpus's lexicon contains 0 tokens
+Corpus's index contains 0 tokens
+
+julia> ner(crps)
+┌ Warning: TokenDocument's can only approximate the original text
+└ @ TextAnalysis ~/.julia/dev/TextAnalysis/src/document.jl:220
+2-element Array{Array{Array{String,1},1},1}:
+ [["O", "O", "O", "O", "O", "O", "O", "O"]]
+ [["O", "O", "LOC", "O"]]
+```

From 24c8a0345e48b450011438c6a92ed96494cbda1f Mon Sep 17 00:00:00 2001
From: Ayushk4 <ayushk4@gmail.com>
Date: Sat, 10 Aug 2019 16:09:45 +0530
Subject: [PATCH 09/16] Tests for new NER APU over Docs and Corpus.

---
 test/ner.jl | 26 ++++++++++++++++++++++++++
 1 file changed, 26 insertions(+)

diff --git a/test/ner.jl b/test/ner.jl
index 1578df9b..9430958f 100644
--- a/test/ner.jl
+++ b/test/ner.jl
@@ -19,4 +19,30 @@ using WordTokenizers
         str = "You owe John Doe 5¥."
         @test ner(str) ==  [ "O", "O", "PER", "PER", "O", "O", "O"]
     end
+
+    @testset "Documents and Corpus" begin
+        text1 = "We aRE vErY ClOSE tO ThE HEaDQuarTeRS."
+        text2 = "The World Health Organization (WHO) is a specialized agency of the United Nations that is concerned with international public health."
+
+        sd = StringDocument(text1)
+        td = TokenDocument(text2)
+
+        tags = ner(sd)
+        @test length(tags) == length(WordTokenizers.split_sentences(text1))
+        @test length(tags[1]) == length(WordTokenizers.tokenize(text1))
+        @test unique(vcat(tags...)) == ["O"]
+
+        tags = ner(td)
+        @test length(tags) == length(WordTokenizers.split_sentences(text2))
+        @test length(tags[1]) == length(WordTokenizers.tokenize(text2))
+        u =  unique(vcat(tags...))
+        @test "O" ∈ u && "ORG" ∈ u
+
+        crps = Corpus([sd, td])
+        tags = ner(crps)
+
+        @test length(tags) == length(crps.documents)
+        @test tags[1] == ner(crps.documents[1])
+        @test tags[2] == ner(crps.documents[2])
+    end
 end

From 5d572f5a9d7ab3d894f82be4eb9e2f7e7c4d8018 Mon Sep 17 00:00:00 2001
From: Ayushk4 <ayushk4@gmail.com>
Date: Sun, 11 Aug 2019 14:13:23 +0530
Subject: [PATCH 10/16] Update link for datadeps

---
 REQUIRE                      | 1 +
 src/sequence/ner_datadeps.jl | 6 ++++--
 2 files changed, 5 insertions(+), 2 deletions(-)

diff --git a/REQUIRE b/REQUIRE
index 4498bedf..a66b3bb5 100644
--- a/REQUIRE
+++ b/REQUIRE
@@ -7,3 +7,4 @@ Flux
 BSON
 JSON
 DataStructures
+DataDeps
diff --git a/src/sequence/ner_datadeps.jl b/src/sequence/ner_datadeps.jl
index d5ce83d0..ff6b4306 100644
--- a/src/sequence/ner_datadeps.jl
+++ b/src/sequence/ner_datadeps.jl
@@ -3,7 +3,8 @@ function ner_datadep_register()
         """
         The weights for NER Sequence Labelling Model.
         """,
-        "https://raw.githubusercontent.com/Ayushk4/Random_set_of_codes/weights/weights.tar.xz",
+        "https://github.com/JuliaText/TextAnalysis.jl/releases/download/v0.6.0/ner_weights.tar.xz",
+        "6290353b66c9bdbb794ddcb6063ab52c30145d3918f2f115f19e21fa994282e6",
         post_fetch_method = function(fn)
             unpack(fn)
             dir = "weights"
@@ -17,7 +18,8 @@ function ner_datadep_register()
         """
         The character and words dict for NER Sequence Labelling Model.
         """,
-        "https://raw.githubusercontent.com/Ayushk4/Random_set_of_codes/weights/model_dicts.tar.xz",
+        "https://github.com/JuliaText/TextAnalysis.jl/releases/download/v0.6.0/ner_dicts.tar.xz",
+        "40cfa37da216b990eb9c257aa7994e34d7a7a59d69b2506c6f39120f2688dc11",
         post_fetch_method = function(fn)
             unpack(fn)
             dir = "model_dicts"

From c2567b4f06ffa4327e46fcd36c40167f2ab22629 Mon Sep 17 00:00:00 2001
From: Ayushk4 <ayushk4@gmail.com>
Date: Mon, 12 Aug 2019 15:54:42 +0530
Subject: [PATCH 11/16] Update REQUIRE

---
 REQUIRE | 1 +
 1 file changed, 1 insertion(+)

diff --git a/REQUIRE b/REQUIRE
index a66b3bb5..fea98e0a 100644
--- a/REQUIRE
+++ b/REQUIRE
@@ -8,3 +8,4 @@ BSON
 JSON
 DataStructures
 DataDeps
+Tracker

From 1051ac8276cef0379fe756f045cbd34914aba991 Mon Sep 17 00:00:00 2001
From: Ayushk4 <ayushk4@gmail.com>
Date: Mon, 12 Aug 2019 15:58:12 +0530
Subject: [PATCH 12/16] Minor fix

---
 src/sequence/sequence_models.jl | 1 -
 test/ner.jl                     | 2 +-
 test/runtests.jl                | 8 +++-----
 3 files changed, 4 insertions(+), 7 deletions(-)

diff --git a/src/sequence/sequence_models.jl b/src/sequence/sequence_models.jl
index 64d17a0a..74288940 100644
--- a/src/sequence/sequence_models.jl
+++ b/src/sequence/sequence_models.jl
@@ -1,5 +1,4 @@
 using BSON, Tracker
-
 mutable struct BiLSTM_CNN_CRF_Model{C, W, L, D, O, A}
     labels::Array{String, 1} # List of Labels
     chars_idx::Dict{Char, Int64} # Dict that maps chars to indices in W_Char_Embed
diff --git a/test/ner.jl b/test/ner.jl
index 9430958f..da974535 100644
--- a/test/ner.jl
+++ b/test/ner.jl
@@ -1,4 +1,4 @@
-using WordTokenizers
+using WordTokenizers, TextAnalysis, Test
 
 @testset "NER" begin
     ner = NERTagger()
diff --git a/test/runtests.jl b/test/runtests.jl
index 3e41a61b..0b45c79b 100644
--- a/test/runtests.jl
+++ b/test/runtests.jl
@@ -1,3 +1,6 @@
+println("Running tests:")
+include("ner.jl")
+
 module TestTextAnalysis
 using SparseArrays
 using Test
@@ -5,11 +8,6 @@ using Languages
 using TextAnalysis
 using WordTokenizers
 
-# @testset "TextAnalysis" begin
-
-println("Running tests:")
-
-include("ner.jl")
 include("coom.jl")
 include("crf.jl")
 include("tokenizer.jl")

From 90d334ac84a2f0871526e75095c83cee897574fd Mon Sep 17 00:00:00 2001
From: Ayushk4 <ayushk4@gmail.com>
Date: Mon, 12 Aug 2019 15:58:29 +0530
Subject: [PATCH 13/16] Add a Project.toml file

---
 Project.toml | 26 ++++++++++++++++++++++++++
 1 file changed, 26 insertions(+)
 create mode 100644 Project.toml

diff --git a/Project.toml b/Project.toml
new file mode 100644
index 00000000..aa12254a
--- /dev/null
+++ b/Project.toml
@@ -0,0 +1,26 @@
+name = "TextAnalysis"
+uuid = "a2db99b7-8b79-58f8-94bf-bbc811eef33d"
+license = "MIT"
+desc = "Julia package for text analysis"
+
+[deps]
+WordTokenizers = "796a5d58-b03d-544a-977e-18100b691f6e"
+DataDeps = "124859b0-ceae-595e-8997-d05f6a7a8dfe"
+Languages = "8ef0a80b-9436-5d2c-a485-80b904378c43"
+DataFrames = "a93c6f00-e57d-5684-b7b6-d8193f3e46c0"
+DataStructures = "864edb3b-99cc-5e75-8d2d-829cb0a9cfe8"
+Flux = "587475ba-b771-5e3f-ad9e-33799f191a9c"
+Tracker = "9f7883ad-71c0-57eb-9f7f-b5c9e6d3789c"
+SparseArrays = "2f01184e-e22b-5df5-ae63-d93ebab69eaf"
+LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e"
+BSON = "fbb218c0-5317-5bc6-957e-2ee96dd4b1f0"
+Printf = "de0858da-6303-5e67-8744-51eddeeeb8d7"
+
+[compat]
+julia = "1"
+
+[extras]
+Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40"
+
+[targets]
+test = ["Test"]

From a3792ec510227cd2b6fb5644c011f01da001186a Mon Sep 17 00:00:00 2001
From: Ayushk4 <ayushk4@gmail.com>
Date: Mon, 12 Aug 2019 18:07:38 +0530
Subject: [PATCH 14/16] Add Statistics to Project.toml

---
 Project.toml     | 25 +++++++++++++------------
 test/ner.jl      |  2 +-
 test/runtests.jl |  9 +++------
 3 files changed, 17 insertions(+), 19 deletions(-)

diff --git a/Project.toml b/Project.toml
index aa12254a..9fb316ff 100644
--- a/Project.toml
+++ b/Project.toml
@@ -4,20 +4,21 @@ license = "MIT"
 desc = "Julia package for text analysis"
 
 [deps]
-WordTokenizers = "796a5d58-b03d-544a-977e-18100b691f6e"
-DataDeps = "124859b0-ceae-595e-8997-d05f6a7a8dfe"
-Languages = "8ef0a80b-9436-5d2c-a485-80b904378c43"
-DataFrames = "a93c6f00-e57d-5684-b7b6-d8193f3e46c0"
-DataStructures = "864edb3b-99cc-5e75-8d2d-829cb0a9cfe8"
-Flux = "587475ba-b771-5e3f-ad9e-33799f191a9c"
-Tracker = "9f7883ad-71c0-57eb-9f7f-b5c9e6d3789c"
-SparseArrays = "2f01184e-e22b-5df5-ae63-d93ebab69eaf"
-LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e"
-BSON = "fbb218c0-5317-5bc6-957e-2ee96dd4b1f0"
-Printf = "de0858da-6303-5e67-8744-51eddeeeb8d7"
+BSON =               "fbb218c0-5317-5bc6-957e-2ee96dd4b1f0"
+Flux =               "587475ba-b771-5e3f-ad9e-33799f191a9c"
+Printf =             "de0858da-6303-5e67-8744-51eddeeeb8d7"
+Tracker =            "9f7883ad-71c0-57eb-9f7f-b5c9e6d3789c"
+DataDeps =           "124859b0-ceae-595e-8997-d05f6a7a8dfe"
+Languages =          "8ef0a80b-9436-5d2c-a485-80b904378c43"
+DataFrames =         "a93c6f00-e57d-5684-b7b6-d8193f3e46c0"
+Statistics =         "10745b16-79ce-11e8-11f9-7d13ad32a3b2"
+SparseArrays =       "2f01184e-e22b-5df5-ae63-d93ebab69eaf"
+LinearAlgebra =      "37e2e46d-f89d-539d-b4ee-838fcccc9c8e"
+DataStructures =     "864edb3b-99cc-5e75-8d2d-829cb0a9cfe8"
+WordTokenizers =     "796a5d58-b03d-544a-977e-18100b691f6e"
 
 [compat]
-julia = "1"
+julia = "1.0"
 
 [extras]
 Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40"
diff --git a/test/ner.jl b/test/ner.jl
index da974535..9430958f 100644
--- a/test/ner.jl
+++ b/test/ner.jl
@@ -1,4 +1,4 @@
-using WordTokenizers, TextAnalysis, Test
+using WordTokenizers
 
 @testset "NER" begin
     ner = NERTagger()
diff --git a/test/runtests.jl b/test/runtests.jl
index 0b45c79b..dbfc5e04 100644
--- a/test/runtests.jl
+++ b/test/runtests.jl
@@ -1,13 +1,12 @@
-println("Running tests:")
-include("ner.jl")
-
-module TestTextAnalysis
 using SparseArrays
 using Test
 using Languages
 using TextAnalysis
 using WordTokenizers
 
+println("Running tests:")
+
+include("ner.jl")
 include("coom.jl")
 include("crf.jl")
 include("tokenizer.jl")
@@ -27,5 +26,3 @@ include("bayes.jl")
 include("taggingschemes.jl")
 include("averagePerceptronTagger.jl")
 include("evaluation_metrics.jl")
-
-end

From bfc738f47e73e7354de6e126f5f9538140ec9115 Mon Sep 17 00:00:00 2001
From: Ayushk4 <ayushk4@gmail.com>
Date: Mon, 12 Aug 2019 18:18:54 +0530
Subject: [PATCH 15/16] Add BinaryProvider.jl to Project.toml

---
 Project.toml | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/Project.toml b/Project.toml
index 9fb316ff..63f81696 100644
--- a/Project.toml
+++ b/Project.toml
@@ -16,9 +16,10 @@ SparseArrays =       "2f01184e-e22b-5df5-ae63-d93ebab69eaf"
 LinearAlgebra =      "37e2e46d-f89d-539d-b4ee-838fcccc9c8e"
 DataStructures =     "864edb3b-99cc-5e75-8d2d-829cb0a9cfe8"
 WordTokenizers =     "796a5d58-b03d-544a-977e-18100b691f6e"
+BinaryProvider =     "b99e7846-7c00-51b0-8f62-c81ae34c0232"
 
 [compat]
-julia = "1.0"
+julia = "1"
 
 [extras]
 Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40"

From ab62d3b17054bfc2ee3ed8ce7cbde096b5fa0457 Mon Sep 17 00:00:00 2001
From: Ayushk4 <ayushk4@gmail.com>
Date: Mon, 12 Aug 2019 18:29:32 +0530
Subject: [PATCH 16/16] Add manifest.toml

---
 Manifest.toml | 424 ++++++++++++++++++++++++++++++++++++++++++++++++++
 Project.toml  |  27 ++--
 2 files changed, 438 insertions(+), 13 deletions(-)
 create mode 100644 Manifest.toml

diff --git a/Manifest.toml b/Manifest.toml
new file mode 100644
index 00000000..446c1d5a
--- /dev/null
+++ b/Manifest.toml
@@ -0,0 +1,424 @@
+[[AbstractTrees]]
+deps = ["Markdown", "Test"]
+git-tree-sha1 = "6621d9645702c1c4e6970cc6a3eae440c768000b"
+uuid = "1520ce14-60c1-5f80-bbc7-55ef81b5835c"
+version = "0.2.1"
+
+[[Adapt]]
+deps = ["LinearAlgebra"]
+git-tree-sha1 = "82dab828020b872fa9efd3abec1152b075bc7cbf"
+uuid = "79e6a3ab-5dfb-504d-930d-738a2a938a0e"
+version = "1.0.0"
+
+[[BSON]]
+deps = ["Profile", "Test"]
+git-tree-sha1 = "6453cef4f9cb8ded8e28e4d6d12e11e20eb692ea"
+uuid = "fbb218c0-5317-5bc6-957e-2ee96dd4b1f0"
+version = "0.2.3"
+
+[[Base64]]
+uuid = "2a0f44e3-6c83-55bd-87e4-b1978d98bd5f"
+
+[[BinDeps]]
+deps = ["Compat", "Libdl", "SHA", "URIParser"]
+git-tree-sha1 = "12093ca6cdd0ee547c39b1870e0c9c3f154d9ca9"
+uuid = "9e28174c-4ba2-5203-b857-d8d62c4213ee"
+version = "0.8.10"
+
+[[BinaryProvider]]
+deps = ["Libdl", "Logging", "SHA"]
+git-tree-sha1 = "c7361ce8a2129f20b0e05a89f7070820cfed6648"
+uuid = "b99e7846-7c00-51b0-8f62-c81ae34c0232"
+version = "0.5.6"
+
+[[CSTParser]]
+deps = ["Tokenize"]
+git-tree-sha1 = "c69698c3d4a7255bc1b4bc2afc09f59db910243b"
+uuid = "00ebfdb7-1f24-5e51-bd34-a7502290713f"
+version = "0.6.2"
+
+[[CategoricalArrays]]
+deps = ["Compat", "DataAPI", "Future", "JSON", "Missings", "Printf", "Reexport"]
+git-tree-sha1 = "13240cfcc884837fc1aa89b60d500a652bcc3f10"
+uuid = "324d7699-5711-5eae-9e2f-1d82baa6b597"
+version = "0.5.5"
+
+[[CodecZlib]]
+deps = ["BinaryProvider", "Libdl", "TranscodingStreams"]
+git-tree-sha1 = "05916673a2627dd91b4969ff8ba6941bc85a960e"
+uuid = "944b1d66-785c-5afd-91f1-9de20f533193"
+version = "0.6.0"
+
+[[ColorTypes]]
+deps = ["FixedPointNumbers", "Random"]
+git-tree-sha1 = "10050a24b09e8e41b951e9976b109871ce98d965"
+uuid = "3da002f7-5984-5a60-b8a6-cbb66c0b333f"
+version = "0.8.0"
+
+[[Colors]]
+deps = ["ColorTypes", "FixedPointNumbers", "InteractiveUtils", "Printf", "Reexport", "Test"]
+git-tree-sha1 = "9f0a0210450acb91c730b730a994f8eef1d3d543"
+uuid = "5ae59095-9a9b-59fe-a467-6f913c188581"
+version = "0.9.5"
+
+[[CommonSubexpressions]]
+deps = ["Test"]
+git-tree-sha1 = "efdaf19ab11c7889334ca247ff4c9f7c322817b0"
+uuid = "bbf7d656-a473-5ed7-a52c-81e309532950"
+version = "0.2.0"
+
+[[Compat]]
+deps = ["Base64", "Dates", "DelimitedFiles", "Distributed", "InteractiveUtils", "LibGit2", "Libdl", "LinearAlgebra", "Markdown", "Mmap", "Pkg", "Printf", "REPL", "Random", "Serialization", "SharedArrays", "Sockets", "SparseArrays", "Statistics", "Test", "UUIDs", "Unicode"]
+git-tree-sha1 = "84aa74986c5b9b898b0d1acaf3258741ee64754f"
+uuid = "34da2185-b29b-5c13-b0c7-acf172513d20"
+version = "2.1.0"
+
+[[Crayons]]
+deps = ["Test"]
+git-tree-sha1 = "f621b8ef51fd2004c7cf157ea47f027fdeac5523"
+uuid = "a8cc5b0e-0ffa-5ad4-8c14-923d3ee1735f"
+version = "4.0.0"
+
+[[DataAPI]]
+git-tree-sha1 = "8903f0219d3472543fc4b2f5ebaf675a07f817c0"
+uuid = "9a962f9c-6df0-11e9-0e5d-c546b8b5ee8a"
+version = "1.0.1"
+
+[[DataDeps]]
+deps = ["HTTP", "Reexport", "SHA"]
+git-tree-sha1 = "3fecf920ad9702f015a5ab198b233b4c1c50992a"
+uuid = "124859b0-ceae-595e-8997-d05f6a7a8dfe"
+version = "0.6.4"
+
+[[DataFrames]]
+deps = ["CategoricalArrays", "Compat", "DataAPI", "InvertedIndices", "IteratorInterfaceExtensions", "Missings", "PooledArrays", "Printf", "REPL", "Reexport", "SortingAlgorithms", "Statistics", "TableTraits", "Tables", "Unicode"]
+git-tree-sha1 = "48ef38bd7cf0e8fd598bda981409eb6ef4b96cbd"
+uuid = "a93c6f00-e57d-5684-b7b6-d8193f3e46c0"
+version = "0.19.2"
+
+[[DataStructures]]
+deps = ["InteractiveUtils", "OrderedCollections"]
+git-tree-sha1 = "0809951a1774dc724da22d26e4289bbaab77809a"
+uuid = "864edb3b-99cc-5e75-8d2d-829cb0a9cfe8"
+version = "0.17.0"
+
+[[DataValueInterfaces]]
+git-tree-sha1 = "bfc1187b79289637fa0ef6d4436ebdfe6905cbd6"
+uuid = "e2d170a0-9d28-54be-80f0-106bbe20a464"
+version = "1.0.0"
+
+[[Dates]]
+deps = ["Printf"]
+uuid = "ade2ca70-3891-5945-98fb-dc099432e06a"
+
+[[DelimitedFiles]]
+deps = ["Mmap"]
+uuid = "8bb1440f-4735-579b-a4ab-409b98df4dab"
+
+[[DiffResults]]
+deps = ["Compat", "StaticArrays"]
+git-tree-sha1 = "34a4a1e8be7bc99bc9c611b895b5baf37a80584c"
+uuid = "163ba53b-c6d8-5494-b064-1a9d43ac40c5"
+version = "0.0.4"
+
+[[DiffRules]]
+deps = ["Random", "Test"]
+git-tree-sha1 = "dc0869fb2f5b23466b32ea799bd82c76480167f7"
+uuid = "b552c78f-8df3-52c6-915a-8e097449b14b"
+version = "0.0.10"
+
+[[Distributed]]
+deps = ["LinearAlgebra", "Random", "Serialization", "Sockets"]
+uuid = "8ba89e20-285c-5b6f-9357-94700520ee1b"
+
+[[FixedPointNumbers]]
+git-tree-sha1 = "d14a6fa5890ea3a7e5dcab6811114f132fec2b4b"
+uuid = "53c48c17-4a7d-5ca2-90c5-79b7896eea93"
+version = "0.6.1"
+
+[[Flux]]
+deps = ["AbstractTrees", "Adapt", "CodecZlib", "Colors", "DelimitedFiles", "Juno", "LinearAlgebra", "MacroTools", "NNlib", "Pkg", "Printf", "Random", "Reexport", "Requires", "SHA", "Statistics", "StatsBase", "Tracker", "ZipFile"]
+git-tree-sha1 = "08212989c2856f95f90709ea5fd824bd27b34514"
+uuid = "587475ba-b771-5e3f-ad9e-33799f191a9c"
+version = "0.8.3"
+
+[[ForwardDiff]]
+deps = ["CommonSubexpressions", "DiffResults", "DiffRules", "InteractiveUtils", "LinearAlgebra", "NaNMath", "Random", "SparseArrays", "SpecialFunctions", "StaticArrays", "Test"]
+git-tree-sha1 = "4c4d727f1b7e0092134fabfab6396b8945c1ea5b"
+uuid = "f6369f11-7733-5829-9624-2563aa707210"
+version = "0.10.3"
+
+[[Future]]
+deps = ["Random"]
+uuid = "9fa8497b-333b-5362-9e8d-4d0656e87820"
+
+[[HTML_Entities]]
+deps = ["StrTables"]
+git-tree-sha1 = "aa19515d6ebe7f91a39cfc1dc6341f38fcac1282"
+uuid = "7693890a-d069-55fe-a829-b4a6d304f0ee"
+version = "1.0.0"
+
+[[HTTP]]
+deps = ["Base64", "Dates", "IniFile", "MbedTLS", "Sockets"]
+git-tree-sha1 = "03ddc88af7f2d963fac5aa9f3ac8e11914d68a78"
+uuid = "cd3eb016-35fb-5094-929b-558a96fad6f3"
+version = "0.8.4"
+
+[[IniFile]]
+deps = ["Test"]
+git-tree-sha1 = "098e4d2c533924c921f9f9847274f2ad89e018b8"
+uuid = "83e8ac13-25f8-5344-8a64-a9f2b223428f"
+version = "0.5.0"
+
+[[InteractiveUtils]]
+deps = ["LinearAlgebra", "Markdown"]
+uuid = "b77e0a4c-d291-57a0-90e8-8db25a27a240"
+
+[[InvertedIndices]]
+deps = ["Test"]
+git-tree-sha1 = "15732c475062348b0165684ffe28e85ea8396afc"
+uuid = "41ab1584-1d38-5bbf-9106-f11c6c58b48f"
+version = "1.0.0"
+
+[[IteratorInterfaceExtensions]]
+git-tree-sha1 = "a3f24677c21f5bbe9d2a714f95dcd58337fb2856"
+uuid = "82899510-4779-5014-852e-03e436cf321d"
+version = "1.0.0"
+
+[[JSON]]
+deps = ["Dates", "Mmap", "Parsers", "Unicode"]
+git-tree-sha1 = "b34d7cef7b337321e97d22242c3c2b91f476748e"
+uuid = "682c06a0-de6a-54ab-a142-c8b1cf79cde6"
+version = "0.21.0"
+
+[[Juno]]
+deps = ["Base64", "Logging", "Media", "Profile", "Test"]
+git-tree-sha1 = "8426e073b1676acba2aea7a4a81d7a3af97a14fe"
+uuid = "e5e0dc1b-0480-54bc-9374-aad01c23163d"
+version = "0.7.1"
+
+[[Languages]]
+deps = ["JSON", "Test"]
+git-tree-sha1 = "fc6ee05e35074a66dc12a716065a25d9deece6fb"
+uuid = "8ef0a80b-9436-5d2c-a485-80b904378c43"
+version = "0.4.2"
+
+[[LibGit2]]
+uuid = "76f85450-5226-5b5a-8eaa-529ad045b433"
+
+[[Libdl]]
+uuid = "8f399da3-3557-5675-b5ff-fb832c97cbdb"
+
+[[LinearAlgebra]]
+deps = ["Libdl"]
+uuid = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e"
+
+[[Logging]]
+uuid = "56ddb016-857b-54e1-b83d-db4d58db5568"
+
+[[MacroTools]]
+deps = ["CSTParser", "Compat", "DataStructures", "Test", "Tokenize"]
+git-tree-sha1 = "d6e9dedb8c92c3465575442da456aec15a89ff76"
+uuid = "1914dd2f-81c6-5fcd-8719-6d5c9610ff09"
+version = "0.5.1"
+
+[[Markdown]]
+deps = ["Base64"]
+uuid = "d6f4376e-aef5-505a-96c1-9c027394607a"
+
+[[MbedTLS]]
+deps = ["BinaryProvider", "Dates", "Distributed", "Libdl", "Random", "Sockets", "Test"]
+git-tree-sha1 = "2d94286a9c2f52c63a16146bb86fd6cdfbf677c6"
+uuid = "739be429-bea8-5141-9913-cc70e7f3736d"
+version = "0.6.8"
+
+[[Media]]
+deps = ["MacroTools", "Test"]
+git-tree-sha1 = "75a54abd10709c01f1b86b84ec225d26e840ed58"
+uuid = "e89f7d12-3494-54d1-8411-f7d8b9ae1f27"
+version = "0.5.0"
+
+[[Missings]]
+deps = ["SparseArrays", "Test"]
+git-tree-sha1 = "f0719736664b4358aa9ec173077d4285775f8007"
+uuid = "e1d29d7a-bbdc-5cf2-9ac0-f12de2c33e28"
+version = "0.4.1"
+
+[[Mmap]]
+uuid = "a63ad114-7e13-5084-954f-fe012c677804"
+
+[[NNlib]]
+deps = ["Libdl", "LinearAlgebra", "Requires", "Statistics", "TimerOutputs"]
+git-tree-sha1 = "0c667371391fc6bb31f7f12f96a56a17098b3de8"
+uuid = "872c559c-99b0-510c-b3b7-b6c96a88d5cd"
+version = "0.6.0"
+
+[[NaNMath]]
+deps = ["Compat"]
+git-tree-sha1 = "ce3b85e484a5d4c71dd5316215069311135fa9f2"
+uuid = "77ba4419-2d1f-58cd-9bb1-8ffee604a2e3"
+version = "0.3.2"
+
+[[OrderedCollections]]
+deps = ["Random", "Serialization", "Test"]
+git-tree-sha1 = "c4c13474d23c60d20a67b217f1d7f22a40edf8f1"
+uuid = "bac558e1-5e72-5ebc-8fee-abe8a469f55d"
+version = "1.1.0"
+
+[[Parsers]]
+deps = ["Dates", "Test"]
+git-tree-sha1 = "db2b35dedab3c0e46dc15996d170af07a5ab91c9"
+uuid = "69de0a69-1ddd-5017-9359-2bf0b02dc9f0"
+version = "0.3.6"
+
+[[Pkg]]
+deps = ["Dates", "LibGit2", "Markdown", "Printf", "REPL", "Random", "SHA", "UUIDs"]
+uuid = "44cfe95a-1eb2-52ea-b672-e2afdf69b78f"
+
+[[PooledArrays]]
+git-tree-sha1 = "6e8c38927cb6e9ae144f7277c753714861b27d14"
+uuid = "2dfb63ee-cc39-5dd5-95bd-886bf059d720"
+version = "0.5.2"
+
+[[Printf]]
+deps = ["Unicode"]
+uuid = "de0858da-6303-5e67-8744-51eddeeeb8d7"
+
+[[Profile]]
+deps = ["Printf"]
+uuid = "9abbd945-dff8-562f-b5e8-e1ebf5ef1b79"
+
+[[REPL]]
+deps = ["InteractiveUtils", "Markdown", "Sockets"]
+uuid = "3fa0cd96-eef1-5676-8a61-b3b8758bbffb"
+
+[[Random]]
+deps = ["Serialization"]
+uuid = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c"
+
+[[Reexport]]
+deps = ["Pkg"]
+git-tree-sha1 = "7b1d07f411bc8ddb7977ec7f377b97b158514fe0"
+uuid = "189a3867-3050-52da-a836-e630ba90ab69"
+version = "0.2.0"
+
+[[Requires]]
+deps = ["Test"]
+git-tree-sha1 = "f6fbf4ba64d295e146e49e021207993b6b48c7d1"
+uuid = "ae029012-a4dd-5104-9daa-d747884805df"
+version = "0.5.2"
+
+[[SHA]]
+uuid = "ea8e919c-243c-51af-8825-aaa63cd721ce"
+
+[[Serialization]]
+uuid = "9e88b42a-f829-5b0c-bbe9-9e923198166b"
+
+[[SharedArrays]]
+deps = ["Distributed", "Mmap", "Random", "Serialization"]
+uuid = "1a1011a3-84de-559e-8e89-a11a2f7dc383"
+
+[[Sockets]]
+uuid = "6462fe0b-24de-5631-8697-dd941f90decc"
+
+[[SortingAlgorithms]]
+deps = ["DataStructures", "Random", "Test"]
+git-tree-sha1 = "03f5898c9959f8115e30bc7226ada7d0df554ddd"
+uuid = "a2af1166-a08f-5f64-846c-94a0d3cef48c"
+version = "0.3.1"
+
+[[SparseArrays]]
+deps = ["LinearAlgebra", "Random"]
+uuid = "2f01184e-e22b-5df5-ae63-d93ebab69eaf"
+
+[[SpecialFunctions]]
+deps = ["BinDeps", "BinaryProvider", "Libdl", "Test"]
+git-tree-sha1 = "0b45dc2e45ed77f445617b99ff2adf0f5b0f23ea"
+uuid = "276daf66-3868-5448-9aa4-cd146d93841b"
+version = "0.7.2"
+
+[[StaticArrays]]
+deps = ["LinearAlgebra", "Random", "Statistics"]
+git-tree-sha1 = "db23bbf50064c582b6f2b9b043c8e7e98ea8c0c6"
+uuid = "90137ffa-7385-5640-81b9-e52037218182"
+version = "0.11.0"
+
+[[Statistics]]
+deps = ["LinearAlgebra", "SparseArrays"]
+uuid = "10745b16-79ce-11e8-11f9-7d13ad32a3b2"
+
+[[StatsBase]]
+deps = ["DataAPI", "DataStructures", "LinearAlgebra", "Missings", "Printf", "Random", "SortingAlgorithms", "SparseArrays", "Statistics"]
+git-tree-sha1 = "c53e809e63fe5cf5de13632090bc3520649c9950"
+uuid = "2913bbd2-ae8a-5f71-8c99-4fb6c76f3a91"
+version = "0.32.0"
+
+[[StrTables]]
+deps = ["Dates"]
+git-tree-sha1 = "5998faae8c6308acc25c25896562a1e66a3bb038"
+uuid = "9700d1a9-a7c8-5760-9816-a99fda30bb8f"
+version = "1.0.1"
+
+[[TableTraits]]
+deps = ["IteratorInterfaceExtensions"]
+git-tree-sha1 = "b1ad568ba658d8cbb3b892ed5380a6f3e781a81e"
+uuid = "3783bdb8-4a98-5b6b-af9a-565f29a5fe9c"
+version = "1.0.0"
+
+[[Tables]]
+deps = ["DataAPI", "DataValueInterfaces", "IteratorInterfaceExtensions", "LinearAlgebra", "TableTraits", "Test"]
+git-tree-sha1 = "951b5be359e92703f886881b175ecfe924d8bd91"
+uuid = "bd369af6-aec1-5ad0-b16a-f7cc5008161c"
+version = "0.2.10"
+
+[[Test]]
+deps = ["Distributed", "InteractiveUtils", "Logging", "Random"]
+uuid = "8dfed614-e22c-5e08-85e1-65c5234f0b40"
+
+[[TimerOutputs]]
+deps = ["Crayons", "Printf", "Test", "Unicode"]
+git-tree-sha1 = "b80671c06f8f8bae08c55d67b5ce292c5ae2660c"
+uuid = "a759f4b9-e2f1-59dc-863e-4aeb61b1ea8f"
+version = "0.5.0"
+
+[[Tokenize]]
+git-tree-sha1 = "c8a8b00ae44a94950814ff77850470711a360225"
+uuid = "0796e94c-ce3b-5d07-9a54-7f471281c624"
+version = "0.5.5"
+
+[[Tracker]]
+deps = ["Adapt", "DiffRules", "ForwardDiff", "LinearAlgebra", "MacroTools", "NNlib", "NaNMath", "Printf", "Random", "Requires", "SpecialFunctions", "Statistics", "Test"]
+git-tree-sha1 = "327342fec6e09f68ced0c2dc5731ed475e4b696b"
+uuid = "9f7883ad-71c0-57eb-9f7f-b5c9e6d3789c"
+version = "0.2.2"
+
+[[TranscodingStreams]]
+deps = ["Random", "Test"]
+git-tree-sha1 = "7c53c35547de1c5b9d46a4797cf6d8253807108c"
+uuid = "3bb67fe8-82b1-5028-8e26-92a6c54297fa"
+version = "0.9.5"
+
+[[URIParser]]
+deps = ["Test", "Unicode"]
+git-tree-sha1 = "6ddf8244220dfda2f17539fa8c9de20d6c575b69"
+uuid = "30578b45-9adc-5946-b283-645ec420af67"
+version = "0.4.0"
+
+[[UUIDs]]
+deps = ["Random"]
+uuid = "cf7118a7-6976-5b1a-9a39-7adc72f591a4"
+
+[[Unicode]]
+uuid = "4ec0a83e-493e-50e2-b9ac-8f72acf5a8f5"
+
+[[WordTokenizers]]
+deps = ["HTML_Entities", "StrTables", "Unicode"]
+git-tree-sha1 = "983ca717c4ec786d0458ebcbe395a1a50b3a1897"
+uuid = "796a5d58-b03d-544a-977e-18100b691f6e"
+version = "0.5.3"
+
+[[ZipFile]]
+deps = ["BinaryProvider", "Libdl", "Printf"]
+git-tree-sha1 = "580ce62b6c14244916cc28ad54f8a2e2886f843d"
+uuid = "a5390f91-8eb1-5f08-bee0-b1d1ffed6cea"
+version = "0.8.3"
diff --git a/Project.toml b/Project.toml
index 63f81696..9d5b4633 100644
--- a/Project.toml
+++ b/Project.toml
@@ -4,19 +4,20 @@ license = "MIT"
 desc = "Julia package for text analysis"
 
 [deps]
-BSON =               "fbb218c0-5317-5bc6-957e-2ee96dd4b1f0"
-Flux =               "587475ba-b771-5e3f-ad9e-33799f191a9c"
-Printf =             "de0858da-6303-5e67-8744-51eddeeeb8d7"
-Tracker =            "9f7883ad-71c0-57eb-9f7f-b5c9e6d3789c"
-DataDeps =           "124859b0-ceae-595e-8997-d05f6a7a8dfe"
-Languages =          "8ef0a80b-9436-5d2c-a485-80b904378c43"
-DataFrames =         "a93c6f00-e57d-5684-b7b6-d8193f3e46c0"
-Statistics =         "10745b16-79ce-11e8-11f9-7d13ad32a3b2"
-SparseArrays =       "2f01184e-e22b-5df5-ae63-d93ebab69eaf"
-LinearAlgebra =      "37e2e46d-f89d-539d-b4ee-838fcccc9c8e"
-DataStructures =     "864edb3b-99cc-5e75-8d2d-829cb0a9cfe8"
-WordTokenizers =     "796a5d58-b03d-544a-977e-18100b691f6e"
-BinaryProvider =     "b99e7846-7c00-51b0-8f62-c81ae34c0232"
+BSON = "fbb218c0-5317-5bc6-957e-2ee96dd4b1f0"
+BinaryProvider = "b99e7846-7c00-51b0-8f62-c81ae34c0232"
+DataDeps = "124859b0-ceae-595e-8997-d05f6a7a8dfe"
+DataFrames = "a93c6f00-e57d-5684-b7b6-d8193f3e46c0"
+DataStructures = "864edb3b-99cc-5e75-8d2d-829cb0a9cfe8"
+Flux = "587475ba-b771-5e3f-ad9e-33799f191a9c"
+JSON = "682c06a0-de6a-54ab-a142-c8b1cf79cde6"
+Languages = "8ef0a80b-9436-5d2c-a485-80b904378c43"
+LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e"
+Printf = "de0858da-6303-5e67-8744-51eddeeeb8d7"
+SparseArrays = "2f01184e-e22b-5df5-ae63-d93ebab69eaf"
+Statistics = "10745b16-79ce-11e8-11f9-7d13ad32a3b2"
+Tracker = "9f7883ad-71c0-57eb-9f7f-b5c9e6d3789c"
+WordTokenizers = "796a5d58-b03d-544a-977e-18100b691f6e"
 
 [compat]
 julia = "1"