From 2d27a957d703fb3d017c6e7e0589c7f5a7232f9c Mon Sep 17 00:00:00 2001 From: Ayushk4 Date: Fri, 9 Aug 2019 20:53:26 +0530 Subject: [PATCH 01/16] Port NER API --- src/Sequence Labelling/NER_DataDeps.jl | 29 +++++++ src/Sequence Labelling/ner.jl | 38 +++++++++ src/Sequence Labelling/sequence_labelling.jl | 83 ++++++++++++++++++++ src/TextAnalysis.jl | 12 ++- 4 files changed, 161 insertions(+), 1 deletion(-) create mode 100644 src/Sequence Labelling/NER_DataDeps.jl create mode 100644 src/Sequence Labelling/ner.jl create mode 100644 src/Sequence Labelling/sequence_labelling.jl diff --git a/src/Sequence Labelling/NER_DataDeps.jl b/src/Sequence Labelling/NER_DataDeps.jl new file mode 100644 index 00000000..c42c1abc --- /dev/null +++ b/src/Sequence Labelling/NER_DataDeps.jl @@ -0,0 +1,29 @@ +using DataDeps + +register(DataDep("NER Model Weights", + """ + The weights for NER Sequence Labelling Model. + """, + "https://raw.githubusercontent.com/Ayushk4/Random_set_of_codes/weights/weights.tar.xz", + post_fetch_method = function(fn) + unpack(fn) + dir = "weights" + innerfiles = readdir(dir) + mv.(joinpath.(dir, innerfiles), innerfiles) + rm(dir) + end +)) + +register(DataDep("NER Model Dicts", + """ + The character and words dict for NER Sequence Labelling Model. + """, + "https://raw.githubusercontent.com/Ayushk4/Random_set_of_codes/weights/model_dicts.tar.xz", + post_fetch_method = function(fn) + unpack(fn) + dir = "model_dicts" + innerfiles = readdir(dir) + mv.(joinpath.(dir, innerfiles), innerfiles) + rm(dir) + end +)) diff --git a/src/Sequence Labelling/ner.jl b/src/Sequence Labelling/ner.jl new file mode 100644 index 00000000..02b8adc7 --- /dev/null +++ b/src/Sequence Labelling/ner.jl @@ -0,0 +1,38 @@ +using BSON, Tracker + +const NER_Char_UNK = '¿' +const NER_Word_UNK = "" + +struct NERmodel{M} + model::M +end + +function load_model_dicts(filepath) + labels = BSON.load(joinpath(filepath, "labels.bson"))[:labels] + chars_idx = BSON.load(joinpath(filepath, "char_to_embed_idx.bson"))[:get_char_index] + words_idx = BSON.load(joinpath(filepath, "word_to_embed_idx.bson"))[:get_word_index] + + return remove_ner_label_prefix.([labels...]), chars_idx, words_idx +end + +NERTagger() = NERTagger(datadep"NER Model Dicts", datadep"NER Model Weights") + +function NERTagger(dicts_path, weights_path) + labels, chars_idx, words_idx = load_model_dicts(dicts_path) + model = BiLSTM_CNN_CRF_Model(labels, chars_idx, words_idx, chars_idx[NER_Char_UNK], words_idx[NER_Word_UNK], weights_path) + NERmodel(model) +end + +function (a::NERmodel)(sentence::String) + a(WordTokenizers.tokenize(sentence)) +end + +function (a::NERmodel)(tokens::Array{String,1}) + input_oh = [onehotinput(a.model, token) for token in tokens] + return (a.model)(input_oh) +end + +function remove_ner_label_prefix(str) + str == "O" && return str + str = str[3:end] +end diff --git a/src/Sequence Labelling/sequence_labelling.jl b/src/Sequence Labelling/sequence_labelling.jl new file mode 100644 index 00000000..e804d87b --- /dev/null +++ b/src/Sequence Labelling/sequence_labelling.jl @@ -0,0 +1,83 @@ +using BSON, Tracker + +mutable struct BiLSTM_CNN_CRF_Model{C, W, L, D, O, A} + labels::Array{String, 1} # List of Labels + chars_idx::Dict{Char, Int64} # Dict that maps chars to indices in W_Char_Embed + words_idx::Dict{String, Int64} # Dict that maps words to indices in W_word_Embed + conv1::C # Convolution Layer over W_Char_Embed to give character representation + W_Char_Embed::W # Weights for character embeddings + W_word_Embed::W # Further trained GloVe Embeddings + forward_lstm::L # Forward LSTM + backward::L # Backward LSTM + d_out::D # Dense_out + c::O # CRF + init_α::A + UNK_Word_idx::Integer + UNK_char_idx::Integer +end + +# BiLSTM_CNN_CRF_Model(labels, chars_idx, words_idx) = + # BiLSTM_CNN_CRF_Model(labels, chars_idx, words_idx, :cpu) + +function BiLSTM_CNN_CRF_Model(labels, chars_idx, words_idx, UNK_char_idx, UNK_Word_idx; CHAR_EMBED_DIMS=25, WORD_EMBED_DIMS=100, + CNN_OUTPUT_SIZE=30, CONV_PAD= (0,2), CONV_WINDOW_LENGTH = 3, LSTM_STATE_SIZE = 200) + n = length(labels) + init_α = fill(-10000, (n + 2, 1)) + init_α[n + 1] = 0 + + BiLSTM_CNN_CRF_Model(labels, chars_idx, words_idx, Conv((CHAR_EMBED_DIMS, CONV_WINDOW_LENGTH), 1=>CNN_OUTPUT_SIZE, pad=CONV_PAD), + rand(CHAR_EMBED_DIMS, length(chars_idx)), rand(WORD_EMBED_DIMS, length(words_idx)), + LSTM(CNN_OUTPUT_SIZE + WORD_EMBED_DIMS, LSTM_STATE_SIZE), LSTM(CNN_OUTPUT_SIZE + WORD_EMBED_DIMS, LSTM_STATE_SIZE), + Dense(LSTM_STATE_SIZE * 2, length(labels) + 2), CRF(n), init_α, UNK_Word_idx, UNK_char_idx) +end + +function BiLSTM_CNN_CRF_Model(labels, chars_idx, words_idx, UNK_char_idx,UNK_Word_idx, weights_path) + n = length(labels) + init_α = fill(-10000, (n + 2, 1)) + init_α[n + 1] = 0 + + W_word_Embed = BSON.load(joinpath(weights_path, "W_word_cpu.bson"))[:W_word_cpu].data + W_Char_Embed = BSON.load(joinpath(weights_path, "W_char_cpu.bson"))[:W_char_cpu].data + forward_lstm = BSON.load(joinpath(weights_path, "forward_lstm.bson"))[:forward_lstm_cpu] + backward = BSON.load(joinpath(weights_path, "backward_lstm.bson"))[:backward_lstm_cpu] + d_out = BSON.load(joinpath(weights_path, "d_cpu.bson"))[:d_cpu] + c = BSON.load(joinpath(weights_path, "crf.bson"))[:crf_cpu] + conv1 = BSON.load(joinpath(weights_path, "conv_cpu.bson"))[:conv_cpu] + + BiLSTM_CNN_CRF_Model(labels, chars_idx, words_idx, conv1, W_Char_Embed, W_word_Embed, + forward_lstm, backward, d_out, c, init_α, UNK_Word_idx, UNK_char_idx) +end + +function (a::BiLSTM_CNN_CRF_Model)(x) + char_features = Chain(x -> reshape(x, size(x)..., 1,1), + a.conv1, + x -> maximum(x, dims=2), + x -> reshape(x, length(x),1)) + input_embeddings((w, cs)) = vcat(a.W_word_Embed * w, char_features(a.W_Char_Embed * cs)) + backward_lstm(x) = reverse((a.backward).(reverse(x))) + bilstm_layer(x) = vcat.((a.forward_lstm).(x), backward_lstm(x)) + m = Chain(x -> input_embeddings.(x), + bilstm_layer, + x -> (a.d_out).(x)) + + oh_outs = viterbi_decode(a.c, m(x), a.init_α) + Flux.reset!(a.backward) + Flux.reset!(a.forward_lstm) + [a.labels[oh.ix] for oh in oh_outs] +end + +# function load(m::BiLSTM_CNN_CRF_Model, weights_path) +# m.conv1 = BSON.load(joinpath(weights_path, "conv_cpu.bson"))[:conv_cpu] +# println("ConvLoaded") +# m.W_word_Embed = BSON.load(joinpath(weights_path, "W_word_cpu.bson"))[:W_word_cpu].data +# println("W Word loaded") +# m.W_Char_Embed = BSON.load(joinpath(weights_path, "W_char_cpu.bson"))[:W_char_cpu].data +# println("W char loaded") +# m.forward_lstm = BSON.load(joinpath(weights_path, "forward_lstm.bson"))[:forward_lstm_cpu] +# m.backward = BSON.load(joinpath(weights_path, "backward_lstm.bson"))[:backward_lstm_cpu] +# m.d_out = BSON.load(joinpath(weights_path, "d_cpu.bson"))[:d_cpu] +# m.c = BSON.load(joinpath(weights_path, "crf.bson"))[:crf_cpu] +# end + +onehotinput(m::BiLSTM_CNN_CRF_Model, word) = (onehot(get(m.words_idx, lowercase(word), m.UNK_Word_idx), 1:length(m.words_idx)), + onehotbatch([get(m.chars_idx, c, m.UNK_char_idx) for c in word], 1:length(m.chars_idx))) diff --git a/src/TextAnalysis.jl b/src/TextAnalysis.jl index 73324d69..9c95a71f 100644 --- a/src/TextAnalysis.jl +++ b/src/TextAnalysis.jl @@ -7,7 +7,8 @@ module TextAnalysis using DataFrames using WordTokenizers - using Flux + using DataDeps + using Flux, Tracker using Flux: identity, onehot, onecold, @treelike import DataFrames.DataFrame @@ -60,6 +61,8 @@ module TextAnalysis export CRF, viterbi_decode, crf_loss + export NERTagger, Tracker, Flux + include("tokenizer.jl") include("ngramizer.jl") include("document.jl") @@ -74,6 +77,10 @@ module TextAnalysis end include(depsjl_path) + function __init__() + include(joinpath(@__DIR__, "Sequence Labelling/NER_DataDeps.jl")) + end + include("stemmer.jl") include("dtm.jl") include("tf_idf.jl") @@ -95,4 +102,7 @@ module TextAnalysis include("CRF/crf_utils.jl") include("CRF/loss.jl") + # NER + include("Sequence Labelling/ner.jl") + include("Sequence Labelling/sequence_labelling.jl") end From e18d0f970c1fca6f0b1574d9ed88516464df5013 Mon Sep 17 00:00:00 2001 From: Ayushk4 Date: Fri, 9 Aug 2019 22:07:50 +0530 Subject: [PATCH 02/16] Fix NER API --- src/Sequence Labelling/ner.jl | 2 +- src/Sequence Labelling/sequence_labelling.jl | 18 +----------------- src/TextAnalysis.jl | 2 +- 3 files changed, 3 insertions(+), 19 deletions(-) diff --git a/src/Sequence Labelling/ner.jl b/src/Sequence Labelling/ner.jl index 02b8adc7..c52964ae 100644 --- a/src/Sequence Labelling/ner.jl +++ b/src/Sequence Labelling/ner.jl @@ -23,7 +23,7 @@ function NERTagger(dicts_path, weights_path) NERmodel(model) end -function (a::NERmodel)(sentence::String) +function (a::NERmodel)(sentence::AbstractString) a(WordTokenizers.tokenize(sentence)) end diff --git a/src/Sequence Labelling/sequence_labelling.jl b/src/Sequence Labelling/sequence_labelling.jl index e804d87b..64d17a0a 100644 --- a/src/Sequence Labelling/sequence_labelling.jl +++ b/src/Sequence Labelling/sequence_labelling.jl @@ -11,14 +11,11 @@ mutable struct BiLSTM_CNN_CRF_Model{C, W, L, D, O, A} backward::L # Backward LSTM d_out::D # Dense_out c::O # CRF - init_α::A + init_α::A # For CRF layer UNK_Word_idx::Integer UNK_char_idx::Integer end -# BiLSTM_CNN_CRF_Model(labels, chars_idx, words_idx) = - # BiLSTM_CNN_CRF_Model(labels, chars_idx, words_idx, :cpu) - function BiLSTM_CNN_CRF_Model(labels, chars_idx, words_idx, UNK_char_idx, UNK_Word_idx; CHAR_EMBED_DIMS=25, WORD_EMBED_DIMS=100, CNN_OUTPUT_SIZE=30, CONV_PAD= (0,2), CONV_WINDOW_LENGTH = 3, LSTM_STATE_SIZE = 200) n = length(labels) @@ -66,18 +63,5 @@ function (a::BiLSTM_CNN_CRF_Model)(x) [a.labels[oh.ix] for oh in oh_outs] end -# function load(m::BiLSTM_CNN_CRF_Model, weights_path) -# m.conv1 = BSON.load(joinpath(weights_path, "conv_cpu.bson"))[:conv_cpu] -# println("ConvLoaded") -# m.W_word_Embed = BSON.load(joinpath(weights_path, "W_word_cpu.bson"))[:W_word_cpu].data -# println("W Word loaded") -# m.W_Char_Embed = BSON.load(joinpath(weights_path, "W_char_cpu.bson"))[:W_char_cpu].data -# println("W char loaded") -# m.forward_lstm = BSON.load(joinpath(weights_path, "forward_lstm.bson"))[:forward_lstm_cpu] -# m.backward = BSON.load(joinpath(weights_path, "backward_lstm.bson"))[:backward_lstm_cpu] -# m.d_out = BSON.load(joinpath(weights_path, "d_cpu.bson"))[:d_cpu] -# m.c = BSON.load(joinpath(weights_path, "crf.bson"))[:crf_cpu] -# end - onehotinput(m::BiLSTM_CNN_CRF_Model, word) = (onehot(get(m.words_idx, lowercase(word), m.UNK_Word_idx), 1:length(m.words_idx)), onehotbatch([get(m.chars_idx, c, m.UNK_char_idx) for c in word], 1:length(m.chars_idx))) diff --git a/src/TextAnalysis.jl b/src/TextAnalysis.jl index 9c95a71f..f62a7a78 100644 --- a/src/TextAnalysis.jl +++ b/src/TextAnalysis.jl @@ -9,7 +9,7 @@ module TextAnalysis using DataDeps using Flux, Tracker - using Flux: identity, onehot, onecold, @treelike + using Flux: identity, onehot, onecold, @treelike, onehotbatch import DataFrames.DataFrame import Base.depwarn From 5730d5d4358d5bd66c8b65b2374a8e51eced06a9 Mon Sep 17 00:00:00 2001 From: Ayushk4 Date: Fri, 9 Aug 2019 22:08:13 +0530 Subject: [PATCH 03/16] Document NER API --- docs/make.jl | 3 +- docs/src/ner.md | 112 ++++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 114 insertions(+), 1 deletion(-) create mode 100644 docs/src/ner.md diff --git a/docs/make.jl b/docs/make.jl index fb09b906..92f66925 100644 --- a/docs/make.jl +++ b/docs/make.jl @@ -16,7 +16,8 @@ makedocs( "Semantic Analysis" => "semantic.md", "Classifier" => "classify.md", "Extended Example" => "example.md", - "Conditional Random Fields" => "crf.md" + "Conditional Random Fields" => "crf.md", + "Named Entity Recognition" => "ner.md" ], ) diff --git a/docs/src/ner.md b/docs/src/ner.md new file mode 100644 index 00000000..88620822 --- /dev/null +++ b/docs/src/ner.md @@ -0,0 +1,112 @@ +# Named Entity Recognition + +The API provided is a pretrained model for tagging Named Entities. +The current model support 4 types of Named Entities - + +- `PER`: Person +- `LOC`: Location +- `ORG`: Organisation +- `MISC`: Miscellaneous +- `O`: Not a Named Entity + +To use the API, we first load the model weights into an instance of tagger. +The function also accepts the path of model_weights and model_dicts (for character and word embeddings) + + NERTagger() + NERTagger(dicts_path, weights_path) + +```julia +julia> ner = NERTagger() +``` +!!! note + When you call `NERTagger()` for the first time, the package will request permission for download the `Model_dicts` and `Model_weights`. Upon downloading, these are store locally and managed by `DataDeps`. So, on subsequent uses the weights will not need to be downloaded again. + +Once we create an instance, we can call it to tag a sentence or a sequence of tokens. + + (ner::NERTagger)(sentence::String) + (ner::NERTagger)(tokens::Array{String, 1}) + +```julia +julia> sentence = "This package is maintained by John Doe." +"This package is maintained by John Doe." + +julia> tags = ner(sentence) +8-element Array{String,1}: + "O" + "O" + "O" + "O" + "O" + "PER" + "PER" + "O" + +``` + +The API tokenizes the input sentences via the default tokenizer provided by `WordTokenizers`, this currently being set to the multilingual `TokTok Tokenizer.` + +``` +julia> using WordTokenizers + +julia> collect(zip(WordTokenizers.tokenize(sentence), tags)) +8-element Array{Tuple{String,String},1}: + ("This", "O") + ("package", "O") + ("is", "O") + ("maintained", "O") + ("by", "O") + ("John", "PER") + ("Doe", "PER") + (".", "O") + +``` + +For tagging a multisentence text or document, once can use `split_sentences` from `WordTokenizers.jl` package and run the ner model on each. + +```julia +julia> sentences = "Rabinov is winding up his term as ambassador. He will be replaced by Eliahu Ben-Elissar, a former Israeli envoy to Egypt and right-wing Likud party politiian." # Sentence taken from CoNLL 2003 Dataset + +julia> splitted_sents = WordTokenizers.split_sentences(sentences) + +julia> tag_sequences = ner.(splitted_sents) +2-element Array{Array{String,1},1}: + ["PER", "O", "O", "O", "O", "O", "O", "O", "O"] + ["O", "O", "O", "O", "O", "PER", "PER", "O", "O", "O", "MISC", "O", "O", "LOC", "O", "O", "ORG", "ORG", "O", "O"] + +julia> zipped = [collect(zip(tag_sequences[i], WordTokenizers.tokenize(splitted_sents[i]))) for i in eachindex(splitted_sents)] + +julia> zipped[1] +9-element Array{Tuple{String,String},1}: + ("PER", "Rabinov") + ("O", "is") + ("O", "winding") + ("O", "up") + ("O", "his") + ("O", "term") + ("O", "as") + ("O", "ambassador") + ("O", ".") + +julia> zipped[2] +20-element Array{Tuple{String,String},1}: + ("O", "He") + ("O", "will") + ("O", "be") + ("O", "replaced") + ("O", "by") + ("PER", "Eliahu") + ("PER", "Ben-Elissar") + ("O", ",") + ("O", "a") + ("O", "former") + ("MISC", "Israeli") + ("O", "envoy") + ("O", "to") + ("LOC", "Egypt") + ("O", "and") + ("O", "right-wing") + ("ORG", "Likud") + ("ORG", "party") + ("O", "politiian") + ("O", ".") +``` From 3d88d2648a998d7bfdac940375eac41b62434df0 Mon Sep 17 00:00:00 2001 From: Ayushk4 Date: Fri, 9 Aug 2019 22:30:10 +0530 Subject: [PATCH 04/16] Add tests for NER --- test/ner.jl | 22 ++++++++++++++++++++++ test/runtests.jl | 2 +- 2 files changed, 23 insertions(+), 1 deletion(-) create mode 100644 test/ner.jl diff --git a/test/ner.jl b/test/ner.jl new file mode 100644 index 00000000..1578df9b --- /dev/null +++ b/test/ner.jl @@ -0,0 +1,22 @@ +using WordTokenizers + +@testset "NER" begin + ner = NERTagger() + + @testset "Basic" begin + str = "Mr. Foo Bar works in Google, California." + @test ner(str) == ["O", "PER", "PER", "O", "O", "ORG", "O", "LOC", "O"] + + str = "If the Irish win the World Cup this year, it will be their 3rd time in a row." + @test ner(str) == [ "O", "O", "MISC", "O", "O", "MISC", "MISC", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O"] + end + + @testset "Unknown Unicode characters" begin + # Making sure that the NER model handles for unknown unicode characters + str = "आ β⬰ 5¥ " + @test length(ner(str)) == length(WordTokenizers.tokenize(str)) + + str = "You owe John Doe 5¥." + @test ner(str) == [ "O", "O", "PER", "PER", "O", "O", "O"] + end +end diff --git a/test/runtests.jl b/test/runtests.jl index 89664788..5035e67f 100644 --- a/test/runtests.jl +++ b/test/runtests.jl @@ -9,8 +9,8 @@ using WordTokenizers println("Running tests:") +include("ner.jl") include("crf.jl") - include("tokenizer.jl") include("ngramizer.jl") include("document.jl") From ebff0cf0dcd73bb20fbf105d9de71f9e84e6f721 Mon Sep 17 00:00:00 2001 From: Ayushk4 Date: Fri, 9 Aug 2019 22:32:06 +0530 Subject: [PATCH 05/16] Update travis, appveyor scripts for always datadeps accept --- .travis.yml | 2 ++ appveyor.yml | 1 + 2 files changed, 3 insertions(+) diff --git a/.travis.yml b/.travis.yml index 19e1aa7b..4d73a9c5 100644 --- a/.travis.yml +++ b/.travis.yml @@ -1,6 +1,8 @@ language: julia os: - linux +env: + - DATADEPS_ALWAYS_ACCEPT=true julia: - 0.7 - 1.0 diff --git a/appveyor.yml b/appveyor.yml index ea42c124..d32e6d4a 100644 --- a/appveyor.yml +++ b/appveyor.yml @@ -1,4 +1,5 @@ environment: + DATADEPS_ALWAYS_ACCEPT: True matrix: - julia_version: 0.7 - julia_version: 1 From 49aea7c9dd8f04fbf6ef33a83082b88e7d91a56d Mon Sep 17 00:00:00 2001 From: Ayushk4 Date: Sat, 10 Aug 2019 15:43:40 +0530 Subject: [PATCH 06/16] Call function instead using include for datadeps --- src/Sequence Labelling/NER_DataDeps.jl | 29 -------------- src/Sequence Labelling/ner.jl | 38 ------------------- src/TextAnalysis.jl | 13 ++++--- src/sequence/ner_datadeps.jl | 29 ++++++++++++++ .../sequence_models.jl} | 0 5 files changed, 36 insertions(+), 73 deletions(-) delete mode 100644 src/Sequence Labelling/NER_DataDeps.jl delete mode 100644 src/Sequence Labelling/ner.jl create mode 100644 src/sequence/ner_datadeps.jl rename src/{Sequence Labelling/sequence_labelling.jl => sequence/sequence_models.jl} (100%) diff --git a/src/Sequence Labelling/NER_DataDeps.jl b/src/Sequence Labelling/NER_DataDeps.jl deleted file mode 100644 index c42c1abc..00000000 --- a/src/Sequence Labelling/NER_DataDeps.jl +++ /dev/null @@ -1,29 +0,0 @@ -using DataDeps - -register(DataDep("NER Model Weights", - """ - The weights for NER Sequence Labelling Model. - """, - "https://raw.githubusercontent.com/Ayushk4/Random_set_of_codes/weights/weights.tar.xz", - post_fetch_method = function(fn) - unpack(fn) - dir = "weights" - innerfiles = readdir(dir) - mv.(joinpath.(dir, innerfiles), innerfiles) - rm(dir) - end -)) - -register(DataDep("NER Model Dicts", - """ - The character and words dict for NER Sequence Labelling Model. - """, - "https://raw.githubusercontent.com/Ayushk4/Random_set_of_codes/weights/model_dicts.tar.xz", - post_fetch_method = function(fn) - unpack(fn) - dir = "model_dicts" - innerfiles = readdir(dir) - mv.(joinpath.(dir, innerfiles), innerfiles) - rm(dir) - end -)) diff --git a/src/Sequence Labelling/ner.jl b/src/Sequence Labelling/ner.jl deleted file mode 100644 index c52964ae..00000000 --- a/src/Sequence Labelling/ner.jl +++ /dev/null @@ -1,38 +0,0 @@ -using BSON, Tracker - -const NER_Char_UNK = '¿' -const NER_Word_UNK = "" - -struct NERmodel{M} - model::M -end - -function load_model_dicts(filepath) - labels = BSON.load(joinpath(filepath, "labels.bson"))[:labels] - chars_idx = BSON.load(joinpath(filepath, "char_to_embed_idx.bson"))[:get_char_index] - words_idx = BSON.load(joinpath(filepath, "word_to_embed_idx.bson"))[:get_word_index] - - return remove_ner_label_prefix.([labels...]), chars_idx, words_idx -end - -NERTagger() = NERTagger(datadep"NER Model Dicts", datadep"NER Model Weights") - -function NERTagger(dicts_path, weights_path) - labels, chars_idx, words_idx = load_model_dicts(dicts_path) - model = BiLSTM_CNN_CRF_Model(labels, chars_idx, words_idx, chars_idx[NER_Char_UNK], words_idx[NER_Word_UNK], weights_path) - NERmodel(model) -end - -function (a::NERmodel)(sentence::AbstractString) - a(WordTokenizers.tokenize(sentence)) -end - -function (a::NERmodel)(tokens::Array{String,1}) - input_oh = [onehotinput(a.model, token) for token in tokens] - return (a.model)(input_oh) -end - -function remove_ner_label_prefix(str) - str == "O" && return str - str = str[3:end] -end diff --git a/src/TextAnalysis.jl b/src/TextAnalysis.jl index 39a9bf52..50863ae3 100644 --- a/src/TextAnalysis.jl +++ b/src/TextAnalysis.jl @@ -80,10 +80,6 @@ module TextAnalysis end include(depsjl_path) - function __init__() - include(joinpath(@__DIR__, "Sequence Labelling/NER_DataDeps.jl")) - end - include("stemmer.jl") include("dtm.jl") include("tf_idf.jl") @@ -107,6 +103,11 @@ module TextAnalysis include("CRF/loss.jl") # NER - include("Sequence Labelling/ner.jl") - include("Sequence Labelling/sequence_labelling.jl") + include("sequence/ner_datadeps.jl") + include("sequence/ner.jl") + include("sequence/sequence_models.jl") + + function __init__() + ner_datadep_register() + end end diff --git a/src/sequence/ner_datadeps.jl b/src/sequence/ner_datadeps.jl new file mode 100644 index 00000000..d5ce83d0 --- /dev/null +++ b/src/sequence/ner_datadeps.jl @@ -0,0 +1,29 @@ +function ner_datadep_register() + register(DataDep("NER Model Weights", + """ + The weights for NER Sequence Labelling Model. + """, + "https://raw.githubusercontent.com/Ayushk4/Random_set_of_codes/weights/weights.tar.xz", + post_fetch_method = function(fn) + unpack(fn) + dir = "weights" + innerfiles = readdir(dir) + mv.(joinpath.(dir, innerfiles), innerfiles) + rm(dir) + end + )) + + register(DataDep("NER Model Dicts", + """ + The character and words dict for NER Sequence Labelling Model. + """, + "https://raw.githubusercontent.com/Ayushk4/Random_set_of_codes/weights/model_dicts.tar.xz", + post_fetch_method = function(fn) + unpack(fn) + dir = "model_dicts" + innerfiles = readdir(dir) + mv.(joinpath.(dir, innerfiles), innerfiles) + rm(dir) + end + )) +end diff --git a/src/Sequence Labelling/sequence_labelling.jl b/src/sequence/sequence_models.jl similarity index 100% rename from src/Sequence Labelling/sequence_labelling.jl rename to src/sequence/sequence_models.jl From e29dc7dcc4de86e5d0cad1e9a6766957a7ddcea8 Mon Sep 17 00:00:00 2001 From: Ayushk4 Date: Sat, 10 Aug 2019 15:45:12 +0530 Subject: [PATCH 07/16] Add APIs for NER Tagging over Documents and Corpus --- src/sequence/ner.jl | 50 +++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 50 insertions(+) create mode 100644 src/sequence/ner.jl diff --git a/src/sequence/ner.jl b/src/sequence/ner.jl new file mode 100644 index 00000000..ea93532f --- /dev/null +++ b/src/sequence/ner.jl @@ -0,0 +1,50 @@ +using BSON, Tracker + +const NER_Char_UNK = '¿' +const NER_Word_UNK = "" + +struct NERmodel{M} + model::M +end + +function load_model_dicts(filepath) + labels = BSON.load(joinpath(filepath, "labels.bson"))[:labels] + chars_idx = BSON.load(joinpath(filepath, "char_to_embed_idx.bson"))[:get_char_index] + words_idx = BSON.load(joinpath(filepath, "word_to_embed_idx.bson"))[:get_word_index] + + return remove_ner_label_prefix.([labels...]), chars_idx, words_idx +end + +NERTagger() = NERTagger(datadep"NER Model Dicts", datadep"NER Model Weights") + +function NERTagger(dicts_path, weights_path) + labels, chars_idx, words_idx = load_model_dicts(dicts_path) + model = BiLSTM_CNN_CRF_Model(labels, chars_idx, words_idx, chars_idx[NER_Char_UNK], words_idx[NER_Word_UNK], weights_path) + NERmodel(model) +end + +function (a::NERmodel)(tokens::Array{String,1}) + input_oh = [onehotinput(a.model, token) for token in tokens] + return (a.model)(input_oh) +end + +function (a::NERmodel)(sentence::AbstractString) + a(WordTokenizers.tokenize(sentence)) +end + +function (a::NERmodel)(doc::AbstractDocument) + return vcat(a.(WordTokenizers.split_sentences(text(doc)))) +end + +function (a::NERmodel)(ngd::NGramDocument) + throw("Sequence Labelling not possible for NGramsDocument") +end + +function (a::NERmodel)(crps::Corpus) + return a.(crps.documents) +end + +function remove_ner_label_prefix(str) + str == "O" && return str + str = str[3:end] +end From 53e8eecc01c72fe8a40b5e8348d03ba99dd523ef Mon Sep 17 00:00:00 2001 From: Ayushk4 Date: Sat, 10 Aug 2019 15:45:57 +0530 Subject: [PATCH 08/16] Update docs for NER APIs --- docs/src/ner.md | 30 +++++++++++++++++++++++++++++- 1 file changed, 29 insertions(+), 1 deletion(-) diff --git a/docs/src/ner.md b/docs/src/ner.md index 88620822..ec38bbd4 100644 --- a/docs/src/ner.md +++ b/docs/src/ner.md @@ -21,10 +21,14 @@ julia> ner = NERTagger() !!! note When you call `NERTagger()` for the first time, the package will request permission for download the `Model_dicts` and `Model_weights`. Upon downloading, these are store locally and managed by `DataDeps`. So, on subsequent uses the weights will not need to be downloaded again. -Once we create an instance, we can call it to tag a sentence or a sequence of tokens. +Once we create an instance, we can call it to tag a String (sentence), sequence of tokens, `AbstractDocument` or `Corpus`. (ner::NERTagger)(sentence::String) (ner::NERTagger)(tokens::Array{String, 1}) + (ner::NERTagger)(sd::StringDocument) + (ner::NERTagger)(fd::FileDocument) + (ner::NERTagger)(td::TokenDocument) + (ner::NERTagger)(crps::Corpus) ```julia julia> sentence = "This package is maintained by John Doe." @@ -110,3 +114,27 @@ julia> zipped[2] ("O", "politiian") ("O", ".") ``` + +Since the tagging the Named Entities is done on sentence level, +the text of `AbstractDocument` is sentence_tokenized and then labelled for over sentence. +However is not possible for `NGramDocument` as text cannot be recreated. +For `TokenDocument`, text is approximated for splitting into sentences, hence the following throws a warning when tagging the `Corpus`. + +```julia +julia> crps = Corpus([StringDocument("We aRE vErY ClOSE tO ThE HEaDQuarTeRS."), TokenDocument("this is Bangalore.")]) +A Corpus with 2 documents: + * 1 StringDocument's + * 0 FileDocument's + * 1 TokenDocument's + * 0 NGramDocument's + +Corpus's lexicon contains 0 tokens +Corpus's index contains 0 tokens + +julia> ner(crps) +┌ Warning: TokenDocument's can only approximate the original text +└ @ TextAnalysis ~/.julia/dev/TextAnalysis/src/document.jl:220 +2-element Array{Array{Array{String,1},1},1}: + [["O", "O", "O", "O", "O", "O", "O", "O"]] + [["O", "O", "LOC", "O"]] +``` From 24c8a0345e48b450011438c6a92ed96494cbda1f Mon Sep 17 00:00:00 2001 From: Ayushk4 Date: Sat, 10 Aug 2019 16:09:45 +0530 Subject: [PATCH 09/16] Tests for new NER APU over Docs and Corpus. --- test/ner.jl | 26 ++++++++++++++++++++++++++ 1 file changed, 26 insertions(+) diff --git a/test/ner.jl b/test/ner.jl index 1578df9b..9430958f 100644 --- a/test/ner.jl +++ b/test/ner.jl @@ -19,4 +19,30 @@ using WordTokenizers str = "You owe John Doe 5¥." @test ner(str) == [ "O", "O", "PER", "PER", "O", "O", "O"] end + + @testset "Documents and Corpus" begin + text1 = "We aRE vErY ClOSE tO ThE HEaDQuarTeRS." + text2 = "The World Health Organization (WHO) is a specialized agency of the United Nations that is concerned with international public health." + + sd = StringDocument(text1) + td = TokenDocument(text2) + + tags = ner(sd) + @test length(tags) == length(WordTokenizers.split_sentences(text1)) + @test length(tags[1]) == length(WordTokenizers.tokenize(text1)) + @test unique(vcat(tags...)) == ["O"] + + tags = ner(td) + @test length(tags) == length(WordTokenizers.split_sentences(text2)) + @test length(tags[1]) == length(WordTokenizers.tokenize(text2)) + u = unique(vcat(tags...)) + @test "O" ∈ u && "ORG" ∈ u + + crps = Corpus([sd, td]) + tags = ner(crps) + + @test length(tags) == length(crps.documents) + @test tags[1] == ner(crps.documents[1]) + @test tags[2] == ner(crps.documents[2]) + end end From 5d572f5a9d7ab3d894f82be4eb9e2f7e7c4d8018 Mon Sep 17 00:00:00 2001 From: Ayushk4 Date: Sun, 11 Aug 2019 14:13:23 +0530 Subject: [PATCH 10/16] Update link for datadeps --- REQUIRE | 1 + src/sequence/ner_datadeps.jl | 6 ++++-- 2 files changed, 5 insertions(+), 2 deletions(-) diff --git a/REQUIRE b/REQUIRE index 4498bedf..a66b3bb5 100644 --- a/REQUIRE +++ b/REQUIRE @@ -7,3 +7,4 @@ Flux BSON JSON DataStructures +DataDeps diff --git a/src/sequence/ner_datadeps.jl b/src/sequence/ner_datadeps.jl index d5ce83d0..ff6b4306 100644 --- a/src/sequence/ner_datadeps.jl +++ b/src/sequence/ner_datadeps.jl @@ -3,7 +3,8 @@ function ner_datadep_register() """ The weights for NER Sequence Labelling Model. """, - "https://raw.githubusercontent.com/Ayushk4/Random_set_of_codes/weights/weights.tar.xz", + "https://github.com/JuliaText/TextAnalysis.jl/releases/download/v0.6.0/ner_weights.tar.xz", + "6290353b66c9bdbb794ddcb6063ab52c30145d3918f2f115f19e21fa994282e6", post_fetch_method = function(fn) unpack(fn) dir = "weights" @@ -17,7 +18,8 @@ function ner_datadep_register() """ The character and words dict for NER Sequence Labelling Model. """, - "https://raw.githubusercontent.com/Ayushk4/Random_set_of_codes/weights/model_dicts.tar.xz", + "https://github.com/JuliaText/TextAnalysis.jl/releases/download/v0.6.0/ner_dicts.tar.xz", + "40cfa37da216b990eb9c257aa7994e34d7a7a59d69b2506c6f39120f2688dc11", post_fetch_method = function(fn) unpack(fn) dir = "model_dicts" From c2567b4f06ffa4327e46fcd36c40167f2ab22629 Mon Sep 17 00:00:00 2001 From: Ayushk4 Date: Mon, 12 Aug 2019 15:54:42 +0530 Subject: [PATCH 11/16] Update REQUIRE --- REQUIRE | 1 + 1 file changed, 1 insertion(+) diff --git a/REQUIRE b/REQUIRE index a66b3bb5..fea98e0a 100644 --- a/REQUIRE +++ b/REQUIRE @@ -8,3 +8,4 @@ BSON JSON DataStructures DataDeps +Tracker From 1051ac8276cef0379fe756f045cbd34914aba991 Mon Sep 17 00:00:00 2001 From: Ayushk4 Date: Mon, 12 Aug 2019 15:58:12 +0530 Subject: [PATCH 12/16] Minor fix --- src/sequence/sequence_models.jl | 1 - test/ner.jl | 2 +- test/runtests.jl | 8 +++----- 3 files changed, 4 insertions(+), 7 deletions(-) diff --git a/src/sequence/sequence_models.jl b/src/sequence/sequence_models.jl index 64d17a0a..74288940 100644 --- a/src/sequence/sequence_models.jl +++ b/src/sequence/sequence_models.jl @@ -1,5 +1,4 @@ using BSON, Tracker - mutable struct BiLSTM_CNN_CRF_Model{C, W, L, D, O, A} labels::Array{String, 1} # List of Labels chars_idx::Dict{Char, Int64} # Dict that maps chars to indices in W_Char_Embed diff --git a/test/ner.jl b/test/ner.jl index 9430958f..da974535 100644 --- a/test/ner.jl +++ b/test/ner.jl @@ -1,4 +1,4 @@ -using WordTokenizers +using WordTokenizers, TextAnalysis, Test @testset "NER" begin ner = NERTagger() diff --git a/test/runtests.jl b/test/runtests.jl index 3e41a61b..0b45c79b 100644 --- a/test/runtests.jl +++ b/test/runtests.jl @@ -1,3 +1,6 @@ +println("Running tests:") +include("ner.jl") + module TestTextAnalysis using SparseArrays using Test @@ -5,11 +8,6 @@ using Languages using TextAnalysis using WordTokenizers -# @testset "TextAnalysis" begin - -println("Running tests:") - -include("ner.jl") include("coom.jl") include("crf.jl") include("tokenizer.jl") From 90d334ac84a2f0871526e75095c83cee897574fd Mon Sep 17 00:00:00 2001 From: Ayushk4 Date: Mon, 12 Aug 2019 15:58:29 +0530 Subject: [PATCH 13/16] Add a Project.toml file --- Project.toml | 26 ++++++++++++++++++++++++++ 1 file changed, 26 insertions(+) create mode 100644 Project.toml diff --git a/Project.toml b/Project.toml new file mode 100644 index 00000000..aa12254a --- /dev/null +++ b/Project.toml @@ -0,0 +1,26 @@ +name = "TextAnalysis" +uuid = "a2db99b7-8b79-58f8-94bf-bbc811eef33d" +license = "MIT" +desc = "Julia package for text analysis" + +[deps] +WordTokenizers = "796a5d58-b03d-544a-977e-18100b691f6e" +DataDeps = "124859b0-ceae-595e-8997-d05f6a7a8dfe" +Languages = "8ef0a80b-9436-5d2c-a485-80b904378c43" +DataFrames = "a93c6f00-e57d-5684-b7b6-d8193f3e46c0" +DataStructures = "864edb3b-99cc-5e75-8d2d-829cb0a9cfe8" +Flux = "587475ba-b771-5e3f-ad9e-33799f191a9c" +Tracker = "9f7883ad-71c0-57eb-9f7f-b5c9e6d3789c" +SparseArrays = "2f01184e-e22b-5df5-ae63-d93ebab69eaf" +LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e" +BSON = "fbb218c0-5317-5bc6-957e-2ee96dd4b1f0" +Printf = "de0858da-6303-5e67-8744-51eddeeeb8d7" + +[compat] +julia = "1" + +[extras] +Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40" + +[targets] +test = ["Test"] From a3792ec510227cd2b6fb5644c011f01da001186a Mon Sep 17 00:00:00 2001 From: Ayushk4 Date: Mon, 12 Aug 2019 18:07:38 +0530 Subject: [PATCH 14/16] Add Statistics to Project.toml --- Project.toml | 25 +++++++++++++------------ test/ner.jl | 2 +- test/runtests.jl | 9 +++------ 3 files changed, 17 insertions(+), 19 deletions(-) diff --git a/Project.toml b/Project.toml index aa12254a..9fb316ff 100644 --- a/Project.toml +++ b/Project.toml @@ -4,20 +4,21 @@ license = "MIT" desc = "Julia package for text analysis" [deps] -WordTokenizers = "796a5d58-b03d-544a-977e-18100b691f6e" -DataDeps = "124859b0-ceae-595e-8997-d05f6a7a8dfe" -Languages = "8ef0a80b-9436-5d2c-a485-80b904378c43" -DataFrames = "a93c6f00-e57d-5684-b7b6-d8193f3e46c0" -DataStructures = "864edb3b-99cc-5e75-8d2d-829cb0a9cfe8" -Flux = "587475ba-b771-5e3f-ad9e-33799f191a9c" -Tracker = "9f7883ad-71c0-57eb-9f7f-b5c9e6d3789c" -SparseArrays = "2f01184e-e22b-5df5-ae63-d93ebab69eaf" -LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e" -BSON = "fbb218c0-5317-5bc6-957e-2ee96dd4b1f0" -Printf = "de0858da-6303-5e67-8744-51eddeeeb8d7" +BSON = "fbb218c0-5317-5bc6-957e-2ee96dd4b1f0" +Flux = "587475ba-b771-5e3f-ad9e-33799f191a9c" +Printf = "de0858da-6303-5e67-8744-51eddeeeb8d7" +Tracker = "9f7883ad-71c0-57eb-9f7f-b5c9e6d3789c" +DataDeps = "124859b0-ceae-595e-8997-d05f6a7a8dfe" +Languages = "8ef0a80b-9436-5d2c-a485-80b904378c43" +DataFrames = "a93c6f00-e57d-5684-b7b6-d8193f3e46c0" +Statistics = "10745b16-79ce-11e8-11f9-7d13ad32a3b2" +SparseArrays = "2f01184e-e22b-5df5-ae63-d93ebab69eaf" +LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e" +DataStructures = "864edb3b-99cc-5e75-8d2d-829cb0a9cfe8" +WordTokenizers = "796a5d58-b03d-544a-977e-18100b691f6e" [compat] -julia = "1" +julia = "1.0" [extras] Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40" diff --git a/test/ner.jl b/test/ner.jl index da974535..9430958f 100644 --- a/test/ner.jl +++ b/test/ner.jl @@ -1,4 +1,4 @@ -using WordTokenizers, TextAnalysis, Test +using WordTokenizers @testset "NER" begin ner = NERTagger() diff --git a/test/runtests.jl b/test/runtests.jl index 0b45c79b..dbfc5e04 100644 --- a/test/runtests.jl +++ b/test/runtests.jl @@ -1,13 +1,12 @@ -println("Running tests:") -include("ner.jl") - -module TestTextAnalysis using SparseArrays using Test using Languages using TextAnalysis using WordTokenizers +println("Running tests:") + +include("ner.jl") include("coom.jl") include("crf.jl") include("tokenizer.jl") @@ -27,5 +26,3 @@ include("bayes.jl") include("taggingschemes.jl") include("averagePerceptronTagger.jl") include("evaluation_metrics.jl") - -end From bfc738f47e73e7354de6e126f5f9538140ec9115 Mon Sep 17 00:00:00 2001 From: Ayushk4 Date: Mon, 12 Aug 2019 18:18:54 +0530 Subject: [PATCH 15/16] Add BinaryProvider.jl to Project.toml --- Project.toml | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/Project.toml b/Project.toml index 9fb316ff..63f81696 100644 --- a/Project.toml +++ b/Project.toml @@ -16,9 +16,10 @@ SparseArrays = "2f01184e-e22b-5df5-ae63-d93ebab69eaf" LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e" DataStructures = "864edb3b-99cc-5e75-8d2d-829cb0a9cfe8" WordTokenizers = "796a5d58-b03d-544a-977e-18100b691f6e" +BinaryProvider = "b99e7846-7c00-51b0-8f62-c81ae34c0232" [compat] -julia = "1.0" +julia = "1" [extras] Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40" From ab62d3b17054bfc2ee3ed8ce7cbde096b5fa0457 Mon Sep 17 00:00:00 2001 From: Ayushk4 Date: Mon, 12 Aug 2019 18:29:32 +0530 Subject: [PATCH 16/16] Add manifest.toml --- Manifest.toml | 424 ++++++++++++++++++++++++++++++++++++++++++++++++++ Project.toml | 27 ++-- 2 files changed, 438 insertions(+), 13 deletions(-) create mode 100644 Manifest.toml diff --git a/Manifest.toml b/Manifest.toml new file mode 100644 index 00000000..446c1d5a --- /dev/null +++ b/Manifest.toml @@ -0,0 +1,424 @@ +[[AbstractTrees]] +deps = ["Markdown", "Test"] +git-tree-sha1 = "6621d9645702c1c4e6970cc6a3eae440c768000b" +uuid = "1520ce14-60c1-5f80-bbc7-55ef81b5835c" +version = "0.2.1" + +[[Adapt]] +deps = ["LinearAlgebra"] +git-tree-sha1 = "82dab828020b872fa9efd3abec1152b075bc7cbf" +uuid = "79e6a3ab-5dfb-504d-930d-738a2a938a0e" +version = "1.0.0" + +[[BSON]] +deps = ["Profile", "Test"] +git-tree-sha1 = "6453cef4f9cb8ded8e28e4d6d12e11e20eb692ea" +uuid = "fbb218c0-5317-5bc6-957e-2ee96dd4b1f0" +version = "0.2.3" + +[[Base64]] +uuid = "2a0f44e3-6c83-55bd-87e4-b1978d98bd5f" + +[[BinDeps]] +deps = ["Compat", "Libdl", "SHA", "URIParser"] +git-tree-sha1 = "12093ca6cdd0ee547c39b1870e0c9c3f154d9ca9" +uuid = "9e28174c-4ba2-5203-b857-d8d62c4213ee" +version = "0.8.10" + +[[BinaryProvider]] +deps = ["Libdl", "Logging", "SHA"] +git-tree-sha1 = "c7361ce8a2129f20b0e05a89f7070820cfed6648" +uuid = "b99e7846-7c00-51b0-8f62-c81ae34c0232" +version = "0.5.6" + +[[CSTParser]] +deps = ["Tokenize"] +git-tree-sha1 = "c69698c3d4a7255bc1b4bc2afc09f59db910243b" +uuid = "00ebfdb7-1f24-5e51-bd34-a7502290713f" +version = "0.6.2" + +[[CategoricalArrays]] +deps = ["Compat", "DataAPI", "Future", "JSON", "Missings", "Printf", "Reexport"] +git-tree-sha1 = "13240cfcc884837fc1aa89b60d500a652bcc3f10" +uuid = "324d7699-5711-5eae-9e2f-1d82baa6b597" +version = "0.5.5" + +[[CodecZlib]] +deps = ["BinaryProvider", "Libdl", "TranscodingStreams"] +git-tree-sha1 = "05916673a2627dd91b4969ff8ba6941bc85a960e" +uuid = "944b1d66-785c-5afd-91f1-9de20f533193" +version = "0.6.0" + +[[ColorTypes]] +deps = ["FixedPointNumbers", "Random"] +git-tree-sha1 = "10050a24b09e8e41b951e9976b109871ce98d965" +uuid = "3da002f7-5984-5a60-b8a6-cbb66c0b333f" +version = "0.8.0" + +[[Colors]] +deps = ["ColorTypes", "FixedPointNumbers", "InteractiveUtils", "Printf", "Reexport", "Test"] +git-tree-sha1 = "9f0a0210450acb91c730b730a994f8eef1d3d543" +uuid = "5ae59095-9a9b-59fe-a467-6f913c188581" +version = "0.9.5" + +[[CommonSubexpressions]] +deps = ["Test"] +git-tree-sha1 = "efdaf19ab11c7889334ca247ff4c9f7c322817b0" +uuid = "bbf7d656-a473-5ed7-a52c-81e309532950" +version = "0.2.0" + +[[Compat]] +deps = ["Base64", "Dates", "DelimitedFiles", "Distributed", "InteractiveUtils", "LibGit2", "Libdl", "LinearAlgebra", "Markdown", "Mmap", "Pkg", "Printf", "REPL", "Random", "Serialization", "SharedArrays", "Sockets", "SparseArrays", "Statistics", "Test", "UUIDs", "Unicode"] +git-tree-sha1 = "84aa74986c5b9b898b0d1acaf3258741ee64754f" +uuid = "34da2185-b29b-5c13-b0c7-acf172513d20" +version = "2.1.0" + +[[Crayons]] +deps = ["Test"] +git-tree-sha1 = "f621b8ef51fd2004c7cf157ea47f027fdeac5523" +uuid = "a8cc5b0e-0ffa-5ad4-8c14-923d3ee1735f" +version = "4.0.0" + +[[DataAPI]] +git-tree-sha1 = "8903f0219d3472543fc4b2f5ebaf675a07f817c0" +uuid = "9a962f9c-6df0-11e9-0e5d-c546b8b5ee8a" +version = "1.0.1" + +[[DataDeps]] +deps = ["HTTP", "Reexport", "SHA"] +git-tree-sha1 = "3fecf920ad9702f015a5ab198b233b4c1c50992a" +uuid = "124859b0-ceae-595e-8997-d05f6a7a8dfe" +version = "0.6.4" + +[[DataFrames]] +deps = ["CategoricalArrays", "Compat", "DataAPI", "InvertedIndices", "IteratorInterfaceExtensions", "Missings", "PooledArrays", "Printf", "REPL", "Reexport", "SortingAlgorithms", "Statistics", "TableTraits", "Tables", "Unicode"] +git-tree-sha1 = "48ef38bd7cf0e8fd598bda981409eb6ef4b96cbd" +uuid = "a93c6f00-e57d-5684-b7b6-d8193f3e46c0" +version = "0.19.2" + +[[DataStructures]] +deps = ["InteractiveUtils", "OrderedCollections"] +git-tree-sha1 = "0809951a1774dc724da22d26e4289bbaab77809a" +uuid = "864edb3b-99cc-5e75-8d2d-829cb0a9cfe8" +version = "0.17.0" + +[[DataValueInterfaces]] +git-tree-sha1 = "bfc1187b79289637fa0ef6d4436ebdfe6905cbd6" +uuid = "e2d170a0-9d28-54be-80f0-106bbe20a464" +version = "1.0.0" + +[[Dates]] +deps = ["Printf"] +uuid = "ade2ca70-3891-5945-98fb-dc099432e06a" + +[[DelimitedFiles]] +deps = ["Mmap"] +uuid = "8bb1440f-4735-579b-a4ab-409b98df4dab" + +[[DiffResults]] +deps = ["Compat", "StaticArrays"] +git-tree-sha1 = "34a4a1e8be7bc99bc9c611b895b5baf37a80584c" +uuid = "163ba53b-c6d8-5494-b064-1a9d43ac40c5" +version = "0.0.4" + +[[DiffRules]] +deps = ["Random", "Test"] +git-tree-sha1 = "dc0869fb2f5b23466b32ea799bd82c76480167f7" +uuid = "b552c78f-8df3-52c6-915a-8e097449b14b" +version = "0.0.10" + +[[Distributed]] +deps = ["LinearAlgebra", "Random", "Serialization", "Sockets"] +uuid = "8ba89e20-285c-5b6f-9357-94700520ee1b" + +[[FixedPointNumbers]] +git-tree-sha1 = "d14a6fa5890ea3a7e5dcab6811114f132fec2b4b" +uuid = "53c48c17-4a7d-5ca2-90c5-79b7896eea93" +version = "0.6.1" + +[[Flux]] +deps = ["AbstractTrees", "Adapt", "CodecZlib", "Colors", "DelimitedFiles", "Juno", "LinearAlgebra", "MacroTools", "NNlib", "Pkg", "Printf", "Random", "Reexport", "Requires", "SHA", "Statistics", "StatsBase", "Tracker", "ZipFile"] +git-tree-sha1 = "08212989c2856f95f90709ea5fd824bd27b34514" +uuid = "587475ba-b771-5e3f-ad9e-33799f191a9c" +version = "0.8.3" + +[[ForwardDiff]] +deps = ["CommonSubexpressions", "DiffResults", "DiffRules", "InteractiveUtils", "LinearAlgebra", "NaNMath", "Random", "SparseArrays", "SpecialFunctions", "StaticArrays", "Test"] +git-tree-sha1 = "4c4d727f1b7e0092134fabfab6396b8945c1ea5b" +uuid = "f6369f11-7733-5829-9624-2563aa707210" +version = "0.10.3" + +[[Future]] +deps = ["Random"] +uuid = "9fa8497b-333b-5362-9e8d-4d0656e87820" + +[[HTML_Entities]] +deps = ["StrTables"] +git-tree-sha1 = "aa19515d6ebe7f91a39cfc1dc6341f38fcac1282" +uuid = "7693890a-d069-55fe-a829-b4a6d304f0ee" +version = "1.0.0" + +[[HTTP]] +deps = ["Base64", "Dates", "IniFile", "MbedTLS", "Sockets"] +git-tree-sha1 = "03ddc88af7f2d963fac5aa9f3ac8e11914d68a78" +uuid = "cd3eb016-35fb-5094-929b-558a96fad6f3" +version = "0.8.4" + +[[IniFile]] +deps = ["Test"] +git-tree-sha1 = "098e4d2c533924c921f9f9847274f2ad89e018b8" +uuid = "83e8ac13-25f8-5344-8a64-a9f2b223428f" +version = "0.5.0" + +[[InteractiveUtils]] +deps = ["LinearAlgebra", "Markdown"] +uuid = "b77e0a4c-d291-57a0-90e8-8db25a27a240" + +[[InvertedIndices]] +deps = ["Test"] +git-tree-sha1 = "15732c475062348b0165684ffe28e85ea8396afc" +uuid = "41ab1584-1d38-5bbf-9106-f11c6c58b48f" +version = "1.0.0" + +[[IteratorInterfaceExtensions]] +git-tree-sha1 = "a3f24677c21f5bbe9d2a714f95dcd58337fb2856" +uuid = "82899510-4779-5014-852e-03e436cf321d" +version = "1.0.0" + +[[JSON]] +deps = ["Dates", "Mmap", "Parsers", "Unicode"] +git-tree-sha1 = "b34d7cef7b337321e97d22242c3c2b91f476748e" +uuid = "682c06a0-de6a-54ab-a142-c8b1cf79cde6" +version = "0.21.0" + +[[Juno]] +deps = ["Base64", "Logging", "Media", "Profile", "Test"] +git-tree-sha1 = "8426e073b1676acba2aea7a4a81d7a3af97a14fe" +uuid = "e5e0dc1b-0480-54bc-9374-aad01c23163d" +version = "0.7.1" + +[[Languages]] +deps = ["JSON", "Test"] +git-tree-sha1 = "fc6ee05e35074a66dc12a716065a25d9deece6fb" +uuid = "8ef0a80b-9436-5d2c-a485-80b904378c43" +version = "0.4.2" + +[[LibGit2]] +uuid = "76f85450-5226-5b5a-8eaa-529ad045b433" + +[[Libdl]] +uuid = "8f399da3-3557-5675-b5ff-fb832c97cbdb" + +[[LinearAlgebra]] +deps = ["Libdl"] +uuid = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e" + +[[Logging]] +uuid = "56ddb016-857b-54e1-b83d-db4d58db5568" + +[[MacroTools]] +deps = ["CSTParser", "Compat", "DataStructures", "Test", "Tokenize"] +git-tree-sha1 = "d6e9dedb8c92c3465575442da456aec15a89ff76" +uuid = "1914dd2f-81c6-5fcd-8719-6d5c9610ff09" +version = "0.5.1" + +[[Markdown]] +deps = ["Base64"] +uuid = "d6f4376e-aef5-505a-96c1-9c027394607a" + +[[MbedTLS]] +deps = ["BinaryProvider", "Dates", "Distributed", "Libdl", "Random", "Sockets", "Test"] +git-tree-sha1 = "2d94286a9c2f52c63a16146bb86fd6cdfbf677c6" +uuid = "739be429-bea8-5141-9913-cc70e7f3736d" +version = "0.6.8" + +[[Media]] +deps = ["MacroTools", "Test"] +git-tree-sha1 = "75a54abd10709c01f1b86b84ec225d26e840ed58" +uuid = "e89f7d12-3494-54d1-8411-f7d8b9ae1f27" +version = "0.5.0" + +[[Missings]] +deps = ["SparseArrays", "Test"] +git-tree-sha1 = "f0719736664b4358aa9ec173077d4285775f8007" +uuid = "e1d29d7a-bbdc-5cf2-9ac0-f12de2c33e28" +version = "0.4.1" + +[[Mmap]] +uuid = "a63ad114-7e13-5084-954f-fe012c677804" + +[[NNlib]] +deps = ["Libdl", "LinearAlgebra", "Requires", "Statistics", "TimerOutputs"] +git-tree-sha1 = "0c667371391fc6bb31f7f12f96a56a17098b3de8" +uuid = "872c559c-99b0-510c-b3b7-b6c96a88d5cd" +version = "0.6.0" + +[[NaNMath]] +deps = ["Compat"] +git-tree-sha1 = "ce3b85e484a5d4c71dd5316215069311135fa9f2" +uuid = "77ba4419-2d1f-58cd-9bb1-8ffee604a2e3" +version = "0.3.2" + +[[OrderedCollections]] +deps = ["Random", "Serialization", "Test"] +git-tree-sha1 = "c4c13474d23c60d20a67b217f1d7f22a40edf8f1" +uuid = "bac558e1-5e72-5ebc-8fee-abe8a469f55d" +version = "1.1.0" + +[[Parsers]] +deps = ["Dates", "Test"] +git-tree-sha1 = "db2b35dedab3c0e46dc15996d170af07a5ab91c9" +uuid = "69de0a69-1ddd-5017-9359-2bf0b02dc9f0" +version = "0.3.6" + +[[Pkg]] +deps = ["Dates", "LibGit2", "Markdown", "Printf", "REPL", "Random", "SHA", "UUIDs"] +uuid = "44cfe95a-1eb2-52ea-b672-e2afdf69b78f" + +[[PooledArrays]] +git-tree-sha1 = "6e8c38927cb6e9ae144f7277c753714861b27d14" +uuid = "2dfb63ee-cc39-5dd5-95bd-886bf059d720" +version = "0.5.2" + +[[Printf]] +deps = ["Unicode"] +uuid = "de0858da-6303-5e67-8744-51eddeeeb8d7" + +[[Profile]] +deps = ["Printf"] +uuid = "9abbd945-dff8-562f-b5e8-e1ebf5ef1b79" + +[[REPL]] +deps = ["InteractiveUtils", "Markdown", "Sockets"] +uuid = "3fa0cd96-eef1-5676-8a61-b3b8758bbffb" + +[[Random]] +deps = ["Serialization"] +uuid = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c" + +[[Reexport]] +deps = ["Pkg"] +git-tree-sha1 = "7b1d07f411bc8ddb7977ec7f377b97b158514fe0" +uuid = "189a3867-3050-52da-a836-e630ba90ab69" +version = "0.2.0" + +[[Requires]] +deps = ["Test"] +git-tree-sha1 = "f6fbf4ba64d295e146e49e021207993b6b48c7d1" +uuid = "ae029012-a4dd-5104-9daa-d747884805df" +version = "0.5.2" + +[[SHA]] +uuid = "ea8e919c-243c-51af-8825-aaa63cd721ce" + +[[Serialization]] +uuid = "9e88b42a-f829-5b0c-bbe9-9e923198166b" + +[[SharedArrays]] +deps = ["Distributed", "Mmap", "Random", "Serialization"] +uuid = "1a1011a3-84de-559e-8e89-a11a2f7dc383" + +[[Sockets]] +uuid = "6462fe0b-24de-5631-8697-dd941f90decc" + +[[SortingAlgorithms]] +deps = ["DataStructures", "Random", "Test"] +git-tree-sha1 = "03f5898c9959f8115e30bc7226ada7d0df554ddd" +uuid = "a2af1166-a08f-5f64-846c-94a0d3cef48c" +version = "0.3.1" + +[[SparseArrays]] +deps = ["LinearAlgebra", "Random"] +uuid = "2f01184e-e22b-5df5-ae63-d93ebab69eaf" + +[[SpecialFunctions]] +deps = ["BinDeps", "BinaryProvider", "Libdl", "Test"] +git-tree-sha1 = "0b45dc2e45ed77f445617b99ff2adf0f5b0f23ea" +uuid = "276daf66-3868-5448-9aa4-cd146d93841b" +version = "0.7.2" + +[[StaticArrays]] +deps = ["LinearAlgebra", "Random", "Statistics"] +git-tree-sha1 = "db23bbf50064c582b6f2b9b043c8e7e98ea8c0c6" +uuid = "90137ffa-7385-5640-81b9-e52037218182" +version = "0.11.0" + +[[Statistics]] +deps = ["LinearAlgebra", "SparseArrays"] +uuid = "10745b16-79ce-11e8-11f9-7d13ad32a3b2" + +[[StatsBase]] +deps = ["DataAPI", "DataStructures", "LinearAlgebra", "Missings", "Printf", "Random", "SortingAlgorithms", "SparseArrays", "Statistics"] +git-tree-sha1 = "c53e809e63fe5cf5de13632090bc3520649c9950" +uuid = "2913bbd2-ae8a-5f71-8c99-4fb6c76f3a91" +version = "0.32.0" + +[[StrTables]] +deps = ["Dates"] +git-tree-sha1 = "5998faae8c6308acc25c25896562a1e66a3bb038" +uuid = "9700d1a9-a7c8-5760-9816-a99fda30bb8f" +version = "1.0.1" + +[[TableTraits]] +deps = ["IteratorInterfaceExtensions"] +git-tree-sha1 = "b1ad568ba658d8cbb3b892ed5380a6f3e781a81e" +uuid = "3783bdb8-4a98-5b6b-af9a-565f29a5fe9c" +version = "1.0.0" + +[[Tables]] +deps = ["DataAPI", "DataValueInterfaces", "IteratorInterfaceExtensions", "LinearAlgebra", "TableTraits", "Test"] +git-tree-sha1 = "951b5be359e92703f886881b175ecfe924d8bd91" +uuid = "bd369af6-aec1-5ad0-b16a-f7cc5008161c" +version = "0.2.10" + +[[Test]] +deps = ["Distributed", "InteractiveUtils", "Logging", "Random"] +uuid = "8dfed614-e22c-5e08-85e1-65c5234f0b40" + +[[TimerOutputs]] +deps = ["Crayons", "Printf", "Test", "Unicode"] +git-tree-sha1 = "b80671c06f8f8bae08c55d67b5ce292c5ae2660c" +uuid = "a759f4b9-e2f1-59dc-863e-4aeb61b1ea8f" +version = "0.5.0" + +[[Tokenize]] +git-tree-sha1 = "c8a8b00ae44a94950814ff77850470711a360225" +uuid = "0796e94c-ce3b-5d07-9a54-7f471281c624" +version = "0.5.5" + +[[Tracker]] +deps = ["Adapt", "DiffRules", "ForwardDiff", "LinearAlgebra", "MacroTools", "NNlib", "NaNMath", "Printf", "Random", "Requires", "SpecialFunctions", "Statistics", "Test"] +git-tree-sha1 = "327342fec6e09f68ced0c2dc5731ed475e4b696b" +uuid = "9f7883ad-71c0-57eb-9f7f-b5c9e6d3789c" +version = "0.2.2" + +[[TranscodingStreams]] +deps = ["Random", "Test"] +git-tree-sha1 = "7c53c35547de1c5b9d46a4797cf6d8253807108c" +uuid = "3bb67fe8-82b1-5028-8e26-92a6c54297fa" +version = "0.9.5" + +[[URIParser]] +deps = ["Test", "Unicode"] +git-tree-sha1 = "6ddf8244220dfda2f17539fa8c9de20d6c575b69" +uuid = "30578b45-9adc-5946-b283-645ec420af67" +version = "0.4.0" + +[[UUIDs]] +deps = ["Random"] +uuid = "cf7118a7-6976-5b1a-9a39-7adc72f591a4" + +[[Unicode]] +uuid = "4ec0a83e-493e-50e2-b9ac-8f72acf5a8f5" + +[[WordTokenizers]] +deps = ["HTML_Entities", "StrTables", "Unicode"] +git-tree-sha1 = "983ca717c4ec786d0458ebcbe395a1a50b3a1897" +uuid = "796a5d58-b03d-544a-977e-18100b691f6e" +version = "0.5.3" + +[[ZipFile]] +deps = ["BinaryProvider", "Libdl", "Printf"] +git-tree-sha1 = "580ce62b6c14244916cc28ad54f8a2e2886f843d" +uuid = "a5390f91-8eb1-5f08-bee0-b1d1ffed6cea" +version = "0.8.3" diff --git a/Project.toml b/Project.toml index 63f81696..9d5b4633 100644 --- a/Project.toml +++ b/Project.toml @@ -4,19 +4,20 @@ license = "MIT" desc = "Julia package for text analysis" [deps] -BSON = "fbb218c0-5317-5bc6-957e-2ee96dd4b1f0" -Flux = "587475ba-b771-5e3f-ad9e-33799f191a9c" -Printf = "de0858da-6303-5e67-8744-51eddeeeb8d7" -Tracker = "9f7883ad-71c0-57eb-9f7f-b5c9e6d3789c" -DataDeps = "124859b0-ceae-595e-8997-d05f6a7a8dfe" -Languages = "8ef0a80b-9436-5d2c-a485-80b904378c43" -DataFrames = "a93c6f00-e57d-5684-b7b6-d8193f3e46c0" -Statistics = "10745b16-79ce-11e8-11f9-7d13ad32a3b2" -SparseArrays = "2f01184e-e22b-5df5-ae63-d93ebab69eaf" -LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e" -DataStructures = "864edb3b-99cc-5e75-8d2d-829cb0a9cfe8" -WordTokenizers = "796a5d58-b03d-544a-977e-18100b691f6e" -BinaryProvider = "b99e7846-7c00-51b0-8f62-c81ae34c0232" +BSON = "fbb218c0-5317-5bc6-957e-2ee96dd4b1f0" +BinaryProvider = "b99e7846-7c00-51b0-8f62-c81ae34c0232" +DataDeps = "124859b0-ceae-595e-8997-d05f6a7a8dfe" +DataFrames = "a93c6f00-e57d-5684-b7b6-d8193f3e46c0" +DataStructures = "864edb3b-99cc-5e75-8d2d-829cb0a9cfe8" +Flux = "587475ba-b771-5e3f-ad9e-33799f191a9c" +JSON = "682c06a0-de6a-54ab-a142-c8b1cf79cde6" +Languages = "8ef0a80b-9436-5d2c-a485-80b904378c43" +LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e" +Printf = "de0858da-6303-5e67-8744-51eddeeeb8d7" +SparseArrays = "2f01184e-e22b-5df5-ae63-d93ebab69eaf" +Statistics = "10745b16-79ce-11e8-11f9-7d13ad32a3b2" +Tracker = "9f7883ad-71c0-57eb-9f7f-b5c9e6d3789c" +WordTokenizers = "796a5d58-b03d-544a-977e-18100b691f6e" [compat] julia = "1"