Merge pull request #167 from Ayushk4/NER_API

Named Entity Recognition
JuliaText · Aug 18, 2019 · 04d1b60 · 04d1b60
2 parents 284f11a + ab62d3b
commit 04d1b60
Show file tree

Hide file tree

Showing 11 changed files with 803 additions and 9 deletions.
diff --git a/Manifest.toml b/Manifest.toml
diff --git a/Project.toml b/Project.toml
@@ -0,0 +1,29 @@
+name = "TextAnalysis"
+uuid = "a2db99b7-8b79-58f8-94bf-bbc811eef33d"
+license = "MIT"
+desc = "Julia package for text analysis"
+
+[deps]
+BSON = "fbb218c0-5317-5bc6-957e-2ee96dd4b1f0"
+BinaryProvider = "b99e7846-7c00-51b0-8f62-c81ae34c0232"
+DataDeps = "124859b0-ceae-595e-8997-d05f6a7a8dfe"
+DataFrames = "a93c6f00-e57d-5684-b7b6-d8193f3e46c0"
+DataStructures = "864edb3b-99cc-5e75-8d2d-829cb0a9cfe8"
+Flux = "587475ba-b771-5e3f-ad9e-33799f191a9c"
+JSON = "682c06a0-de6a-54ab-a142-c8b1cf79cde6"
+Languages = "8ef0a80b-9436-5d2c-a485-80b904378c43"
+LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e"
+Printf = "de0858da-6303-5e67-8744-51eddeeeb8d7"
+SparseArrays = "2f01184e-e22b-5df5-ae63-d93ebab69eaf"
+Statistics = "10745b16-79ce-11e8-11f9-7d13ad32a3b2"
+Tracker = "9f7883ad-71c0-57eb-9f7f-b5c9e6d3789c"
+WordTokenizers = "796a5d58-b03d-544a-977e-18100b691f6e"
+
+[compat]
+julia = "1"
+
+[extras]
+Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40"
+
+[targets]
+test = ["Test"]
diff --git a/REQUIRE b/REQUIRE
@@ -8,3 +8,4 @@ BSON
 JSON
 DataStructures
 DataDeps
+Tracker
diff --git a/docs/make.jl b/docs/make.jl
@@ -17,7 +17,8 @@ makedocs(
         "Classifier" => "classify.md",
         "Extended Example" => "example.md",
         "Evaluation Metrics" => "evaluation_metrics.md",
-        "Conditional Random Fields" => "crf.md"
+        "Conditional Random Fields" => "crf.md",
+        "Named Entity Recognition" => "ner.md"
     ],
 )
 

diff --git a/docs/src/ner.md b/docs/src/ner.md
@@ -0,0 +1,140 @@
+# Named Entity Recognition
+
+The API provided is a pretrained model for tagging Named Entities.
+The current model support 4 types of Named Entities -
+
+- `PER`: Person
+- `LOC`: Location
+- `ORG`: Organisation
+- `MISC`: Miscellaneous
+- `O`: Not a Named Entity
+
+To use the API, we first load the model weights into an instance of tagger.
+The function also accepts the path of model_weights and model_dicts (for character and word embeddings)
+
+    NERTagger()
+    NERTagger(dicts_path, weights_path)
+
+```julia
+julia> ner = NERTagger()
+```
+!!! note
+    When you call `NERTagger()` for the first time, the package will request permission for download the `Model_dicts` and `Model_weights`. Upon downloading, these are store locally and managed by `DataDeps`. So, on subsequent uses the weights will not need to be downloaded again.
+
+Once we create an instance, we can call it to tag a String (sentence), sequence of tokens, `AbstractDocument` or `Corpus`.
+
+    (ner::NERTagger)(sentence::String)
+    (ner::NERTagger)(tokens::Array{String, 1})
+    (ner::NERTagger)(sd::StringDocument)
+    (ner::NERTagger)(fd::FileDocument)
+    (ner::NERTagger)(td::TokenDocument)
+    (ner::NERTagger)(crps::Corpus)
+
+```julia
+julia> sentence = "This package is maintained by John Doe."
+"This package is maintained by John Doe."
+
+julia> tags = ner(sentence)
+8-element Array{String,1}:
+ "O"
+ "O"
+ "O"
+ "O"
+ "O"
+ "PER"
+ "PER"
+ "O"
+
+```
+
+The API tokenizes the input sentences via the default tokenizer provided by `WordTokenizers`, this currently being set to the multilingual `TokTok Tokenizer.`
+
+```
+julia> using WordTokenizers
+
+julia> collect(zip(WordTokenizers.tokenize(sentence), tags))
+8-element Array{Tuple{String,String},1}:
+ ("This", "O")
+ ("package", "O")
+ ("is", "O")
+ ("maintained", "O")
+ ("by", "O")
+ ("John", "PER")
+ ("Doe", "PER")
+ (".", "O")
+
+```
+
+For tagging a multisentence text or document, once can use `split_sentences` from `WordTokenizers.jl` package and run the ner model on each.
+
+```julia
+julia> sentences = "Rabinov is winding up his term as ambassador. He will be replaced by Eliahu Ben-Elissar, a former Israeli envoy to Egypt and right-wing Likud party politiian." # Sentence taken from CoNLL 2003 Dataset
+
+julia> splitted_sents = WordTokenizers.split_sentences(sentences)
+
+julia> tag_sequences = ner.(splitted_sents)
+2-element Array{Array{String,1},1}:
+ ["PER", "O", "O", "O", "O", "O", "O", "O", "O"]
+ ["O", "O", "O", "O", "O", "PER", "PER", "O", "O", "O", "MISC", "O", "O", "LOC", "O", "O", "ORG", "ORG", "O", "O"]
+
+julia> zipped = [collect(zip(tag_sequences[i], WordTokenizers.tokenize(splitted_sents[i]))) for i in eachindex(splitted_sents)]
+
+julia> zipped[1]
+9-element Array{Tuple{String,String},1}:
+ ("PER", "Rabinov")
+ ("O", "is")
+ ("O", "winding")
+ ("O", "up")
+ ("O", "his")
+ ("O", "term")
+ ("O", "as")
+ ("O", "ambassador")
+ ("O", ".")
+
+julia> zipped[2]
+20-element Array{Tuple{String,String},1}:
+ ("O", "He")
+ ("O", "will")
+ ("O", "be")
+ ("O", "replaced")
+ ("O", "by")
+ ("PER", "Eliahu")
+ ("PER", "Ben-Elissar")
+ ("O", ",")
+ ("O", "a")
+ ("O", "former")
+ ("MISC", "Israeli")
+ ("O", "envoy")
+ ("O", "to")
+ ("LOC", "Egypt")
+ ("O", "and")
+ ("O", "right-wing")
+ ("ORG", "Likud")
+ ("ORG", "party")
+ ("O", "politiian")
+ ("O", ".")
+```
+
+Since the tagging the Named Entities is done on sentence level,
+the text of `AbstractDocument` is sentence_tokenized and then labelled for over sentence.
+However is not possible for `NGramDocument` as text cannot be recreated.
+For `TokenDocument`, text is approximated for splitting into sentences, hence the following throws a warning when tagging the `Corpus`.
+
+```julia
+julia> crps = Corpus([StringDocument("We aRE vErY ClOSE tO ThE HEaDQuarTeRS."), TokenDocument("this is Bangalore.")])
+A Corpus with 2 documents:
+ * 1 StringDocument's
+ * 0 FileDocument's
+ * 1 TokenDocument's
+ * 0 NGramDocument's
+
+Corpus's lexicon contains 0 tokens
+Corpus's index contains 0 tokens
+
+julia> ner(crps)
+┌ Warning: TokenDocument's can only approximate the original text
+└ @ TextAnalysis ~/.julia/dev/TextAnalysis/src/document.jl:220
+2-element Array{Array{Array{String,1},1},1}:
+ [["O", "O", "O", "O", "O", "O", "O", "O"]]
+ [["O", "O", "LOC", "O"]]
+```
diff --git a/src/TextAnalysis.jl b/src/TextAnalysis.jl
@@ -11,8 +11,9 @@ module TextAnalysis
     using DataStructures
     using Statistics
 
-    using Flux
-    using Flux: identity, onehot, onecold, @treelike
+    using DataDeps
+    using Flux, Tracker
+    using Flux: identity, onehot, onecold, @treelike, onehotbatch
 
     import DataFrames.DataFrame
     import Base.depwarn
@@ -65,6 +66,8 @@ module TextAnalysis
 
     export CRF, viterbi_decode, crf_loss
 
+    export NERTagger, Tracker, Flux
+
     include("tokenizer.jl")
     include("ngramizer.jl")
     include("document.jl")
@@ -102,7 +105,13 @@ module TextAnalysis
     include("CRF/crf_utils.jl")
     include("CRF/loss.jl")
 
+    # NER
+    include("sequence/ner_datadeps.jl")
+    include("sequence/ner.jl")
+    include("sequence/sequence_models.jl")
+
     function __init__()
         pos_tagger_datadep_register()
+        ner_datadep_register()
     end
 end
diff --git a/src/sequence/ner.jl b/src/sequence/ner.jl
@@ -0,0 +1,50 @@
+using BSON, Tracker
+
+const NER_Char_UNK = '¿'
+const NER_Word_UNK = "<UNK>"
+
+struct NERmodel{M}
+    model::M
+end
+
+function load_model_dicts(filepath)
+    labels = BSON.load(joinpath(filepath, "labels.bson"))[:labels]
+    chars_idx = BSON.load(joinpath(filepath, "char_to_embed_idx.bson"))[:get_char_index]
+    words_idx = BSON.load(joinpath(filepath, "word_to_embed_idx.bson"))[:get_word_index]
+
+    return remove_ner_label_prefix.([labels...]), chars_idx, words_idx
+end
+
+NERTagger() = NERTagger(datadep"NER Model Dicts", datadep"NER Model Weights")
+
+function NERTagger(dicts_path, weights_path)
+    labels, chars_idx, words_idx = load_model_dicts(dicts_path)
+    model = BiLSTM_CNN_CRF_Model(labels, chars_idx, words_idx, chars_idx[NER_Char_UNK], words_idx[NER_Word_UNK], weights_path)
+    NERmodel(model)
+end
+
+function (a::NERmodel)(tokens::Array{String,1})
+    input_oh = [onehotinput(a.model, token) for token in tokens]
+    return (a.model)(input_oh)
+end
+
+function (a::NERmodel)(sentence::AbstractString)
+    a(WordTokenizers.tokenize(sentence))
+end
+
+function (a::NERmodel)(doc::AbstractDocument)
+    return vcat(a.(WordTokenizers.split_sentences(text(doc))))
+end
+
+function (a::NERmodel)(ngd::NGramDocument)
+    throw("Sequence Labelling not possible for NGramsDocument")
+end
+
+function (a::NERmodel)(crps::Corpus)
+    return a.(crps.documents)
+end
+
+function remove_ner_label_prefix(str)
+    str == "O" && return str
+    str = str[3:end]
+end
diff --git a/src/sequence/ner_datadeps.jl b/src/sequence/ner_datadeps.jl
@@ -0,0 +1,31 @@
+function ner_datadep_register()
+    register(DataDep("NER Model Weights",
+        """
+        The weights for NER Sequence Labelling Model.
+        """,
+        "https://github.com/JuliaText/TextAnalysis.jl/releases/download/v0.6.0/ner_weights.tar.xz",
+        "6290353b66c9bdbb794ddcb6063ab52c30145d3918f2f115f19e21fa994282e6",
+        post_fetch_method = function(fn)
+            unpack(fn)
+            dir = "weights"
+            innerfiles = readdir(dir)
+            mv.(joinpath.(dir, innerfiles), innerfiles)
+            rm(dir)
+        end
+    ))
+
+    register(DataDep("NER Model Dicts",
+        """
+        The character and words dict for NER Sequence Labelling Model.
+        """,
+        "https://github.com/JuliaText/TextAnalysis.jl/releases/download/v0.6.0/ner_dicts.tar.xz",
+        "40cfa37da216b990eb9c257aa7994e34d7a7a59d69b2506c6f39120f2688dc11",
+        post_fetch_method = function(fn)
+            unpack(fn)
+            dir = "model_dicts"
+            innerfiles = readdir(dir)
+            mv.(joinpath.(dir, innerfiles), innerfiles)
+            rm(dir)
+        end
+    ))
+end
diff --git a/src/sequence/sequence_models.jl b/src/sequence/sequence_models.jl
@@ -0,0 +1,66 @@
+using BSON, Tracker
+mutable struct BiLSTM_CNN_CRF_Model{C, W, L, D, O, A}
+    labels::Array{String, 1} # List of Labels
+    chars_idx::Dict{Char, Int64} # Dict that maps chars to indices in W_Char_Embed
+    words_idx::Dict{String, Int64} # Dict that maps words to indices in W_word_Embed
+    conv1::C # Convolution Layer over W_Char_Embed to give character representation
+    W_Char_Embed::W # Weights for character embeddings
+    W_word_Embed::W # Further trained GloVe Embeddings
+    forward_lstm::L # Forward LSTM
+    backward::L # Backward LSTM
+    d_out::D # Dense_out
+    c::O # CRF
+    init_α::A # For CRF layer
+    UNK_Word_idx::Integer
+    UNK_char_idx::Integer
+end
+
+function BiLSTM_CNN_CRF_Model(labels, chars_idx, words_idx, UNK_char_idx, UNK_Word_idx; CHAR_EMBED_DIMS=25, WORD_EMBED_DIMS=100,
+                              CNN_OUTPUT_SIZE=30, CONV_PAD= (0,2), CONV_WINDOW_LENGTH = 3, LSTM_STATE_SIZE = 200)
+    n = length(labels)
+    init_α = fill(-10000, (n + 2, 1))
+    init_α[n + 1] = 0
+
+    BiLSTM_CNN_CRF_Model(labels, chars_idx, words_idx, Conv((CHAR_EMBED_DIMS, CONV_WINDOW_LENGTH), 1=>CNN_OUTPUT_SIZE, pad=CONV_PAD),
+                rand(CHAR_EMBED_DIMS, length(chars_idx)), rand(WORD_EMBED_DIMS, length(words_idx)),
+                LSTM(CNN_OUTPUT_SIZE + WORD_EMBED_DIMS, LSTM_STATE_SIZE), LSTM(CNN_OUTPUT_SIZE + WORD_EMBED_DIMS, LSTM_STATE_SIZE),
+                Dense(LSTM_STATE_SIZE * 2, length(labels) + 2), CRF(n), init_α, UNK_Word_idx, UNK_char_idx)
+end
+
+function BiLSTM_CNN_CRF_Model(labels, chars_idx, words_idx, UNK_char_idx,UNK_Word_idx, weights_path)
+    n = length(labels)
+    init_α = fill(-10000, (n + 2, 1))
+    init_α[n + 1] = 0
+
+    W_word_Embed = BSON.load(joinpath(weights_path, "W_word_cpu.bson"))[:W_word_cpu].data
+    W_Char_Embed = BSON.load(joinpath(weights_path, "W_char_cpu.bson"))[:W_char_cpu].data
+    forward_lstm = BSON.load(joinpath(weights_path, "forward_lstm.bson"))[:forward_lstm_cpu]
+    backward = BSON.load(joinpath(weights_path, "backward_lstm.bson"))[:backward_lstm_cpu]
+    d_out = BSON.load(joinpath(weights_path, "d_cpu.bson"))[:d_cpu]
+    c = BSON.load(joinpath(weights_path, "crf.bson"))[:crf_cpu]
+    conv1 = BSON.load(joinpath(weights_path, "conv_cpu.bson"))[:conv_cpu]
+
+    BiLSTM_CNN_CRF_Model(labels, chars_idx, words_idx, conv1, W_Char_Embed, W_word_Embed,
+                forward_lstm, backward, d_out, c, init_α, UNK_Word_idx, UNK_char_idx)
+end
+
+function (a::BiLSTM_CNN_CRF_Model)(x)
+    char_features = Chain(x -> reshape(x, size(x)..., 1,1),
+                          a.conv1,
+                          x -> maximum(x, dims=2),
+                          x -> reshape(x, length(x),1))
+    input_embeddings((w, cs)) = vcat(a.W_word_Embed * w, char_features(a.W_Char_Embed * cs))
+    backward_lstm(x) = reverse((a.backward).(reverse(x)))
+    bilstm_layer(x) = vcat.((a.forward_lstm).(x), backward_lstm(x))
+    m = Chain(x -> input_embeddings.(x),
+              bilstm_layer,
+              x -> (a.d_out).(x))
+
+    oh_outs = viterbi_decode(a.c, m(x), a.init_α)
+    Flux.reset!(a.backward)
+    Flux.reset!(a.forward_lstm)
+    [a.labels[oh.ix] for oh in oh_outs]
+end
+
+onehotinput(m::BiLSTM_CNN_CRF_Model, word) = (onehot(get(m.words_idx, lowercase(word), m.UNK_Word_idx), 1:length(m.words_idx)),
+                onehotbatch([get(m.chars_idx, c, m.UNK_char_idx) for c in word], 1:length(m.chars_idx)))
-Original file line number
+Diff line change
@@ Expand Up / @@ -8,3 +8,4 @@ BSON @@
     JSON
     DataStructures
     DataDeps
+    Tracker