-
Notifications
You must be signed in to change notification settings - Fork 96
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge pull request #167 from Ayushk4/NER_API
Named Entity Recognition
- Loading branch information
Showing
11 changed files
with
803 additions
and
9 deletions.
There are no files selected for viewing
Large diffs are not rendered by default.
Oops, something went wrong.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,29 @@ | ||
name = "TextAnalysis" | ||
uuid = "a2db99b7-8b79-58f8-94bf-bbc811eef33d" | ||
license = "MIT" | ||
desc = "Julia package for text analysis" | ||
|
||
[deps] | ||
BSON = "fbb218c0-5317-5bc6-957e-2ee96dd4b1f0" | ||
BinaryProvider = "b99e7846-7c00-51b0-8f62-c81ae34c0232" | ||
DataDeps = "124859b0-ceae-595e-8997-d05f6a7a8dfe" | ||
DataFrames = "a93c6f00-e57d-5684-b7b6-d8193f3e46c0" | ||
DataStructures = "864edb3b-99cc-5e75-8d2d-829cb0a9cfe8" | ||
Flux = "587475ba-b771-5e3f-ad9e-33799f191a9c" | ||
JSON = "682c06a0-de6a-54ab-a142-c8b1cf79cde6" | ||
Languages = "8ef0a80b-9436-5d2c-a485-80b904378c43" | ||
LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e" | ||
Printf = "de0858da-6303-5e67-8744-51eddeeeb8d7" | ||
SparseArrays = "2f01184e-e22b-5df5-ae63-d93ebab69eaf" | ||
Statistics = "10745b16-79ce-11e8-11f9-7d13ad32a3b2" | ||
Tracker = "9f7883ad-71c0-57eb-9f7f-b5c9e6d3789c" | ||
WordTokenizers = "796a5d58-b03d-544a-977e-18100b691f6e" | ||
|
||
[compat] | ||
julia = "1" | ||
|
||
[extras] | ||
Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40" | ||
|
||
[targets] | ||
test = ["Test"] |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -8,3 +8,4 @@ BSON | |
JSON | ||
DataStructures | ||
DataDeps | ||
Tracker |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,140 @@ | ||
# Named Entity Recognition | ||
|
||
The API provided is a pretrained model for tagging Named Entities. | ||
The current model support 4 types of Named Entities - | ||
|
||
- `PER`: Person | ||
- `LOC`: Location | ||
- `ORG`: Organisation | ||
- `MISC`: Miscellaneous | ||
- `O`: Not a Named Entity | ||
|
||
To use the API, we first load the model weights into an instance of tagger. | ||
The function also accepts the path of model_weights and model_dicts (for character and word embeddings) | ||
|
||
NERTagger() | ||
NERTagger(dicts_path, weights_path) | ||
|
||
```julia | ||
julia> ner = NERTagger() | ||
``` | ||
!!! note | ||
When you call `NERTagger()` for the first time, the package will request permission for download the `Model_dicts` and `Model_weights`. Upon downloading, these are store locally and managed by `DataDeps`. So, on subsequent uses the weights will not need to be downloaded again. | ||
|
||
Once we create an instance, we can call it to tag a String (sentence), sequence of tokens, `AbstractDocument` or `Corpus`. | ||
|
||
(ner::NERTagger)(sentence::String) | ||
(ner::NERTagger)(tokens::Array{String, 1}) | ||
(ner::NERTagger)(sd::StringDocument) | ||
(ner::NERTagger)(fd::FileDocument) | ||
(ner::NERTagger)(td::TokenDocument) | ||
(ner::NERTagger)(crps::Corpus) | ||
|
||
```julia | ||
julia> sentence = "This package is maintained by John Doe." | ||
"This package is maintained by John Doe." | ||
|
||
julia> tags = ner(sentence) | ||
8-element Array{String,1}: | ||
"O" | ||
"O" | ||
"O" | ||
"O" | ||
"O" | ||
"PER" | ||
"PER" | ||
"O" | ||
|
||
``` | ||
|
||
The API tokenizes the input sentences via the default tokenizer provided by `WordTokenizers`, this currently being set to the multilingual `TokTok Tokenizer.` | ||
|
||
``` | ||
julia> using WordTokenizers | ||
julia> collect(zip(WordTokenizers.tokenize(sentence), tags)) | ||
8-element Array{Tuple{String,String},1}: | ||
("This", "O") | ||
("package", "O") | ||
("is", "O") | ||
("maintained", "O") | ||
("by", "O") | ||
("John", "PER") | ||
("Doe", "PER") | ||
(".", "O") | ||
``` | ||
|
||
For tagging a multisentence text or document, once can use `split_sentences` from `WordTokenizers.jl` package and run the ner model on each. | ||
|
||
```julia | ||
julia> sentences = "Rabinov is winding up his term as ambassador. He will be replaced by Eliahu Ben-Elissar, a former Israeli envoy to Egypt and right-wing Likud party politiian." # Sentence taken from CoNLL 2003 Dataset | ||
|
||
julia> splitted_sents = WordTokenizers.split_sentences(sentences) | ||
|
||
julia> tag_sequences = ner.(splitted_sents) | ||
2-element Array{Array{String,1},1}: | ||
["PER", "O", "O", "O", "O", "O", "O", "O", "O"] | ||
["O", "O", "O", "O", "O", "PER", "PER", "O", "O", "O", "MISC", "O", "O", "LOC", "O", "O", "ORG", "ORG", "O", "O"] | ||
|
||
julia> zipped = [collect(zip(tag_sequences[i], WordTokenizers.tokenize(splitted_sents[i]))) for i in eachindex(splitted_sents)] | ||
|
||
julia> zipped[1] | ||
9-element Array{Tuple{String,String},1}: | ||
("PER", "Rabinov") | ||
("O", "is") | ||
("O", "winding") | ||
("O", "up") | ||
("O", "his") | ||
("O", "term") | ||
("O", "as") | ||
("O", "ambassador") | ||
("O", ".") | ||
|
||
julia> zipped[2] | ||
20-element Array{Tuple{String,String},1}: | ||
("O", "He") | ||
("O", "will") | ||
("O", "be") | ||
("O", "replaced") | ||
("O", "by") | ||
("PER", "Eliahu") | ||
("PER", "Ben-Elissar") | ||
("O", ",") | ||
("O", "a") | ||
("O", "former") | ||
("MISC", "Israeli") | ||
("O", "envoy") | ||
("O", "to") | ||
("LOC", "Egypt") | ||
("O", "and") | ||
("O", "right-wing") | ||
("ORG", "Likud") | ||
("ORG", "party") | ||
("O", "politiian") | ||
("O", ".") | ||
``` | ||
|
||
Since the tagging the Named Entities is done on sentence level, | ||
the text of `AbstractDocument` is sentence_tokenized and then labelled for over sentence. | ||
However is not possible for `NGramDocument` as text cannot be recreated. | ||
For `TokenDocument`, text is approximated for splitting into sentences, hence the following throws a warning when tagging the `Corpus`. | ||
|
||
```julia | ||
julia> crps = Corpus([StringDocument("We aRE vErY ClOSE tO ThE HEaDQuarTeRS."), TokenDocument("this is Bangalore.")]) | ||
A Corpus with 2 documents: | ||
* 1 StringDocument's | ||
* 0 FileDocument's | ||
* 1 TokenDocument's | ||
* 0 NGramDocument's | ||
|
||
Corpus's lexicon contains 0 tokens | ||
Corpus's index contains 0 tokens | ||
|
||
julia> ner(crps) | ||
┌ Warning: TokenDocument's can only approximate the original text | ||
└ @ TextAnalysis ~/.julia/dev/TextAnalysis/src/document.jl:220 | ||
2-element Array{Array{Array{String,1},1},1}: | ||
[["O", "O", "O", "O", "O", "O", "O", "O"]] | ||
[["O", "O", "LOC", "O"]] | ||
``` |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,50 @@ | ||
using BSON, Tracker | ||
|
||
const NER_Char_UNK = '¿' | ||
const NER_Word_UNK = "<UNK>" | ||
|
||
struct NERmodel{M} | ||
model::M | ||
end | ||
|
||
function load_model_dicts(filepath) | ||
labels = BSON.load(joinpath(filepath, "labels.bson"))[:labels] | ||
chars_idx = BSON.load(joinpath(filepath, "char_to_embed_idx.bson"))[:get_char_index] | ||
words_idx = BSON.load(joinpath(filepath, "word_to_embed_idx.bson"))[:get_word_index] | ||
|
||
return remove_ner_label_prefix.([labels...]), chars_idx, words_idx | ||
end | ||
|
||
NERTagger() = NERTagger(datadep"NER Model Dicts", datadep"NER Model Weights") | ||
|
||
function NERTagger(dicts_path, weights_path) | ||
labels, chars_idx, words_idx = load_model_dicts(dicts_path) | ||
model = BiLSTM_CNN_CRF_Model(labels, chars_idx, words_idx, chars_idx[NER_Char_UNK], words_idx[NER_Word_UNK], weights_path) | ||
NERmodel(model) | ||
end | ||
|
||
function (a::NERmodel)(tokens::Array{String,1}) | ||
input_oh = [onehotinput(a.model, token) for token in tokens] | ||
return (a.model)(input_oh) | ||
end | ||
|
||
function (a::NERmodel)(sentence::AbstractString) | ||
a(WordTokenizers.tokenize(sentence)) | ||
end | ||
|
||
function (a::NERmodel)(doc::AbstractDocument) | ||
return vcat(a.(WordTokenizers.split_sentences(text(doc)))) | ||
end | ||
|
||
function (a::NERmodel)(ngd::NGramDocument) | ||
throw("Sequence Labelling not possible for NGramsDocument") | ||
end | ||
|
||
function (a::NERmodel)(crps::Corpus) | ||
return a.(crps.documents) | ||
end | ||
|
||
function remove_ner_label_prefix(str) | ||
str == "O" && return str | ||
str = str[3:end] | ||
end |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,31 @@ | ||
function ner_datadep_register() | ||
register(DataDep("NER Model Weights", | ||
""" | ||
The weights for NER Sequence Labelling Model. | ||
""", | ||
"https://github.com/JuliaText/TextAnalysis.jl/releases/download/v0.6.0/ner_weights.tar.xz", | ||
"6290353b66c9bdbb794ddcb6063ab52c30145d3918f2f115f19e21fa994282e6", | ||
post_fetch_method = function(fn) | ||
unpack(fn) | ||
dir = "weights" | ||
innerfiles = readdir(dir) | ||
mv.(joinpath.(dir, innerfiles), innerfiles) | ||
rm(dir) | ||
end | ||
)) | ||
|
||
register(DataDep("NER Model Dicts", | ||
""" | ||
The character and words dict for NER Sequence Labelling Model. | ||
""", | ||
"https://github.com/JuliaText/TextAnalysis.jl/releases/download/v0.6.0/ner_dicts.tar.xz", | ||
"40cfa37da216b990eb9c257aa7994e34d7a7a59d69b2506c6f39120f2688dc11", | ||
post_fetch_method = function(fn) | ||
unpack(fn) | ||
dir = "model_dicts" | ||
innerfiles = readdir(dir) | ||
mv.(joinpath.(dir, innerfiles), innerfiles) | ||
rm(dir) | ||
end | ||
)) | ||
end |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,66 @@ | ||
using BSON, Tracker | ||
mutable struct BiLSTM_CNN_CRF_Model{C, W, L, D, O, A} | ||
labels::Array{String, 1} # List of Labels | ||
chars_idx::Dict{Char, Int64} # Dict that maps chars to indices in W_Char_Embed | ||
words_idx::Dict{String, Int64} # Dict that maps words to indices in W_word_Embed | ||
conv1::C # Convolution Layer over W_Char_Embed to give character representation | ||
W_Char_Embed::W # Weights for character embeddings | ||
W_word_Embed::W # Further trained GloVe Embeddings | ||
forward_lstm::L # Forward LSTM | ||
backward::L # Backward LSTM | ||
d_out::D # Dense_out | ||
c::O # CRF | ||
init_α::A # For CRF layer | ||
UNK_Word_idx::Integer | ||
UNK_char_idx::Integer | ||
end | ||
|
||
function BiLSTM_CNN_CRF_Model(labels, chars_idx, words_idx, UNK_char_idx, UNK_Word_idx; CHAR_EMBED_DIMS=25, WORD_EMBED_DIMS=100, | ||
CNN_OUTPUT_SIZE=30, CONV_PAD= (0,2), CONV_WINDOW_LENGTH = 3, LSTM_STATE_SIZE = 200) | ||
n = length(labels) | ||
init_α = fill(-10000, (n + 2, 1)) | ||
init_α[n + 1] = 0 | ||
|
||
BiLSTM_CNN_CRF_Model(labels, chars_idx, words_idx, Conv((CHAR_EMBED_DIMS, CONV_WINDOW_LENGTH), 1=>CNN_OUTPUT_SIZE, pad=CONV_PAD), | ||
rand(CHAR_EMBED_DIMS, length(chars_idx)), rand(WORD_EMBED_DIMS, length(words_idx)), | ||
LSTM(CNN_OUTPUT_SIZE + WORD_EMBED_DIMS, LSTM_STATE_SIZE), LSTM(CNN_OUTPUT_SIZE + WORD_EMBED_DIMS, LSTM_STATE_SIZE), | ||
Dense(LSTM_STATE_SIZE * 2, length(labels) + 2), CRF(n), init_α, UNK_Word_idx, UNK_char_idx) | ||
end | ||
|
||
function BiLSTM_CNN_CRF_Model(labels, chars_idx, words_idx, UNK_char_idx,UNK_Word_idx, weights_path) | ||
n = length(labels) | ||
init_α = fill(-10000, (n + 2, 1)) | ||
init_α[n + 1] = 0 | ||
|
||
W_word_Embed = BSON.load(joinpath(weights_path, "W_word_cpu.bson"))[:W_word_cpu].data | ||
W_Char_Embed = BSON.load(joinpath(weights_path, "W_char_cpu.bson"))[:W_char_cpu].data | ||
forward_lstm = BSON.load(joinpath(weights_path, "forward_lstm.bson"))[:forward_lstm_cpu] | ||
backward = BSON.load(joinpath(weights_path, "backward_lstm.bson"))[:backward_lstm_cpu] | ||
d_out = BSON.load(joinpath(weights_path, "d_cpu.bson"))[:d_cpu] | ||
c = BSON.load(joinpath(weights_path, "crf.bson"))[:crf_cpu] | ||
conv1 = BSON.load(joinpath(weights_path, "conv_cpu.bson"))[:conv_cpu] | ||
|
||
BiLSTM_CNN_CRF_Model(labels, chars_idx, words_idx, conv1, W_Char_Embed, W_word_Embed, | ||
forward_lstm, backward, d_out, c, init_α, UNK_Word_idx, UNK_char_idx) | ||
end | ||
|
||
function (a::BiLSTM_CNN_CRF_Model)(x) | ||
char_features = Chain(x -> reshape(x, size(x)..., 1,1), | ||
a.conv1, | ||
x -> maximum(x, dims=2), | ||
x -> reshape(x, length(x),1)) | ||
input_embeddings((w, cs)) = vcat(a.W_word_Embed * w, char_features(a.W_Char_Embed * cs)) | ||
backward_lstm(x) = reverse((a.backward).(reverse(x))) | ||
bilstm_layer(x) = vcat.((a.forward_lstm).(x), backward_lstm(x)) | ||
m = Chain(x -> input_embeddings.(x), | ||
bilstm_layer, | ||
x -> (a.d_out).(x)) | ||
|
||
oh_outs = viterbi_decode(a.c, m(x), a.init_α) | ||
Flux.reset!(a.backward) | ||
Flux.reset!(a.forward_lstm) | ||
[a.labels[oh.ix] for oh in oh_outs] | ||
end | ||
|
||
onehotinput(m::BiLSTM_CNN_CRF_Model, word) = (onehot(get(m.words_idx, lowercase(word), m.UNK_Word_idx), 1:length(m.words_idx)), | ||
onehotbatch([get(m.chars_idx, c, m.UNK_char_idx) for c in word], 1:length(m.chars_idx))) |
Oops, something went wrong.