Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Improving Scan_Vocab speed, build_vocab_from_freq function. Iteration 2 #1695

Merged
merged 21 commits into from
Nov 8, 2017
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
17 changes: 12 additions & 5 deletions gensim/models/word2vec.py
Original file line number Diff line number Diff line change
Expand Up @@ -647,13 +647,19 @@ def build_vocab_from_freq(self, word_freq, keep_raw_vocab=False, corpus_count=No

Examples
--------
>>> build_vocab_from_freq({"Word1":15,"Word2":20}, update=True)
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Model is undefined, please create model first (docstring should be executable, i.e. I can copy-paste this code to console and I expect that code run successfully) we plan to add doctests to our CI soon.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

👍

>>> from gensim.models.word2vec import Word2Vec
>>> model= Word2Vec()
>>> model.build_vocab_from_freq({"Word1": 15, "Word2": 20})
"""
logger.info("Processing provided word frequencies")
vocab = defaultdict(int, word_freq)
raw_vocab = word_freq # Instead of scanning text, this will assign provided word frequencies dictionary(word_freq) to be directly the raw vocab
logger.info(
"collected %i different raw word, with total frequency of %i",
len(raw_vocab), sum(itervalues(raw_vocab))
)

self.corpus_count = corpus_count if corpus_count else 0
self.raw_vocab = vocab
self.corpus_count = corpus_count if corpus_count else 0 # Since no sentences are provided, this is to control the corpus_count
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

PEP8 - two spaces before #

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

These are 2 space, arent they ?

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Oh, really, sorry

self.raw_vocab = raw_vocab

self.scale_vocab(keep_raw_vocab=keep_raw_vocab, trim_rule=trim_rule, update=update) # trim by min_count & precalculate downsampling
self.finalize_vocab(update=update) # build tables & arrays
Expand Down Expand Up @@ -682,7 +688,7 @@ def scan_vocab(self, sentences, progress_per=10000, trim_rule=None):
)
for word in sentence:
vocab[word] += 1
total_words += 1
total_words += len(sentence)

if self.max_vocab_size and len(vocab) > self.max_vocab_size:
utils.prune_vocab(vocab, min_reduce, trim_rule=trim_rule)
Expand All @@ -694,6 +700,7 @@ def scan_vocab(self, sentences, progress_per=10000, trim_rule=None):
)
self.corpus_count = sentence_no + 1
self.raw_vocab = vocab
return total_words

def scale_vocab(self, min_count=None, sample=None, dry_run=False,
keep_raw_vocab=False, trim_rule=None, update=False):
Expand Down
39 changes: 35 additions & 4 deletions gensim/test/test_word2vec.py
Original file line number Diff line number Diff line change
Expand Up @@ -97,8 +97,8 @@ def testBuildVocabFromFreq(self):
model_neg = word2vec.Word2Vec(size=10, min_count=0, seed=42, hs=0, negative=5)
model_hs.build_vocab_from_freq(freq_dict)
model_neg.build_vocab_from_freq(freq_dict)
self.assertTrue(len(model_hs.wv.vocab), 12)
self.assertTrue(len(model_neg.wv.vocab), 12)
self.assertEqual(len(model_hs.wv.vocab), 12)
self.assertEqual(len(model_neg.wv.vocab), 12)
self.assertEqual(model_hs.wv.vocab['minors'].count, 2)
self.assertEqual(model_hs.wv.vocab['graph'].count, 3)
self.assertEqual(model_hs.wv.vocab['system'].count, 4)
Expand Down Expand Up @@ -126,11 +126,42 @@ def testBuildVocabFromFreq(self):
new_freq_dict = {'computer': 1, 'artificial': 4, 'human': 1, 'graph': 1, 'intelligence': 4, 'system': 1, 'trees': 1}
model_hs.build_vocab_from_freq(new_freq_dict, update=True)
model_neg.build_vocab_from_freq(new_freq_dict, update=True)
self.assertTrue(model_hs.wv.vocab['graph'].count, 4)
self.assertTrue(model_hs.wv.vocab['artificial'].count, 4)
self.assertEqual(model_hs.wv.vocab['graph'].count, 4)
self.assertEqual(model_hs.wv.vocab['artificial'].count, 4)
self.assertEqual(len(model_hs.wv.vocab), 14)
self.assertEqual(len(model_neg.wv.vocab), 14)

def testPruneVocab(self):
"""Test Prune vocab while scanning sentences"""
sentences = [
["graph", "system"],
["graph", "system"],
["system", "eps"],
["graph", "system"]
]
model = word2vec.Word2Vec(sentences, size=10, min_count=0, max_vocab_size=2, seed=42, hs=1, negative=0)
self.assertEqual(len(model.wv.vocab), 2)
self.assertEqual(model.wv.vocab['graph'].count, 3)
self.assertEqual(model.wv.vocab['system'].count, 4)

sentences = [
["graph", "system"],
["graph", "system"],
["system", "eps"],
["graph", "system"],
["minors", "survey", "minors", "survey", "minors"]
]
model = word2vec.Word2Vec(sentences, size=10, min_count=0, max_vocab_size=2, seed=42, hs=1, negative=0)
self.assertEqual(len(model.wv.vocab), 3)
self.assertEqual(model.wv.vocab['graph'].count, 3)
self.assertEqual(model.wv.vocab['minors'].count, 3)
self.assertEqual(model.wv.vocab['system'].count, 4)

def testTotalWordCount(self):
model = word2vec.Word2Vec(size=10, min_count=0, seed=42)
total_words = model.scan_vocab(sentences)
self.assertEqual(total_words, 29)

def testOnlineLearning(self):
"""Test that the algorithm is able to add new words to the
vocabulary and to a trained model when using a sorted vocabulary"""
Expand Down