-
-
Notifications
You must be signed in to change notification settings - Fork 4.4k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Improving Scan_Vocab speed, build_vocab_from_freq function. Iteration 2 #1695
Changes from all commits
3f30e1e
c4f387e
8abd58b
8ec0433
b9f3a5f
0a5e8d6
644fcad
c91b4cb
1e4ef3e
9ae7a84
1e82811
aa9227d
e156b95
2066a2a
62ed129
473d7e6
a65e36b
7f46a05
f744c4f
6471164
9bc6b78
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -647,13 +647,19 @@ def build_vocab_from_freq(self, word_freq, keep_raw_vocab=False, corpus_count=No | |
|
||
Examples | ||
-------- | ||
>>> build_vocab_from_freq({"Word1":15,"Word2":20}, update=True) | ||
>>> from gensim.models.word2vec import Word2Vec | ||
>>> model= Word2Vec() | ||
>>> model.build_vocab_from_freq({"Word1": 15, "Word2": 20}) | ||
""" | ||
logger.info("Processing provided word frequencies") | ||
vocab = defaultdict(int, word_freq) | ||
raw_vocab = word_freq # Instead of scanning text, this will assign provided word frequencies dictionary(word_freq) to be directly the raw vocab | ||
logger.info( | ||
"collected %i different raw word, with total frequency of %i", | ||
len(raw_vocab), sum(itervalues(raw_vocab)) | ||
) | ||
|
||
self.corpus_count = corpus_count if corpus_count else 0 | ||
self.raw_vocab = vocab | ||
self.corpus_count = corpus_count if corpus_count else 0 # Since no sentences are provided, this is to control the corpus_count | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. PEP8 - two spaces before There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. These are 2 space, arent they ? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Oh, really, sorry |
||
self.raw_vocab = raw_vocab | ||
|
||
self.scale_vocab(keep_raw_vocab=keep_raw_vocab, trim_rule=trim_rule, update=update) # trim by min_count & precalculate downsampling | ||
self.finalize_vocab(update=update) # build tables & arrays | ||
|
@@ -682,7 +688,7 @@ def scan_vocab(self, sentences, progress_per=10000, trim_rule=None): | |
) | ||
for word in sentence: | ||
vocab[word] += 1 | ||
total_words += 1 | ||
total_words += len(sentence) | ||
|
||
if self.max_vocab_size and len(vocab) > self.max_vocab_size: | ||
utils.prune_vocab(vocab, min_reduce, trim_rule=trim_rule) | ||
|
@@ -694,6 +700,7 @@ def scan_vocab(self, sentences, progress_per=10000, trim_rule=None): | |
) | ||
self.corpus_count = sentence_no + 1 | ||
self.raw_vocab = vocab | ||
return total_words | ||
|
||
def scale_vocab(self, min_count=None, sample=None, dry_run=False, | ||
keep_raw_vocab=False, trim_rule=None, update=False): | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Model is undefined, please create model first (docstring should be executable, i.e. I can copy-paste this code to console and I expect that code run successfully) we plan to add doctests to our CI soon.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
👍