Skip to content

Commit

Permalink
fixing fixable FIXMEs, in preparation for 4.0.0beta
Browse files Browse the repository at this point in the history
  • Loading branch information
piskvorky committed Oct 27, 2020
1 parent 742ed1d commit 52acca5
Show file tree
Hide file tree
Showing 5 changed files with 43 additions and 33 deletions.
10 changes: 5 additions & 5 deletions gensim/models/fasttext.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,7 @@
.. sourcecode:: pycon
>>> # from gensim.models import FastText # FIXME: why does Sphinx dislike this import?
>>> from gensim.models import FastText
>>> from gensim.test.utils import common_texts # some example sentences
>>>
>>> print(common_texts[0])
Expand Down Expand Up @@ -488,9 +488,9 @@ def estimate_memory(self, vocab_size=None, report=None):
hashes = ft_ngram_hashes(word, self.wv.min_n, self.wv.max_n, self.wv.bucket)
num_ngrams += len(hashes)
# A list (64 bytes) with one np.array (100 bytes) per key, with a total of
# num_ngrams uint32s (4 bytes) amongst them
# Only used during training, not stored with the model
report['buckets_word'] = 64 + (100 * len(self.wv)) + (4 * num_ngrams) # FIXME: caching & calc sensible?
# num_ngrams uint32s (4 bytes) amongst them.
# Only used during training, not stored with the model.
report['buckets_word'] = 64 + (100 * len(self.wv)) + (4 * num_ngrams) # TODO: caching & calc sensible?
report['total'] = sum(report.values())
logger.info(
"estimated required memory for %i words, %i buckets and %i dimensions: %i bytes",
Expand Down Expand Up @@ -1193,7 +1193,7 @@ def recalc_char_ngram_buckets(self):
Scan the vocabulary, calculate ngrams and their hashes, and cache the list of ngrams for each known word.
"""
# FIXME: evaluate if precaching even necessary, compared to recalculating as needed
# TODO: evaluate if precaching even necessary, compared to recalculating as needed.
if self.bucket == 0:
self.buckets_word = [np.array([], dtype=np.uint32)] * len(self.index_to_key)
return
Expand Down
47 changes: 28 additions & 19 deletions gensim/models/keyedvectors.py
Original file line number Diff line number Diff line change
Expand Up @@ -215,7 +215,7 @@ def __init__(self, vector_size, count=0, dtype=np.float32, mapfile_path=None):
Vector dimensions will default to `np.float32` (AKA `REAL` in some Gensim code) unless
another type is provided here.
mapfile_path : string, optional
FIXME: UNDER CONSTRUCTION / WILL CHANGE PRE-4.0.0 PER #2955 / #2975
FIXME: UNDER CONSTRUCTION / WILL CHANGE PRE-4.0.0 PER #2955 / #2975.
"""
self.vector_size = vector_size
# pre-allocating `index_to_key` to full size helps avoid redundant re-allocations, esp for `expandos`
Expand Down Expand Up @@ -974,7 +974,7 @@ def most_similar_cosmul(self, positive=None, negative=None, topn=10):
one-dimensional numpy array with the size of the vocabulary.
"""
# FIXME: Update to better match & share code with most_similar()
# TODO: Update to better match & share code with most_similar()
if isinstance(topn, Integral) and topn < 1:
return []

Expand Down Expand Up @@ -1520,45 +1520,52 @@ def save_word2vec_format(
(in case word vectors are appended with document vectors afterwards).
write_header : bool, optional
If False, don't write the 1st line declaring the count of vectors and dimensions.
FIXME: doc prefix, append, sort_attr
prefix : str, optional
String to prepend in front of each stored word. Default = no prefix.
append : bool, optional
If set, open `fname` in `ab` mode instead of the default `wb` mode.
sort_attr : str, optional
Sort the output vectors in descending order of this attribute. Default: most frequent keys first.
"""
if total_vec is None:
total_vec = len(self.index_to_key)
mode = 'wb' if not append else 'ab'
if 'count' in self.expandos:
# if frequency-info available, store in most-to-least-frequent order
store_order_vocab_keys = sorted(self.key_to_index.keys(), key=lambda k: -self.get_vecattr(k, sort_attr))
else:
store_order_vocab_keys = self.index_to_key
if sort_attr not in self.expandos:
raise ValueError(f"attribute {sort_attr} not present in {self}")
store_order_vocab_keys = sorted(self.key_to_index.keys(), key=lambda k: -self.get_vecattr(k, sort_attr))

if fvocab is not None:
logger.info("storing vocabulary in %s", fvocab)
with utils.open(fvocab, mode) as vout:
for word in store_order_vocab_keys:
vout.write(utils.to_utf8("%s%s %s\n" % (prefix, word, self.get_vecattr(word, sort_attr))))
vout.write(f"{prefix}{word} {self.get_vecattr(word, sort_attr)}\n")

logger.info("storing %sx%s projection weights into %s", total_vec, self.vector_size, fname)
assert (len(self.index_to_key), self.vector_size) == self.vectors.shape

# after (possibly-empty) initial range of int-only keys,
# store in sorted order: most frequent keys at the top
# After (possibly-empty) initial range of int-only keys,
# store in sorted order: most frequent keys at the top.
# TODO: is this still relevant in 4.0? What is this about?
# Needs clear comments at the very least.
index_id_count = 0
for i, val in enumerate(self.index_to_key):
if not (i == val):
if i != val:
break
index_id_count += 1
keys_to_write = itertools.chain(range(0, index_id_count), store_order_vocab_keys)

# Store the actual vectors to the output file, in the order defined by sort_attr.
with utils.open(fname, mode) as fout:
if write_header:
fout.write(utils.to_utf8("%s %s\n" % (total_vec, self.vector_size)))
fout.write(f"{total_vec} {self.vector_size}".encode('utf8'))
for key in keys_to_write:
row = self[key]
key_vector = self[key]
if binary:
row = row.astype(REAL)
fout.write(utils.to_utf8(prefix + str(key)) + b" " + row.tobytes())
fout.write(f"{prefix}{key} ".encode('utf8') + key_vector.tobytes())
else:
fout.write(utils.to_utf8("%s%s %s\n" % (prefix, str(key), ' '.join(repr(val) for val in row))))
fout.write(f"{prefix}{key} {' '.join(repr(val) for val in key_vector)}\n".encode('utf8'))

@classmethod
def load_word2vec_format(cls, fname, fvocab=None, binary=False, encoding='utf8', unicode_errors='strict',
Expand Down Expand Up @@ -1915,9 +1922,11 @@ def pseudorandom_weak_vector(size, seed_string=None, hashfxn=hash):


def prep_vectors(target_shape, prior_vectors=None, seed=0, dtype=REAL):
"""FIXME: NAME/DOCS CHANGES PRE-4.0.0 FOR #2955/#2975 MMAP & OTHER INITIALIZATION CLEANUP WORK
Return a numpy array of the given shape. Reuse prior_vectors object or values
to extent possible. Initialize new values randomly if requested."""
"""Return a numpy array of the given shape. Reuse prior_vectors object or values
to extent possible. Initialize new values randomly if requested.
FIXME: NAME/DOCS CHANGES PRE-4.0.0 FOR #2955/#2975 MMAP & OTHER INITIALIZATION CLEANUP WORK.
"""
if prior_vectors is None:
prior_vectors = np.zeros((0, 0))
if prior_vectors.shape == target_shape:
Expand Down
11 changes: 6 additions & 5 deletions gensim/test/test_fasttext.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,8 +18,9 @@
from gensim.models.word2vec import LineSentence
from gensim.models.fasttext import FastText as FT_gensim, FastTextKeyedVectors, _unpack
from gensim.models.keyedvectors import KeyedVectors
from gensim.test.utils import datapath, get_tmpfile, temporary_file, common_texts as sentences, \
lee_corpus_list as list_corpus
from gensim.test.utils import (
datapath, get_tmpfile, temporary_file, common_texts as sentences, lee_corpus_list as list_corpus,
)
from gensim.test.test_word2vec import TestWord2VecModel
import gensim.models._fasttext_bin
from gensim.models.fasttext_inner import compute_ngrams, compute_ngrams_bytes, ft_hash_bytes
Expand Down Expand Up @@ -813,7 +814,7 @@ def test_estimate_memory(self):
self.assertEqual(report['syn0_vocab'], 192)
self.assertEqual(report['syn1'], 192)
self.assertEqual(report['syn1neg'], 192)
# FIXME: these fixed numbers for particular implementation generations encumber changes without real QA
# TODO: these fixed numbers for particular implementation generations encumber changes without real QA
# perhaps instead verify reports' total is within some close factor of a deep-audit of actual memory used?
self.assertEqual(report['syn0_ngrams'], model.vector_size * np.dtype(np.float32).itemsize * BUCKET)
self.assertEqual(report['buckets_word'], 688)
Expand Down Expand Up @@ -996,10 +997,10 @@ def test_continuation_native(self):
self.model_structural_sanity(native)

#
# Pick a word that's is in both corpuses.
# Pick a word that is in both corpuses.
# Its vectors should be different between training runs.
#
word = 'human' # FIXME: this isn't actually in model, except via OOV ngrams
word = 'society'
old_vector = native.wv.get_vector(word).tolist()

native.train(list_corpus, total_examples=len(list_corpus), epochs=native.epochs)
Expand Down
2 changes: 1 addition & 1 deletion gensim/test/test_ldamodel.py
Original file line number Diff line number Diff line change
Expand Up @@ -292,7 +292,7 @@ def testGetDocumentTopics(self):
self.assertTrue(isinstance(phi_values, list))

# word_topics looks like this: ({word_id => [topic_id_most_probable, topic_id_second_most_probable, ...]).
# we check one case in word_topics, i.e of the first word in the doc, and it's likely topics.
# we check one case in word_topics, i.e of the first word in the doc, and its likely topics.

# FIXME: Fails on osx and win
# expected_word = 0
Expand Down
6 changes: 3 additions & 3 deletions gensim/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -1679,9 +1679,9 @@ def lemmatize(content, allowed_tags=re.compile(r'(NN|VB|JJ|RB)'), light=False,
import warnings
warnings.warn("The light flag is no longer supported by pattern.")

# tokenization in `pattern` is weird; it gets thrown off by non-letters,
# producing '==relate/VBN' or '**/NN'... try to preprocess the text a little
# FIXME this throws away all fancy parsing cues, including sentence structure,
# Tokenization in `pattern` is weird; it gets thrown off by non-letters,
# producing '==relate/VBN' or '**/NN'... try to preprocess the text a little.
# XXX: this throws away all fancy parsing cues, including sentence structure,
# abbreviations etc.
content = ' '.join(tokenize(content, lower=True, errors='ignore'))

Expand Down

0 comments on commit 52acca5

Please sign in to comment.