From 52acca5f85a5779ada3bb641fb67eb0e5fbf0f69 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Radim=20=C5=98eh=C5=AF=C5=99ek?= Date: Tue, 27 Oct 2020 18:58:03 +0100 Subject: [PATCH] fixing fixable FIXMEs, in preparation for 4.0.0beta --- gensim/models/fasttext.py | 10 ++++---- gensim/models/keyedvectors.py | 47 +++++++++++++++++++++-------------- gensim/test/test_fasttext.py | 11 ++++---- gensim/test/test_ldamodel.py | 2 +- gensim/utils.py | 6 ++--- 5 files changed, 43 insertions(+), 33 deletions(-) diff --git a/gensim/models/fasttext.py b/gensim/models/fasttext.py index 433df55fcd..5832330611 100644 --- a/gensim/models/fasttext.py +++ b/gensim/models/fasttext.py @@ -30,7 +30,7 @@ .. sourcecode:: pycon - >>> # from gensim.models import FastText # FIXME: why does Sphinx dislike this import? + >>> from gensim.models import FastText >>> from gensim.test.utils import common_texts # some example sentences >>> >>> print(common_texts[0]) @@ -488,9 +488,9 @@ def estimate_memory(self, vocab_size=None, report=None): hashes = ft_ngram_hashes(word, self.wv.min_n, self.wv.max_n, self.wv.bucket) num_ngrams += len(hashes) # A list (64 bytes) with one np.array (100 bytes) per key, with a total of - # num_ngrams uint32s (4 bytes) amongst them - # Only used during training, not stored with the model - report['buckets_word'] = 64 + (100 * len(self.wv)) + (4 * num_ngrams) # FIXME: caching & calc sensible? + # num_ngrams uint32s (4 bytes) amongst them. + # Only used during training, not stored with the model. + report['buckets_word'] = 64 + (100 * len(self.wv)) + (4 * num_ngrams) # TODO: caching & calc sensible? report['total'] = sum(report.values()) logger.info( "estimated required memory for %i words, %i buckets and %i dimensions: %i bytes", @@ -1193,7 +1193,7 @@ def recalc_char_ngram_buckets(self): Scan the vocabulary, calculate ngrams and their hashes, and cache the list of ngrams for each known word. """ - # FIXME: evaluate if precaching even necessary, compared to recalculating as needed + # TODO: evaluate if precaching even necessary, compared to recalculating as needed. if self.bucket == 0: self.buckets_word = [np.array([], dtype=np.uint32)] * len(self.index_to_key) return diff --git a/gensim/models/keyedvectors.py b/gensim/models/keyedvectors.py index a27aea287b..f39aa181b4 100644 --- a/gensim/models/keyedvectors.py +++ b/gensim/models/keyedvectors.py @@ -215,7 +215,7 @@ def __init__(self, vector_size, count=0, dtype=np.float32, mapfile_path=None): Vector dimensions will default to `np.float32` (AKA `REAL` in some Gensim code) unless another type is provided here. mapfile_path : string, optional - FIXME: UNDER CONSTRUCTION / WILL CHANGE PRE-4.0.0 PER #2955 / #2975 + FIXME: UNDER CONSTRUCTION / WILL CHANGE PRE-4.0.0 PER #2955 / #2975. """ self.vector_size = vector_size # pre-allocating `index_to_key` to full size helps avoid redundant re-allocations, esp for `expandos` @@ -974,7 +974,7 @@ def most_similar_cosmul(self, positive=None, negative=None, topn=10): one-dimensional numpy array with the size of the vocabulary. """ - # FIXME: Update to better match & share code with most_similar() + # TODO: Update to better match & share code with most_similar() if isinstance(topn, Integral) and topn < 1: return [] @@ -1520,45 +1520,52 @@ def save_word2vec_format( (in case word vectors are appended with document vectors afterwards). write_header : bool, optional If False, don't write the 1st line declaring the count of vectors and dimensions. - FIXME: doc prefix, append, sort_attr + prefix : str, optional + String to prepend in front of each stored word. Default = no prefix. + append : bool, optional + If set, open `fname` in `ab` mode instead of the default `wb` mode. + sort_attr : str, optional + Sort the output vectors in descending order of this attribute. Default: most frequent keys first. + """ if total_vec is None: total_vec = len(self.index_to_key) mode = 'wb' if not append else 'ab' - if 'count' in self.expandos: - # if frequency-info available, store in most-to-least-frequent order - store_order_vocab_keys = sorted(self.key_to_index.keys(), key=lambda k: -self.get_vecattr(k, sort_attr)) - else: - store_order_vocab_keys = self.index_to_key + if sort_attr not in self.expandos: + raise ValueError(f"attribute {sort_attr} not present in {self}") + store_order_vocab_keys = sorted(self.key_to_index.keys(), key=lambda k: -self.get_vecattr(k, sort_attr)) if fvocab is not None: logger.info("storing vocabulary in %s", fvocab) with utils.open(fvocab, mode) as vout: for word in store_order_vocab_keys: - vout.write(utils.to_utf8("%s%s %s\n" % (prefix, word, self.get_vecattr(word, sort_attr)))) + vout.write(f"{prefix}{word} {self.get_vecattr(word, sort_attr)}\n") logger.info("storing %sx%s projection weights into %s", total_vec, self.vector_size, fname) assert (len(self.index_to_key), self.vector_size) == self.vectors.shape - # after (possibly-empty) initial range of int-only keys, - # store in sorted order: most frequent keys at the top + # After (possibly-empty) initial range of int-only keys, + # store in sorted order: most frequent keys at the top. + # TODO: is this still relevant in 4.0? What is this about? + # Needs clear comments at the very least. index_id_count = 0 for i, val in enumerate(self.index_to_key): - if not (i == val): + if i != val: break index_id_count += 1 keys_to_write = itertools.chain(range(0, index_id_count), store_order_vocab_keys) + # Store the actual vectors to the output file, in the order defined by sort_attr. with utils.open(fname, mode) as fout: if write_header: - fout.write(utils.to_utf8("%s %s\n" % (total_vec, self.vector_size))) + fout.write(f"{total_vec} {self.vector_size}".encode('utf8')) for key in keys_to_write: - row = self[key] + key_vector = self[key] if binary: row = row.astype(REAL) - fout.write(utils.to_utf8(prefix + str(key)) + b" " + row.tobytes()) + fout.write(f"{prefix}{key} ".encode('utf8') + key_vector.tobytes()) else: - fout.write(utils.to_utf8("%s%s %s\n" % (prefix, str(key), ' '.join(repr(val) for val in row)))) + fout.write(f"{prefix}{key} {' '.join(repr(val) for val in key_vector)}\n".encode('utf8')) @classmethod def load_word2vec_format(cls, fname, fvocab=None, binary=False, encoding='utf8', unicode_errors='strict', @@ -1915,9 +1922,11 @@ def pseudorandom_weak_vector(size, seed_string=None, hashfxn=hash): def prep_vectors(target_shape, prior_vectors=None, seed=0, dtype=REAL): - """FIXME: NAME/DOCS CHANGES PRE-4.0.0 FOR #2955/#2975 MMAP & OTHER INITIALIZATION CLEANUP WORK - Return a numpy array of the given shape. Reuse prior_vectors object or values - to extent possible. Initialize new values randomly if requested.""" + """Return a numpy array of the given shape. Reuse prior_vectors object or values + to extent possible. Initialize new values randomly if requested. + + FIXME: NAME/DOCS CHANGES PRE-4.0.0 FOR #2955/#2975 MMAP & OTHER INITIALIZATION CLEANUP WORK. + """ if prior_vectors is None: prior_vectors = np.zeros((0, 0)) if prior_vectors.shape == target_shape: diff --git a/gensim/test/test_fasttext.py b/gensim/test/test_fasttext.py index 3d3537d03e..4864802b5c 100644 --- a/gensim/test/test_fasttext.py +++ b/gensim/test/test_fasttext.py @@ -18,8 +18,9 @@ from gensim.models.word2vec import LineSentence from gensim.models.fasttext import FastText as FT_gensim, FastTextKeyedVectors, _unpack from gensim.models.keyedvectors import KeyedVectors -from gensim.test.utils import datapath, get_tmpfile, temporary_file, common_texts as sentences, \ - lee_corpus_list as list_corpus +from gensim.test.utils import ( + datapath, get_tmpfile, temporary_file, common_texts as sentences, lee_corpus_list as list_corpus, +) from gensim.test.test_word2vec import TestWord2VecModel import gensim.models._fasttext_bin from gensim.models.fasttext_inner import compute_ngrams, compute_ngrams_bytes, ft_hash_bytes @@ -813,7 +814,7 @@ def test_estimate_memory(self): self.assertEqual(report['syn0_vocab'], 192) self.assertEqual(report['syn1'], 192) self.assertEqual(report['syn1neg'], 192) - # FIXME: these fixed numbers for particular implementation generations encumber changes without real QA + # TODO: these fixed numbers for particular implementation generations encumber changes without real QA # perhaps instead verify reports' total is within some close factor of a deep-audit of actual memory used? self.assertEqual(report['syn0_ngrams'], model.vector_size * np.dtype(np.float32).itemsize * BUCKET) self.assertEqual(report['buckets_word'], 688) @@ -996,10 +997,10 @@ def test_continuation_native(self): self.model_structural_sanity(native) # - # Pick a word that's is in both corpuses. + # Pick a word that is in both corpuses. # Its vectors should be different between training runs. # - word = 'human' # FIXME: this isn't actually in model, except via OOV ngrams + word = 'society' old_vector = native.wv.get_vector(word).tolist() native.train(list_corpus, total_examples=len(list_corpus), epochs=native.epochs) diff --git a/gensim/test/test_ldamodel.py b/gensim/test/test_ldamodel.py index f1c17ac0c9..2b92e3887b 100644 --- a/gensim/test/test_ldamodel.py +++ b/gensim/test/test_ldamodel.py @@ -292,7 +292,7 @@ def testGetDocumentTopics(self): self.assertTrue(isinstance(phi_values, list)) # word_topics looks like this: ({word_id => [topic_id_most_probable, topic_id_second_most_probable, ...]). - # we check one case in word_topics, i.e of the first word in the doc, and it's likely topics. + # we check one case in word_topics, i.e of the first word in the doc, and its likely topics. # FIXME: Fails on osx and win # expected_word = 0 diff --git a/gensim/utils.py b/gensim/utils.py index 49877249f1..ba6171f109 100644 --- a/gensim/utils.py +++ b/gensim/utils.py @@ -1679,9 +1679,9 @@ def lemmatize(content, allowed_tags=re.compile(r'(NN|VB|JJ|RB)'), light=False, import warnings warnings.warn("The light flag is no longer supported by pattern.") - # tokenization in `pattern` is weird; it gets thrown off by non-letters, - # producing '==relate/VBN' or '**/NN'... try to preprocess the text a little - # FIXME this throws away all fancy parsing cues, including sentence structure, + # Tokenization in `pattern` is weird; it gets thrown off by non-letters, + # producing '==relate/VBN' or '**/NN'... try to preprocess the text a little. + # XXX: this throws away all fancy parsing cues, including sentence structure, # abbreviations etc. content = ' '.join(tokenize(content, lower=True, errors='ignore'))