fixing fixable FIXMEs, in preparation for 4.0.0beta

piskvorky · Oct 27, 2020 · 52acca5 · 52acca5
1 parent 742ed1d
commit 52acca5
Show file tree

Hide file tree

Showing 5 changed files with 43 additions and 33 deletions.
diff --git a/gensim/models/fasttext.py b/gensim/models/fasttext.py
@@ -30,7 +30,7 @@
 
 .. sourcecode:: pycon
 
-    >>> # from gensim.models import FastText  # FIXME: why does Sphinx dislike this import?
+    >>> from gensim.models import FastText
     >>> from gensim.test.utils import common_texts  # some example sentences
     >>>
     >>> print(common_texts[0])
@@ -488,9 +488,9 @@ def estimate_memory(self, vocab_size=None, report=None):
                 hashes = ft_ngram_hashes(word, self.wv.min_n, self.wv.max_n, self.wv.bucket)
                 num_ngrams += len(hashes)
             # A list (64 bytes) with one np.array (100 bytes) per key, with a total of
-            # num_ngrams uint32s (4 bytes) amongst them
-            # Only used during training, not stored with the model
-            report['buckets_word'] = 64 + (100 * len(self.wv)) + (4 * num_ngrams)  # FIXME: caching & calc sensible?
+            # num_ngrams uint32s (4 bytes) amongst them.
+            # Only used during training, not stored with the model.
+            report['buckets_word'] = 64 + (100 * len(self.wv)) + (4 * num_ngrams)  # TODO: caching & calc sensible?
         report['total'] = sum(report.values())
         logger.info(
             "estimated required memory for %i words, %i buckets and %i dimensions: %i bytes",
@@ -1193,7 +1193,7 @@ def recalc_char_ngram_buckets(self):
         Scan the vocabulary, calculate ngrams and their hashes, and cache the list of ngrams for each known word.
 
         """
-        # FIXME: evaluate if precaching even necessary, compared to recalculating as needed
+        # TODO: evaluate if precaching even necessary, compared to recalculating as needed.
         if self.bucket == 0:
             self.buckets_word = [np.array([], dtype=np.uint32)] * len(self.index_to_key)
             return

diff --git a/gensim/models/keyedvectors.py b/gensim/models/keyedvectors.py
@@ -215,7 +215,7 @@ def __init__(self, vector_size, count=0, dtype=np.float32, mapfile_path=None):
             Vector dimensions will default to `np.float32` (AKA `REAL` in some Gensim code) unless
             another type is provided here.
         mapfile_path : string, optional
-            FIXME: UNDER CONSTRUCTION / WILL CHANGE PRE-4.0.0 PER #2955 / #2975
+            FIXME: UNDER CONSTRUCTION / WILL CHANGE PRE-4.0.0 PER #2955 / #2975.
         """
         self.vector_size = vector_size
         # pre-allocating `index_to_key` to full size helps avoid redundant re-allocations, esp for `expandos`
@@ -974,7 +974,7 @@ def most_similar_cosmul(self, positive=None, negative=None, topn=10):
             one-dimensional numpy array with the size of the vocabulary.
 
         """
-        # FIXME: Update to better match & share code with most_similar()
+        # TODO: Update to better match & share code with most_similar()
         if isinstance(topn, Integral) and topn < 1:
             return []
 
@@ -1520,45 +1520,52 @@ def save_word2vec_format(
             (in case word vectors are appended with document vectors afterwards).
         write_header : bool, optional
             If False, don't write the 1st line declaring the count of vectors and dimensions.
-        FIXME: doc prefix, append, sort_attr
+        prefix : str, optional
+            String to prepend in front of each stored word. Default = no prefix.
+        append : bool, optional
+            If set, open `fname` in `ab` mode instead of the default `wb` mode.
+        sort_attr : str, optional
+            Sort the output vectors in descending order of this attribute. Default: most frequent keys first.
+
         """
         if total_vec is None:
             total_vec = len(self.index_to_key)
         mode = 'wb' if not append else 'ab'
-        if 'count' in self.expandos:
-            # if frequency-info available, store in most-to-least-frequent order
-            store_order_vocab_keys = sorted(self.key_to_index.keys(), key=lambda k: -self.get_vecattr(k, sort_attr))
-        else:
-            store_order_vocab_keys = self.index_to_key
+        if sort_attr not in self.expandos:
+            raise ValueError(f"attribute {sort_attr} not present in {self}")
+        store_order_vocab_keys = sorted(self.key_to_index.keys(), key=lambda k: -self.get_vecattr(k, sort_attr))
 
         if fvocab is not None:
             logger.info("storing vocabulary in %s", fvocab)
             with utils.open(fvocab, mode) as vout:
                 for word in store_order_vocab_keys:
-                    vout.write(utils.to_utf8("%s%s %s\n" % (prefix, word, self.get_vecattr(word, sort_attr))))
+                    vout.write(f"{prefix}{word} {self.get_vecattr(word, sort_attr)}\n")
 
         logger.info("storing %sx%s projection weights into %s", total_vec, self.vector_size, fname)
         assert (len(self.index_to_key), self.vector_size) == self.vectors.shape
 
-        # after (possibly-empty) initial range of int-only keys,
-        # store in sorted order: most frequent keys at the top
+        # After (possibly-empty) initial range of int-only keys,
+        # store in sorted order: most frequent keys at the top.
+        # TODO: is this still relevant in 4.0? What is this about?
+        # Needs clear comments at the very least.
         index_id_count = 0
         for i, val in enumerate(self.index_to_key):
-            if not (i == val):
+            if i != val:
                 break
             index_id_count += 1
         keys_to_write = itertools.chain(range(0, index_id_count), store_order_vocab_keys)
 
+        # Store the actual vectors to the output file, in the order defined by sort_attr.
         with utils.open(fname, mode) as fout:
             if write_header:
-                fout.write(utils.to_utf8("%s %s\n" % (total_vec, self.vector_size)))
+                fout.write(f"{total_vec} {self.vector_size}".encode('utf8'))
             for key in keys_to_write:
-                row = self[key]
+                key_vector = self[key]
                 if binary:
                     row = row.astype(REAL)
-                    fout.write(utils.to_utf8(prefix + str(key)) + b" " + row.tobytes())
+                    fout.write(f"{prefix}{key} ".encode('utf8') + key_vector.tobytes())
                 else:
-                    fout.write(utils.to_utf8("%s%s %s\n" % (prefix, str(key), ' '.join(repr(val) for val in row))))
+                    fout.write(f"{prefix}{key} {' '.join(repr(val) for val in key_vector)}\n".encode('utf8'))
 
     @classmethod
     def load_word2vec_format(cls, fname, fvocab=None, binary=False, encoding='utf8', unicode_errors='strict',
@@ -1915,9 +1922,11 @@ def pseudorandom_weak_vector(size, seed_string=None, hashfxn=hash):
 
 
 def prep_vectors(target_shape, prior_vectors=None, seed=0, dtype=REAL):
-    """FIXME: NAME/DOCS CHANGES PRE-4.0.0 FOR #2955/#2975 MMAP & OTHER INITIALIZATION CLEANUP WORK
-    Return a numpy array of the given shape. Reuse prior_vectors object or values
-    to extent possible. Initialize new values randomly if requested."""
+    """Return a numpy array of the given shape. Reuse prior_vectors object or values
+    to extent possible. Initialize new values randomly if requested.
+
+    FIXME: NAME/DOCS CHANGES PRE-4.0.0 FOR #2955/#2975 MMAP & OTHER INITIALIZATION CLEANUP WORK.
+    """
     if prior_vectors is None:
         prior_vectors = np.zeros((0, 0))
     if prior_vectors.shape == target_shape:

diff --git a/gensim/test/test_fasttext.py b/gensim/test/test_fasttext.py
@@ -18,8 +18,9 @@
 from gensim.models.word2vec import LineSentence
 from gensim.models.fasttext import FastText as FT_gensim, FastTextKeyedVectors, _unpack
 from gensim.models.keyedvectors import KeyedVectors
-from gensim.test.utils import datapath, get_tmpfile, temporary_file, common_texts as sentences, \
-    lee_corpus_list as list_corpus
+from gensim.test.utils import (
+    datapath, get_tmpfile, temporary_file, common_texts as sentences, lee_corpus_list as list_corpus,
+)
 from gensim.test.test_word2vec import TestWord2VecModel
 import gensim.models._fasttext_bin
 from gensim.models.fasttext_inner import compute_ngrams, compute_ngrams_bytes, ft_hash_bytes
@@ -813,7 +814,7 @@ def test_estimate_memory(self):
         self.assertEqual(report['syn0_vocab'], 192)
         self.assertEqual(report['syn1'], 192)
         self.assertEqual(report['syn1neg'], 192)
-        # FIXME: these fixed numbers for particular implementation generations encumber changes without real QA
+        # TODO: these fixed numbers for particular implementation generations encumber changes without real QA
         # perhaps instead verify reports' total is within some close factor of a deep-audit of actual memory used?
         self.assertEqual(report['syn0_ngrams'], model.vector_size * np.dtype(np.float32).itemsize * BUCKET)
         self.assertEqual(report['buckets_word'], 688)
@@ -996,10 +997,10 @@ def test_continuation_native(self):
         self.model_structural_sanity(native)
 
         #
-        # Pick a word that's is in both corpuses.
+        # Pick a word that is in both corpuses.
         # Its vectors should be different between training runs.
         #
-        word = 'human'  # FIXME: this isn't actually in model, except via OOV ngrams
+        word = 'society'
         old_vector = native.wv.get_vector(word).tolist()
 
         native.train(list_corpus, total_examples=len(list_corpus), epochs=native.epochs)

diff --git a/gensim/test/test_ldamodel.py b/gensim/test/test_ldamodel.py
@@ -292,7 +292,7 @@ def testGetDocumentTopics(self):
             self.assertTrue(isinstance(phi_values, list))
 
         # word_topics looks like this: ({word_id => [topic_id_most_probable, topic_id_second_most_probable, ...]).
-        # we check one case in word_topics, i.e of the first word in the doc, and it's likely topics.
+        # we check one case in word_topics, i.e of the first word in the doc, and its likely topics.
 
         # FIXME: Fails on osx and win
         # expected_word = 0

diff --git a/gensim/utils.py b/gensim/utils.py
@@ -1679,9 +1679,9 @@ def lemmatize(content, allowed_tags=re.compile(r'(NN|VB|JJ|RB)'), light=False,
         import warnings
         warnings.warn("The light flag is no longer supported by pattern.")
 
-    # tokenization in `pattern` is weird; it gets thrown off by non-letters,
-    # producing '==relate/VBN' or '**/NN'... try to preprocess the text a little
-    # FIXME this throws away all fancy parsing cues, including sentence structure,
+    # Tokenization in `pattern` is weird; it gets thrown off by non-letters,
+    # producing '==relate/VBN' or '**/NN'... try to preprocess the text a little.
+    # XXX: this throws away all fancy parsing cues, including sentence structure,
     # abbreviations etc.
     content = ' '.join(tokenize(content, lower=True, errors='ignore'))