From 161ad5538b8aeed936ef73fb8ea301762cec08b4 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Radim=20=C5=98eh=C5=AF=C5=99ek?= Date: Mon, 27 Jul 2020 09:31:30 +0200 Subject: [PATCH] get rid of six in doc2vec --- gensim/models/doc2vec.py | 20 +++++++++----------- 1 file changed, 9 insertions(+), 11 deletions(-) diff --git a/gensim/models/doc2vec.py b/gensim/models/doc2vec.py index bd7633db41..61733812a0 100644 --- a/gensim/models/doc2vec.py +++ b/gensim/models/doc2vec.py @@ -74,8 +74,6 @@ from dataclasses import dataclass from numpy import zeros, float32 as REAL, vstack, integer, dtype import numpy as np -from six.moves import range -from six import string_types, integer_types, itervalues from gensim import utils, matutils # utility fnc for pickling, common scipy operations etc from gensim.utils import deprecated @@ -609,7 +607,7 @@ def infer_vector(self, doc_words, alpha=None, min_alpha=None, epochs=None, steps The inferred paragraph vector for the new document. """ - if isinstance(doc_words, string_types): + if isinstance(doc_words, str): # a common mistake; fail with a nicer error raise TypeError("Parameter doc_words of infer_vector() must be a list of strings (not a single string).") alpha = alpha or self.alpha @@ -661,7 +659,7 @@ def __getitem__(self, tag): The vector representations of each tag as a matrix (will be 1D if `tag` was a single tag) """ - if isinstance(tag, string_types + integer_types + (integer,)): + if isinstance(tag, (str, int, integer,)): if tag not in self.wv: return self.dv[tag] return self.wv[tag] @@ -898,13 +896,13 @@ def build_vocab_from_freq(self, word_freq, keep_raw_vocab=False, corpus_count=No If true, the new provided words in `word_freq` dict will be added to model's vocab. """ - logger.info("Processing provided word frequencies") + logger.info("processing provided word frequencies") # Instead of scanning text, this will assign provided word frequencies dictionary(word_freq) - # to be directly the raw vocab + # to be directly the raw vocab. raw_vocab = word_freq logger.info( - "collected %i different raw word, with total frequency of %i", - len(raw_vocab), sum(itervalues(raw_vocab)) + "collected %i different raw words, with total frequency of %i", + len(raw_vocab), sum(raw_vocab.values()), ) # Since no documents are provided, this is to control the corpus_count @@ -929,11 +927,11 @@ def _scan_vocab(self, corpus_iterable, progress_per, trim_rule): doctags_list = [] for document_no, document in enumerate(corpus_iterable): if not checked_string_types: - if isinstance(document.words, string_types): + if isinstance(document.words, str): logger.warning( "Each 'words' should be a list of words (usually unicode strings). " "First 'words' here is instead plain %s.", - type(document.words) + type(document.words), ) checked_string_types += 1 if document_no % progress_per == 0: @@ -948,7 +946,7 @@ def _scan_vocab(self, corpus_iterable, progress_per, trim_rule): for tag in document.tags: # Note a document tag during initial corpus scan, for structure sizing. - if isinstance(tag, integer_types + (integer,)): + if isinstance(tag, (int, integer,)): max_rawint = max(max_rawint, tag) else: if tag in doctags_lookup: