Skip to content

Commit

Permalink
get rid of six in doc2vec
Browse files Browse the repository at this point in the history
  • Loading branch information
piskvorky committed Jul 27, 2020
1 parent 4cf4da0 commit 161ad55
Showing 1 changed file with 9 additions and 11 deletions.
20 changes: 9 additions & 11 deletions gensim/models/doc2vec.py
Original file line number Diff line number Diff line change
Expand Up @@ -74,8 +74,6 @@
from dataclasses import dataclass
from numpy import zeros, float32 as REAL, vstack, integer, dtype
import numpy as np
from six.moves import range
from six import string_types, integer_types, itervalues

from gensim import utils, matutils # utility fnc for pickling, common scipy operations etc
from gensim.utils import deprecated
Expand Down Expand Up @@ -609,7 +607,7 @@ def infer_vector(self, doc_words, alpha=None, min_alpha=None, epochs=None, steps
The inferred paragraph vector for the new document.
"""
if isinstance(doc_words, string_types):
if isinstance(doc_words, str): # a common mistake; fail with a nicer error
raise TypeError("Parameter doc_words of infer_vector() must be a list of strings (not a single string).")

alpha = alpha or self.alpha
Expand Down Expand Up @@ -661,7 +659,7 @@ def __getitem__(self, tag):
The vector representations of each tag as a matrix (will be 1D if `tag` was a single tag)
"""
if isinstance(tag, string_types + integer_types + (integer,)):
if isinstance(tag, (str, int, integer,)):
if tag not in self.wv:
return self.dv[tag]
return self.wv[tag]
Expand Down Expand Up @@ -898,13 +896,13 @@ def build_vocab_from_freq(self, word_freq, keep_raw_vocab=False, corpus_count=No
If true, the new provided words in `word_freq` dict will be added to model's vocab.
"""
logger.info("Processing provided word frequencies")
logger.info("processing provided word frequencies")
# Instead of scanning text, this will assign provided word frequencies dictionary(word_freq)
# to be directly the raw vocab
# to be directly the raw vocab.
raw_vocab = word_freq
logger.info(
"collected %i different raw word, with total frequency of %i",
len(raw_vocab), sum(itervalues(raw_vocab))
"collected %i different raw words, with total frequency of %i",
len(raw_vocab), sum(raw_vocab.values()),
)

# Since no documents are provided, this is to control the corpus_count
Expand All @@ -929,11 +927,11 @@ def _scan_vocab(self, corpus_iterable, progress_per, trim_rule):
doctags_list = []
for document_no, document in enumerate(corpus_iterable):
if not checked_string_types:
if isinstance(document.words, string_types):
if isinstance(document.words, str):
logger.warning(
"Each 'words' should be a list of words (usually unicode strings). "
"First 'words' here is instead plain %s.",
type(document.words)
type(document.words),
)
checked_string_types += 1
if document_no % progress_per == 0:
Expand All @@ -948,7 +946,7 @@ def _scan_vocab(self, corpus_iterable, progress_per, trim_rule):

for tag in document.tags:
# Note a document tag during initial corpus scan, for structure sizing.
if isinstance(tag, integer_types + (integer,)):
if isinstance(tag, (int, integer,)):
max_rawint = max(max_rawint, tag)
else:
if tag in doctags_lookup:
Expand Down

0 comments on commit 161ad55

Please sign in to comment.