Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Remove native Python implementations of Cython extensions #2630

Merged
merged 23 commits into from
Oct 25, 2019
Merged
Show file tree
Hide file tree
Changes from 3 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
179 changes: 1 addition & 178 deletions gensim/matutils.py
Original file line number Diff line number Diff line change
Expand Up @@ -1426,181 +1426,4 @@ def close(self):
# try to load fast, cythonized code if possible
from gensim.corpora._mmreader import MmReader
except ImportError:
FAST_VERSION = -1

class MmReader(object):
"""Matrix market file reader, used internally in :class:`~gensim.corpora.mmcorpus.MmCorpus`.

Wrap a term-document matrix on disk (in matrix-market format), and present it
as an object which supports iteration over the rows (~documents).

Attributes
----------
num_docs : int
Number of documents in market matrix file.
num_terms : int
Number of terms.
num_nnz : int
Number of non-zero terms.

Notes
-----
Note that the file is read into memory one document at a time, not the whole matrix at once
(unlike e.g. `scipy.io.mmread` and other implementations).
This allows us to process corpora which are larger than the available RAM.

"""
def __init__(self, input, transposed=True):
"""

Parameters
----------
input : {str, file-like object}
Path to the input file in MM format or a file-like object that supports `seek()`
(e.g. smart_open objects).
transposed : bool, optional
Do lines represent `doc_id, term_id, value`, instead of `term_id, doc_id, value`?

"""
logger.info("initializing corpus reader from %s", input)
self.input, self.transposed = input, transposed
with utils.open_file(self.input) as lines:
try:
header = utils.to_unicode(next(lines)).strip()
if not header.lower().startswith('%%matrixmarket matrix coordinate real general'):
raise ValueError(
"File %s not in Matrix Market format with coordinate real general; instead found: \n%s" %
(self.input, header)
)
except StopIteration:
pass

self.num_docs = self.num_terms = self.num_nnz = 0
for lineno, line in enumerate(lines):
line = utils.to_unicode(line)
if not line.startswith('%'):
self.num_docs, self.num_terms, self.num_nnz = (int(x) for x in line.split())
if not self.transposed:
self.num_docs, self.num_terms = self.num_terms, self.num_docs
break

logger.info(
"accepted corpus with %i documents, %i features, %i non-zero entries",
self.num_docs, self.num_terms, self.num_nnz
)

def __len__(self):
"""Get the corpus size: total number of documents."""
return self.num_docs

def __str__(self):
return ("MmCorpus(%i documents, %i features, %i non-zero entries)" %
(self.num_docs, self.num_terms, self.num_nnz))

def skip_headers(self, input_file):
"""Skip file headers that appear before the first document.

Parameters
----------
input_file : iterable of str
Iterable taken from file in MM format.

"""
for line in input_file:
if line.startswith(b'%'):
continue
break

def __iter__(self):
"""Iterate through all documents in the corpus.

Notes
------
Note that the total number of vectors returned is always equal to the number of rows specified
in the header: empty documents are inserted and yielded where appropriate, even if they are not explicitly
stored in the Matrix Market file.

Yields
------
(int, list of (int, number))
Document id and document in sparse bag-of-words format.

"""
with utils.file_or_filename(self.input) as lines:
self.skip_headers(lines)

previd = -1
for line in lines:
docid, termid, val = utils.to_unicode(line).split() # needed for python3
if not self.transposed:
termid, docid = docid, termid
# -1 because matrix market indexes are 1-based => convert to 0-based
docid, termid, val = int(docid) - 1, int(termid) - 1, float(val)
assert previd <= docid, "matrix columns must come in ascending order"
if docid != previd:
# change of document: return the document read so far (its id is prevId)
if previd >= 0:
yield previd, document # noqa:F821

# return implicit (empty) documents between previous id and new id
# too, to keep consistent document numbering and corpus length
for previd in range(previd + 1, docid):
yield previd, []

# from now on start adding fields to a new document, with a new id
previd = docid
document = []

document.append((termid, val,)) # add another field to the current document

# handle the last document, as a special case
if previd >= 0:
yield previd, document

# return empty documents between the last explicit document and the number
# of documents as specified in the header
for previd in range(previd + 1, self.num_docs):
yield previd, []

def docbyoffset(self, offset):
"""Get the document at file offset `offset` (in bytes).

Parameters
----------
offset : int
File offset, in bytes, of the desired document.

Returns
------
list of (int, str)
Document in sparse bag-of-words format.

"""
# empty documents are not stored explicitly in MM format, so the index marks
# them with a special offset, -1.
if offset == -1:
return []
if isinstance(self.input, string_types):
fin, close_fin = utils.open(self.input, 'rb'), True
else:
fin, close_fin = self.input, False

fin.seek(offset) # works for gzip/bz2 input, too
previd, document = -1, []
for line in fin:
docid, termid, val = line.split()
if not self.transposed:
termid, docid = docid, termid
# -1 because matrix market indexes are 1-based => convert to 0-based
docid, termid, val = int(docid) - 1, int(termid) - 1, float(val)
assert previd <= docid, "matrix columns must come in ascending order"
if docid != previd:
if previd >= 0:
break
previd = docid

document.append((termid, val,)) # add another field to the current document

if close_fin:
fin.close()
return document
raise utils.NO_CYTHON
15 changes: 1 addition & 14 deletions gensim/models/base_any2vec.py
Original file line number Diff line number Diff line change
Expand Up @@ -647,7 +647,7 @@ def _set_train_params(self, **kwargs):

def __init__(self, sentences=None, corpus_file=None, workers=3, vector_size=100, epochs=5, callbacks=(),
batch_words=10000, trim_rule=None, sg=0, alpha=0.025, window=5, seed=1, hs=0, negative=5,
ns_exponent=0.75, cbow_mean=1, min_alpha=0.0001, compute_loss=False, fast_version=0, **kwargs):
ns_exponent=0.75, cbow_mean=1, min_alpha=0.0001, compute_loss=False, **kwargs):
"""

Parameters
Expand Down Expand Up @@ -712,8 +712,6 @@ def __init__(self, sentences=None, corpus_file=None, workers=3, vector_size=100,
compute_loss : bool, optional
If True, loss will be computed while training the Word2Vec model and stored in
:attr:`~gensim.models.base_any2vec.BaseWordEmbeddingsModel.running_training_loss` attribute.
fast_version : {-1, 1}, optional
Whether or not the fast cython implementation of the internal training methods is available. 1 means it is.
**kwargs : object
Key word arguments needed to allow children classes to accept more arguments.

Expand All @@ -738,17 +736,6 @@ def __init__(self, sentences=None, corpus_file=None, workers=3, vector_size=100,
super(BaseWordEmbeddingsModel, self).__init__(
workers=workers, vector_size=vector_size, epochs=epochs, callbacks=callbacks, batch_words=batch_words)

if fast_version < 0:
warnings.warn(
"C extension not loaded, training will be slow. "
"Install a C compiler and reinstall gensim for fast training."
)
self.neg_labels = []
if self.negative > 0:
# precompute negative labels optimization for pure-python training
self.neg_labels = zeros(self.negative + 1)
self.neg_labels[0] = 1.

if sentences is not None or corpus_file is not None:
self._check_input_data_sanity(data_iterable=sentences, corpus_file=corpus_file)
if corpus_file is not None and not isinstance(corpus_file, string_types):
Expand Down
1 change: 0 additions & 1 deletion gensim/models/deprecated/fasttext.py
Original file line number Diff line number Diff line change
Expand Up @@ -45,7 +45,6 @@

logger = logging.getLogger(__name__)

FAST_VERSION = -1
MAX_WORDS_IN_BATCH = 10000


Expand Down
25 changes: 0 additions & 25 deletions gensim/models/deprecated/word2vec.py
Original file line number Diff line number Diff line change
Expand Up @@ -162,9 +162,6 @@

logger = logging.getLogger(__name__)


# failed... fall back to plain numpy (20-80x slower training than the above)
FAST_VERSION = -1
mpenkov marked this conversation as resolved.
Show resolved Hide resolved
MAX_WORDS_IN_BATCH = 10000


Expand Down Expand Up @@ -588,11 +585,6 @@ def __init__(self, sentences=None, size=100, alpha=0.025, window=5, min_count=5,

self.load = call_on_class_only

if FAST_VERSION == -1:
logger.warning('Slow version of %s is being used', __name__)
else:
logger.debug('Fast version of %s is being used', __name__)

self.initialize_word_vectors()
self.sg = int(sg)
self.cum_table = None # for negative sampling
Expand Down Expand Up @@ -1007,16 +999,6 @@ def train(self, sentences, total_examples=None, total_words=None,
"""
if self.model_trimmed_post_training:
raise RuntimeError("Parameters for training were discarded using model_trimmed_post_training method")
if FAST_VERSION < 0:
warnings.warn(
"C extension not loaded for Word2Vec, training will be slow. "
"Install a C compiler and reinstall gensim for fast training."
)
self.neg_labels = []
if self.negative > 0:
# precompute negative labels optimization for pure-python training
self.neg_labels = zeros(self.negative + 1)
self.neg_labels[0] = 1.

if compute_loss:
self.compute_loss = compute_loss
Expand Down Expand Up @@ -1234,12 +1216,6 @@ def score(self, sentences, total_sentences=int(1e6), chunksize=100, queue_factor
.. [#deepir] https://github.com/piskvorky/gensim/blob/develop/docs/notebooks/deepir.ipynb

"""
if FAST_VERSION < 0:
warnings.warn(
"C extension compilation failed, scoring will be slow. "
"Install a C compiler and reinstall gensim for fastness."
)

logger.info(
"scoring sentences with %i workers on %i vocabulary and %i features, "
"using sg=%s hs=%s sample=%s and negative=%s",
Expand Down Expand Up @@ -1852,7 +1828,6 @@ def __iter__(self):
level=logging.INFO
)
logger.info("running %s", " ".join(sys.argv))
logger.info("using optimization %s", FAST_VERSION)

# check and process cmdline input
program = os.path.basename(sys.argv[0])
Expand Down
Loading