From 9dd5d19e8fff7c521c5984c48c66cee5b784c47d Mon Sep 17 00:00:00 2001 From: mohsin-ashraf Date: Mon, 27 Jul 2020 21:02:27 +0500 Subject: [PATCH] Ref #2894 fixing segmentation fault for doc2vec --- gensim/models/doc2vec_corpusfile.pyx | 50 ++++++++++++++-------------- 1 file changed, 25 insertions(+), 25 deletions(-) diff --git a/gensim/models/doc2vec_corpusfile.pyx b/gensim/models/doc2vec_corpusfile.pyx index 5b8cbeabff..9ebd52ba3b 100644 --- a/gensim/models/doc2vec_corpusfile.pyx +++ b/gensim/models/doc2vec_corpusfile.pyx @@ -50,20 +50,20 @@ from gensim.models.word2vec_corpusfile cimport ( DEF MAX_DOCUMENT_LEN = 10000 -cdef int ONE = 1 +cdef long long ONE = 1 cdef REAL_t ONEF = 1.0 cdef void prepare_c_structures_for_batch( - vector[string] &doc_words, int sample, int hs, int window, long long *total_words, - int *effective_words, unsigned long long *next_random, cvocab_t *vocab, - np.uint32_t *indexes, int *codelens, np.uint8_t **codes, np.uint32_t **points, - np.uint32_t *reduced_windows, int *document_len, int train_words, - int docvecs_count, int doc_tag, + vector[string] &doc_words, long long sample, long long hs, long long window, long long *total_words, + long long *effective_words, unsigned long long *next_random, cvocab_t *vocab, + np.uint32_t *indexes, long long *codelens, np.uint8_t **codes, np.uint32_t **points, + np.uint32_t *reduced_windows, long long *document_len, long long train_words, + long long docvecs_count, long long doc_tag, ) nogil: cdef VocabItem predict_word cdef string token - cdef int i = 0 + cdef long long i = 0 total_words[0] += doc_words.size() @@ -144,8 +144,8 @@ def d2v_train_epoch_dbow( """ cdef Doc2VecConfig c - cdef int cur_epoch = _cur_epoch - cdef int num_epochs = model.epochs + cdef long long cur_epoch = _cur_epoch + cdef long long num_epochs = model.epochs cdef long long expected_examples = (-1 if _expected_examples is None else _expected_examples) cdef long long expected_words = (-1 if _expected_words is None else _expected_words) cdef REAL_t start_alpha = model.alpha @@ -155,14 +155,14 @@ def d2v_train_epoch_dbow( cdef CythonLineSentence input_stream = CythonLineSentence(corpus_file, offset) cdef CythonVocab vocab = _cython_vocab - cdef int i, j, document_len - cdef int effective_words = 0 + cdef long long i, j, document_len + cdef long long effective_words = 0 cdef long long total_documents = 0 cdef long long total_effective_words = 0, total_words = 0 - cdef int sent_idx, idx_start, idx_end + cdef long long sent_idx, idx_start, idx_end cdef vector[string] doc_words - cdef int _doc_tag = start_doctag + cdef long long _doc_tag = start_doctag init_d2v_config( &c, model, _alpha, learn_doctags, learn_words, learn_hidden, train_words=train_words, @@ -283,8 +283,8 @@ def d2v_train_epoch_dm( """ cdef Doc2VecConfig c - cdef int cur_epoch = _cur_epoch - cdef int num_epochs = model.epochs + cdef long long cur_epoch = _cur_epoch + cdef long long num_epochs = model.epochs cdef long long expected_examples = (-1 if _expected_examples is None else _expected_examples) cdef long long expected_words = (-1 if _expected_words is None else _expected_words) cdef REAL_t start_alpha = model.alpha @@ -294,15 +294,15 @@ def d2v_train_epoch_dm( cdef CythonLineSentence input_stream = CythonLineSentence(corpus_file, offset) cdef CythonVocab vocab = _cython_vocab - cdef int i, j, k, m, document_len - cdef int effective_words = 0 + cdef long long i, j, k, m, document_len + cdef long long effective_words = 0 cdef long long total_documents = 0 cdef long long total_effective_words = 0, total_words = 0 - cdef int sent_idx, idx_start, idx_end + cdef long long sent_idx, idx_start, idx_end cdef REAL_t count, inv_count = 1.0 cdef vector[string] doc_words - cdef int _doc_tag = start_doctag + cdef long long _doc_tag = start_doctag init_d2v_config( &c, model, _alpha, learn_doctags, learn_words, learn_hidden, train_words=False, @@ -437,8 +437,8 @@ def d2v_train_epoch_dm_concat( """ cdef Doc2VecConfig c - cdef int cur_epoch = _cur_epoch - cdef int num_epochs = model.epochs + cdef long long cur_epoch = _cur_epoch + cdef long long num_epochs = model.epochs cdef long long expected_examples = (-1 if _expected_examples is None else _expected_examples) cdef long long expected_words = (-1 if _expected_words is None else _expected_words) cdef REAL_t start_alpha = model.alpha @@ -448,14 +448,14 @@ def d2v_train_epoch_dm_concat( cdef CythonLineSentence input_stream = CythonLineSentence(corpus_file, offset) cdef CythonVocab vocab = _cython_vocab - cdef int i, j, k, m, n, document_len - cdef int effective_words = 0 + cdef long long i, j, k, m, n, document_len + cdef long long effective_words = 0 cdef long long total_documents = 0 cdef long long total_effective_words = 0, total_words = 0 - cdef int sent_idx, idx_start, idx_end + cdef long long sent_idx, idx_start, idx_end cdef vector[string] doc_words - cdef int _doc_tag = start_doctag + cdef long long _doc_tag = start_doctag init_d2v_config( &c, model, _alpha, learn_doctags, learn_words, learn_hidden, train_words=False,