From 91d219a7ae3fe1791685496d204090330065cb7e Mon Sep 17 00:00:00 2001 From: float64 Date: Tue, 28 Nov 2017 21:54:46 -0500 Subject: [PATCH 1/2] nline a barebones version of logsumexp for improved performance logsumexp accounts for 50% of the run time of ldamodel. Much of this time is spent by "robustness" checks performed by scipy's logsumexp (eg, _asarray_validated, checks for NaNs, etc.). Removing these checks greatly improves the overall performance of ldamodel. Eg, run time when fitting a lda model on the endron dataset (from UCI) decreases from 20-40%. --- gensim/models/ldamodel.py | 24 ++++++++++++++++++------ 1 file changed, 18 insertions(+), 6 deletions(-) diff --git a/gensim/models/ldamodel.py b/gensim/models/ldamodel.py index 1b50f0a9b8..20155036a9 100755 --- a/gensim/models/ldamodel.py +++ b/gensim/models/ldamodel.py @@ -47,16 +47,28 @@ from gensim.models import basemodel, CoherenceModel from gensim.models.callbacks import Callback -# log(sum(exp(x))) that tries to avoid overflow -try: - from scipy.special import logsumexp -except ImportError: - from scipy.misc import logsumexp - logger = logging.getLogger('gensim.models.ldamodel') +def logsumexp(x): + """ + barebones log-sum-exp that tries to avoid overflows + + Args: + x: 1d ndarray + + Note: + does not support NaNs + + """ + x_max = np.max(x) + x = np.log(np.sum(np.exp(x - x_max))) + x += x_max + + return x + + def update_dir_prior(prior, N, logphat, rho): """ Updates a given prior using Newton's method, described in From 44b8eba30d694c16c73453beab57829b2effbe91 Mon Sep 17 00:00:00 2001 From: float64 Date: Tue, 28 Nov 2017 21:54:46 -0500 Subject: [PATCH 2/2] inline a barebones version of logsumexp for improved performance logsumexp accounts for 50% of the run time of ldamodel. Much of this time is spent by "robustness" checks performed by scipy's logsumexp (eg, _asarray_validated, checks for NaNs, etc.). Removing these checks greatly improves the overall performance of ldamodel. Eg, run time when fitting a lda model on the enron dataset (from UCI) decreases from 20-40%. --- gensim/models/ldamodel.py | 24 ++++++++++++++++++------ 1 file changed, 18 insertions(+), 6 deletions(-) diff --git a/gensim/models/ldamodel.py b/gensim/models/ldamodel.py index 1b50f0a9b8..20155036a9 100755 --- a/gensim/models/ldamodel.py +++ b/gensim/models/ldamodel.py @@ -47,16 +47,28 @@ from gensim.models import basemodel, CoherenceModel from gensim.models.callbacks import Callback -# log(sum(exp(x))) that tries to avoid overflow -try: - from scipy.special import logsumexp -except ImportError: - from scipy.misc import logsumexp - logger = logging.getLogger('gensim.models.ldamodel') +def logsumexp(x): + """ + barebones log-sum-exp that tries to avoid overflows + + Args: + x: 1d ndarray + + Note: + does not support NaNs + + """ + x_max = np.max(x) + x = np.log(np.sum(np.exp(x - x_max))) + x += x_max + + return x + + def update_dir_prior(prior, N, logphat, rho): """ Updates a given prior using Newton's method, described in