From 01c3dde9a3b7f480dd1fe68b142ea206cfae1776 Mon Sep 17 00:00:00 2001 From: Chinmaya Pancholi Date: Fri, 24 Mar 2017 14:00:29 +0530 Subject: [PATCH 1/8] removed unnecessary keep_bocab_item import --- gensim/models/word2vec.py | 30 ++++++++++++++++++++++++++++-- 1 file changed, 28 insertions(+), 2 deletions(-) diff --git a/gensim/models/word2vec.py b/gensim/models/word2vec.py index 000eee6976..4ffef587f0 100644 --- a/gensim/models/word2vec.py +++ b/gensim/models/word2vec.py @@ -108,8 +108,9 @@ import itertools import warnings +import operator + from gensim.utils import keep_vocab_item, call_on_class_only -from gensim.utils import keep_vocab_item from gensim.models.keyedvectors import KeyedVectors, Vocab try: @@ -361,7 +362,7 @@ class Word2Vec(utils.SaveLoad): """ def __init__( - self, sentences=None, size=100, alpha=0.025, window=5, min_count=5, + self, sentences=None, size=100, alpha=0.025, window=5, min_count=5, max_vocab = None, max_vocab_size=None, sample=1e-3, seed=1, workers=3, min_alpha=0.0001, sg=0, hs=0, negative=5, cbow_mean=1, hashfxn=hash, iter=5, null_word=0, trim_rule=None, sorted_vocab=1, batch_words=MAX_WORDS_IN_BATCH): @@ -456,6 +457,7 @@ def __init__( self.seed = seed self.random = random.RandomState(seed) self.min_count = min_count + self.max_vocab = max_vocab self.sample = sample self.workers = int(workers) self.min_alpha = float(min_alpha) @@ -549,6 +551,11 @@ def build_vocab(self, sentences, keep_raw_vocab=False, trim_rule=None, progress_ """ self.scan_vocab(sentences, progress_per=progress_per, trim_rule=trim_rule) # initial survey + + # min_count_actual = + self.compute_min_count(self.raw_vocab, self.max_vocab) + # print min_count_actual + self.scale_vocab(keep_raw_vocab=keep_raw_vocab, trim_rule=trim_rule, update=update) # trim by min_count & precalculate downsampling self.finalize_vocab(update=update) # build tables & arrays @@ -582,6 +589,25 @@ def scan_vocab(self, sentences, progress_per=10000, trim_rule=None): self.corpus_count = sentence_no + 1 self.raw_vocab = vocab + def compute_min_count(self, vocab1, max_vocab1): + vocab_sorted_by_count = sorted(vocab1.items(), key=operator.itemgetter(1), reverse=True) + print vocab_sorted_by_count + word_counts = zeros(len(vocab_sorted_by_count)) + for x in range(len(vocab_sorted_by_count)): + if x == 0: + word_counts[x] = (vocab_sorted_by_count[x][1]) + else: + word_counts[x] = (vocab_sorted_by_count[x][1] + word_counts[x-1]) + + print word_counts + + index1 = word_counts.searchsorted(max_vocab1) + print index1 + if index1!= len(word_counts): + print vocab_sorted_by_count[index1+1][1] + else : + print 0 + def scale_vocab(self, min_count=None, sample=None, dry_run=False, keep_raw_vocab=False, trim_rule=None, update=False): """ Apply vocabulary settings for `min_count` (discarding less-frequent words) From 8797cd122ac74da4f9e4f903b8e74633b0cda611 Mon Sep 17 00:00:00 2001 From: Chinmaya Pancholi Date: Fri, 24 Mar 2017 14:01:33 +0530 Subject: [PATCH 2/8] removed duplicate warnings import --- gensim/models/word2vec.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/gensim/models/word2vec.py b/gensim/models/word2vec.py index 4ffef587f0..6659e222e6 100644 --- a/gensim/models/word2vec.py +++ b/gensim/models/word2vec.py @@ -801,7 +801,6 @@ def train(self, sentences, total_words=None, word_count=0, if (self.model_trimmed_post_training): raise RuntimeError("Parameters for training were discarded using model_trimmed_post_training method") if FAST_VERSION < 0: - import warnings warnings.warn("C extension not loaded for Word2Vec, training will be slow. " "Install a C compiler and reinstall gensim for fast training.") self.neg_labels = [] @@ -1004,7 +1003,6 @@ def score(self, sentences, total_sentences=int(1e6), chunksize=100, queue_factor """ if FAST_VERSION < 0: - import warnings warnings.warn("C extension compilation failed, scoring will be slow. " "Install a C compiler and reinstall gensim for fastness.") From d09442cb984defff7245d84cd4ca3ed8c12461b9 Mon Sep 17 00:00:00 2001 From: Chinmaya Pancholi Date: Fri, 24 Mar 2017 14:02:57 +0530 Subject: [PATCH 3/8] updated warning message for trim_rule --- gensim/models/word2vec.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/gensim/models/word2vec.py b/gensim/models/word2vec.py index 6659e222e6..95db588e4d 100644 --- a/gensim/models/word2vec.py +++ b/gensim/models/word2vec.py @@ -424,7 +424,7 @@ def __init__( in the vocabulary, be trimmed away, or handled using the default (discard if word count < min_count). Can be None (min_count will be used), or a callable that accepts parameters (word, count, min_count) and returns either `utils.RULE_DISCARD`, `utils.RULE_KEEP` or `utils.RULE_DEFAULT`. - Note: The rule, if given, is only used prune vocabulary during build_vocab() and is not stored as part + Note: The rule, if given, is only used to prune vocabulary during build_vocab() and is not stored as part of the model. `sorted_vocab` = if 1 (default), sort the vocabulary by descending frequency before @@ -480,7 +480,7 @@ def __init__( else : if trim_rule is not None : - logger.warning("The rule, if given, is only used prune vocabulary during build_vocab() and is not stored as part of the model. ") + logger.warning("The rule, if given, is only used to prune vocabulary during build_vocab() and is not stored as part of the model. ") logger.warning("Model initialized without sentences. trim_rule provided, if any, will be ignored." ) From 73b7e2fa11c25f2932d3dd06bc97414870cc97c4 Mon Sep 17 00:00:00 2001 From: Chinmaya Pancholi Date: Mon, 27 Mar 2017 20:51:49 +0530 Subject: [PATCH 4/8] added wrapper class for lsimodel --- gensim/models/word2vec.py | 36 +----- .../sklearn_wrapper_gensim_lsimodel.py | 104 ++++++++++++++++++ 2 files changed, 110 insertions(+), 30 deletions(-) create mode 100644 gensim/sklearn_integration/sklearn_wrapper_gensim_lsimodel.py diff --git a/gensim/models/word2vec.py b/gensim/models/word2vec.py index 95db588e4d..000eee6976 100644 --- a/gensim/models/word2vec.py +++ b/gensim/models/word2vec.py @@ -108,9 +108,8 @@ import itertools import warnings -import operator - from gensim.utils import keep_vocab_item, call_on_class_only +from gensim.utils import keep_vocab_item from gensim.models.keyedvectors import KeyedVectors, Vocab try: @@ -362,7 +361,7 @@ class Word2Vec(utils.SaveLoad): """ def __init__( - self, sentences=None, size=100, alpha=0.025, window=5, min_count=5, max_vocab = None, + self, sentences=None, size=100, alpha=0.025, window=5, min_count=5, max_vocab_size=None, sample=1e-3, seed=1, workers=3, min_alpha=0.0001, sg=0, hs=0, negative=5, cbow_mean=1, hashfxn=hash, iter=5, null_word=0, trim_rule=None, sorted_vocab=1, batch_words=MAX_WORDS_IN_BATCH): @@ -424,7 +423,7 @@ def __init__( in the vocabulary, be trimmed away, or handled using the default (discard if word count < min_count). Can be None (min_count will be used), or a callable that accepts parameters (word, count, min_count) and returns either `utils.RULE_DISCARD`, `utils.RULE_KEEP` or `utils.RULE_DEFAULT`. - Note: The rule, if given, is only used to prune vocabulary during build_vocab() and is not stored as part + Note: The rule, if given, is only used prune vocabulary during build_vocab() and is not stored as part of the model. `sorted_vocab` = if 1 (default), sort the vocabulary by descending frequency before @@ -457,7 +456,6 @@ def __init__( self.seed = seed self.random = random.RandomState(seed) self.min_count = min_count - self.max_vocab = max_vocab self.sample = sample self.workers = int(workers) self.min_alpha = float(min_alpha) @@ -480,7 +478,7 @@ def __init__( else : if trim_rule is not None : - logger.warning("The rule, if given, is only used to prune vocabulary during build_vocab() and is not stored as part of the model. ") + logger.warning("The rule, if given, is only used prune vocabulary during build_vocab() and is not stored as part of the model. ") logger.warning("Model initialized without sentences. trim_rule provided, if any, will be ignored." ) @@ -551,11 +549,6 @@ def build_vocab(self, sentences, keep_raw_vocab=False, trim_rule=None, progress_ """ self.scan_vocab(sentences, progress_per=progress_per, trim_rule=trim_rule) # initial survey - - # min_count_actual = - self.compute_min_count(self.raw_vocab, self.max_vocab) - # print min_count_actual - self.scale_vocab(keep_raw_vocab=keep_raw_vocab, trim_rule=trim_rule, update=update) # trim by min_count & precalculate downsampling self.finalize_vocab(update=update) # build tables & arrays @@ -589,25 +582,6 @@ def scan_vocab(self, sentences, progress_per=10000, trim_rule=None): self.corpus_count = sentence_no + 1 self.raw_vocab = vocab - def compute_min_count(self, vocab1, max_vocab1): - vocab_sorted_by_count = sorted(vocab1.items(), key=operator.itemgetter(1), reverse=True) - print vocab_sorted_by_count - word_counts = zeros(len(vocab_sorted_by_count)) - for x in range(len(vocab_sorted_by_count)): - if x == 0: - word_counts[x] = (vocab_sorted_by_count[x][1]) - else: - word_counts[x] = (vocab_sorted_by_count[x][1] + word_counts[x-1]) - - print word_counts - - index1 = word_counts.searchsorted(max_vocab1) - print index1 - if index1!= len(word_counts): - print vocab_sorted_by_count[index1+1][1] - else : - print 0 - def scale_vocab(self, min_count=None, sample=None, dry_run=False, keep_raw_vocab=False, trim_rule=None, update=False): """ Apply vocabulary settings for `min_count` (discarding less-frequent words) @@ -801,6 +775,7 @@ def train(self, sentences, total_words=None, word_count=0, if (self.model_trimmed_post_training): raise RuntimeError("Parameters for training were discarded using model_trimmed_post_training method") if FAST_VERSION < 0: + import warnings warnings.warn("C extension not loaded for Word2Vec, training will be slow. " "Install a C compiler and reinstall gensim for fast training.") self.neg_labels = [] @@ -1003,6 +978,7 @@ def score(self, sentences, total_sentences=int(1e6), chunksize=100, queue_factor """ if FAST_VERSION < 0: + import warnings warnings.warn("C extension compilation failed, scoring will be slow. " "Install a C compiler and reinstall gensim for fastness.") diff --git a/gensim/sklearn_integration/sklearn_wrapper_gensim_lsimodel.py b/gensim/sklearn_integration/sklearn_wrapper_gensim_lsimodel.py new file mode 100644 index 0000000000..4b72e27547 --- /dev/null +++ b/gensim/sklearn_integration/sklearn_wrapper_gensim_lsimodel.py @@ -0,0 +1,104 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- +# +# Copyright (C) 2011 Radim Rehurek +# Licensed under the GNU LGPL v2.1 - http://www.gnu.org/licenses/lgpl.html +# +""" +Scikit learn interface for gensim for easy use of gensim with scikit-learn +Follows scikit-learn API conventions +""" +import numpy as np + +from gensim import models +from gensim import matutils +from scipy import sparse +from sklearn.base import TransformerMixin, BaseEstimator + +# accuracy defaults for the multi-pass stochastic algo +P2_EXTRA_DIMS = 100 # set to `None` for dynamic P2_EXTRA_DIMS=k +P2_EXTRA_ITERS = 2 + +class SklearnWrapperLsiModel(models.LsiModel, TransformerMixin, BaseEstimator): + """ + Base LSI module + """ + + def __init__(self, corpus=None, num_topics=200, id2word=None, chunksize=20000, + decay=1.0, onepass=True, power_iters=P2_EXTRA_ITERS, extra_samples=P2_EXTRA_DIMS): + """ + Sklearn wrapper for LSI model. Class derived from gensim.model.LsiModel. + """ + self.corpus = corpus + self.num_topics = num_topics + self.id2word = id2word + self.chunksize = chunksize + self.decay = decay + self.onepass = onepass + self.extra_samples = extra_samples + self.power_iters = power_iters + + # if 'fit' function is not used, then 'corpus' is given in init + if self.corpus: + models.LsiModel.__init__(self, corpus=self.corpus, num_topics=self.num_topics, id2word=self.id2word, chunksize=self.chunksize, + decay=self.decay, onepass=self.onepass, power_iters=self.power_iters, extra_samples=self.extra_samples) + + def get_params(self, deep=True): + """ + Returns all parameters as dictionary. + """ + return {"corpus": self.corpus, "num_topics": self.num_topics, "id2word": self.id2word, + "chunksize": self.chunksize, "decay": self.decay, "onepass": self.onepass, + "extra_samples": self.extra_samples, "power_iters": self.power_iters} + + def set_params(self, **parameters): + """ + Set all parameters. + """ + for parameter, value in parameters.items(): + self.parameter = value + return self + + def fit(self, X, y=None): + """ + For fitting corpus into the class object. + Calls gensim.model.LsiModel: + >>>gensim.models.LsiModel(corpus=corpus, num_topics=num_topics, id2word=id2word, chunksize=chunksize, decay=decay, onepass=onepass, power_iters=power_iters, extra_samples=extra_samples) + """ + if sparse.issparse(X): + self.corpus = matutils.Sparse2Corpus(X) + else: + self.corpus = X + + models.LsiModel.__init__(self, corpus=self.corpus, num_topics=self.num_topics, id2word=self.id2word, chunksize=self.chunksize, + decay=self.decay, onepass=self.onepass, power_iters=self.power_iters, extra_samples=self.extra_samples) + return self + + def transform(self, docs): + """ + Takes a list of documents as input ('docs'). + Returns a matrix of topic distribution for the given document bow, where a_ij + indicates (topic_i, topic_probability_j). + """ + # The input as array of array + check = lambda x: [x] if isinstance(x[0], tuple) else x + docs = check(docs) + X = [[] for i in range(0,len(docs))]; + for k,v in enumerate(docs): + doc_topics = self[v] + print doc_topics + probs_docs = list(map(lambda x: x[1], doc_topics)) + # Everything should be equal in length + if len(probs_docs) != self.num_topics: + probs_docs.extend([1e-12]*(self.num_topics - len(probs_docs))) + X[k] = probs_docs + probs_docs = [] + return np.reshape(np.array(X), (len(docs), self.num_topics)) + + def partial_fit(self, X): + """ + Train model over X. + """ + if sparse.issparse(X): + X = matutils.Sparse2Corpus(X) + self.add_documents(corpus=X) \ No newline at end of file From 0a9ca6a76aca17142cd563666ffab9a217bb1210 Mon Sep 17 00:00:00 2001 From: Chinmaya Pancholi Date: Mon, 27 Mar 2017 21:00:35 +0530 Subject: [PATCH 5/8] removed unnecessary print statement --- gensim/sklearn_integration/sklearn_wrapper_gensim_lsimodel.py | 1 - 1 file changed, 1 deletion(-) diff --git a/gensim/sklearn_integration/sklearn_wrapper_gensim_lsimodel.py b/gensim/sklearn_integration/sklearn_wrapper_gensim_lsimodel.py index 4b72e27547..11da5065dc 100644 --- a/gensim/sklearn_integration/sklearn_wrapper_gensim_lsimodel.py +++ b/gensim/sklearn_integration/sklearn_wrapper_gensim_lsimodel.py @@ -86,7 +86,6 @@ def transform(self, docs): X = [[] for i in range(0,len(docs))]; for k,v in enumerate(docs): doc_topics = self[v] - print doc_topics probs_docs = list(map(lambda x: x[1], doc_topics)) # Everything should be equal in length if len(probs_docs) != self.num_topics: From b9ef3609b3e366aa4d463d04b517781c43953b85 Mon Sep 17 00:00:00 2001 From: Chinmaya Pancholi Date: Tue, 28 Mar 2017 01:32:49 +0530 Subject: [PATCH 6/8] added tests for lsi wrapper --- gensim/test/test_sklearn_integration.py | 55 ++++++++++++++++++++++++- 1 file changed, 53 insertions(+), 2 deletions(-) diff --git a/gensim/test/test_sklearn_integration.py b/gensim/test/test_sklearn_integration.py index 3a6401962b..ff254da930 100644 --- a/gensim/test/test_sklearn_integration.py +++ b/gensim/test/test_sklearn_integration.py @@ -11,6 +11,7 @@ from sklearn.datasets import load_files from sklearn import linear_model from gensim.sklearn_integration.sklearn_wrapper_gensim_ldamodel import SklearnWrapperLdaModel +from gensim.sklearn_integration.sklearn_wrapper_gensim_lsimodel import SklearnWrapperLsiModel from gensim.corpora import Dictionary from gensim import matutils @@ -55,7 +56,7 @@ def testTransform(self): X = self.model.transform(bow) self.assertTrue(X.shape[0], 3) self.assertTrue(X.shape[1], self.model.num_topics) - + def testGetTopicDist(self): texts_new = ['graph','eulerian'] bow = self.model.id2word.doc2bow(texts_new) @@ -97,7 +98,7 @@ def testPipeline(self): compressed_content = f.read() uncompressed_content = codecs.decode(compressed_content, 'zlib_codec') cache = pickle.loads(uncompressed_content) - data = cache + data = cache id2word=Dictionary(map(lambda x : x.split(), data.data)) corpus = [id2word.doc2bow(i.split()) for i in data.data] rand = numpy.random.mtrand.RandomState(1) # set seed for getting same result @@ -107,5 +108,55 @@ def testPipeline(self): score = text_lda.score(corpus, data.target) self.assertGreater(score, 0.50) +class TestSklearnLSIWrapper(unittest.TestCase): + def setUp(self): + self.model = SklearnWrapperLsiModel(id2word=dictionary, num_topics=2) + self.model.fit(corpus) + + def testPrintTopic(self): + topic = self.model.print_topics(2) + for k, v in topic: + self.assertTrue(isinstance(v, six.string_types)) + self.assertTrue(isinstance(k, int)) + + def testTransform(self): + texts_new = ['graph','eulerian'] + bow = self.model.id2word.doc2bow(texts_new) + X = self.model.transform(bow) + self.assertTrue(X.shape[0], 1) + self.assertTrue(X.shape[1], self.model.num_topics) + texts_new = [['graph','eulerian'],['server', 'flow'], ['path', 'system']] + bow = [] + for i in texts_new: + bow.append(self.model.id2word.doc2bow(i)) + X = self.model.transform(bow) + self.assertTrue(X.shape[0], 3) + self.assertTrue(X.shape[1], self.model.num_topics) + + def testPartialFit(self): + for i in range(10): + self.model.partial_fit(X=corpus) # fit against the model again + doc=list(corpus)[0] # transform only the first document + transformed = self.model[doc] + transformed_approx = matutils.sparse2full(transformed, 2) # better approximation + expected=[1.39, 0.0] + passed = numpy.allclose(sorted(transformed_approx), sorted(expected), atol=1e-1) + self.assertTrue(passed) + + def testPipeline(self): + model = SklearnWrapperLsiModel(num_topics=2) + with open(datapath('mini_newsgroup'),'rb') as f: + compressed_content = f.read() + uncompressed_content = codecs.decode(compressed_content, 'zlib_codec') + cache = pickle.loads(uncompressed_content) + data = cache + id2word=Dictionary(map(lambda x : x.split(), data.data)) + corpus = [id2word.doc2bow(i.split()) for i in data.data] + clf=linear_model.LogisticRegression(penalty='l2', C=0.1) + text_lda = Pipeline((('features', model,), ('classifier', clf))) + text_lda.fit(corpus, data.target) + score = text_lda.score(corpus, data.target) + self.assertGreater(score, 0.50) + if __name__ == '__main__': unittest.main() From a36626127353f413b739365054e77b2c15f209c7 Mon Sep 17 00:00:00 2001 From: Chinmaya Pancholi Date: Tue, 28 Mar 2017 22:52:34 +0530 Subject: [PATCH 7/8] changed name from testPrintTopic to testModelSanity and made defaults explicit --- .../sklearn_integration/sklearn_wrapper_gensim_lsimodel.py | 6 +----- gensim/test/test_sklearn_integration.py | 2 +- 2 files changed, 2 insertions(+), 6 deletions(-) diff --git a/gensim/sklearn_integration/sklearn_wrapper_gensim_lsimodel.py b/gensim/sklearn_integration/sklearn_wrapper_gensim_lsimodel.py index 11da5065dc..753cbaf899 100644 --- a/gensim/sklearn_integration/sklearn_wrapper_gensim_lsimodel.py +++ b/gensim/sklearn_integration/sklearn_wrapper_gensim_lsimodel.py @@ -15,17 +15,13 @@ from scipy import sparse from sklearn.base import TransformerMixin, BaseEstimator -# accuracy defaults for the multi-pass stochastic algo -P2_EXTRA_DIMS = 100 # set to `None` for dynamic P2_EXTRA_DIMS=k -P2_EXTRA_ITERS = 2 - class SklearnWrapperLsiModel(models.LsiModel, TransformerMixin, BaseEstimator): """ Base LSI module """ def __init__(self, corpus=None, num_topics=200, id2word=None, chunksize=20000, - decay=1.0, onepass=True, power_iters=P2_EXTRA_ITERS, extra_samples=P2_EXTRA_DIMS): + decay=1.0, onepass=True, power_iters=2, extra_samples=100): """ Sklearn wrapper for LSI model. Class derived from gensim.model.LsiModel. """ diff --git a/gensim/test/test_sklearn_integration.py b/gensim/test/test_sklearn_integration.py index ff254da930..2f5497550a 100644 --- a/gensim/test/test_sklearn_integration.py +++ b/gensim/test/test_sklearn_integration.py @@ -113,7 +113,7 @@ def setUp(self): self.model = SklearnWrapperLsiModel(id2word=dictionary, num_topics=2) self.model.fit(corpus) - def testPrintTopic(self): + def testModelSanity(self): topic = self.model.print_topics(2) for k, v in topic: self.assertTrue(isinstance(v, six.string_types)) From e74d8e71f948da26068e73cef2a75a2785bd4e40 Mon Sep 17 00:00:00 2001 From: Chinmaya Pancholi Date: Wed, 29 Mar 2017 00:25:01 +0530 Subject: [PATCH 8/8] added pipeline example for LsiModel --- docs/notebooks/sklearn_wrapper.ipynb | 136 ++++++++++++++++++++------- 1 file changed, 104 insertions(+), 32 deletions(-) diff --git a/docs/notebooks/sklearn_wrapper.ipynb b/docs/notebooks/sklearn_wrapper.ipynb index 0d28429ecf..e98047dedc 100644 --- a/docs/notebooks/sklearn_wrapper.ipynb +++ b/docs/notebooks/sklearn_wrapper.ipynb @@ -11,7 +11,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "This tutorial is about using gensim models as a part of your scikit learn workflow with the help of wrappers found at ```gensim.sklearn_integration.sklearn_wrapper_gensim_ldaModel```" + "This tutorial is about using gensim models as a part of your scikit learn workflow with the help of wrappers found at ```gensim.sklearn_integration```" ] }, { @@ -19,7 +19,9 @@ "metadata": {}, "source": [ "The wrapper available (as of now) are :\n", - "* LdaModel (```gensim.sklearn_integration.sklearn_wrapper_gensim_ldaModel.SklearnWrapperLdaModel```),which implements gensim's ```LdaModel``` in a scikit-learn interface" + "* LdaModel (```gensim.sklearn_integration.sklearn_wrapper_gensim_ldaModel.SklearnWrapperLdaModel```),which implements gensim's ```LdaModel``` in a scikit-learn interface\n", + "\n", + "* LsiModel (```gensim.sklearn_integration.sklearn_wrapper_gensim_lsiModel.SklearnWrapperLsiModel```),which implements gensim's ```LsiModel``` in a scikit-learn interface" ] }, { @@ -38,7 +40,7 @@ }, { "cell_type": "code", - "execution_count": 20, + "execution_count": 1, "metadata": { "collapsed": false }, @@ -56,7 +58,7 @@ }, { "cell_type": "code", - "execution_count": 21, + "execution_count": 2, "metadata": { "collapsed": true }, @@ -85,7 +87,7 @@ }, { "cell_type": "code", - "execution_count": 22, + "execution_count": 3, "metadata": { "collapsed": false }, @@ -111,7 +113,7 @@ " [ 0.84210373, 0.15789627]])" ] }, - "execution_count": 22, + "execution_count": 3, "metadata": {}, "output_type": "execute_result" } @@ -129,7 +131,7 @@ "collapsed": true }, "source": [ - "### Integration with Sklearn" + "#### Integration with Sklearn" ] }, { @@ -141,7 +143,7 @@ }, { "cell_type": "code", - "execution_count": 23, + "execution_count": 4, "metadata": { "collapsed": false }, @@ -157,7 +159,7 @@ }, { "cell_type": "code", - "execution_count": 24, + "execution_count": 5, "metadata": { "collapsed": false }, @@ -179,7 +181,7 @@ }, { "cell_type": "code", - "execution_count": 25, + "execution_count": 6, "metadata": { "collapsed": false }, @@ -202,7 +204,7 @@ }, { "cell_type": "code", - "execution_count": 26, + "execution_count": 7, "metadata": { "collapsed": false }, @@ -211,18 +213,18 @@ "data": { "text/plain": [ "[(0,\n", - " u'0.085*\"abroad\" + 0.053*\"ciphertext\" + 0.042*\"arithmetic\" + 0.037*\"facts\" + 0.031*\"courtesy\" + 0.025*\"amolitor\" + 0.023*\"argue\" + 0.021*\"asking\" + 0.020*\"agree\" + 0.018*\"classified\"'),\n", + " u'0.025*\"456\" + 0.021*\"argue\" + 0.016*\"bitnet\" + 0.015*\"beastmaster\" + 0.014*\"cryptography\" + 0.013*\"false\" + 0.012*\"digex\" + 0.011*\"cover\" + 0.011*\"classified\" + 0.010*\"disk\"'),\n", " (1,\n", - " u'0.098*\"asking\" + 0.075*\"cryptography\" + 0.068*\"abroad\" + 0.033*\"456\" + 0.025*\"argue\" + 0.022*\"bitnet\" + 0.017*\"false\" + 0.014*\"digex\" + 0.014*\"effort\" + 0.013*\"disk\"'),\n", + " u'0.142*\"abroad\" + 0.113*\"asking\" + 0.088*\"cryptography\" + 0.044*\"ciphertext\" + 0.043*\"arithmetic\" + 0.032*\"courtesy\" + 0.030*\"facts\" + 0.021*\"argue\" + 0.019*\"amolitor\" + 0.018*\"agree\"'),\n", " (2,\n", - " u'0.023*\"accurate\" + 0.021*\"corporate\" + 0.013*\"clark\" + 0.012*\"chance\" + 0.009*\"consideration\" + 0.008*\"authentication\" + 0.008*\"dawson\" + 0.008*\"candidates\" + 0.008*\"basically\" + 0.008*\"assess\"'),\n", + " u'0.034*\"certain\" + 0.027*\"69\" + 0.025*\"book\" + 0.025*\"demand\" + 0.024*\"87\" + 0.024*\"cracking\" + 0.021*\"farm\" + 0.019*\"fierkelab\" + 0.015*\"face\" + 0.011*\"abroad\"'),\n", " (3,\n", - " u'0.016*\"cryptography\" + 0.007*\"evans\" + 0.006*\"considering\" + 0.006*\"forgot\" + 0.006*\"built\" + 0.005*\"constitutional\" + 0.005*\"fly\" + 0.004*\"cellular\" + 0.004*\"computed\" + 0.004*\"digitized\"'),\n", + " u'0.017*\"decipher\" + 0.017*\"example\" + 0.016*\"cases\" + 0.016*\"follow\" + 0.008*\"considering\" + 0.006*\"forgot\" + 0.006*\"cellular\" + 0.005*\"evans\" + 0.005*\"computed\" + 0.005*\"cia\"'),\n", " (4,\n", - " u'0.028*\"certain\" + 0.022*\"69\" + 0.021*\"book\" + 0.020*\"demand\" + 0.020*\"cracking\" + 0.020*\"87\" + 0.017*\"farm\" + 0.017*\"fierkelab\" + 0.015*\"face\" + 0.009*\"constitutional\"')]" + " u'0.022*\"accurate\" + 0.021*\"corporate\" + 0.013*\"chance\" + 0.012*\"clark\" + 0.009*\"consideration\" + 0.009*\"candidates\" + 0.008*\"dawson\" + 0.008*\"authentication\" + 0.008*\"assess\" + 0.008*\"attempt\"')]" ] }, - "execution_count": 26, + "execution_count": 7, "metadata": {}, "output_type": "execute_result" } @@ -239,12 +241,12 @@ "collapsed": true }, "source": [ - "### Example for Using Grid Search" + "#### Example for Using Grid Search" ] }, { "cell_type": "code", - "execution_count": 30, + "execution_count": 8, "metadata": { "collapsed": false }, @@ -256,7 +258,7 @@ }, { "cell_type": "code", - "execution_count": 31, + "execution_count": 9, "metadata": { "collapsed": true }, @@ -269,7 +271,7 @@ }, { "cell_type": "code", - "execution_count": 32, + "execution_count": 10, "metadata": { "collapsed": false }, @@ -280,16 +282,16 @@ "GridSearchCV(cv=5, error_score='raise',\n", " estimator=SklearnWrapperLdaModel(alpha='symmetric', chunksize=2000, corpus=None,\n", " decay=0.5, eta=None, eval_every=10, gamma_threshold=0.001,\n", - " id2word=,\n", + " id2word=,\n", " iterations=50, minimum_probability=0.01, num_topics=5,\n", " offset=1.0, passes=20, random_state=None, update_every=1),\n", " fit_params={}, iid=True, n_jobs=1,\n", " param_grid={'num_topics': (2, 3, 5, 10), 'iterations': (1, 20, 50)},\n", " pre_dispatch='2*n_jobs', refit=True, return_train_score=True,\n", - " scoring=, verbose=0)" + " scoring=, verbose=0)" ] }, - "execution_count": 32, + "execution_count": 10, "metadata": {}, "output_type": "execute_result" } @@ -303,7 +305,7 @@ }, { "cell_type": "code", - "execution_count": 33, + "execution_count": 11, "metadata": { "collapsed": false }, @@ -311,10 +313,10 @@ { "data": { "text/plain": [ - "{'iterations': 50, 'num_topics': 3}" + "{'iterations': 20, 'num_topics': 3}" ] }, - "execution_count": 33, + "execution_count": 11, "metadata": {}, "output_type": "execute_result" } @@ -327,14 +329,14 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "### Example of Using Pipeline" + "#### Example of Using Pipeline" ] }, { "cell_type": "code", - "execution_count": 34, + "execution_count": 12, "metadata": { - "collapsed": true + "collapsed": false }, "outputs": [], "source": [ @@ -350,7 +352,7 @@ }, { "cell_type": "code", - "execution_count": 35, + "execution_count": 13, "metadata": { "collapsed": false }, @@ -362,7 +364,7 @@ }, { "cell_type": "code", - "execution_count": 38, + "execution_count": 14, "metadata": { "collapsed": false }, @@ -396,6 +398,76 @@ "print_features_pipe(pipe, id2word.values())\n", "print pipe.score(corpus, data.target)" ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### LsiModel" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "To use LsiModel begin with importing LsiModel wrapper" + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "from gensim.sklearn_integration.sklearn_wrapper_gensim_lsimodel import SklearnWrapperLsiModel" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### Example of Using Pipeline" + ] + }, + { + "cell_type": "code", + "execution_count": 16, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[ 0.13652819 0.00383696 0.02635504 -0.08454895 -0.02356143 0.60020084\n", + " 1.07026252 -0.04072257 0.43732847 0.54913549 -0.20242834 -0.21855402\n", + " -1.30546283 -0.08690711 0.17606255]\n", + "Positive features: 01101001B:1.07 comp.org.eff.talk.:0.60 red@redpoll.neoucom.edu:0.55 circuitry:0.44 >Pat:0.18 Fame.:0.14 Fame,:0.03 considered,:0.00\n", + "Negative features: internet...:-1.31 trawling:-0.22 hanging:-0.20 dome.:-0.09 Keach:-0.08 *best*:-0.04 comp.org.eff.talk,:-0.02\n", + "0.865771812081\n" + ] + } + ], + "source": [ + "model=SklearnWrapperLsiModel(num_topics=15, id2word=id2word)\n", + "clf=linear_model.LogisticRegression(penalty='l2', C=0.1) #l2 penalty used\n", + "pipe = Pipeline((('features', model,), ('classifier', clf)))\n", + "pipe.fit(corpus, data.target)\n", + "print_features_pipe(pipe, id2word.values())\n", + "print pipe.score(corpus, data.target)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [] } ], "metadata": {