diff --git a/docs/notebooks/ensemble_lda_with_opinosis.ipynb b/docs/notebooks/ensemble_lda_with_opinosis.ipynb new file mode 100644 index 0000000000..7380a87f71 --- /dev/null +++ b/docs/notebooks/ensemble_lda_with_opinosis.ipynb @@ -0,0 +1,183 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 1, + "metadata": { + "scrolled": false + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "####\n" + ] + } + ], + "source": [ + "import logging\n", + "from gensim.models import EnsembleLda, LdaMulticore\n", + "from gensim.corpora import OpinosisCorpus\n", + "import os" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "enable the ensemble logger to show what it is doing currently" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [], + "source": [ + "elda_logger = logging.getLogger(EnsembleLda.__module__)\n", + "elda_logger.setLevel(logging.INFO)\n", + "elda_logger.addHandler(logging.StreamHandler())" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [], + "source": [ + "def pretty_print_topics():\n", + " # note that the words are stemmed so they appear chopped off\n", + " for t in elda.print_topics(num_words=7):\n", + " print('-', t[1].replace('*',' ').replace('\"','').replace(' +',','), '\\n')" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Experiments on the Opinosis Dataset\n", + "\n", + "Opinosis [1] is a small (but redundant) corpus that contains 289 product reviews for 51 products. Since it's so small, the results are rather unstable.\n", + "\n", + "[1] Kavita Ganesan, ChengXiang Zhai, and Jiawei Han, _Opinosis: a graph-based approach to abstractive summarization of highly redundant opinions [online],_ Proceedings of the 23rd International Conference on Computational Linguistics, Association for Computational Linguistics, 2010, pp. 340–348. Available from: https://kavita-ganesan.com/opinosis/" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Preparing the corpus\n", + "\n", + "First, download the opinosis dataset. On linux it can be done like this for example:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "!mkdir ~/opinosis\n", + "!wget -P ~/opinosis https://github.com/kavgan/opinosis/raw/master/OpinosisDataset1.0_0.zip\n", + "!unzip ~/opinosis/OpinosisDataset1.0_0.zip -d ~/opinosis" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "path = os.path.expanduser('~/opinosis/')" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Corpus and id2word mapping can be created using the load_opinosis_data function provided in the package.\n", + "It preprocesses the data using the PorterStemmer and stopwords from the nltk package.\n", + "\n", + "The parameter of the function is the relative path to the folder, into which the zip file was extracted before. That folder contains a 'summaries-gold' subfolder." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "opinosis = OpinosisCorpus(path)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Training" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "**parameters**\n", + "\n", + "**topic_model_kind** ldamulticore is highly recommended for EnsembleLda. ensemble_workers and **distance_workers** are used to improve the time needed to train the models, as well as the **masking_method** 'rank'. ldamulticore is not able to fully utilize all cores on this small corpus, so **ensemble_workers** can be set to 3 to get 95 - 100% cpu usage on my i5 3470.\n", + "\n", + "Since the corpus is so small, a high number of **num_models** is needed to extract stable topics. The Opinosis corpus contains 51 categories, however, some of them are quite similar. For example there are 3 categories about the batteries of portable products. There are also multiple categories about cars. So I chose 20 for num_topics, which is smaller than the number of categories." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "elda = EnsembleLda(corpus=opinosis.corpus, id2word=opinosis.id2word, num_models=128, num_topics=20,\n", + " passes=20, iterations=100, ensemble_workers=3, distance_workers=4,\n", + " topic_model_class='ldamulticore', masking_method='rank')\n", + "pretty_print_topics()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "The default for **min_samples** would be 64, half of the number of models and **eps** would be 0.1. You basically play around with them until you find a sweetspot that fits for your needs." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "elda.recluster(min_samples=55, eps=0.14)\n", + "pretty_print_topics()" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.8.1" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/docs/src/apiref.rst b/docs/src/apiref.rst index 6218336b06..996a6079d9 100644 --- a/docs/src/apiref.rst +++ b/docs/src/apiref.rst @@ -22,6 +22,7 @@ Modules: corpora/malletcorpus corpora/mmcorpus corpora/_mmreader + corpora/opinosiscorpus corpora/sharded_corpus corpora/svmlightcorpus corpora/textcorpus @@ -29,6 +30,7 @@ Modules: corpora/wikicorpus models/ldamodel models/ldamulticore + models/ensemblelda models/nmf models/lsimodel models/ldaseqmodel diff --git a/docs/src/auto_examples/index.rst b/docs/src/auto_examples/index.rst index 5566611a8b..62dba3d50a 100644 --- a/docs/src/auto_examples/index.rst +++ b/docs/src/auto_examples/index.rst @@ -13,7 +13,7 @@ If you're thinking about contributing documentation, please see :ref:`sphx_glr_a .. raw:: html -
+
@@ -33,9 +33,9 @@ Understanding this functionality is vital for using gensim effectively. .. only:: html - .. figure:: /auto_examples/core/images/thumb/sphx_glr_run_core_concepts_thumb.png + .. figure:: /auto_examples/core/images/thumb/sphx_glr_run_core_concepts_thumb.png - :ref:`sphx_glr_auto_examples_core_run_core_concepts.py` + :ref:`sphx_glr_auto_examples_core_run_core_concepts.py` .. raw:: html @@ -53,9 +53,9 @@ Understanding this functionality is vital for using gensim effectively. .. only:: html - .. figure:: /auto_examples/core/images/thumb/sphx_glr_run_corpora_and_vector_spaces_thumb.png + .. figure:: /auto_examples/core/images/thumb/sphx_glr_run_corpora_and_vector_spaces_thumb.png - :ref:`sphx_glr_auto_examples_core_run_corpora_and_vector_spaces.py` + :ref:`sphx_glr_auto_examples_core_run_corpora_and_vector_spaces.py` .. raw:: html @@ -73,9 +73,9 @@ Understanding this functionality is vital for using gensim effectively. .. only:: html - .. figure:: /auto_examples/core/images/thumb/sphx_glr_run_topics_and_transformations_thumb.png + .. figure:: /auto_examples/core/images/thumb/sphx_glr_run_topics_and_transformations_thumb.png - :ref:`sphx_glr_auto_examples_core_run_topics_and_transformations.py` + :ref:`sphx_glr_auto_examples_core_run_topics_and_transformations.py` .. raw:: html @@ -93,9 +93,9 @@ Understanding this functionality is vital for using gensim effectively. .. only:: html - .. figure:: /auto_examples/core/images/thumb/sphx_glr_run_similarity_queries_thumb.png + .. figure:: /auto_examples/core/images/thumb/sphx_glr_run_similarity_queries_thumb.png - :ref:`sphx_glr_auto_examples_core_run_similarity_queries.py` + :ref:`sphx_glr_auto_examples_core_run_similarity_queries.py` .. raw:: html @@ -108,7 +108,7 @@ Understanding this functionality is vital for using gensim effectively. /auto_examples/core/run_similarity_queries .. raw:: html -
+
@@ -127,9 +127,9 @@ Learning-oriented lessons that introduce a particular gensim feature, e.g. a mod .. only:: html - .. figure:: /auto_examples/tutorials/images/thumb/sphx_glr_run_word2vec_thumb.png + .. figure:: /auto_examples/tutorials/images/thumb/sphx_glr_run_word2vec_thumb.png - :ref:`sphx_glr_auto_examples_tutorials_run_word2vec.py` + :ref:`sphx_glr_auto_examples_tutorials_run_word2vec.py` .. raw:: html @@ -147,9 +147,9 @@ Learning-oriented lessons that introduce a particular gensim feature, e.g. a mod .. only:: html - .. figure:: /auto_examples/tutorials/images/thumb/sphx_glr_run_doc2vec_lee_thumb.png + .. figure:: /auto_examples/tutorials/images/thumb/sphx_glr_run_doc2vec_lee_thumb.png - :ref:`sphx_glr_auto_examples_tutorials_run_doc2vec_lee.py` + :ref:`sphx_glr_auto_examples_tutorials_run_doc2vec_lee.py` .. raw:: html @@ -167,9 +167,9 @@ Learning-oriented lessons that introduce a particular gensim feature, e.g. a mod .. only:: html - .. figure:: /auto_examples/tutorials/images/thumb/sphx_glr_run_fasttext_thumb.png + .. figure:: /auto_examples/tutorials/images/thumb/sphx_glr_run_fasttext_thumb.png - :ref:`sphx_glr_auto_examples_tutorials_run_fasttext.py` + :ref:`sphx_glr_auto_examples_tutorials_run_fasttext.py` .. raw:: html @@ -187,9 +187,9 @@ Learning-oriented lessons that introduce a particular gensim feature, e.g. a mod .. only:: html - .. figure:: /auto_examples/tutorials/images/thumb/sphx_glr_run_annoy_thumb.png + .. figure:: /auto_examples/tutorials/images/thumb/sphx_glr_run_annoy_thumb.png - :ref:`sphx_glr_auto_examples_tutorials_run_annoy.py` + :ref:`sphx_glr_auto_examples_tutorials_run_annoy.py` .. raw:: html @@ -207,9 +207,9 @@ Learning-oriented lessons that introduce a particular gensim feature, e.g. a mod .. only:: html - .. figure:: /auto_examples/tutorials/images/thumb/sphx_glr_run_lda_thumb.png + .. figure:: /auto_examples/tutorials/images/thumb/sphx_glr_run_lda_thumb.png - :ref:`sphx_glr_auto_examples_tutorials_run_lda.py` + :ref:`sphx_glr_auto_examples_tutorials_run_lda.py` .. raw:: html @@ -227,9 +227,9 @@ Learning-oriented lessons that introduce a particular gensim feature, e.g. a mod .. only:: html - .. figure:: /auto_examples/tutorials/images/thumb/sphx_glr_run_distance_metrics_thumb.png + .. figure:: /auto_examples/tutorials/images/thumb/sphx_glr_run_distance_metrics_thumb.png - :ref:`sphx_glr_auto_examples_tutorials_run_distance_metrics.py` + :ref:`sphx_glr_auto_examples_tutorials_run_distance_metrics.py` .. raw:: html @@ -247,9 +247,9 @@ Learning-oriented lessons that introduce a particular gensim feature, e.g. a mod .. only:: html - .. figure:: /auto_examples/tutorials/images/thumb/sphx_glr_run_wmd_thumb.png + .. figure:: /auto_examples/tutorials/images/thumb/sphx_glr_run_wmd_thumb.png - :ref:`sphx_glr_auto_examples_tutorials_run_wmd.py` + :ref:`sphx_glr_auto_examples_tutorials_run_wmd.py` .. raw:: html @@ -267,9 +267,9 @@ Learning-oriented lessons that introduce a particular gensim feature, e.g. a mod .. only:: html - .. figure:: /auto_examples/tutorials/images/thumb/sphx_glr_run_summarization_thumb.png + .. figure:: /auto_examples/tutorials/images/thumb/sphx_glr_run_summarization_thumb.png - :ref:`sphx_glr_auto_examples_tutorials_run_summarization.py` + :ref:`sphx_glr_auto_examples_tutorials_run_summarization.py` .. raw:: html @@ -287,9 +287,9 @@ Learning-oriented lessons that introduce a particular gensim feature, e.g. a mod .. only:: html - .. figure:: /auto_examples/tutorials/images/thumb/sphx_glr_run_pivoted_doc_norm_thumb.png + .. figure:: /auto_examples/tutorials/images/thumb/sphx_glr_run_pivoted_doc_norm_thumb.png - :ref:`sphx_glr_auto_examples_tutorials_run_pivoted_doc_norm.py` + :ref:`sphx_glr_auto_examples_tutorials_run_pivoted_doc_norm.py` .. raw:: html @@ -302,7 +302,7 @@ Learning-oriented lessons that introduce a particular gensim feature, e.g. a mod /auto_examples/tutorials/run_pivoted_doc_norm .. raw:: html -
+
@@ -321,9 +321,9 @@ These **goal-oriented guides** demonstrate how to **solve a specific problem** u .. only:: html - .. figure:: /auto_examples/howtos/images/thumb/sphx_glr_run_downloader_api_thumb.png + .. figure:: /auto_examples/howtos/images/thumb/sphx_glr_run_downloader_api_thumb.png - :ref:`sphx_glr_auto_examples_howtos_run_downloader_api.py` + :ref:`sphx_glr_auto_examples_howtos_run_downloader_api.py` .. raw:: html @@ -341,9 +341,9 @@ These **goal-oriented guides** demonstrate how to **solve a specific problem** u .. only:: html - .. figure:: /auto_examples/howtos/images/thumb/sphx_glr_run_doc_thumb.png + .. figure:: /auto_examples/howtos/images/thumb/sphx_glr_run_doc_thumb.png - :ref:`sphx_glr_auto_examples_howtos_run_doc.py` + :ref:`sphx_glr_auto_examples_howtos_run_doc.py` .. raw:: html @@ -361,9 +361,9 @@ These **goal-oriented guides** demonstrate how to **solve a specific problem** u .. only:: html - .. figure:: /auto_examples/howtos/images/thumb/sphx_glr_run_doc2vec_imdb_thumb.png + .. figure:: /auto_examples/howtos/images/thumb/sphx_glr_run_doc2vec_imdb_thumb.png - :ref:`sphx_glr_auto_examples_howtos_run_doc2vec_imdb.py` + :ref:`sphx_glr_auto_examples_howtos_run_doc2vec_imdb.py` .. raw:: html @@ -381,9 +381,9 @@ These **goal-oriented guides** demonstrate how to **solve a specific problem** u .. only:: html - .. figure:: /auto_examples/howtos/images/thumb/sphx_glr_run_compare_lda_thumb.png + .. figure:: /auto_examples/howtos/images/thumb/sphx_glr_run_compare_lda_thumb.png - :ref:`sphx_glr_auto_examples_howtos_run_compare_lda.py` + :ref:`sphx_glr_auto_examples_howtos_run_compare_lda.py` .. raw:: html @@ -396,7 +396,7 @@ These **goal-oriented guides** demonstrate how to **solve a specific problem** u /auto_examples/howtos/run_compare_lda .. raw:: html -
+
@@ -440,7 +440,7 @@ Blog posts, tutorial videos, hackathons and other useful Gensim resources, from .. raw:: html -
+
@@ -452,13 +452,13 @@ Blog posts, tutorial videos, hackathons and other useful Gensim resources, from .. container:: sphx-glr-download - :download:`Download all examples in Python source code: auto_examples_python.zip ` + :download:`Download all examples in Python source code: auto_examples_python.zip ` .. container:: sphx-glr-download - :download:`Download all examples in Jupyter notebooks: auto_examples_jupyter.zip ` + :download:`Download all examples in Jupyter notebooks: auto_examples_jupyter.zip ` .. only:: html diff --git a/docs/src/corpora/opinosiscorpus.rst b/docs/src/corpora/opinosiscorpus.rst new file mode 100644 index 0000000000..3f62454677 --- /dev/null +++ b/docs/src/corpora/opinosiscorpus.rst @@ -0,0 +1,9 @@ +:mod:`corpora.opinosiscorpus` -- Topic related review sentences +=============================================================== + +.. automodule:: gensim.corpora.opinosiscorpus + :synopsis: Topic related review sentences + :members: + :inherited-members: + :undoc-members: + :show-inheritance: diff --git a/docs/src/models/ensemblelda.rst b/docs/src/models/ensemblelda.rst new file mode 100644 index 0000000000..42e63c94be --- /dev/null +++ b/docs/src/models/ensemblelda.rst @@ -0,0 +1,9 @@ +:mod:`models.ensembelda` -- Ensemble Latent Dirichlet Allocation +================================================================ + +.. automodule:: gensim.models.ensemblelda + :synopsis: Ensemble Latent Dirichlet Allocation + :members: + :inherited-members: + :undoc-members: + :show-inheritance: diff --git a/gensim/corpora/__init__.py b/gensim/corpora/__init__.py index 0d51a9b903..7b7044e0b9 100644 --- a/gensim/corpora/__init__.py +++ b/gensim/corpora/__init__.py @@ -15,3 +15,4 @@ from .textcorpus import TextCorpus, TextDirectoryCorpus # noqa:F401 from .ucicorpus import UciCorpus # noqa:F401 from .malletcorpus import MalletCorpus # noqa:F401 +from .opinosiscorpus import OpinosisCorpus # noqa:F401 diff --git a/gensim/corpora/opinosiscorpus.py b/gensim/corpora/opinosiscorpus.py new file mode 100644 index 0000000000..1a68ec670c --- /dev/null +++ b/gensim/corpora/opinosiscorpus.py @@ -0,0 +1,75 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- +# +# Author: Tobias B + + +"""Creates a corpus and dictionary from the Opinosis dataset. + +References +---------- +.. [1] Ganesan, Kavita and Zhai, ChengXiang and Han, Jiawei. Opinosis: a graph-based approach to abstractive + summarization of highly redundant opinions [online]. In : Proceedings of the 23rd International Conference on + Computational Linguistics. 2010. p. 340-348. Available from: https://kavita-ganesan.com/opinosis/ +""" + +import os +import re +from gensim.corpora import Dictionary +from gensim.parsing.porter import PorterStemmer +from gensim.parsing.preprocessing import STOPWORDS + + +class OpinosisCorpus(): + """Creates a corpus and dictionary from the Opinosis dataset. + + http://kavita-ganesan.com/opinosis-opinion-dataset/ + + This data is organized in folders, each folder containing a few short docs. + + Data can be obtained quickly using the following commands in bash: + + mkdir opinosis && cd opinosis + wget https://github.com/kavgan/opinosis/raw/master/OpinosisDataset1.0_0.zip + unzip OpinosisDataset1.0_0.zip + + corpus and dictionary can be accessed by using the .corpus and .id2word members + + """ + + def __init__(self, path): + """Load the downloaded corpus. + + Parameters + ---------- + path : string + Path to the extracted zip file. If 'summaries-gold' is in a folder + called 'opinosis', then the Path parameter would be 'opinosis', + either relative to you current working directory or absolute. + """ + # citation + path = os.path.join(path, "summaries-gold") + dictionary = Dictionary() + corpus = [] + stemmer = PorterStemmer() + + for directory, b, filenames in os.walk(path): + # each subdirectory of path is one collection of reviews to a specific product + # now get the corpus/documents + for filename in filenames: + filepath = directory + os.sep + filename + # write down the document and the topicId and split into train and testdata + with open(filepath) as file: + doc = file.read() + + preprocessed_doc = [ + stemmer.stem(token) for token in re.findall(r'\w+', doc.lower()) + if token not in STOPWORDS + ] + + dictionary.add_documents([preprocessed_doc]) + corpus += [dictionary.doc2bow(preprocessed_doc)] + + # and return the results the same way the other corpus generating functions do + self.corpus = corpus + self.id2word = dictionary diff --git a/gensim/models/__init__.py b/gensim/models/__init__.py index a0ee690550..507ec887b9 100644 --- a/gensim/models/__init__.py +++ b/gensim/models/__init__.py @@ -21,6 +21,7 @@ from .ldaseqmodel import LdaSeqModel # noqa:F401 from .fasttext import FastText # noqa:F401 from .translation_matrix import TranslationMatrix, BackMappingTranslationMatrix # noqa:F401 +from .ensemblelda import EnsembleLda # noqa:F401 from . import wrappers # noqa:F401 from . import deprecated # noqa:F401 diff --git a/gensim/models/ensemblelda.py b/gensim/models/ensemblelda.py new file mode 100644 index 0000000000..e793a9edcc --- /dev/null +++ b/gensim/models/ensemblelda.py @@ -0,0 +1,1289 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- +# +# Authors: Tobias Brigl , Alex Salles , +# Alex Loosley , Data Reply Munich +# + + +"""Ensemble Latent Dirichlet Allocation (eLDA), a method of training a topic model ensemble. + +Extracts reliable topics that are consistently learned across multiple LDA models. eLDA has the added benefit that +the user does not need to know the exact number of topics the topic model should extract ahead of time + +For more details read our paper (https://www.hip70890b.de/machine_learning/ensemble_LDA/). + +Usage examples +-------------- + +Train an ensemble of LdaModels using a Gensim corpus + +.. sourcecode:: pycon + + >>> from gensim.test.utils import common_texts + >>> from gensim.corpora.dictionary import Dictionary + >>> from gensim.models import EnsembleLda + >>> + >>> # Create a corpus from a list of texts + >>> common_dictionary = Dictionary(common_texts) + >>> common_corpus = [common_dictionary.doc2bow(text) for text in common_texts] + >>> + >>> # Train the model on the corpus. corpus has to be provided as a + >>> # keyword argument, as they are passed through to the children. + >>> elda = EnsembleLda(corpus=common_corpus, id2word=common_dictionary, num_topics=10, num_models=4) + +Save a model to disk, or reload a pre-trained model + +.. sourcecode:: pycon + + >>> from gensim.test.utils import datapath + >>> + >>> # Save model to disk. + >>> temp_file = datapath("model") + >>> elda.save(temp_file) + >>> + >>> # Load a potentially pretrained model from disk. + >>> elda = EnsembleLda.load(temp_file) + +Query, the model using new, unseen documents + +.. sourcecode:: pycon + + >>> # Create a new corpus, made of previously unseen documents. + >>> other_texts = [ + ... ['computer', 'time', 'graph'], + ... ['survey', 'response', 'eps'], + ... ['human', 'system', 'computer'] + ... ] + >>> other_corpus = [common_dictionary.doc2bow(text) for text in other_texts] + >>> + >>> unseen_doc = other_corpus[0] + >>> vector = elda[unseen_doc] # get topic probability distribution for a document + +Increase the ensemble size by adding a new model. Make sure it uses the same dictionary + +.. sourcecode:: pycon + + >>> from gensim.models import LdaModel + >>> elda.add_model(LdaModel(common_corpus, id2word=common_dictionary, num_topics=10)) + >>> elda.recluster() + >>> vector = elda[unseen_doc] + +To optimize the ensemble for your specific case, the children can be clustered again using +different hyperparameters + +.. sourcecode:: pycon + + >>> elda.recluster(eps=0.2) + +References +---------- +.. [1] REHUREK, Radim and Sojka, PETR, 2010, Software framework for topic modelling with large corpora. In : THE LREC + 2010 WORKSHOP ON NEW CHALLENGES FOR NLP FRAMEWORKS [online]. Msida : University of Malta. 2010. p. 45-50. + Available from: http://citeseerx.ist.psu.edu/viewdoc/summary?doi=10.1.1.695.4595 + +.. [2] BRIGL, Tobias, 2019, Extracting Reliable Topics using Ensemble Latent Dirichlet Allocation [Bachelor Thesis]. + Technische Hochschule Ingolstadt. Munich: Data Reply GmbH. Available from: + https://www.hip70890b.de/machine_learning/ensemble_LDA/ + +Citation +-------- +At the moment, there is no paper associated to ensemble LDA but we are working on publicly releasing Tobi Brigl's +bachelor thesis on the topic. In the meantime, please include a mention of us (Brigl, T.; Salles, A.; Loosley, A) and +a link to this file (https://github.com/RaRe-Technologies/gensim/blob/develop/gensim/models/ensemblelda.py) if this work +is presented, further developed, or used as the basis for a published result. + +Other Notes +----------- +The adjectives stable and reliable (topics) are used somewhat interchangeably throughout the doc strings and +comments. + +""" +import logging +import os +from multiprocessing import Process, Pipe, ProcessError +import importlib + +import numpy as np +from scipy.spatial.distance import cosine + +from gensim import utils +from gensim.models import ldamodel, ldamulticore, basemodel +from gensim.utils import SaveLoad + +logger = logging.getLogger(__name__) + + +class EnsembleLda(SaveLoad): + """Ensemble Latent Dirichlet Allocation (eLDA), a method of training a topic model ensemble. + + Extracts reliable topics that are consistently learned accross multiple LDA models. eLDA has the added benefit that + the user does not need to know the exact number of topics the topic model should extract ahead of time [2]. + + """ + + def __init__(self, topic_model_class="ldamulticore", num_models=3, + min_cores=None, # default value from _generate_stable_topics() + epsilon=0.1, ensemble_workers=1, memory_friendly_ttda=True, + min_samples=None, masking_method="mass", masking_threshold=None, + distance_workers=1, random_state=None, **gensim_kw_args): + """Create and train a new EnsembleLda model. + + Will start training immediatelly, except if iterations, passes or num_models is 0 or if the corpus is missing. + + Parameters + ---------- + topic_model_class : str, topic model, optional + Examples: + * 'ldamulticore' (default, recommended) + * 'lda' + ensemble_workers : int, optional + Spawns that many processes and distributes the models from the ensemble to those as evenly as possible. + num_models should be a multiple of ensemble_workers. + + Setting it to 0 or 1 will both use the non-multiprocessing version. Default: 1 + num_models : int, optional + How many LDA models to train in this ensemble. + Default: 3 + min_cores : int, optional + Minimum cores a cluster of topics has to contain so that it is recognized as stable topic. + epsilon : float, optional + Defaults to 0.1. Epsilon for the CBDBSCAN clustering that generates the stable topics. + ensemble_workers : int, optional + Spawns that many processes and distributes the models from the ensemble to those as evenly as possible. + num_models should be a multiple of ensemble_workers. + + Setting it to 0 or 1 will both use the nonmultiprocessing version. Default: 1 + memory_friendly_ttda : boolean, optional + If True, the models in the ensemble are deleted after training and only a concatenation of each model's + topic term distribution (called ttda) is kept to save memory. + + Defaults to True. When False, trained models are stored in a list in self.tms, and no models that are not + of a gensim model type can be added to this ensemble using the add_model function. + + If False, any topic term matrix can be suplied to add_model. + min_samples : int, optional + Required int of nearby topics for a topic to be considered as 'core' in the CBDBSCAN clustering. + masking_method : str, optional + Choose one of "mass" (default) or "rank" (percentile, faster). + + For clustering, distances between topic-term distributions are asymmetric. In particular, the distance + (technically a divergence) from distribution A to B is more of a measure of if A is contained in B. At a + high level, this involves using distribution A to mask distribution B and then calculating the cosine + distance between the two. The masking can be done in two ways: + + 1. mass: forms mask by taking the top ranked terms until their cumulative mass reaches the + 'masking_threshold' + + 2. rank: forms mask by taking the top ranked terms (by mass) until the 'masking_threshold' is reached. + For example, a ranking threshold of 0.11 means the top 0.11 terms by weight are used to form a mask. + + masking_threshold : float, optional + Default: None, which uses ``0.95`` for "mass", and ``0.11`` for masking_method "rank". In general, too + small a mask threshold leads to inaccurate calculations (no signal) and too big a mask leads to noisy + distance calculations. Defaults are often a good sweet spot for this hyperparameter. + distance_workers : int, optional + When ``distance_workers`` is ``None``, it defaults to ``os.cpu_count()`` for maximum performance. Default is + 1, which is not multiprocessed. Set to ``> 1`` to enable multiprocessing. + **gensim_kw_args + Parameters for each gensim model (e.g. :py:class:`gensim.models.LdaModel`) in the ensemble. + + """ + # INTERNAL PARAMETERS + # Set random state + # nps max random state of 2**32 - 1 is too large for windows: + self._MAX_RANDOM_STATE = np.iinfo(np.int32).max + + # _COSINE_DISTANCE_CALCULATION_THRESHOLD is used so that cosine distance calculations can be sped up by skipping + # distance calculations for highly masked topic-term distributions + self._COSINE_DISTANCE_CALCULATION_THRESHOLD = 0.05 + + if "id2word" not in gensim_kw_args: + gensim_kw_args["id2word"] = None + if "corpus" not in gensim_kw_args: + gensim_kw_args["corpus"] = None + + if gensim_kw_args["id2word"] is None and not gensim_kw_args["corpus"] is None: + logger.warning("no word id mapping provided; initializing from corpus, assuming identity") + gensim_kw_args["id2word"] = utils.dict_from_corpus(gensim_kw_args["corpus"]) + if gensim_kw_args["id2word"] is None and gensim_kw_args["corpus"] is None: + raise ValueError( + "at least one of corpus/id2word must be specified, to establish " + "input space dimensionality. Corpus should be provided using the " + "`corpus` keyword argument.") + + if type(topic_model_class) == type and issubclass(topic_model_class, ldamodel.LdaModel): + self.topic_model_class = topic_model_class + else: + kinds = { + "lda": ldamodel.LdaModel, + "ldamulticore": ldamulticore.LdaMulticore + } + if topic_model_class not in kinds: + raise ValueError( + "topic_model_class should be one of 'lda', 'ldamulticode' or a model inheriting from LdaModel") + self.topic_model_class = kinds[topic_model_class] + + self.num_models = num_models + self.gensim_kw_args = gensim_kw_args + + self.memory_friendly_ttda = memory_friendly_ttda + + self.distance_workers = distance_workers + self.masking_threshold = masking_threshold + self.masking_method = masking_method + + # this will provide the gensim api to the ensemble basically + self.classic_model_representation = None + + # the ensembles state + self.random_state = utils.get_random_state(random_state) + self.sstats_sum = 0 + self.eta = None + self.tms = [] + # initialize empty topic term distribution array + self.ttda = np.empty((0, len(gensim_kw_args["id2word"]))) + self.asymmetric_distance_matrix_outdated = True + + # in case the model will not train due to some + # parameters, stop here and don't train. + if num_models <= 0: + return + if gensim_kw_args.get("corpus") is None: + return + if "iterations" in gensim_kw_args and gensim_kw_args["iterations"] <= 0: + return + if "passes" in gensim_kw_args and gensim_kw_args["passes"] <= 0: + return + + logger.info("generating {} topic models...".format(num_models)) + + if ensemble_workers > 1: + self._generate_topic_models_multiproc(num_models, ensemble_workers) + else: + # singlecore + self._generate_topic_models(num_models) + + self._generate_asymmetric_distance_matrix() + self._generate_topic_clusters(epsilon, min_samples) + self._generate_stable_topics(min_cores) + + # create model that can provide the usual gensim api to the stable topics from the ensemble + self.generate_gensim_representation() + + def get_topic_model_class(self): + """Get the class that is used for :meth:`gensim.models.EnsembleLda.generate_gensim_representation`.""" + if self.topic_model_class is None: + instruction = 'Try setting topic_model_class manually to what the individual models were based on, ' \ + 'e.g. LdaMulticore.' + try: + module = importlib.import_module(self.topic_model_module_string) + self.topic_model_class = getattr(module, self.topic_model_class_string) + del self.topic_model_module_string + del self.topic_model_class_string + except ModuleNotFoundError: + logger.error( + 'Could not import the "{}" module in order to provide the "{}" class as ' + '"topic_model_class" attribute. {}' + .format(self.topic_model_class_string, self.topic_model_class_string, instruction)) + except AttributeError: + logger.error( + 'Could not import the "{}" class from the "{}" module in order to set the ' + '"topic_model_class" attribute. {}' + .format(self.topic_model_class_string, self.topic_model_module_string, instruction)) + return self.topic_model_class + + def save(self, *args, **kwargs): + """See :meth:`gensim.utils.SaveLoad.save`.""" + if self.get_topic_model_class() is not None: + self.topic_model_module_string = self.topic_model_class.__module__ + self.topic_model_class_string = self.topic_model_class.__name__ + kwargs['ignore'] = frozenset(kwargs.get('ignore', ())).union(('topic_model_class', )) + super(EnsembleLda, self).save(*args, **kwargs) + + save.__doc__ = SaveLoad.save.__doc__ + + def convert_to_memory_friendly(self): + """Remove the stored gensim models and only keep their ttdas.""" + self.tms = [] + self.memory_friendly_ttda = True + + def generate_gensim_representation(self): + """Create a gensim model from the stable topics. + + The returned representation is an Gensim LdaModel (:py:class:`gensim.models.LdaModel`) that has been + instantiated with an A-priori belief on word probability, eta, that represents the topic-term distributions of + any stable topics the were found by clustering over the ensemble of topic distributions. + + When no stable topics have been detected, None is returned. + + Returns + ------- + :py:class:`gensim.models.LdaModel` + A Gensim LDA Model classic_model_representation for which: + ``classic_model_representation.get_topics() == self.get_topics()`` + + """ + logger.info("generating classic gensim model representation based on results from the ensemble") + + sstats_sum = self.sstats_sum + # if sstats_sum (which is the number of words actually) should be wrong for some fantastic funny reason + # that makes you want to peel your skin off, recreate it (takes a while): + if sstats_sum == 0 and "corpus" in self.gensim_kw_args and not self.gensim_kw_args["corpus"] is None: + for document in self.gensim_kw_args["corpus"]: + for token in document: + sstats_sum += token[1] + self.sstats_sum = sstats_sum + + stable_topics = self.get_topics() + + num_stable_topics = len(stable_topics) + + if num_stable_topics == 0: + logger.error("the model did not detect any stable topic. You can try to adjust epsilon: " + "recluster(eps=...)") + self.classic_model_representation = None + return + + # create a new gensim model + params = self.gensim_kw_args.copy() + params["eta"] = self.eta + params["num_topics"] = num_stable_topics + # adjust params in a way that no training happens + params["passes"] = 0 # no training + # iterations is needed for inference, pass it to the model + + classic_model_representation = self.get_topic_model_class()(**params) + + # when eta was None, use what gensim generates as default eta for the following tasks: + eta = classic_model_representation.eta + if sstats_sum == 0: + sstats_sum = classic_model_representation.state.sstats.sum() + self.sstats_sum = sstats_sum + + # the following is important for the denormalization + # to generate the proper sstats for the new gensim model: + # transform to dimensionality of stable_topics. axis=1 is summed + eta_sum = 0 + if isinstance(eta, (int, float)): + eta_sum = [eta * len(stable_topics[0])] * num_stable_topics + else: + if len(eta.shape) == 1: # [e1, e2, e3] + eta_sum = [[eta.sum()]] * num_stable_topics + if len(eta.shape) > 1: # [[e11, e12, ...], [e21, e22, ...], ...] + eta_sum = np.array(eta.sum(axis=1)[:, None]) + + # the factor, that will be used when get_topics() is used, for normalization + # will never change, because the sum for eta as well as the sum for sstats is constant. + # Therefore predicting normalization_factor becomes super easy. + # corpus is a mapping of id to occurrences + + # so one can also easily calculate the + # right sstats, so that get_topics() will return the stable topics no + # matter eta. + + normalization_factor = np.array([[sstats_sum / num_stable_topics]] * num_stable_topics) + eta_sum + + sstats = stable_topics * normalization_factor + sstats -= eta + + classic_model_representation.state.sstats = sstats.astype(np.float32) + # fix expElogbeta. + classic_model_representation.sync_state() + + self.classic_model_representation = classic_model_representation + + return classic_model_representation + + def add_model(self, target, num_new_models=None): + """Add the topic term distribution array (ttda) of another model to the ensemble. + + This way, multiple topic models can be connected to an ensemble manually. Make sure that all the models use + the exact same dictionary/idword mapping. + + In order to generate new stable topics afterwards, use: + 1. ``self._generate_asymmetric_distance_matrix()`` + 2. ``self.``:meth:`~gensim.models.ensemblelda.EnsembleLda.recluster` + + The ttda of another ensemble can also be used, in that case set ``num_new_models`` to the ``num_models`` + parameter of the ensemble, that means the number of classic models in the ensemble that generated the ttda. + This is important, because that information is used to estimate "min_samples" for _generate_topic_clusters. + + If you trained this ensemble in the past with a certain Dictionary that you want to reuse for other + models, you can get it from: ``self.id2word``. + + Parameters + ---------- + target : {see description} + 1. A single EnsembleLda object + 2. List of EnsembleLda objects + 3. A single Gensim topic model (e.g. (:py:class:`gensim.models.LdaModel`) + 4. List of Gensim topic models + + if memory_friendly_ttda is True, target can also be: + 5. topic-term-distribution-array + + example: [[0.1, 0.1, 0.8], [...], ...] + + [topic1, topic2, ...] + with topic being an array of probabilities: + [token1, token2, ...] + + token probabilities in a single topic sum to one, therefore, all the words sum to len(ttda) + + num_new_models : integer, optional + the model keeps track of how many models were used in this ensemble. Set higher if ttda contained topics + from more than one model. Default: None, which takes care of it automatically. + + If target is a 2D-array of float values, it assumes 1. + + If the ensemble has ``memory_friendly_ttda`` set to False, then it will always use the number of models in + the target parameter. + + """ + # If the model has never seen a ttda before, initialize. + # If it has, append. + + # Be flexible. Can be a single element or a list of elements + # make sure it's a numpy array + if not isinstance(target, (np.ndarray, list)): + target = np.array([target]) + else: + target = np.array(target) + assert len(target) > 0 + + if self.memory_friendly_ttda: + # for memory friendly models/ttdas, append the ttdas to itself + + detected_num_models = 0 + + ttda = [] + + # 1. ttda array, because that's the only accepted input that contains numbers + if isinstance(target.dtype.type(), (np.number, float)): + ttda = target + detected_num_models = 1 + + # 2. list of ensemblelda objects + elif isinstance(target[0], type(self)): + ttda = np.concatenate([ensemble.ttda for ensemble in target], axis=0) + detected_num_models = sum([ensemble.num_models for ensemble in target]) + + # 3. list of gensim models + elif isinstance(target[0], basemodel.BaseTopicModel): + ttda = np.concatenate([model.get_topics() for model in target], axis=0) + detected_num_models = len(target) + + # unknown + else: + raise ValueError("target is of unknown type or a list of unknown types: {}".format(type(target[0]))) + + # new models were added, increase num_models + # if the user didn't provide a custon numer to use + if num_new_models is None: + self.num_models += detected_num_models + else: + self.num_models += num_new_models + + else: # memory unfriendly ensembles + + ttda = [] + + # 1. ttda array + if isinstance(target.dtype.type(), (np.number, float)): + raise ValueError( + 'ttda arrays cannot be added to ensembles, for which memory_friendly_ttda=False, ' + 'you can call convert_to_memory_friendly, but it will discard the stored gensim ' + 'models and only keep the relevant topic term distributions from them.') + + # 2. list of ensembles + elif isinstance(target[0], type(self)): + for ensemble in target: + self.tms += ensemble.tms + ttda = np.concatenate([ensemble.ttda for ensemble in target], axis=0) + + # 3. list of gensim models + elif isinstance(target[0], basemodel.BaseTopicModel): + self.tms += target.tolist() + ttda = np.concatenate([model.get_topics() for model in target], axis=0) + + # unknown + else: + raise ValueError("target is of unknown type or a list of unknown types: {}".format(type(target[0]))) + + # in this case, len(self.tms) should + # always match self.num_models + if num_new_models is not None and num_new_models + self.num_models != len(self.tms): + logger.info( + 'num_new_models will be ignored. num_models should match the number of ' + 'stored models for a memory unfriendly ensemble') + self.num_models = len(self.tms) + + logger.info("ensemble contains {} models and {} topics now".format(self.num_models, len(self.ttda))) + + if self.ttda.shape[1] != ttda.shape[1]: + raise ValueError("target ttda dimensions do not match. Topics must be {} but was {} elements large" + .format(self.ttda.shape[-1], ttda.shape[-1])) + self.ttda = np.append(self.ttda, ttda, axis=0) + + # tell recluster that the distance matrix needs to be regenerated + self.asymmetric_distance_matrix_outdated = True + + def _teardown(self, pipes, processes, i): + """close pipes and terminate processes + + Parameters + ---------- + pipes : {list of :class:`multiprocessing.Pipe`} + list of pipes that the processes use to communicate with the parent + processes : {list of :class:`multiprocessing.Process`} + list of worker processes + i : int + index of the process that could not be started + + """ + logger.error("could not start process {}".format(i)) + + for pipe in pipes: + pipe[1].close() + pipe[0].close() + + for process in processes: + if process.is_alive(): + process.terminate() + del process + + def _generate_topic_models_multiproc(self, num_models, ensemble_workers): + """Generate the topic models to form the ensemble in a multiprocessed way. + + Depending on the used topic model this can result in a speedup. + + Parameters + ---------- + num_models : int + how many models to train in the ensemble + ensemble_workers : int + into how many processes to split the models will be set to max(workers, num_models), to avoid workers that + are supposed to train 0 models. + + to get maximum performance, set to the number of your cores, if non-parallelized models are being used in + the ensemble (LdaModel). + + For LdaMulticore, the performance gain is small and gets larger for a significantly smaller corpus. + In that case, ensemble_workers=2 can be used. + + """ + # the way random_states is handled needs to prevent getting different results when multiprocessing is on, + # or getting the same results in every lda children. so it is solved by generating a list of state seeds before + # multiprocessing is started. + random_states = [self.random_state.randint(self._MAX_RANDOM_STATE) for _ in range(num_models)] + + # each worker has to work on at least one model. + # Don't spawn idle workers: + workers = min(ensemble_workers, num_models) + + # create worker processes: + # from what I know this is basically forking with a jump to a target function in each child + # so modifying the ensemble object will not modify the one in the parent because of no shared memory + processes = [] + pipes = [] + num_models_unhandled = num_models # how many more models need to be trained by workers? + + for i in range(workers): + parent_conn, child_conn = Pipe() + num_subprocess_models = 0 + if i == workers - 1: # i is a index, hence -1 + # is this the last worker that needs to be created? + # then task that worker with all the remaining models + num_subprocess_models = num_models_unhandled + else: + num_subprocess_models = int(num_models_unhandled / (workers - i)) + + # get the chunk from the random states that is meant to be for those models + random_states_for_worker = random_states[-num_models_unhandled:][:num_subprocess_models] + + try: + process = Process( + target=self._generate_topic_models, + args=(num_subprocess_models, random_states_for_worker, child_conn)) + + processes.append(process) + pipes.append((parent_conn, child_conn)) + process.start() + + num_models_unhandled -= num_subprocess_models + + except ProcessError: + self._teardown(pipes, processes, i) + raise ProcessError + + # aggregate results + # will also block until workers are finished + for pipe in pipes: + answer = pipe[0].recv() # [0], because that is the parentConn + pipe[0].close() + # this does basically the same as the _generate_topic_models function (concatenate all the ttdas): + if not self.memory_friendly_ttda: + self.tms += answer + ttda = np.concatenate([model.get_topics() for model in answer]) + else: + ttda = answer + self.ttda = np.concatenate([self.ttda, ttda]) + + # end all processes + for process in processes: + process.terminate() + + def _generate_topic_models(self, num_models, random_states=None, pipe=None): + """Train the topic models that form the ensemble. + + Parameters + ---------- + num_models : int + number of models to be generated + random_states : list + list of numbers or np.random.RandomState objects. Will be autogenerated based on the ensembles + RandomState if None (default). + pipe : multiprocessing.pipe + Default None. If provided, will send the trained models over this pipe. If memory friendly, it will only + send the ttda. + + """ + if pipe is not None: + logger.info("spawned worker to generate {} topic models".format(num_models)) + + if random_states is None: + random_states = [self.random_state.randint(self._MAX_RANDOM_STATE) for _ in range(num_models)] + + assert len(random_states) == num_models + + kwargs = self.gensim_kw_args.copy() + + tm = None # remember one of the topic models from the following + # loop, in order to collect some properties from it afterwards. + + for i in range(num_models): + + kwargs["random_state"] = random_states[i] + + tm = self.get_topic_model_class()(**kwargs) + + # adds the lambda (that is the unnormalized get_topics) to ttda, which is + # a list of all those lambdas + self.ttda = np.concatenate([self.ttda, tm.get_topics()]) + + # only saves the model if it is not "memory friendly" + if not self.memory_friendly_ttda: + self.tms += [tm] + + # use one of the tms to get some info that will be needed later + self.sstats_sum = tm.state.sstats.sum() + self.eta = tm.eta + + if pipe is not None: + # send the ttda that is in the child/workers version of the memory into the pipe + # available, after _generate_topic_models has been called in the worker + if self.memory_friendly_ttda: + # remember that this code is inside the worker processes memory, + # so self.ttda is the ttda of only a chunk of models + pipe.send(self.ttda) + else: + pipe.send(self.tms) + + pipe.close() + + def _asymmetric_distance_matrix_worker(self, worker_id, ttdas_sent, n_ttdas, pipe, threshold, method): + """Worker that computes the distance to all other nodes from a chunk of nodes.""" + logger.info("spawned worker to generate {} rows of the asymmetric distance matrix".format(n_ttdas)) + # the chunk of ttda that's going to be calculated: + ttda1 = self.ttda[ttdas_sent:ttdas_sent + n_ttdas] + distance_chunk = self._calculate_asymmetric_distance_matrix_chunk( + ttda1=ttda1, ttda2=self.ttda, threshold=threshold, start_index=ttdas_sent, method=method) + pipe.send((worker_id, distance_chunk)) # remember that this code is inside the workers memory + pipe.close() + + def _generate_asymmetric_distance_matrix(self): + """Calculate the pairwise distance matrix for all the ttdas from the ensemble. + + Returns the asymmetric pairwise distance matrix that is used in the DBSCAN clustering. + + Afterwards, the model needs to be reclustered for this generated matrix to take effect. + + """ + workers = self.distance_workers + threshold = self.masking_threshold + method = self.masking_method + + # matrix is up to date afterwards + self.asymmetric_distance_matrix_outdated = False + + if threshold is None: + threshold = {"mass": 0.95, "rank": 0.11}[method] + + logger.info("generating a {} x {} asymmetric distance matrix...".format(len(self.ttda), len(self.ttda))) + + # singlecore + if workers is not None and workers <= 1: + self.asymmetric_distance_matrix = self._calculate_asymmetric_distance_matrix_chunk( + ttda1=self.ttda, ttda2=self.ttda, threshold=threshold, start_index=0, method=method) + return self.asymmetric_distance_matrix + + # else, if workers > 1 use multiprocessing + # best performance on 2-core machine: 2 workers + if workers is None: + workers = os.cpu_count() + + # create worker processes: + processes = [] + pipes = [] + ttdas_sent = 0 + + for i in range(workers): + try: + parent_conn, child_conn = Pipe() + + # Load Balancing, for example if there are 9 ttdas and 4 workers, the load will be balanced 2, 2, 2, 3. + n_ttdas = 0 + if i == workers - 1: # i is a index, hence -1 + # is this the last worker that needs to be created? + # then task that worker with all the remaining models + n_ttdas = len(self.ttda) - ttdas_sent + else: + n_ttdas = int((len(self.ttda) - ttdas_sent) / (workers - i)) + + process = Process(target=self._asymmetric_distance_matrix_worker, + args=(i, ttdas_sent, n_ttdas, child_conn, threshold, method)) + ttdas_sent += n_ttdas + + processes.append(process) + pipes.append((parent_conn, child_conn)) + process.start() + + except ProcessError: + self._teardown(pipes, processes, i) + raise ProcessError + + distances = [] + # note, that the following loop maintains order in how the ttda will be concatenated + # which is very important. Ordering in ttda has to be the same as when using only one process + for pipe in pipes: + answer = pipe[0].recv() # [0], because that is the parentConn + pipe[0].close() # child conn will be closed from inside the worker + # this does basically the same as the _generate_topic_models function (concatenate all the ttdas): + distances.append(answer[1]) + + # end all processes + for process in processes: + process.terminate() + + self.asymmetric_distance_matrix = np.concatenate(distances) + + return self.asymmetric_distance_matrix + + def _calculate_asymmetric_distance_matrix_chunk(self, ttda1, ttda2, threshold, start_index, method): + """Calculate an (asymmetric) distance from each topic in ``ttda1`` to each topic in ``ttda2``. + + Parameters + ---------- + ttda1 and ttda2: 2D arrays of floats + Two ttda matrices that are going to be used for distance calculation. Each row in ttda corresponds to one + topic. Each cell in the resulting matrix corresponds to the distance between a topic pair. + threshold : float, optional + threshold defaults to: ``{"mass": 0.95, "rank": 0.11}``, depending on the selected method + start_index : int + this function might be used in multiprocessing, so start_index has to be set as ttda1 is a chunk of the + complete ttda in that case. start_index would be 0 if ``ttda1 == self.ttda``. When self.ttda is split into + two pieces, each 100 ttdas long, then start_index should be be 100. default is 0 + method : {'mass', 'rank}, optional + method can be "mass" for the original masking method or "rank" for a faster masking method that selects + by rank of largest elements in the topic term distribution, to determine which tokens are relevant for the + topic. + + Returns + ------- + 2D Numpy.numpy.ndarray of floats + Asymmetric distance matrix of size ``len(ttda1)`` by ``len(ttda2)``. + + """ + # initialize the distance matrix. ndarray is faster than zeros + distances = np.ndarray((len(ttda1), len(ttda2))) + + if ttda1.shape[0] > 0 and ttda2.shape[0] > 0: + # the worker might not have received a ttda because it was chunked up too much + + if method not in ["mass", "rank"]: + raise ValueError("method {} unknown".format(method)) + + # select masking method: + def mass_masking(a): + """Original masking method. Returns a new binary mask.""" + sorted_a = np.sort(a)[::-1] + largest_mass = sorted_a.cumsum() < threshold + smallest_valid = sorted_a[largest_mass][-1] + return a >= smallest_valid + + def rank_masking(a): + """Faster masking method. Returns a new binary mask.""" + return a > np.sort(a)[::-1][int(len(a) * threshold)] + + create_mask = {"mass": mass_masking, "rank": rank_masking}[method] + + # some help to find a better threshold by useful log messages + avg_mask_size = 0 + + # now iterate over each topic + for ttd1_idx, ttd1 in enumerate(ttda1): + # create mask from ttd1 that removes noise from a and keeps the largest terms + mask = create_mask(ttd1) + ttd1_masked = ttd1[mask] + + avg_mask_size += mask.sum() + + # now look at every possible pair for topic a: + for ttd2_idx, ttd2 in enumerate(ttda2): + # distance to itself is 0 + if ttd1_idx + start_index == ttd2_idx: + distances[ttd1_idx][ttd2_idx] = 0 + continue + + # now mask b based on a, which will force the shape of a onto b + ttd2_masked = ttd2[mask] + + # Smart distance calculation avoids calculating cosine distance for highly masked topic-term + # distributions that will have distance values near 1. + if ttd2_masked.sum() <= self._COSINE_DISTANCE_CALCULATION_THRESHOLD: + distance = 1 + else: + distance = cosine(ttd1_masked, ttd2_masked) + + distances[ttd1_idx][ttd2_idx] = distance + + percent = round(100 * avg_mask_size / ttda1.shape[0] / ttda1.shape[1], 1) + logger.info('the given threshold of {} covered on average {}% of tokens'.format(threshold, percent)) + + return distances + + def _generate_topic_clusters(self, eps=0.1, min_samples=None): + """Run the CBDBSCAN algorithm on all the detected topics and label them with label-indices. + + The final approval and generation of stable topics is done in ``_generate_stable_topics()``. + + Parameters + ---------- + eps : float + dbscan distance scale + min_samples : int, optional + defaults to ``int(self.num_models / 2)``, dbscan min neighbours threshold which corresponds to finding + stable topics and should scale with the number of models, ``self.num_models`` + + """ + if min_samples is None: + min_samples = int(self.num_models / 2) + + logger.info("fitting the clustering model") + + self.cluster_model = CBDBSCAN(eps=eps, min_samples=min_samples) + self.cluster_model.fit(self.asymmetric_distance_matrix) + + def _is_valid_core(self, topic): + """Check if the topic is a valid core. + + Parameters + ---------- + topic : {'is_core', 'valid_parents', 'label'} + topic to validate + + """ + return topic["is_core"] and (topic["valid_parents"] == {topic["label"]}) + + def _group_by_labels(self, results): + """Group all the learned cores by their label, which was assigned in the cluster_model. + + Parameters + ---------- + results : {list of {'is_core', 'neighboring_labels', 'label'}} + After calling .fit on a CBDBSCAN model, the results can be retrieved from it by accessing the .results + member, which can be used as the argument to this function. It's a list of infos gathered during + the clustering step and each element in the list corresponds to a single topic. + + Returns + ------- + dict of (int, list of {'is_core', 'num_neighboring_labels', 'neighboring_labels'}) + A mapping of the label to a list of topics that belong to that particular label. Also adds + a new member to each topic called num_neighboring_labels, which is the number of + neighboring_labels of that topic. + + """ + grouped_by_labels = {} + for topic in results: + if topic["is_core"]: + topic = topic.copy() + + # counts how many different labels a core has as parents + topic["num_neighboring_labels"] = len(topic["neighboring_labels"]) + + label = topic["label"] + if label not in grouped_by_labels: + grouped_by_labels[label] = [] + grouped_by_labels[label].append(topic) + return grouped_by_labels + + def _aggregate_topics(self, grouped_by_labels): + """Aggregate the labeled topics to a list of clusters. + + Parameters + ---------- + grouped_by_labels : dict of (int, list of {'is_core', 'num_neighboring_labels', 'neighboring_labels'}) + The return value of _group_by_labels. A mapping of the label to a list of each topic which belongs to the + label. + + Returns + ------- + list of {'max_num_neighboring_labels', 'neighboring_labels', 'label'} + max_num_neighboring_labels is the max number of parent labels among each topic of a given cluster. label + refers to the label identifier of the cluster. neighboring_labels is a concatenated list of the + neighboring_labels sets of each topic. Its sorted by max_num_neighboring_labels in descending + order. There is one single element for each cluster. + + """ + sorted_clusters = [] + + for label, group in grouped_by_labels.items(): + num_neighboring_labels = 0 + neighboring_labels = [] # will be a list of sets + + for topic in group: + max_num_neighboring_labels = max(topic["num_neighboring_labels"], num_neighboring_labels) + neighboring_labels.append(topic["neighboring_labels"]) + + neighboring_labels = [x for x in neighboring_labels if len(x) > 0] + + sorted_clusters.append({ + "max_num_neighboring_labels": max_num_neighboring_labels, + "neighboring_labels": neighboring_labels, + "label": label, + "num_cores": len([topic for topic in group if topic["is_core"]]) + }) + + sorted_clusters = sorted(sorted_clusters, + key=lambda cluster: ( + cluster["max_num_neighboring_labels"], + cluster["label"] + ), reverse=False) + + return sorted_clusters + + def _remove_from_all_sets(self, label, clusters): + """Remove a label from every set in "neighboring_labels" for each core in ``clusters``.""" + for cluster in clusters: + for neighboring_labels_set in cluster["neighboring_labels"]: + if label in neighboring_labels_set: + neighboring_labels_set.remove(label) + + def _contains_isolated_cores(self, label, cluster, min_cores): + """Check if the cluster has at least ``min_cores`` of cores that belong to no other cluster.""" + return sum(map(lambda x: x == {label}, cluster["neighboring_labels"])) >= min_cores + + def _generate_stable_topics(self, min_cores=None): + """Generate stable topics out of the clusters. + + The function finds clusters of topics using a variant of DBScan. If a cluster has enough core topics + (c.f. parameter ``min_cores``), then this cluster represents a stable topic. The stable topic is specifically + calculated as the average over all topic-term distributions of the core topics in the cluster. + + This function is the last step that has to be done in the ensemble. After this step is complete, + Stable topics can be retrieved afterwards using the :meth:`~gensim.models.ensemblelda.EnsembleLda.get_topics` + method. + + Parameters + ---------- + min_cores : int + Minimum number of core topics needed to form a cluster that represents a stable topic. + Using ``None`` defaults to ``min_cores = min(3, max(1, int(self.num_models /4 +1)))`` + + """ + # min_cores being 0 makes no sense. there has to be a core for a cluster + # or there is no cluster + if min_cores == 0: + min_cores = 1 + + if min_cores is None: + # min_cores is a number between 1 and 3, depending on the number of models + min_cores = min(3, max(1, int(self.num_models / 4 + 1))) + logger.info("generating stable topics, each cluster needs at least {} cores".format(min_cores)) + else: + logger.info("generating stable topics") + + results = self.cluster_model.results + + grouped_by_labels = self._group_by_labels(results) + + sorted_clusters = self._aggregate_topics(grouped_by_labels) + + for cluster in sorted_clusters: + cluster["is_valid"] = None + if cluster["num_cores"] < min_cores: + cluster["is_valid"] = False + self._remove_from_all_sets(cluster["label"], sorted_clusters) + + # now that invalid clusters are removed, check which clusters contain enough cores that don't belong to any + # other cluster. + for cluster in [cluster for cluster in sorted_clusters if cluster["is_valid"] is None]: + label = cluster["label"] + if self._contains_isolated_cores(label, cluster, min_cores): + cluster["is_valid"] = True + else: + cluster["is_valid"] = False + # This modifies parent labels which is important in _contains_isolated_cores, so the result depends on + # where to start. + self._remove_from_all_sets(label, sorted_clusters) + + # list of all the label numbers that are valid + valid_labels = np.array([cluster["label"] for cluster in sorted_clusters if cluster["is_valid"]]) + + for topic in results: + topic["valid_parents"] = {label for label in topic["neighboring_labels"] if label in valid_labels} + + # keeping only VALID cores + valid_core_mask = np.vectorize(self._is_valid_core)(results) + valid_topics = self.ttda[valid_core_mask] + topic_labels = np.array([topic["label"] for topic in results])[valid_core_mask] + unique_labels = np.unique(topic_labels) + + num_stable_topics = len(unique_labels) + stable_topics = np.empty((num_stable_topics, len(self.id2word))) + + # for each cluster + for l, label in enumerate(unique_labels): + # mean of all the topics that are of that cluster + topics_of_cluster = np.array([topic for t, topic in enumerate(valid_topics) if topic_labels[t] == label]) + stable_topics[l] = topics_of_cluster.mean(axis=0) + + self.sorted_clusters = sorted_clusters + self.stable_topics = stable_topics + + def recluster(self, eps=0.1, min_samples=None, min_cores=None): + """Reapply CBDBSCAN clustering and stable topic generation. + + Stable topics can be retrieved using :meth:`~gensim.models.ensemblelda.EnsembleLda.get_topics`. + + Parameters + ---------- + eps : float + epsilon for the CBDBSCAN algorithm, having the same meaning as in classic DBSCAN clustering. + default: ``0.1`` + min_samples : int + The minimum number of samples in the neighborhood of a topic to be considered a core in CBDBSCAN. + default: ``int(self.num_models / 2)`` + min_cores : int + how many cores a cluster has to have, to be treated as stable topic. That means, how many topics + that look similar have to be present, so that the average topic in those is used as stable topic. + default: ``min(3, max(1, int(self.num_models /4 +1)))`` + + """ + # if new models were added to the ensemble, the distance matrix needs to be generated again + if self.asymmetric_distance_matrix_outdated: + logger.info("asymmetric distance matrix is outdated due to add_model") + self._generate_asymmetric_distance_matrix() + + # Run CBDBSCAN to get topic clusters: + self._generate_topic_clusters(eps, min_samples) + + # Interpret the results of CBDBSCAN to identify reliable topics: + self._generate_stable_topics(min_cores) + + # Create gensim LdaModel representation of topic model with reliable topics (can be used for inference): + self.generate_gensim_representation() + + # GENSIM API + # to make using the ensemble in place of a gensim model as easy as possible + + def get_topics(self): + """Return only the stable topics from the ensemble. + + Returns + ------- + 2D Numpy.numpy.ndarray of floats + List of stable topic term distributions + + """ + return self.stable_topics + + def _has_gensim_representation(self): + """Check if stable topics and the internal gensim representation exist. Raise an error if not.""" + if self.classic_model_representation is None: + if len(self.stable_topics) == 0: + raise ValueError("no stable topic was detected") + else: + raise ValueError("use generate_gensim_representation() first") + + def __getitem__(self, i): + """See :meth:`gensim.models.LdaModel.__getitem__`.""" + self._has_gensim_representation() + return self.classic_model_representation[i] + + def inference(self, *posargs, **kwargs): + """See :meth:`gensim.models.LdaModel.inference`.""" + self._has_gensim_representation() + return self.classic_model_representation.inference(*posargs, **kwargs) + + def log_perplexity(self, *posargs, **kwargs): + """See :meth:`gensim.models.LdaModel.log_perplexity`.""" + self._has_gensim_representation() + return self.classic_model_representation.log_perplexity(*posargs, **kwargs) + + def print_topics(self, *posargs, **kwargs): + """See :meth:`gensim.models.LdaModel.print_topics`.""" + self._has_gensim_representation() + return self.classic_model_representation.print_topics(*posargs, **kwargs) + + @property + def id2word(self): + """Return the :py:class:`gensim.corpora.dictionary.Dictionary` object used in the model.""" + return self.gensim_kw_args["id2word"] + + +class CBDBSCAN(): + """A Variation of the DBSCAN algorithm called Checkback DBSCAN (CBDBSCAN). + + The algorithm works based on DBSCAN-like parameters 'eps' and 'min_samples' that respectively define how far a + "nearby" point is, and the minimum number of nearby points needed to label a candidate datapoint a core of a + cluster. (See https://scikit-learn.org/stable/modules/generated/sklearn.cluster.DBSCAN.html). + + The algorithm works as follows: + + 1. (A)symmetric distance matrix provided at fit-time (called 'amatrix'). + For the sake of example below, assume the there are only five topics (amatrix contains distances with dim 5x5), + T_1, T_2, T_3, T_4, T_5: + 2. Start by scanning a candidate topic with respect to a parent topic + (e.g. T_1 with respect to parent None) + 3. Check which topics are nearby the candidate topic using 'self.eps' as a threshold and call them neighbours + (e.g. assume T_3, T_4, and T_5 are nearby and become neighbours) + 4. If there are more neighbours than 'self.min_samples', the candidate topic becomes a core candidate for a cluster + (e.g. if 'min_samples'=1, then T_1 becomes the first core of a cluster) + 5. If candidate is a core, CheckBack (CB) to find the fraction of neighbours that are either the parent or the + parent's neighbours. If this fraction is more than 75%, give the candidate the same label as its parent. + (e.g. in the trivial case there is no parent (or neighbours of that parent), a new incremental label is given) + 6. If candidate is a core, recursively scan the next nearby topic (e.g. scan T_3) labeling the previous topic as + the parent and the previous neighbours as the parent_neighbours - repeat steps 2-6: + + 2. (e.g. Scan candidate T_3 with respect to parent T_1 that has parent_neighbours T_3, T_4, and T_5) + 3. (e.g. T5 is the only neighbour) + 4. (e.g. number of neighbours is 1, therefore candidate T_3 becomes a core) + 5. (e.g. CheckBack finds that two of the four parent and parent neighbours are neighbours of candidate T_3. + Therefore the candidate T_3 does NOT get the same label as its parent T_1) + 6. (e.g. Scan candidate T_5 with respect to parent T_3 that has parent_neighbours T_5) + + The CB step has the effect that it enforces cluster compactness and allows the model to avoid creating clusters for + unreliable topics made of a composition of multiple reliable topics (something that occurs often LDA models that is + one cause of unreliable topics). + + """ + + def __init__(self, eps, min_samples): + """Create a new CBDBSCAN object. Call fit in order to train it on an asymmetric distance matrix. + + Parameters + ---------- + eps : float + epsilon for the CBDBSCAN algorithm, having the same meaning as in classic DBSCAN clustering. + min_samples : int + The minimum number of samples in the neighborhood of a topic to be considered a core in CBDBSCAN. + + """ + self.eps = eps + self.min_samples = min_samples + + def fit(self, amatrix): + """Apply the algorithm to an asymmetric distance matrix.""" + self.next_label = 0 + + topic_clustering_results = [{ + "is_core": False, + "neighboring_labels": set(), + "neighboring_topic_indices": set(), + "label": None + } for _ in range(len(amatrix))] + + amatrix_copy = amatrix.copy() + + # to avoid the problem of comparing the topic with itself + np.fill_diagonal(amatrix_copy, 1) + + min_distance_per_topic = [(distance, index) for index, distance in enumerate(amatrix_copy.min(axis=1))] + min_distance_per_topic_sorted = sorted(min_distance_per_topic, key=lambda distance: distance[0]) + ordered_min_similarity = [index for distance, index in min_distance_per_topic_sorted] + + def scan_topic(topic_index, current_label=None, parent_neighbors=None): + """Extend the cluster in one direction. + + Results are accumulated to ``self.results``. + + Parameters + ---------- + topic_index : int + The topic that might be added to the existing cluster, or which might create a new cluster if necessary. + current_label : int + The label of the cluster that might be suitable for ``topic_index`` + + """ + neighbors_sorted = sorted( + [ + (distance, index) + for index, distance in enumerate(amatrix_copy[topic_index]) + ], + key=lambda x: x[0], + ) + neighboring_topic_indices = [index for distance, index in neighbors_sorted if distance < self.eps] + + num_neighboring_topics = len(neighboring_topic_indices) + + # If the number of neighbor indices of a topic is large enough, it is considered a core. + # This also takes neighbor indices that already are identified as core in count. + if num_neighboring_topics >= self.min_samples: + # This topic is a core! + topic_clustering_results[topic_index]["is_core"] = True + + # if current_label is none, then this is the first core + # of a new cluster (hence next_label is used) + if current_label is None: + # next_label is initialized with 0 in fit() for the first cluster + current_label = self.next_label + self.next_label += 1 + + else: + # In case the core has a parent, check the distance to the parents neighbors (since the matrix is + # asymmetric, it takes return distances into account here) + # If less than 25% of the elements are close enough, then create a new cluster rather than further + # growing the current cluster in that direction. + close_parent_neighbors_mask = amatrix_copy[topic_index][parent_neighbors] < self.eps + + if close_parent_neighbors_mask.mean() < 0.25: + # start new cluster by changing current_label + current_label = self.next_label + self.next_label += 1 + + topic_clustering_results[topic_index]["label"] = current_label + + for neighboring_topic_index in neighboring_topic_indices: + if topic_clustering_results[neighboring_topic_index]["label"] is None: + ordered_min_similarity.remove(neighboring_topic_index) + # try to extend the cluster into the direction of the neighbor + scan_topic(neighboring_topic_index, current_label, neighboring_topic_indices + [topic_index]) + + topic_clustering_results[neighboring_topic_index]["neighboring_topic_indices"].add(topic_index) + topic_clustering_results[neighboring_topic_index]["neighboring_labels"].add(current_label) + + else: + # this topic is not a core! + if current_label is None: + topic_clustering_results[topic_index]["label"] = -1 + else: + topic_clustering_results[topic_index]["label"] = current_label + + # elements are going to be removed from that array in scan_topic, do until it is empty + while len(ordered_min_similarity) != 0: + next_topic_index = ordered_min_similarity.pop(0) + scan_topic(next_topic_index) + + self.results = topic_clustering_results diff --git a/gensim/test/test_data/ensemblelda b/gensim/test/test_data/ensemblelda new file mode 100644 index 0000000000..127d00888c Binary files /dev/null and b/gensim/test/test_data/ensemblelda differ diff --git a/gensim/test/test_ensemblelda.py b/gensim/test/test_ensemblelda.py new file mode 100644 index 0000000000..6794cb71a0 --- /dev/null +++ b/gensim/test/test_ensemblelda.py @@ -0,0 +1,365 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- +# +# Author: Tobias B + +""" +Automated tests for checking the EnsembleLda Class +""" + +import logging +import unittest + +import numpy as np +from copy import deepcopy + +from gensim.models import EnsembleLda, LdaMulticore, LdaModel +from gensim.test.utils import datapath, get_tmpfile, common_corpus, common_dictionary + +num_topics = 2 +num_models = 4 +passes = 50 + + +class TestModel(unittest.TestCase): + def setUp(self): + # same configuration for each model to make sure + # the topics are equal + random_state = 0 + + self.eLDA = EnsembleLda(corpus=common_corpus, id2word=common_dictionary, num_topics=num_topics, + passes=passes, num_models=num_models, random_state=random_state, + topic_model_class=LdaModel) + + self.eLDA_mu = EnsembleLda(corpus=common_corpus, id2word=common_dictionary, num_topics=num_topics, + passes=passes, num_models=num_models, random_state=random_state, + memory_friendly_ttda=False, topic_model_class=LdaModel) + + def check_ttda(self, ensemble): + """tests the integrity of the ttda of any ensemble""" + self.assertGreater(len(ensemble.ttda), 0) + a = ensemble.ttda.sum(axis=1) + b = np.ones(len(ensemble.ttda)).astype(np.float32) + np.testing.assert_allclose(a, b, rtol=1e-04) + + def test_eLDA(self): + # given that the random_state doesn't change, it should + # always be 2 detected topics in this setup. + self.assertEqual(self.eLDA.stable_topics.shape[1], len(common_dictionary)) + self.assertEqual(len(self.eLDA.ttda), num_models * num_topics) + self.check_ttda(self.eLDA) + + # reclustering shouldn't change anything without + # added models or different parameters + self.eLDA.recluster() + + self.assertEqual(self.eLDA.stable_topics.shape[1], len(common_dictionary)) + self.assertEqual(len(self.eLDA.ttda), num_models * num_topics) + self.check_ttda(self.eLDA) + + # compare with a pre-trained reference model + reference = EnsembleLda.load(datapath('ensemblelda')) + np.testing.assert_allclose(self.eLDA.ttda, reference.ttda, rtol=1e-05) + # small values in the distance matrix tend to vary quite a bit around 2%, + # so use some absolute tolerance measurement to check if the matrix is at least + # close to the target. + atol = reference.asymmetric_distance_matrix.max() * 1e-05 + np.testing.assert_allclose(self.eLDA.asymmetric_distance_matrix, + reference.asymmetric_distance_matrix, atol=atol) + + def test_clustering(self): + # the following test is quite specific to the current implementation and not part of any api, + # but it makes improving those sections of the code easier as long as sorted_clusters and the + # cluster_model results are supposed to stay the same. Potentially this test will deprecate. + + reference = EnsembleLda.load(datapath('ensemblelda')) + cluster_model_results = deepcopy(reference.cluster_model.results) + sorted_clusters = deepcopy(reference.sorted_clusters) + stable_topics = deepcopy(reference.get_topics()) + + # continue training with the distance matrix of the pretrained reference and see if + # the generated clusters match. + reference.asymmetric_distance_matrix_outdated = True + reference.recluster() + + self.assert_cluster_results_equal(reference.cluster_model.results, cluster_model_results) + self.assertEqual(reference.sorted_clusters, sorted_clusters) + np.testing.assert_allclose(reference.get_topics(), stable_topics, rtol=1e-05) + + def test_not_trained(self): + # should not throw errors and no training should happen + + # 0 passes + eLDA = EnsembleLda(corpus=common_corpus, id2word=common_dictionary, num_topics=num_topics, + passes=0, num_models=num_models, random_state=0) + self.assertEqual(len(eLDA.ttda), 0) + + # no corpus + eLDA = EnsembleLda(id2word=common_dictionary, num_topics=num_topics, + passes=passes, num_models=num_models, random_state=0) + self.assertEqual(len(eLDA.ttda), 0) + + # 0 iterations + eLDA = EnsembleLda(corpus=common_corpus, id2word=common_dictionary, num_topics=num_topics, + iterations=0, num_models=num_models, random_state=0) + self.assertEqual(len(eLDA.ttda), 0) + + # 0 models + eLDA = EnsembleLda(corpus=common_corpus, id2word=common_dictionary, num_topics=num_topics, + passes=passes, num_models=0, random_state=0) + self.assertEqual(len(eLDA.ttda), 0) + + def test_memory_unfriendly(self): + # at this point, self.eLDA_mu and self.eLDA are already trained + # in the setUpClass function + # both should be 100% similar (but floats cannot + # be compared that easily, so check for threshold) + self.assertEqual(len(self.eLDA_mu.tms), num_models) + np.testing.assert_allclose(self.eLDA.ttda, self.eLDA_mu.ttda, rtol=1e-05) + np.testing.assert_allclose(self.eLDA.get_topics(), self.eLDA_mu.get_topics(), rtol=1e-05) + self.check_ttda(self.eLDA_mu) + + def test_generate_gensim_rep(self): + gensimModel = self.eLDA.generate_gensim_representation() + topics = gensimModel.get_topics() + np.testing.assert_allclose(self.eLDA.get_topics(), topics, rtol=1e-05) + + def assert_cluster_results_equal(self, a, b): + """compares important attributes of the cluster results""" + np.testing.assert_array_equal([row["label"] for row in a], + [row["label"] for row in b]) + np.testing.assert_array_equal([row["is_core"] for row in a], + [row["is_core"] for row in b]) + + def test_persisting(self): + fname = get_tmpfile('gensim_models_ensemblelda') + self.eLDA.save(fname) + loaded_eLDA = EnsembleLda.load(fname) + # storing the ensemble without memory_friendy_ttda + self.eLDA_mu.save(fname) + loaded_eLDA_mu = EnsembleLda.load(fname) + + # topic_model_class will be lazy loaded and should be None first + assert loaded_eLDA.topic_model_class is None + + # was it stored and loaded correctly? + # memory friendly. + loaded_eLDA_representation = loaded_eLDA.generate_gensim_representation() + + # generating the representation also lazily loads the topic_model_class + assert loaded_eLDA.topic_model_class == LdaModel + + topics = loaded_eLDA_representation.get_topics() + ttda = loaded_eLDA.ttda + amatrix = loaded_eLDA.asymmetric_distance_matrix + np.testing.assert_allclose(self.eLDA.get_topics(), topics, rtol=1e-05) + np.testing.assert_allclose(self.eLDA.ttda, ttda, rtol=1e-05) + np.testing.assert_allclose(self.eLDA.asymmetric_distance_matrix, amatrix, rtol=1e-05) + + a = self.eLDA.cluster_model.results + b = loaded_eLDA.cluster_model.results + + self.assert_cluster_results_equal(a, b) + + # memory unfriendly + loaded_eLDA_mu_representation = loaded_eLDA_mu.generate_gensim_representation() + topics = loaded_eLDA_mu_representation.get_topics() + np.testing.assert_allclose(self.eLDA.get_topics(), topics, rtol=1e-05) + + def test_multiprocessing(self): + # same configuration + random_state = 0 + + # use 3 processes for the ensemble and the distance, + # so that the 4 models and 8 topics cannot be distributed + # to each worker evenly + workers = 3 + + # memory friendly. contains List of topic word distributions + eLDA_multi = EnsembleLda(corpus=common_corpus, id2word=common_dictionary, topic_model_class=LdaModel, + num_topics=num_topics, passes=passes, num_models=num_models, + random_state=random_state, ensemble_workers=workers, distance_workers=workers) + + # memory unfriendly. contains List of models + eLDA_multi_mu = EnsembleLda(corpus=common_corpus, id2word=common_dictionary, topic_model_class=LdaModel, + num_topics=num_topics, passes=passes, num_models=num_models, + random_state=random_state, ensemble_workers=workers, distance_workers=workers, + memory_friendly_ttda=False) + + np.testing.assert_allclose(self.eLDA.get_topics(), eLDA_multi.get_topics(), rtol=1e-05) + np.testing.assert_allclose(self.eLDA_mu.get_topics(), eLDA_multi_mu.get_topics(), rtol=1e-05) + + def test_add_models(self): + # same configuration + num_models = self.eLDA.num_models + + # make sure countings and sizes after adding are correct + # create new models and add other models to them. + + # there are a ton of configurations for the first parameter possible, + # try them all + + # quickly train something that can be used for counting results + num_new_models = 3 + num_new_topics = 3 + + # 1. memory friendly + eLDA_base = EnsembleLda(corpus=common_corpus, id2word=common_dictionary, + num_topics=num_new_topics, passes=1, num_models=num_new_models, + iterations=1, random_state=0, topic_model_class=LdaMulticore, + workers=3, ensemble_workers=2) + + # 1.1 ttda + a = len(eLDA_base.ttda) + b = eLDA_base.num_models + eLDA_base.add_model(self.eLDA.ttda) + self.assertEqual(len(eLDA_base.ttda), a + len(self.eLDA.ttda)) + self.assertEqual(eLDA_base.num_models, b + 1) # defaults to 1 for one ttda matrix + + # 1.2 an ensemble + a = len(eLDA_base.ttda) + b = eLDA_base.num_models + eLDA_base.add_model(self.eLDA, 5) + self.assertEqual(len(eLDA_base.ttda), a + len(self.eLDA.ttda)) + self.assertEqual(eLDA_base.num_models, b + 5) + + # 1.3 a list of ensembles + a = len(eLDA_base.ttda) + b = eLDA_base.num_models + # it should be totally legit to add a memory unfriendly object to a memory friendly one + eLDA_base.add_model([self.eLDA, self.eLDA_mu]) + self.assertEqual(len(eLDA_base.ttda), a + 2 * len(self.eLDA.ttda)) + self.assertEqual(eLDA_base.num_models, b + 2 * num_models) + + # 1.4 a single gensim model + model = self.eLDA.classic_model_representation + + a = len(eLDA_base.ttda) + b = eLDA_base.num_models + eLDA_base.add_model(model) + self.assertEqual(len(eLDA_base.ttda), a + len(model.get_topics())) + self.assertEqual(eLDA_base.num_models, b + 1) + + # 1.5 a list gensim models + a = len(eLDA_base.ttda) + b = eLDA_base.num_models + eLDA_base.add_model([model, model]) + self.assertEqual(len(eLDA_base.ttda), a + 2 * len(model.get_topics())) + self.assertEqual(eLDA_base.num_models, b + 2) + + self.check_ttda(eLDA_base) + + # 2. memory unfriendly + eLDA_base_mu = EnsembleLda(corpus=common_corpus, id2word=common_dictionary, + num_topics=num_new_topics, passes=1, num_models=num_new_models, + iterations=1, random_state=0, topic_model_class=LdaMulticore, + workers=3, ensemble_workers=2, memory_friendly_ttda=False) + + # 2.1 a single ensemble + a = len(eLDA_base_mu.tms) + b = eLDA_base_mu.num_models + eLDA_base_mu.add_model(self.eLDA_mu) + self.assertEqual(len(eLDA_base_mu.tms), a + num_models) + self.assertEqual(eLDA_base_mu.num_models, b + num_models) + + # 2.2 a list of ensembles + a = len(eLDA_base_mu.tms) + b = eLDA_base_mu.num_models + eLDA_base_mu.add_model([self.eLDA_mu, self.eLDA_mu]) + self.assertEqual(len(eLDA_base_mu.tms), a + 2 * num_models) + self.assertEqual(eLDA_base_mu.num_models, b + 2 * num_models) + + # 2.3 a single gensim model + a = len(eLDA_base_mu.tms) + b = eLDA_base_mu.num_models + eLDA_base_mu.add_model(self.eLDA_mu.tms[0]) + self.assertEqual(len(eLDA_base_mu.tms), a + 1) + self.assertEqual(eLDA_base_mu.num_models, b + 1) + + # 2.4 a list of gensim models + a = len(eLDA_base_mu.tms) + b = eLDA_base_mu.num_models + eLDA_base_mu.add_model(self.eLDA_mu.tms) + self.assertEqual(len(eLDA_base_mu.tms), a + num_models) + self.assertEqual(eLDA_base_mu.num_models, b + num_models) + + # 2.5 topic term distributions should throw errors, because the + # actual models are needed for the memory unfriendly ensemble + a = len(eLDA_base_mu.tms) + b = eLDA_base_mu.num_models + self.assertRaises(ValueError, lambda: eLDA_base_mu.add_model(self.eLDA_mu.tms[0].get_topics())) + # remains unchanged + self.assertEqual(len(eLDA_base_mu.tms), a) + self.assertEqual(eLDA_base_mu.num_models, b) + + self.assertEqual(eLDA_base_mu.num_models, len(eLDA_base_mu.tms)) + self.check_ttda(eLDA_base_mu) + + def test_add_and_recluster(self): + # 6.2: see if after adding a model, the model still makes sense + num_new_models = 3 + num_new_topics = 3 + + # train + new_eLDA = EnsembleLda(corpus=common_corpus, id2word=common_dictionary, + num_topics=num_new_topics, passes=10, num_models=num_new_models, + iterations=30, random_state=1, topic_model_class='ldamulticore', + distance_workers=4) + new_eLDA_mu = EnsembleLda(corpus=common_corpus, id2word=common_dictionary, + num_topics=num_new_topics, passes=10, num_models=num_new_models, + iterations=30, random_state=1, topic_model_class='ldamulticore', + distance_workers=4, memory_friendly_ttda=False) + # both should be similar + np.testing.assert_allclose(new_eLDA.ttda, new_eLDA_mu.ttda, rtol=1e-05) + np.testing.assert_allclose(new_eLDA.get_topics(), new_eLDA_mu.get_topics(), rtol=1e-05) + # and every next step applied to both should result in similar results + + # 1. adding to ttda and tms + new_eLDA.add_model(self.eLDA) + new_eLDA_mu.add_model(self.eLDA_mu) + np.testing.assert_allclose(new_eLDA.ttda, new_eLDA_mu.ttda, rtol=1e-05) + self.assertEqual(len(new_eLDA.ttda), len(self.eLDA.ttda) + num_new_models * num_new_topics) + self.assertEqual(len(new_eLDA_mu.ttda), len(self.eLDA_mu.ttda) + num_new_models * num_new_topics) + self.assertEqual(len(new_eLDA_mu.tms), num_models + num_new_models) + self.check_ttda(new_eLDA) + self.check_ttda(new_eLDA_mu) + + # 2. distance matrix + new_eLDA._generate_asymmetric_distance_matrix() + new_eLDA_mu._generate_asymmetric_distance_matrix() + np.testing.assert_allclose(new_eLDA.asymmetric_distance_matrix, + new_eLDA_mu.asymmetric_distance_matrix) + + # 3. CBDBSCAN results + new_eLDA._generate_topic_clusters() + new_eLDA_mu._generate_topic_clusters() + a = new_eLDA.cluster_model.results + b = new_eLDA_mu.cluster_model.results + self.assert_cluster_results_equal(a, b) + + # 4. finally, the stable topics + new_eLDA._generate_stable_topics() + new_eLDA_mu._generate_stable_topics() + np.testing.assert_allclose(new_eLDA.get_topics(), + new_eLDA_mu.get_topics()) + + new_eLDA.generate_gensim_representation() + new_eLDA_mu.generate_gensim_representation() + + # same random state, hence topics should be still similar + np.testing.assert_allclose(new_eLDA.get_topics(), new_eLDA_mu.get_topics(), rtol=1e-05) + + def test_inference(self): + import numpy as np + # get the most likely token id from topic 0 + max_id = np.argmax(self.eLDA.get_topics()[0, :]) + self.assertGreater(self.eLDA.classic_model_representation.iterations, 0) + # topic 0 should be dominant in the inference. + # the difference between the probabilities should be significant and larger than 0.3 + infered = self.eLDA[[(max_id, 1)]] + self.assertGreater(infered[0][1] - 0.3, infered[1][1]) + + +if __name__ == '__main__': + logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.WARN) + unittest.main()