diff --git a/docs/notebooks/ensemble_lda_with_opinosis.ipynb b/docs/notebooks/ensemble_lda_with_opinosis.ipynb
new file mode 100644
index 0000000000..7380a87f71
--- /dev/null
+++ b/docs/notebooks/ensemble_lda_with_opinosis.ipynb
@@ -0,0 +1,183 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "metadata": {
+    "scrolled": false
+   },
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "####\n"
+     ]
+    }
+   ],
+   "source": [
+    "import logging\n",
+    "from gensim.models import EnsembleLda, LdaMulticore\n",
+    "from gensim.corpora import OpinosisCorpus\n",
+    "import os"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "enable the ensemble logger to show what it is doing currently"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "elda_logger = logging.getLogger(EnsembleLda.__module__)\n",
+    "elda_logger.setLevel(logging.INFO)\n",
+    "elda_logger.addHandler(logging.StreamHandler())"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def pretty_print_topics():\n",
+    "    # note that the words are stemmed so they appear chopped off\n",
+    "    for t in elda.print_topics(num_words=7):\n",
+    "        print('-', t[1].replace('*',' ').replace('\"','').replace(' +',','), '\\n')"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Experiments on the Opinosis Dataset\n",
+    "\n",
+    "Opinosis [1] is a small (but redundant) corpus that contains 289 product reviews for 51 products. Since it's so small, the results are rather unstable.\n",
+    "\n",
+    "[1] Kavita Ganesan, ChengXiang Zhai, and Jiawei Han, _Opinosis: a graph-based approach to abstractive summarization of highly redundant opinions [online],_ Proceedings of the 23rd International Conference on Computational Linguistics, Association for Computational Linguistics, 2010, pp. 340–348. Available from: https://kavita-ganesan.com/opinosis/"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Preparing the corpus\n",
+    "\n",
+    "First, download the opinosis dataset. On linux it can be done like this for example:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "!mkdir ~/opinosis\n",
+    "!wget -P ~/opinosis https://github.com/kavgan/opinosis/raw/master/OpinosisDataset1.0_0.zip\n",
+    "!unzip ~/opinosis/OpinosisDataset1.0_0.zip -d ~/opinosis"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "path = os.path.expanduser('~/opinosis/')"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Corpus and id2word mapping can be created using the load_opinosis_data function provided in the package.\n",
+    "It preprocesses the data using the PorterStemmer and stopwords from the nltk package.\n",
+    "\n",
+    "The parameter of the function is the relative path to the folder, into which the zip file was extracted before. That folder contains a 'summaries-gold' subfolder."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "opinosis = OpinosisCorpus(path)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Training"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "**parameters**\n",
+    "\n",
+    "**topic_model_kind** ldamulticore is highly recommended for EnsembleLda. ensemble_workers and **distance_workers** are used to improve the time needed to train the models, as well as the **masking_method** 'rank'. ldamulticore is not able to fully utilize all cores on this small corpus, so **ensemble_workers** can be set to 3 to get 95 - 100% cpu usage on my i5 3470.\n",
+    "\n",
+    "Since the corpus is so small, a high number of **num_models** is needed to extract stable topics. The Opinosis corpus contains 51 categories, however, some of them are quite similar. For example there are 3 categories about the batteries of portable products. There are also multiple categories about cars. So I chose 20 for num_topics, which is smaller than the number of categories."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "elda = EnsembleLda(corpus=opinosis.corpus, id2word=opinosis.id2word, num_models=128, num_topics=20,\n",
+    "                   passes=20, iterations=100, ensemble_workers=3, distance_workers=4,\n",
+    "                   topic_model_class='ldamulticore', masking_method='rank')\n",
+    "pretty_print_topics()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "The default for **min_samples** would be 64, half of the number of models and **eps** would be 0.1. You basically play around with them until you find a sweetspot that fits for your needs."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "elda.recluster(min_samples=55, eps=0.14)\n",
+    "pretty_print_topics()"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.8.1"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}
diff --git a/docs/src/apiref.rst b/docs/src/apiref.rst
index 6218336b06..996a6079d9 100644
--- a/docs/src/apiref.rst
+++ b/docs/src/apiref.rst
@@ -22,6 +22,7 @@ Modules:
     corpora/malletcorpus
     corpora/mmcorpus
     corpora/_mmreader
+    corpora/opinosiscorpus
     corpora/sharded_corpus
     corpora/svmlightcorpus
     corpora/textcorpus
@@ -29,6 +30,7 @@ Modules:
     corpora/wikicorpus
     models/ldamodel
     models/ldamulticore
+    models/ensemblelda
     models/nmf
     models/lsimodel
     models/ldaseqmodel
diff --git a/docs/src/auto_examples/index.rst b/docs/src/auto_examples/index.rst
index 5566611a8b..62dba3d50a 100644
--- a/docs/src/auto_examples/index.rst
+++ b/docs/src/auto_examples/index.rst
@@ -13,7 +13,7 @@ If you're thinking about contributing documentation, please see :ref:`sphx_glr_a
 
 .. raw:: html
 
-    <div style='clear:both'></div>
+    <div class="sphx-glr-clear"></div>
 
 
 
@@ -33,9 +33,9 @@ Understanding this functionality is vital for using gensim effectively.
 
 .. only:: html
 
-    .. figure:: /auto_examples/core/images/thumb/sphx_glr_run_core_concepts_thumb.png
+ .. figure:: /auto_examples/core/images/thumb/sphx_glr_run_core_concepts_thumb.png
 
-        :ref:`sphx_glr_auto_examples_core_run_core_concepts.py`
+     :ref:`sphx_glr_auto_examples_core_run_core_concepts.py`
 
 .. raw:: html
 
@@ -53,9 +53,9 @@ Understanding this functionality is vital for using gensim effectively.
 
 .. only:: html
 
-    .. figure:: /auto_examples/core/images/thumb/sphx_glr_run_corpora_and_vector_spaces_thumb.png
+ .. figure:: /auto_examples/core/images/thumb/sphx_glr_run_corpora_and_vector_spaces_thumb.png
 
-        :ref:`sphx_glr_auto_examples_core_run_corpora_and_vector_spaces.py`
+     :ref:`sphx_glr_auto_examples_core_run_corpora_and_vector_spaces.py`
 
 .. raw:: html
 
@@ -73,9 +73,9 @@ Understanding this functionality is vital for using gensim effectively.
 
 .. only:: html
 
-    .. figure:: /auto_examples/core/images/thumb/sphx_glr_run_topics_and_transformations_thumb.png
+ .. figure:: /auto_examples/core/images/thumb/sphx_glr_run_topics_and_transformations_thumb.png
 
-        :ref:`sphx_glr_auto_examples_core_run_topics_and_transformations.py`
+     :ref:`sphx_glr_auto_examples_core_run_topics_and_transformations.py`
 
 .. raw:: html
 
@@ -93,9 +93,9 @@ Understanding this functionality is vital for using gensim effectively.
 
 .. only:: html
 
-    .. figure:: /auto_examples/core/images/thumb/sphx_glr_run_similarity_queries_thumb.png
+ .. figure:: /auto_examples/core/images/thumb/sphx_glr_run_similarity_queries_thumb.png
 
-        :ref:`sphx_glr_auto_examples_core_run_similarity_queries.py`
+     :ref:`sphx_glr_auto_examples_core_run_similarity_queries.py`
 
 .. raw:: html
 
@@ -108,7 +108,7 @@ Understanding this functionality is vital for using gensim effectively.
    /auto_examples/core/run_similarity_queries
 .. raw:: html
 
-    <div style='clear:both'></div>
+    <div class="sphx-glr-clear"></div>
 
 
 
@@ -127,9 +127,9 @@ Learning-oriented lessons that introduce a particular gensim feature, e.g. a mod
 
 .. only:: html
 
-    .. figure:: /auto_examples/tutorials/images/thumb/sphx_glr_run_word2vec_thumb.png
+ .. figure:: /auto_examples/tutorials/images/thumb/sphx_glr_run_word2vec_thumb.png
 
-        :ref:`sphx_glr_auto_examples_tutorials_run_word2vec.py`
+     :ref:`sphx_glr_auto_examples_tutorials_run_word2vec.py`
 
 .. raw:: html
 
@@ -147,9 +147,9 @@ Learning-oriented lessons that introduce a particular gensim feature, e.g. a mod
 
 .. only:: html
 
-    .. figure:: /auto_examples/tutorials/images/thumb/sphx_glr_run_doc2vec_lee_thumb.png
+ .. figure:: /auto_examples/tutorials/images/thumb/sphx_glr_run_doc2vec_lee_thumb.png
 
-        :ref:`sphx_glr_auto_examples_tutorials_run_doc2vec_lee.py`
+     :ref:`sphx_glr_auto_examples_tutorials_run_doc2vec_lee.py`
 
 .. raw:: html
 
@@ -167,9 +167,9 @@ Learning-oriented lessons that introduce a particular gensim feature, e.g. a mod
 
 .. only:: html
 
-    .. figure:: /auto_examples/tutorials/images/thumb/sphx_glr_run_fasttext_thumb.png
+ .. figure:: /auto_examples/tutorials/images/thumb/sphx_glr_run_fasttext_thumb.png
 
-        :ref:`sphx_glr_auto_examples_tutorials_run_fasttext.py`
+     :ref:`sphx_glr_auto_examples_tutorials_run_fasttext.py`
 
 .. raw:: html
 
@@ -187,9 +187,9 @@ Learning-oriented lessons that introduce a particular gensim feature, e.g. a mod
 
 .. only:: html
 
-    .. figure:: /auto_examples/tutorials/images/thumb/sphx_glr_run_annoy_thumb.png
+ .. figure:: /auto_examples/tutorials/images/thumb/sphx_glr_run_annoy_thumb.png
 
-        :ref:`sphx_glr_auto_examples_tutorials_run_annoy.py`
+     :ref:`sphx_glr_auto_examples_tutorials_run_annoy.py`
 
 .. raw:: html
 
@@ -207,9 +207,9 @@ Learning-oriented lessons that introduce a particular gensim feature, e.g. a mod
 
 .. only:: html
 
-    .. figure:: /auto_examples/tutorials/images/thumb/sphx_glr_run_lda_thumb.png
+ .. figure:: /auto_examples/tutorials/images/thumb/sphx_glr_run_lda_thumb.png
 
-        :ref:`sphx_glr_auto_examples_tutorials_run_lda.py`
+     :ref:`sphx_glr_auto_examples_tutorials_run_lda.py`
 
 .. raw:: html
 
@@ -227,9 +227,9 @@ Learning-oriented lessons that introduce a particular gensim feature, e.g. a mod
 
 .. only:: html
 
-    .. figure:: /auto_examples/tutorials/images/thumb/sphx_glr_run_distance_metrics_thumb.png
+ .. figure:: /auto_examples/tutorials/images/thumb/sphx_glr_run_distance_metrics_thumb.png
 
-        :ref:`sphx_glr_auto_examples_tutorials_run_distance_metrics.py`
+     :ref:`sphx_glr_auto_examples_tutorials_run_distance_metrics.py`
 
 .. raw:: html
 
@@ -247,9 +247,9 @@ Learning-oriented lessons that introduce a particular gensim feature, e.g. a mod
 
 .. only:: html
 
-    .. figure:: /auto_examples/tutorials/images/thumb/sphx_glr_run_wmd_thumb.png
+ .. figure:: /auto_examples/tutorials/images/thumb/sphx_glr_run_wmd_thumb.png
 
-        :ref:`sphx_glr_auto_examples_tutorials_run_wmd.py`
+     :ref:`sphx_glr_auto_examples_tutorials_run_wmd.py`
 
 .. raw:: html
 
@@ -267,9 +267,9 @@ Learning-oriented lessons that introduce a particular gensim feature, e.g. a mod
 
 .. only:: html
 
-    .. figure:: /auto_examples/tutorials/images/thumb/sphx_glr_run_summarization_thumb.png
+ .. figure:: /auto_examples/tutorials/images/thumb/sphx_glr_run_summarization_thumb.png
 
-        :ref:`sphx_glr_auto_examples_tutorials_run_summarization.py`
+     :ref:`sphx_glr_auto_examples_tutorials_run_summarization.py`
 
 .. raw:: html
 
@@ -287,9 +287,9 @@ Learning-oriented lessons that introduce a particular gensim feature, e.g. a mod
 
 .. only:: html
 
-    .. figure:: /auto_examples/tutorials/images/thumb/sphx_glr_run_pivoted_doc_norm_thumb.png
+ .. figure:: /auto_examples/tutorials/images/thumb/sphx_glr_run_pivoted_doc_norm_thumb.png
 
-        :ref:`sphx_glr_auto_examples_tutorials_run_pivoted_doc_norm.py`
+     :ref:`sphx_glr_auto_examples_tutorials_run_pivoted_doc_norm.py`
 
 .. raw:: html
 
@@ -302,7 +302,7 @@ Learning-oriented lessons that introduce a particular gensim feature, e.g. a mod
    /auto_examples/tutorials/run_pivoted_doc_norm
 .. raw:: html
 
-    <div style='clear:both'></div>
+    <div class="sphx-glr-clear"></div>
 
 
 
@@ -321,9 +321,9 @@ These **goal-oriented guides** demonstrate how to **solve a specific problem** u
 
 .. only:: html
 
-    .. figure:: /auto_examples/howtos/images/thumb/sphx_glr_run_downloader_api_thumb.png
+ .. figure:: /auto_examples/howtos/images/thumb/sphx_glr_run_downloader_api_thumb.png
 
-        :ref:`sphx_glr_auto_examples_howtos_run_downloader_api.py`
+     :ref:`sphx_glr_auto_examples_howtos_run_downloader_api.py`
 
 .. raw:: html
 
@@ -341,9 +341,9 @@ These **goal-oriented guides** demonstrate how to **solve a specific problem** u
 
 .. only:: html
 
-    .. figure:: /auto_examples/howtos/images/thumb/sphx_glr_run_doc_thumb.png
+ .. figure:: /auto_examples/howtos/images/thumb/sphx_glr_run_doc_thumb.png
 
-        :ref:`sphx_glr_auto_examples_howtos_run_doc.py`
+     :ref:`sphx_glr_auto_examples_howtos_run_doc.py`
 
 .. raw:: html
 
@@ -361,9 +361,9 @@ These **goal-oriented guides** demonstrate how to **solve a specific problem** u
 
 .. only:: html
 
-    .. figure:: /auto_examples/howtos/images/thumb/sphx_glr_run_doc2vec_imdb_thumb.png
+ .. figure:: /auto_examples/howtos/images/thumb/sphx_glr_run_doc2vec_imdb_thumb.png
 
-        :ref:`sphx_glr_auto_examples_howtos_run_doc2vec_imdb.py`
+     :ref:`sphx_glr_auto_examples_howtos_run_doc2vec_imdb.py`
 
 .. raw:: html
 
@@ -381,9 +381,9 @@ These **goal-oriented guides** demonstrate how to **solve a specific problem** u
 
 .. only:: html
 
-    .. figure:: /auto_examples/howtos/images/thumb/sphx_glr_run_compare_lda_thumb.png
+ .. figure:: /auto_examples/howtos/images/thumb/sphx_glr_run_compare_lda_thumb.png
 
-        :ref:`sphx_glr_auto_examples_howtos_run_compare_lda.py`
+     :ref:`sphx_glr_auto_examples_howtos_run_compare_lda.py`
 
 .. raw:: html
 
@@ -396,7 +396,7 @@ These **goal-oriented guides** demonstrate how to **solve a specific problem** u
    /auto_examples/howtos/run_compare_lda
 .. raw:: html
 
-    <div style='clear:both'></div>
+    <div class="sphx-glr-clear"></div>
 
 
 
@@ -440,7 +440,7 @@ Blog posts, tutorial videos, hackathons and other useful Gensim resources, from
 
 .. raw:: html
 
-    <div style='clear:both'></div>
+    <div class="sphx-glr-clear"></div>
 
 
 
@@ -452,13 +452,13 @@ Blog posts, tutorial videos, hackathons and other useful Gensim resources, from
 
   .. container:: sphx-glr-download
 
-    :download:`Download all examples in Python source code: auto_examples_python.zip <//home/misha/git/gensim/docs/src/auto_examples/auto_examples_python.zip>`
+    :download:`Download all examples in Python source code: auto_examples_python.zip <//Users/a.loosley/Alex/Repos/gensim/docs/src/auto_examples/auto_examples_python.zip>`
 
 
 
   .. container:: sphx-glr-download
 
-    :download:`Download all examples in Jupyter notebooks: auto_examples_jupyter.zip <//home/misha/git/gensim/docs/src/auto_examples/auto_examples_jupyter.zip>`
+    :download:`Download all examples in Jupyter notebooks: auto_examples_jupyter.zip <//Users/a.loosley/Alex/Repos/gensim/docs/src/auto_examples/auto_examples_jupyter.zip>`
 
 
 .. only:: html
diff --git a/docs/src/corpora/opinosiscorpus.rst b/docs/src/corpora/opinosiscorpus.rst
new file mode 100644
index 0000000000..3f62454677
--- /dev/null
+++ b/docs/src/corpora/opinosiscorpus.rst
@@ -0,0 +1,9 @@
+:mod:`corpora.opinosiscorpus` -- Topic related review sentences
+===============================================================
+
+.. automodule:: gensim.corpora.opinosiscorpus
+    :synopsis: Topic related review sentences
+    :members:
+    :inherited-members:
+    :undoc-members:
+    :show-inheritance:
diff --git a/docs/src/models/ensemblelda.rst b/docs/src/models/ensemblelda.rst
new file mode 100644
index 0000000000..42e63c94be
--- /dev/null
+++ b/docs/src/models/ensemblelda.rst
@@ -0,0 +1,9 @@
+:mod:`models.ensembelda` -- Ensemble Latent Dirichlet Allocation
+================================================================
+
+.. automodule:: gensim.models.ensemblelda
+    :synopsis: Ensemble Latent Dirichlet Allocation
+    :members:
+    :inherited-members:
+    :undoc-members:
+    :show-inheritance:
diff --git a/gensim/corpora/__init__.py b/gensim/corpora/__init__.py
index 0d51a9b903..7b7044e0b9 100644
--- a/gensim/corpora/__init__.py
+++ b/gensim/corpora/__init__.py
@@ -15,3 +15,4 @@
 from .textcorpus import TextCorpus, TextDirectoryCorpus  # noqa:F401
 from .ucicorpus import UciCorpus  # noqa:F401
 from .malletcorpus import MalletCorpus  # noqa:F401
+from .opinosiscorpus import OpinosisCorpus  # noqa:F401
diff --git a/gensim/corpora/opinosiscorpus.py b/gensim/corpora/opinosiscorpus.py
new file mode 100644
index 0000000000..1a68ec670c
--- /dev/null
+++ b/gensim/corpora/opinosiscorpus.py
@@ -0,0 +1,75 @@
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+#
+# Author: Tobias B <github.com/sezanzeb>
+
+
+"""Creates a corpus and dictionary from the Opinosis dataset.
+
+References
+----------
+.. [1] Ganesan, Kavita and Zhai, ChengXiang and Han, Jiawei. Opinosis: a graph-based approach to abstractive
+       summarization of highly redundant opinions [online]. In : Proceedings of the 23rd International Conference on
+       Computational Linguistics. 2010. p. 340-348. Available from: https://kavita-ganesan.com/opinosis/
+"""
+
+import os
+import re
+from gensim.corpora import Dictionary
+from gensim.parsing.porter import PorterStemmer
+from gensim.parsing.preprocessing import STOPWORDS
+
+
+class OpinosisCorpus():
+    """Creates a corpus and dictionary from the Opinosis dataset.
+
+    http://kavita-ganesan.com/opinosis-opinion-dataset/
+
+    This data is organized in folders, each folder containing a few short docs.
+
+    Data can be obtained quickly using the following commands in bash:
+
+        mkdir opinosis && cd opinosis
+        wget https://github.com/kavgan/opinosis/raw/master/OpinosisDataset1.0_0.zip
+        unzip OpinosisDataset1.0_0.zip
+
+    corpus and dictionary can be accessed by using the .corpus and .id2word members
+
+    """
+
+    def __init__(self, path):
+        """Load the downloaded corpus.
+
+        Parameters
+        ----------
+        path : string
+            Path to the extracted zip file. If 'summaries-gold' is in a folder
+            called 'opinosis', then the Path parameter would be 'opinosis',
+            either relative to you current working directory or absolute.
+        """
+        # citation
+        path = os.path.join(path, "summaries-gold")
+        dictionary = Dictionary()
+        corpus = []
+        stemmer = PorterStemmer()
+
+        for directory, b, filenames in os.walk(path):
+            # each subdirectory of path is one collection of reviews to a specific product
+            # now get the corpus/documents
+            for filename in filenames:
+                filepath = directory + os.sep + filename
+                # write down the document and the topicId and split into train and testdata
+                with open(filepath) as file:
+                    doc = file.read()
+
+                preprocessed_doc = [
+                    stemmer.stem(token) for token in re.findall(r'\w+', doc.lower())
+                    if token not in STOPWORDS
+                ]
+
+                dictionary.add_documents([preprocessed_doc])
+                corpus += [dictionary.doc2bow(preprocessed_doc)]
+
+        # and return the results the same way the other corpus generating functions do
+        self.corpus = corpus
+        self.id2word = dictionary
diff --git a/gensim/models/__init__.py b/gensim/models/__init__.py
index a0ee690550..507ec887b9 100644
--- a/gensim/models/__init__.py
+++ b/gensim/models/__init__.py
@@ -21,6 +21,7 @@
 from .ldaseqmodel import LdaSeqModel  # noqa:F401
 from .fasttext import FastText  # noqa:F401
 from .translation_matrix import TranslationMatrix, BackMappingTranslationMatrix  # noqa:F401
+from .ensemblelda import EnsembleLda  # noqa:F401
 
 from . import wrappers  # noqa:F401
 from . import deprecated  # noqa:F401
diff --git a/gensim/models/ensemblelda.py b/gensim/models/ensemblelda.py
new file mode 100644
index 0000000000..e793a9edcc
--- /dev/null
+++ b/gensim/models/ensemblelda.py
@@ -0,0 +1,1289 @@
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+#
+# Authors: Tobias Brigl <github.com/sezanzeb>, Alex Salles <alex.salles@gmail.com>,
+# Alex Loosley <aloosley@alumni.brown.edu>, Data Reply Munich
+#
+
+
+"""Ensemble Latent Dirichlet Allocation (eLDA), a method of training a topic model ensemble.
+
+Extracts reliable topics that are consistently learned across multiple LDA models. eLDA has the added benefit that
+the user does not need to know the exact number of topics the topic model should extract ahead of time
+
+For more details read our paper (https://www.hip70890b.de/machine_learning/ensemble_LDA/).
+
+Usage examples
+--------------
+
+Train an ensemble of LdaModels using a Gensim corpus
+
+.. sourcecode:: pycon
+
+    >>> from gensim.test.utils import common_texts
+    >>> from gensim.corpora.dictionary import Dictionary
+    >>> from gensim.models import EnsembleLda
+    >>>
+    >>> # Create a corpus from a list of texts
+    >>> common_dictionary = Dictionary(common_texts)
+    >>> common_corpus = [common_dictionary.doc2bow(text) for text in common_texts]
+    >>>
+    >>> # Train the model on the corpus. corpus has to be provided as a
+    >>> # keyword argument, as they are passed through to the children.
+    >>> elda = EnsembleLda(corpus=common_corpus, id2word=common_dictionary, num_topics=10, num_models=4)
+
+Save a model to disk, or reload a pre-trained model
+
+.. sourcecode:: pycon
+
+    >>> from gensim.test.utils import datapath
+    >>>
+    >>> # Save model to disk.
+    >>> temp_file = datapath("model")
+    >>> elda.save(temp_file)
+    >>>
+    >>> # Load a potentially pretrained model from disk.
+    >>> elda = EnsembleLda.load(temp_file)
+
+Query, the model using new, unseen documents
+
+.. sourcecode:: pycon
+
+    >>> # Create a new corpus, made of previously unseen documents.
+    >>> other_texts = [
+    ...     ['computer', 'time', 'graph'],
+    ...     ['survey', 'response', 'eps'],
+    ...     ['human', 'system', 'computer']
+    ... ]
+    >>> other_corpus = [common_dictionary.doc2bow(text) for text in other_texts]
+    >>>
+    >>> unseen_doc = other_corpus[0]
+    >>> vector = elda[unseen_doc]  # get topic probability distribution for a document
+
+Increase the ensemble size by adding a new model. Make sure it uses the same dictionary
+
+.. sourcecode:: pycon
+
+    >>> from gensim.models import LdaModel
+    >>> elda.add_model(LdaModel(common_corpus, id2word=common_dictionary, num_topics=10))
+    >>> elda.recluster()
+    >>> vector = elda[unseen_doc]
+
+To optimize the ensemble for your specific case, the children can be clustered again using
+different hyperparameters
+
+.. sourcecode:: pycon
+
+    >>> elda.recluster(eps=0.2)
+
+References
+----------
+.. [1] REHUREK, Radim and Sojka, PETR, 2010, Software framework for topic modelling with large corpora. In : THE LREC
+       2010 WORKSHOP ON NEW CHALLENGES FOR NLP FRAMEWORKS [online]. Msida : University of Malta. 2010. p. 45-50.
+       Available from: http://citeseerx.ist.psu.edu/viewdoc/summary?doi=10.1.1.695.4595
+
+.. [2] BRIGL, Tobias, 2019, Extracting Reliable Topics using Ensemble Latent Dirichlet Allocation [Bachelor Thesis].
+       Technische Hochschule Ingolstadt. Munich: Data Reply GmbH. Available from:
+       https://www.hip70890b.de/machine_learning/ensemble_LDA/
+
+Citation
+--------
+At the moment, there is no paper associated to ensemble LDA but we are working on publicly releasing Tobi Brigl's
+bachelor thesis on the topic.  In the meantime, please include a mention of us (Brigl, T.; Salles, A.; Loosley, A) and
+a link to this file (https://github.com/RaRe-Technologies/gensim/blob/develop/gensim/models/ensemblelda.py) if this work
+is presented, further developed, or used as the basis for a published result.
+
+Other Notes
+-----------
+The adjectives stable and reliable (topics) are used somewhat interchangeably throughout the doc strings and
+comments.
+
+"""
+import logging
+import os
+from multiprocessing import Process, Pipe, ProcessError
+import importlib
+
+import numpy as np
+from scipy.spatial.distance import cosine
+
+from gensim import utils
+from gensim.models import ldamodel, ldamulticore, basemodel
+from gensim.utils import SaveLoad
+
+logger = logging.getLogger(__name__)
+
+
+class EnsembleLda(SaveLoad):
+    """Ensemble Latent Dirichlet Allocation (eLDA), a method of training a topic model ensemble.
+
+    Extracts reliable topics that are consistently learned accross multiple LDA models. eLDA has the added benefit that
+    the user does not need to know the exact number of topics the topic model should extract ahead of time [2].
+
+    """
+
+    def __init__(self, topic_model_class="ldamulticore", num_models=3,
+                 min_cores=None,  # default value from _generate_stable_topics()
+                 epsilon=0.1, ensemble_workers=1, memory_friendly_ttda=True,
+                 min_samples=None, masking_method="mass", masking_threshold=None,
+                 distance_workers=1, random_state=None, **gensim_kw_args):
+        """Create and train a new EnsembleLda model.
+
+        Will start training immediatelly, except if iterations, passes or num_models is 0 or if the corpus is missing.
+
+        Parameters
+        ----------
+        topic_model_class : str, topic model, optional
+            Examples:
+                * 'ldamulticore' (default, recommended)
+                * 'lda'
+        ensemble_workers : int, optional
+            Spawns that many processes and distributes the models from the ensemble to those as evenly as possible.
+            num_models should be a multiple of ensemble_workers.
+
+            Setting it to 0 or 1 will both use the non-multiprocessing version. Default: 1
+        num_models : int, optional
+            How many LDA models to train in this ensemble.
+            Default: 3
+        min_cores : int, optional
+            Minimum cores a cluster of topics has to contain so that it is recognized as stable topic.
+        epsilon : float, optional
+            Defaults to 0.1. Epsilon for the CBDBSCAN clustering that generates the stable topics.
+        ensemble_workers : int, optional
+            Spawns that many processes and distributes the models from the ensemble to those as evenly as possible.
+            num_models should be a multiple of ensemble_workers.
+
+            Setting it to 0 or 1 will both use the nonmultiprocessing version. Default: 1
+        memory_friendly_ttda : boolean, optional
+            If True, the models in the ensemble are deleted after training and only a concatenation of each model's
+            topic term distribution (called ttda) is kept to save memory.
+
+            Defaults to True. When False, trained models are stored in a list in self.tms, and no models that are not
+            of a gensim model type can be added to this ensemble using the add_model function.
+
+            If False, any topic term matrix can be suplied to add_model.
+        min_samples : int, optional
+            Required int of nearby topics for a topic to be considered as 'core' in the CBDBSCAN clustering.
+        masking_method : str, optional
+            Choose one of "mass" (default) or "rank" (percentile, faster).
+
+            For clustering, distances between topic-term distributions are asymmetric.  In particular, the distance
+            (technically a divergence) from distribution A to B is more of a measure of if A is contained in B.  At a
+            high level, this involves using distribution A to mask distribution B and then calculating the cosine
+            distance between the two.  The masking can be done in two ways:
+
+            1. mass: forms mask by taking the top ranked terms until their cumulative mass reaches the
+            'masking_threshold'
+
+            2. rank: forms mask by taking the top ranked terms (by mass) until the 'masking_threshold' is reached.
+            For example, a ranking threshold of 0.11 means the top 0.11 terms by weight are used to form a mask.
+
+        masking_threshold : float, optional
+            Default: None, which uses ``0.95`` for "mass", and ``0.11`` for masking_method "rank".  In general, too
+            small a mask threshold leads to inaccurate calculations (no signal) and too big a mask leads to noisy
+            distance calculations.  Defaults are often a good sweet spot for this hyperparameter.
+        distance_workers : int, optional
+            When ``distance_workers`` is ``None``, it defaults to ``os.cpu_count()`` for maximum performance. Default is
+            1, which is not multiprocessed. Set to ``> 1`` to enable multiprocessing.
+        **gensim_kw_args
+            Parameters for each gensim model (e.g. :py:class:`gensim.models.LdaModel`) in the ensemble.
+
+        """
+        # INTERNAL PARAMETERS
+        # Set random state
+        # nps max random state of 2**32 - 1 is too large for windows:
+        self._MAX_RANDOM_STATE = np.iinfo(np.int32).max
+
+        # _COSINE_DISTANCE_CALCULATION_THRESHOLD is used so that cosine distance calculations can be sped up by skipping
+        # distance calculations for highly masked topic-term distributions
+        self._COSINE_DISTANCE_CALCULATION_THRESHOLD = 0.05
+
+        if "id2word" not in gensim_kw_args:
+            gensim_kw_args["id2word"] = None
+        if "corpus" not in gensim_kw_args:
+            gensim_kw_args["corpus"] = None
+
+        if gensim_kw_args["id2word"] is None and not gensim_kw_args["corpus"] is None:
+            logger.warning("no word id mapping provided; initializing from corpus, assuming identity")
+            gensim_kw_args["id2word"] = utils.dict_from_corpus(gensim_kw_args["corpus"])
+        if gensim_kw_args["id2word"] is None and gensim_kw_args["corpus"] is None:
+            raise ValueError(
+                "at least one of corpus/id2word must be specified, to establish "
+                "input space dimensionality. Corpus should be provided using the "
+                "`corpus` keyword argument.")
+
+        if type(topic_model_class) == type and issubclass(topic_model_class, ldamodel.LdaModel):
+            self.topic_model_class = topic_model_class
+        else:
+            kinds = {
+                "lda": ldamodel.LdaModel,
+                "ldamulticore": ldamulticore.LdaMulticore
+            }
+            if topic_model_class not in kinds:
+                raise ValueError(
+                    "topic_model_class should be one of 'lda', 'ldamulticode' or a model inheriting from LdaModel")
+            self.topic_model_class = kinds[topic_model_class]
+
+        self.num_models = num_models
+        self.gensim_kw_args = gensim_kw_args
+
+        self.memory_friendly_ttda = memory_friendly_ttda
+
+        self.distance_workers = distance_workers
+        self.masking_threshold = masking_threshold
+        self.masking_method = masking_method
+
+        # this will provide the gensim api to the ensemble basically
+        self.classic_model_representation = None
+
+        # the ensembles state
+        self.random_state = utils.get_random_state(random_state)
+        self.sstats_sum = 0
+        self.eta = None
+        self.tms = []
+        # initialize empty topic term distribution array
+        self.ttda = np.empty((0, len(gensim_kw_args["id2word"])))
+        self.asymmetric_distance_matrix_outdated = True
+
+        # in case the model will not train due to some
+        # parameters, stop here and don't train.
+        if num_models <= 0:
+            return
+        if gensim_kw_args.get("corpus") is None:
+            return
+        if "iterations" in gensim_kw_args and gensim_kw_args["iterations"] <= 0:
+            return
+        if "passes" in gensim_kw_args and gensim_kw_args["passes"] <= 0:
+            return
+
+        logger.info("generating {} topic models...".format(num_models))
+
+        if ensemble_workers > 1:
+            self._generate_topic_models_multiproc(num_models, ensemble_workers)
+        else:
+            # singlecore
+            self._generate_topic_models(num_models)
+
+        self._generate_asymmetric_distance_matrix()
+        self._generate_topic_clusters(epsilon, min_samples)
+        self._generate_stable_topics(min_cores)
+
+        # create model that can provide the usual gensim api to the stable topics from the ensemble
+        self.generate_gensim_representation()
+
+    def get_topic_model_class(self):
+        """Get the class that is used for :meth:`gensim.models.EnsembleLda.generate_gensim_representation`."""
+        if self.topic_model_class is None:
+            instruction = 'Try setting topic_model_class manually to what the individual models were based on, ' \
+                'e.g. LdaMulticore.'
+            try:
+                module = importlib.import_module(self.topic_model_module_string)
+                self.topic_model_class = getattr(module, self.topic_model_class_string)
+                del self.topic_model_module_string
+                del self.topic_model_class_string
+            except ModuleNotFoundError:
+                logger.error(
+                    'Could not import the "{}" module in order to provide the "{}" class as '
+                    '"topic_model_class" attribute. {}'
+                    .format(self.topic_model_class_string, self.topic_model_class_string, instruction))
+            except AttributeError:
+                logger.error(
+                    'Could not import the "{}" class from the "{}" module in order to set the '
+                    '"topic_model_class" attribute. {}'
+                    .format(self.topic_model_class_string, self.topic_model_module_string, instruction))
+        return self.topic_model_class
+
+    def save(self, *args, **kwargs):
+        """See :meth:`gensim.utils.SaveLoad.save`."""
+        if self.get_topic_model_class() is not None:
+            self.topic_model_module_string = self.topic_model_class.__module__
+            self.topic_model_class_string = self.topic_model_class.__name__
+        kwargs['ignore'] = frozenset(kwargs.get('ignore', ())).union(('topic_model_class', ))
+        super(EnsembleLda, self).save(*args, **kwargs)
+
+    save.__doc__ = SaveLoad.save.__doc__
+
+    def convert_to_memory_friendly(self):
+        """Remove the stored gensim models and only keep their ttdas."""
+        self.tms = []
+        self.memory_friendly_ttda = True
+
+    def generate_gensim_representation(self):
+        """Create a gensim model from the stable topics.
+
+        The returned representation is an Gensim LdaModel (:py:class:`gensim.models.LdaModel`) that has been
+        instantiated with an A-priori belief on word probability, eta, that represents the topic-term distributions of
+        any stable topics the were found by clustering over the ensemble of topic distributions.
+
+        When no stable topics have been detected, None is returned.
+
+        Returns
+        -------
+        :py:class:`gensim.models.LdaModel`
+            A Gensim LDA Model classic_model_representation for which:
+            ``classic_model_representation.get_topics() == self.get_topics()``
+
+        """
+        logger.info("generating classic gensim model representation based on results from the ensemble")
+
+        sstats_sum = self.sstats_sum
+        # if sstats_sum (which is the number of words actually) should be wrong for some fantastic funny reason
+        # that makes you want to peel your skin off, recreate it (takes a while):
+        if sstats_sum == 0 and "corpus" in self.gensim_kw_args and not self.gensim_kw_args["corpus"] is None:
+            for document in self.gensim_kw_args["corpus"]:
+                for token in document:
+                    sstats_sum += token[1]
+            self.sstats_sum = sstats_sum
+
+        stable_topics = self.get_topics()
+
+        num_stable_topics = len(stable_topics)
+
+        if num_stable_topics == 0:
+            logger.error("the model did not detect any stable topic. You can try to adjust epsilon: "
+                         "recluster(eps=...)")
+            self.classic_model_representation = None
+            return
+
+        # create a new gensim model
+        params = self.gensim_kw_args.copy()
+        params["eta"] = self.eta
+        params["num_topics"] = num_stable_topics
+        # adjust params in a way that no training happens
+        params["passes"] = 0  # no training
+        # iterations is needed for inference, pass it to the model
+
+        classic_model_representation = self.get_topic_model_class()(**params)
+
+        # when eta was None, use what gensim generates as default eta for the following tasks:
+        eta = classic_model_representation.eta
+        if sstats_sum == 0:
+            sstats_sum = classic_model_representation.state.sstats.sum()
+            self.sstats_sum = sstats_sum
+
+        # the following is important for the denormalization
+        # to generate the proper sstats for the new gensim model:
+        # transform to dimensionality of stable_topics. axis=1 is summed
+        eta_sum = 0
+        if isinstance(eta, (int, float)):
+            eta_sum = [eta * len(stable_topics[0])] * num_stable_topics
+        else:
+            if len(eta.shape) == 1:  # [e1, e2, e3]
+                eta_sum = [[eta.sum()]] * num_stable_topics
+            if len(eta.shape) > 1:  # [[e11, e12, ...], [e21, e22, ...], ...]
+                eta_sum = np.array(eta.sum(axis=1)[:, None])
+
+        # the factor, that will be used when get_topics() is used, for normalization
+        # will never change, because the sum for eta as well as the sum for sstats is constant.
+        # Therefore predicting normalization_factor becomes super easy.
+        # corpus is a mapping of id to occurrences
+
+        # so one can also easily calculate the
+        # right sstats, so that get_topics() will return the stable topics no
+        # matter eta.
+
+        normalization_factor = np.array([[sstats_sum / num_stable_topics]] * num_stable_topics) + eta_sum
+
+        sstats = stable_topics * normalization_factor
+        sstats -= eta
+
+        classic_model_representation.state.sstats = sstats.astype(np.float32)
+        # fix expElogbeta.
+        classic_model_representation.sync_state()
+
+        self.classic_model_representation = classic_model_representation
+
+        return classic_model_representation
+
+    def add_model(self, target, num_new_models=None):
+        """Add the topic term distribution array (ttda) of another model to the ensemble.
+
+        This way, multiple topic models can be connected to an ensemble manually. Make sure that all the models use
+        the exact same dictionary/idword mapping.
+
+        In order to generate new stable topics afterwards, use:
+            1. ``self._generate_asymmetric_distance_matrix()``
+            2. ``self.``:meth:`~gensim.models.ensemblelda.EnsembleLda.recluster`
+
+        The ttda of another ensemble can also be used, in that case set ``num_new_models`` to the ``num_models``
+        parameter of the ensemble, that means the number of classic models in the ensemble that generated the ttda.
+        This is important, because that information is used to estimate "min_samples" for _generate_topic_clusters.
+
+        If you trained this ensemble in the past with a certain Dictionary that you want to reuse for other
+        models, you can get it from: ``self.id2word``.
+
+        Parameters
+        ----------
+        target : {see description}
+            1. A single EnsembleLda object
+            2. List of EnsembleLda objects
+            3. A single Gensim topic model (e.g. (:py:class:`gensim.models.LdaModel`)
+            4. List of Gensim topic models
+
+            if memory_friendly_ttda is True, target can also be:
+            5. topic-term-distribution-array
+
+            example: [[0.1, 0.1, 0.8], [...], ...]
+
+            [topic1, topic2, ...]
+            with topic being an array of probabilities:
+            [token1, token2, ...]
+
+            token probabilities in a single topic sum to one, therefore, all the words sum to len(ttda)
+
+        num_new_models : integer, optional
+            the model keeps track of how many models were used in this ensemble. Set higher if ttda contained topics
+            from more than one model. Default: None, which takes care of it automatically.
+
+            If target is a 2D-array of float values, it assumes 1.
+
+            If the ensemble has ``memory_friendly_ttda`` set to False, then it will always use the number of models in
+            the target parameter.
+
+        """
+        # If the model has never seen a ttda before, initialize.
+        # If it has, append.
+
+        # Be flexible. Can be a single element or a list of elements
+        # make sure it's a numpy array
+        if not isinstance(target, (np.ndarray, list)):
+            target = np.array([target])
+        else:
+            target = np.array(target)
+            assert len(target) > 0
+
+        if self.memory_friendly_ttda:
+            # for memory friendly models/ttdas, append the ttdas to itself
+
+            detected_num_models = 0
+
+            ttda = []
+
+            # 1. ttda array, because that's the only accepted input that contains numbers
+            if isinstance(target.dtype.type(), (np.number, float)):
+                ttda = target
+                detected_num_models = 1
+
+            # 2. list of ensemblelda objects
+            elif isinstance(target[0], type(self)):
+                ttda = np.concatenate([ensemble.ttda for ensemble in target], axis=0)
+                detected_num_models = sum([ensemble.num_models for ensemble in target])
+
+            # 3. list of gensim models
+            elif isinstance(target[0], basemodel.BaseTopicModel):
+                ttda = np.concatenate([model.get_topics() for model in target], axis=0)
+                detected_num_models = len(target)
+
+            # unknown
+            else:
+                raise ValueError("target is of unknown type or a list of unknown types: {}".format(type(target[0])))
+
+            # new models were added, increase num_models
+            # if the user didn't provide a custon numer to use
+            if num_new_models is None:
+                self.num_models += detected_num_models
+            else:
+                self.num_models += num_new_models
+
+        else:  # memory unfriendly ensembles
+
+            ttda = []
+
+            # 1. ttda array
+            if isinstance(target.dtype.type(), (np.number, float)):
+                raise ValueError(
+                    'ttda arrays cannot be added to ensembles, for which memory_friendly_ttda=False, '
+                    'you can call convert_to_memory_friendly, but it will discard the stored gensim '
+                    'models and only keep the relevant topic term distributions from them.')
+
+            # 2. list of ensembles
+            elif isinstance(target[0], type(self)):
+                for ensemble in target:
+                    self.tms += ensemble.tms
+                ttda = np.concatenate([ensemble.ttda for ensemble in target], axis=0)
+
+            # 3. list of gensim models
+            elif isinstance(target[0], basemodel.BaseTopicModel):
+                self.tms += target.tolist()
+                ttda = np.concatenate([model.get_topics() for model in target], axis=0)
+
+            # unknown
+            else:
+                raise ValueError("target is of unknown type or a list of unknown types: {}".format(type(target[0])))
+
+            # in this case, len(self.tms) should
+            # always match self.num_models
+            if num_new_models is not None and num_new_models + self.num_models != len(self.tms):
+                logger.info(
+                    'num_new_models will be ignored. num_models should match the number of '
+                    'stored models for a memory unfriendly ensemble')
+            self.num_models = len(self.tms)
+
+        logger.info("ensemble contains {} models and {} topics now".format(self.num_models, len(self.ttda)))
+
+        if self.ttda.shape[1] != ttda.shape[1]:
+            raise ValueError("target ttda dimensions do not match. Topics must be {} but was {} elements large"
+                .format(self.ttda.shape[-1], ttda.shape[-1]))
+        self.ttda = np.append(self.ttda, ttda, axis=0)
+
+        # tell recluster that the distance matrix needs to be regenerated
+        self.asymmetric_distance_matrix_outdated = True
+
+    def _teardown(self, pipes, processes, i):
+        """close pipes and terminate processes
+
+        Parameters
+        ----------
+            pipes : {list of :class:`multiprocessing.Pipe`}
+                list of pipes that the processes use to communicate with the parent
+            processes : {list of :class:`multiprocessing.Process`}
+                list of worker processes
+            i : int
+                index of the process that could not be started
+
+        """
+        logger.error("could not start process {}".format(i))
+
+        for pipe in pipes:
+            pipe[1].close()
+            pipe[0].close()
+
+        for process in processes:
+            if process.is_alive():
+                process.terminate()
+            del process
+
+    def _generate_topic_models_multiproc(self, num_models, ensemble_workers):
+        """Generate the topic models to form the ensemble in a multiprocessed way.
+
+        Depending on the used topic model this can result in a speedup.
+
+        Parameters
+        ----------
+        num_models : int
+            how many models to train in the ensemble
+        ensemble_workers : int
+            into how many processes to split the models will be set to max(workers, num_models), to avoid workers that
+            are supposed to train 0 models.
+
+            to get maximum performance, set to the number of your cores, if non-parallelized models are being used in
+            the ensemble (LdaModel).
+
+            For LdaMulticore, the performance gain is small and gets larger for a significantly smaller corpus.
+            In that case, ensemble_workers=2 can be used.
+
+        """
+        # the way random_states is handled needs to prevent getting different results when multiprocessing is on,
+        # or getting the same results in every lda children. so it is solved by generating a list of state seeds before
+        # multiprocessing is started.
+        random_states = [self.random_state.randint(self._MAX_RANDOM_STATE) for _ in range(num_models)]
+
+        # each worker has to work on at least one model.
+        # Don't spawn idle workers:
+        workers = min(ensemble_workers, num_models)
+
+        # create worker processes:
+        # from what I know this is basically forking with a jump to a target function in each child
+        # so modifying the ensemble object will not modify the one in the parent because of no shared memory
+        processes = []
+        pipes = []
+        num_models_unhandled = num_models  # how many more models need to be trained by workers?
+
+        for i in range(workers):
+            parent_conn, child_conn = Pipe()
+            num_subprocess_models = 0
+            if i == workers - 1:  # i is a index, hence -1
+                # is this the last worker that needs to be created?
+                # then task that worker with all the remaining models
+                num_subprocess_models = num_models_unhandled
+            else:
+                num_subprocess_models = int(num_models_unhandled / (workers - i))
+
+            # get the chunk from the random states that is meant to be for those models
+            random_states_for_worker = random_states[-num_models_unhandled:][:num_subprocess_models]
+
+            try:
+                process = Process(
+                    target=self._generate_topic_models,
+                    args=(num_subprocess_models, random_states_for_worker, child_conn))
+
+                processes.append(process)
+                pipes.append((parent_conn, child_conn))
+                process.start()
+
+                num_models_unhandled -= num_subprocess_models
+
+            except ProcessError:
+                self._teardown(pipes, processes, i)
+                raise ProcessError
+
+        # aggregate results
+        # will also block until workers are finished
+        for pipe in pipes:
+            answer = pipe[0].recv()  # [0], because that is the parentConn
+            pipe[0].close()
+            # this does basically the same as the _generate_topic_models function (concatenate all the ttdas):
+            if not self.memory_friendly_ttda:
+                self.tms += answer
+                ttda = np.concatenate([model.get_topics() for model in answer])
+            else:
+                ttda = answer
+            self.ttda = np.concatenate([self.ttda, ttda])
+
+        # end all processes
+        for process in processes:
+            process.terminate()
+
+    def _generate_topic_models(self, num_models, random_states=None, pipe=None):
+        """Train the topic models that form the ensemble.
+
+        Parameters
+        ----------
+        num_models : int
+            number of models to be generated
+        random_states : list
+            list of numbers or np.random.RandomState objects. Will be autogenerated based on the ensembles
+            RandomState if None (default).
+        pipe : multiprocessing.pipe
+            Default None. If provided, will send the trained models over this pipe. If memory friendly, it will only
+            send the ttda.
+
+        """
+        if pipe is not None:
+            logger.info("spawned worker to generate {} topic models".format(num_models))
+
+        if random_states is None:
+            random_states = [self.random_state.randint(self._MAX_RANDOM_STATE) for _ in range(num_models)]
+
+        assert len(random_states) == num_models
+
+        kwargs = self.gensim_kw_args.copy()
+
+        tm = None  # remember one of the topic models from the following
+        # loop, in order to collect some properties from it afterwards.
+
+        for i in range(num_models):
+
+            kwargs["random_state"] = random_states[i]
+
+            tm = self.get_topic_model_class()(**kwargs)
+
+            # adds the lambda (that is the unnormalized get_topics) to ttda, which is
+            # a list of all those lambdas
+            self.ttda = np.concatenate([self.ttda, tm.get_topics()])
+
+            # only saves the model if it is not "memory friendly"
+            if not self.memory_friendly_ttda:
+                self.tms += [tm]
+
+        # use one of the tms to get some info that will be needed later
+        self.sstats_sum = tm.state.sstats.sum()
+        self.eta = tm.eta
+
+        if pipe is not None:
+            # send the ttda that is in the child/workers version of the memory into the pipe
+            # available, after _generate_topic_models has been called in the worker
+            if self.memory_friendly_ttda:
+                # remember that this code is inside the worker processes memory,
+                # so self.ttda is the ttda of only a chunk of models
+                pipe.send(self.ttda)
+            else:
+                pipe.send(self.tms)
+
+            pipe.close()
+
+    def _asymmetric_distance_matrix_worker(self, worker_id, ttdas_sent, n_ttdas, pipe, threshold, method):
+        """Worker that computes the distance to all other nodes from a chunk of nodes."""
+        logger.info("spawned worker to generate {} rows of the asymmetric distance matrix".format(n_ttdas))
+        # the chunk of ttda that's going to be calculated:
+        ttda1 = self.ttda[ttdas_sent:ttdas_sent + n_ttdas]
+        distance_chunk = self._calculate_asymmetric_distance_matrix_chunk(
+            ttda1=ttda1, ttda2=self.ttda, threshold=threshold, start_index=ttdas_sent, method=method)
+        pipe.send((worker_id, distance_chunk))  # remember that this code is inside the workers memory
+        pipe.close()
+
+    def _generate_asymmetric_distance_matrix(self):
+        """Calculate the pairwise distance matrix for all the ttdas from the ensemble.
+
+        Returns the asymmetric pairwise distance matrix that is used in the DBSCAN clustering.
+
+        Afterwards, the model needs to be reclustered for this generated matrix to take effect.
+
+        """
+        workers = self.distance_workers
+        threshold = self.masking_threshold
+        method = self.masking_method
+
+        # matrix is up to date afterwards
+        self.asymmetric_distance_matrix_outdated = False
+
+        if threshold is None:
+            threshold = {"mass": 0.95, "rank": 0.11}[method]
+
+        logger.info("generating a {} x {} asymmetric distance matrix...".format(len(self.ttda), len(self.ttda)))
+
+        # singlecore
+        if workers is not None and workers <= 1:
+            self.asymmetric_distance_matrix = self._calculate_asymmetric_distance_matrix_chunk(
+                ttda1=self.ttda, ttda2=self.ttda, threshold=threshold, start_index=0, method=method)
+            return self.asymmetric_distance_matrix
+
+        # else, if workers > 1 use multiprocessing
+        # best performance on 2-core machine: 2 workers
+        if workers is None:
+            workers = os.cpu_count()
+
+        # create worker processes:
+        processes = []
+        pipes = []
+        ttdas_sent = 0
+
+        for i in range(workers):
+            try:
+                parent_conn, child_conn = Pipe()
+
+                # Load Balancing, for example if there are 9 ttdas and 4 workers, the load will be balanced 2, 2, 2, 3.
+                n_ttdas = 0
+                if i == workers - 1:  # i is a index, hence -1
+                    # is this the last worker that needs to be created?
+                    # then task that worker with all the remaining models
+                    n_ttdas = len(self.ttda) - ttdas_sent
+                else:
+                    n_ttdas = int((len(self.ttda) - ttdas_sent) / (workers - i))
+
+                process = Process(target=self._asymmetric_distance_matrix_worker,
+                            args=(i, ttdas_sent, n_ttdas, child_conn, threshold, method))
+                ttdas_sent += n_ttdas
+
+                processes.append(process)
+                pipes.append((parent_conn, child_conn))
+                process.start()
+
+            except ProcessError:
+                self._teardown(pipes, processes, i)
+                raise ProcessError
+
+        distances = []
+        # note, that the following loop maintains order in how the ttda will be concatenated
+        # which is very important. Ordering in ttda has to be the same as when using only one process
+        for pipe in pipes:
+            answer = pipe[0].recv()  # [0], because that is the parentConn
+            pipe[0].close()  # child conn will be closed from inside the worker
+            # this does basically the same as the _generate_topic_models function (concatenate all the ttdas):
+            distances.append(answer[1])
+
+        # end all processes
+        for process in processes:
+            process.terminate()
+
+        self.asymmetric_distance_matrix = np.concatenate(distances)
+
+        return self.asymmetric_distance_matrix
+
+    def _calculate_asymmetric_distance_matrix_chunk(self, ttda1, ttda2, threshold, start_index, method):
+        """Calculate an (asymmetric) distance from each topic in ``ttda1`` to each topic in ``ttda2``.
+
+        Parameters
+        ----------
+        ttda1 and ttda2: 2D arrays of floats
+            Two ttda matrices that are going to be used for distance calculation. Each row in ttda corresponds to one
+            topic. Each cell in the resulting matrix corresponds to the distance between a topic pair.
+        threshold : float, optional
+            threshold defaults to: ``{"mass": 0.95, "rank": 0.11}``, depending on the selected method
+        start_index : int
+            this function might be used in multiprocessing, so start_index has to be set as ttda1 is a chunk of the
+            complete ttda in that case. start_index would be 0 if ``ttda1 == self.ttda``. When self.ttda is split into
+            two pieces, each 100 ttdas long, then start_index should be be 100. default is 0
+        method : {'mass', 'rank}, optional
+            method can be "mass" for the original masking method or "rank" for a faster masking method that selects
+            by rank of largest elements in the topic term distribution, to determine which tokens are relevant for the
+            topic.
+
+        Returns
+        -------
+        2D Numpy.numpy.ndarray of floats
+            Asymmetric distance matrix of size ``len(ttda1)`` by ``len(ttda2)``.
+
+        """
+        # initialize the distance matrix. ndarray is faster than zeros
+        distances = np.ndarray((len(ttda1), len(ttda2)))
+
+        if ttda1.shape[0] > 0 and ttda2.shape[0] > 0:
+            # the worker might not have received a ttda because it was chunked up too much
+
+            if method not in ["mass", "rank"]:
+                raise ValueError("method {} unknown".format(method))
+
+            # select masking method:
+            def mass_masking(a):
+                """Original masking method. Returns a new binary mask."""
+                sorted_a = np.sort(a)[::-1]
+                largest_mass = sorted_a.cumsum() < threshold
+                smallest_valid = sorted_a[largest_mass][-1]
+                return a >= smallest_valid
+
+            def rank_masking(a):
+                """Faster masking method. Returns a new binary mask."""
+                return a > np.sort(a)[::-1][int(len(a) * threshold)]
+
+            create_mask = {"mass": mass_masking, "rank": rank_masking}[method]
+
+            # some help to find a better threshold by useful log messages
+            avg_mask_size = 0
+
+            # now iterate over each topic
+            for ttd1_idx, ttd1 in enumerate(ttda1):
+                # create mask from ttd1 that removes noise from a and keeps the largest terms
+                mask = create_mask(ttd1)
+                ttd1_masked = ttd1[mask]
+
+                avg_mask_size += mask.sum()
+
+                # now look at every possible pair for topic a:
+                for ttd2_idx, ttd2 in enumerate(ttda2):
+                    # distance to itself is 0
+                    if ttd1_idx + start_index == ttd2_idx:
+                        distances[ttd1_idx][ttd2_idx] = 0
+                        continue
+
+                    # now mask b based on a, which will force the shape of a onto b
+                    ttd2_masked = ttd2[mask]
+
+                    # Smart distance calculation avoids calculating cosine distance for highly masked topic-term
+                    # distributions that will have distance values near 1.
+                    if ttd2_masked.sum() <= self._COSINE_DISTANCE_CALCULATION_THRESHOLD:
+                        distance = 1
+                    else:
+                        distance = cosine(ttd1_masked, ttd2_masked)
+
+                    distances[ttd1_idx][ttd2_idx] = distance
+
+            percent = round(100 * avg_mask_size / ttda1.shape[0] / ttda1.shape[1], 1)
+            logger.info('the given threshold of {} covered on average {}% of tokens'.format(threshold, percent))
+
+        return distances
+
+    def _generate_topic_clusters(self, eps=0.1, min_samples=None):
+        """Run the CBDBSCAN algorithm on all the detected topics and label them with label-indices.
+
+        The final approval and generation of stable topics is done in ``_generate_stable_topics()``.
+
+        Parameters
+        ----------
+        eps : float
+            dbscan distance scale
+        min_samples : int, optional
+            defaults to ``int(self.num_models / 2)``, dbscan min neighbours threshold which corresponds to finding
+            stable topics and should scale with the number of models, ``self.num_models``
+
+        """
+        if min_samples is None:
+            min_samples = int(self.num_models / 2)
+
+        logger.info("fitting the clustering model")
+
+        self.cluster_model = CBDBSCAN(eps=eps, min_samples=min_samples)
+        self.cluster_model.fit(self.asymmetric_distance_matrix)
+
+    def _is_valid_core(self, topic):
+        """Check if the topic is a valid core.
+
+        Parameters
+        ----------
+        topic : {'is_core', 'valid_parents', 'label'}
+            topic to validate
+
+        """
+        return topic["is_core"] and (topic["valid_parents"] == {topic["label"]})
+
+    def _group_by_labels(self, results):
+        """Group all the learned cores by their label, which was assigned in the cluster_model.
+
+        Parameters
+        ----------
+        results : {list of {'is_core', 'neighboring_labels', 'label'}}
+            After calling .fit on a CBDBSCAN model, the results can be retrieved from it by accessing the .results
+            member, which can be used as the argument to this function. It's a list of infos gathered during
+            the clustering step and each element in the list corresponds to a single topic.
+
+        Returns
+        -------
+            dict of (int, list of {'is_core', 'num_neighboring_labels', 'neighboring_labels'})
+            A mapping of the label to a list of topics that belong to that particular label. Also adds
+            a new member to each topic called num_neighboring_labels, which is the number of
+            neighboring_labels of that topic.
+
+        """
+        grouped_by_labels = {}
+        for topic in results:
+            if topic["is_core"]:
+                topic = topic.copy()
+
+                # counts how many different labels a core has as parents
+                topic["num_neighboring_labels"] = len(topic["neighboring_labels"])
+
+                label = topic["label"]
+                if label not in grouped_by_labels:
+                    grouped_by_labels[label] = []
+                grouped_by_labels[label].append(topic)
+        return grouped_by_labels
+
+    def _aggregate_topics(self, grouped_by_labels):
+        """Aggregate the labeled topics to a list of clusters.
+
+        Parameters
+        ----------
+        grouped_by_labels : dict of (int, list of {'is_core', 'num_neighboring_labels', 'neighboring_labels'})
+            The return value of _group_by_labels. A mapping of the label to a list of each topic which belongs to the
+            label.
+
+        Returns
+        -------
+            list of {'max_num_neighboring_labels', 'neighboring_labels', 'label'}
+            max_num_neighboring_labels is the max number of parent labels among each topic of a given cluster. label
+            refers to the label identifier of the cluster. neighboring_labels is a concatenated list of the
+            neighboring_labels sets of each topic. Its sorted by max_num_neighboring_labels in descending
+            order. There is one single element for each cluster.
+
+        """
+        sorted_clusters = []
+
+        for label, group in grouped_by_labels.items():
+            num_neighboring_labels = 0
+            neighboring_labels = []  # will be a list of sets
+
+            for topic in group:
+                max_num_neighboring_labels = max(topic["num_neighboring_labels"], num_neighboring_labels)
+                neighboring_labels.append(topic["neighboring_labels"])
+
+            neighboring_labels = [x for x in neighboring_labels if len(x) > 0]
+
+            sorted_clusters.append({
+                "max_num_neighboring_labels": max_num_neighboring_labels,
+                "neighboring_labels": neighboring_labels,
+                "label": label,
+                "num_cores": len([topic for topic in group if topic["is_core"]])
+            })
+
+        sorted_clusters = sorted(sorted_clusters,
+            key=lambda cluster: (
+                cluster["max_num_neighboring_labels"],
+                cluster["label"]
+            ), reverse=False)
+
+        return sorted_clusters
+
+    def _remove_from_all_sets(self, label, clusters):
+        """Remove a label from every set in "neighboring_labels" for each core in ``clusters``."""
+        for cluster in clusters:
+            for neighboring_labels_set in cluster["neighboring_labels"]:
+                if label in neighboring_labels_set:
+                    neighboring_labels_set.remove(label)
+
+    def _contains_isolated_cores(self, label, cluster, min_cores):
+        """Check if the cluster has at least ``min_cores`` of cores that belong to no other cluster."""
+        return sum(map(lambda x: x == {label}, cluster["neighboring_labels"])) >= min_cores
+
+    def _generate_stable_topics(self, min_cores=None):
+        """Generate stable topics out of the clusters.
+
+        The function finds clusters of topics using a variant of DBScan.  If a cluster has enough core topics
+        (c.f. parameter ``min_cores``), then this cluster represents a stable topic.  The stable topic is specifically
+        calculated as the average over all topic-term distributions of the core topics in the cluster.
+
+        This function is the last step that has to be done in the ensemble.  After this step is complete,
+        Stable topics can be retrieved afterwards using the :meth:`~gensim.models.ensemblelda.EnsembleLda.get_topics`
+        method.
+
+        Parameters
+        ----------
+        min_cores : int
+            Minimum number of core topics needed to form a cluster that represents a stable topic.
+                Using ``None`` defaults to ``min_cores = min(3, max(1, int(self.num_models /4 +1)))``
+
+        """
+        # min_cores being 0 makes no sense. there has to be a core for a cluster
+        # or there is no cluster
+        if min_cores == 0:
+            min_cores = 1
+
+        if min_cores is None:
+            # min_cores is a number between 1 and 3, depending on the number of models
+            min_cores = min(3, max(1, int(self.num_models / 4 + 1)))
+            logger.info("generating stable topics, each cluster needs at least {} cores".format(min_cores))
+        else:
+            logger.info("generating stable topics")
+
+        results = self.cluster_model.results
+
+        grouped_by_labels = self._group_by_labels(results)
+
+        sorted_clusters = self._aggregate_topics(grouped_by_labels)
+
+        for cluster in sorted_clusters:
+            cluster["is_valid"] = None
+            if cluster["num_cores"] < min_cores:
+                cluster["is_valid"] = False
+                self._remove_from_all_sets(cluster["label"], sorted_clusters)
+
+        # now that invalid clusters are removed, check which clusters contain enough cores that don't belong to any
+        # other cluster.
+        for cluster in [cluster for cluster in sorted_clusters if cluster["is_valid"] is None]:
+            label = cluster["label"]
+            if self._contains_isolated_cores(label, cluster, min_cores):
+                cluster["is_valid"] = True
+            else:
+                cluster["is_valid"] = False
+                # This modifies parent labels which is important in _contains_isolated_cores, so the result depends on
+                # where to start.
+                self._remove_from_all_sets(label, sorted_clusters)
+
+        # list of all the label numbers that are valid
+        valid_labels = np.array([cluster["label"] for cluster in sorted_clusters if cluster["is_valid"]])
+
+        for topic in results:
+            topic["valid_parents"] = {label for label in topic["neighboring_labels"] if label in valid_labels}
+
+        # keeping only VALID cores
+        valid_core_mask = np.vectorize(self._is_valid_core)(results)
+        valid_topics = self.ttda[valid_core_mask]
+        topic_labels = np.array([topic["label"] for topic in results])[valid_core_mask]
+        unique_labels = np.unique(topic_labels)
+
+        num_stable_topics = len(unique_labels)
+        stable_topics = np.empty((num_stable_topics, len(self.id2word)))
+
+        # for each cluster
+        for l, label in enumerate(unique_labels):
+            # mean of all the topics that are of that cluster
+            topics_of_cluster = np.array([topic for t, topic in enumerate(valid_topics) if topic_labels[t] == label])
+            stable_topics[l] = topics_of_cluster.mean(axis=0)
+
+        self.sorted_clusters = sorted_clusters
+        self.stable_topics = stable_topics
+
+    def recluster(self, eps=0.1, min_samples=None, min_cores=None):
+        """Reapply CBDBSCAN clustering and stable topic generation.
+
+        Stable topics can be retrieved using :meth:`~gensim.models.ensemblelda.EnsembleLda.get_topics`.
+
+        Parameters
+        ----------
+        eps : float
+            epsilon for the CBDBSCAN algorithm, having the same meaning as in classic DBSCAN clustering.
+            default: ``0.1``
+        min_samples : int
+            The minimum number of samples in the neighborhood of a topic to be considered a core in CBDBSCAN.
+            default: ``int(self.num_models / 2)``
+        min_cores : int
+            how many cores a cluster has to have, to be treated as stable topic. That means, how many topics
+            that look similar have to be present, so that the average topic in those is used as stable topic.
+            default: ``min(3, max(1, int(self.num_models /4 +1)))``
+
+        """
+        # if new models were added to the ensemble, the distance matrix needs to be generated again
+        if self.asymmetric_distance_matrix_outdated:
+            logger.info("asymmetric distance matrix is outdated due to add_model")
+            self._generate_asymmetric_distance_matrix()
+
+        # Run CBDBSCAN to get topic clusters:
+        self._generate_topic_clusters(eps, min_samples)
+
+        # Interpret the results of CBDBSCAN to identify reliable topics:
+        self._generate_stable_topics(min_cores)
+
+        # Create gensim LdaModel representation of topic model with reliable topics (can be used for inference):
+        self.generate_gensim_representation()
+
+    # GENSIM API
+    # to make using the ensemble in place of a gensim model as easy as possible
+
+    def get_topics(self):
+        """Return only the stable topics from the ensemble.
+
+        Returns
+        -------
+        2D Numpy.numpy.ndarray of floats
+            List of stable topic term distributions
+
+        """
+        return self.stable_topics
+
+    def _has_gensim_representation(self):
+        """Check if stable topics and the internal gensim representation exist. Raise an error if not."""
+        if self.classic_model_representation is None:
+            if len(self.stable_topics) == 0:
+                raise ValueError("no stable topic was detected")
+            else:
+                raise ValueError("use generate_gensim_representation() first")
+
+    def __getitem__(self, i):
+        """See :meth:`gensim.models.LdaModel.__getitem__`."""
+        self._has_gensim_representation()
+        return self.classic_model_representation[i]
+
+    def inference(self, *posargs, **kwargs):
+        """See :meth:`gensim.models.LdaModel.inference`."""
+        self._has_gensim_representation()
+        return self.classic_model_representation.inference(*posargs, **kwargs)
+
+    def log_perplexity(self, *posargs, **kwargs):
+        """See :meth:`gensim.models.LdaModel.log_perplexity`."""
+        self._has_gensim_representation()
+        return self.classic_model_representation.log_perplexity(*posargs, **kwargs)
+
+    def print_topics(self, *posargs, **kwargs):
+        """See :meth:`gensim.models.LdaModel.print_topics`."""
+        self._has_gensim_representation()
+        return self.classic_model_representation.print_topics(*posargs, **kwargs)
+
+    @property
+    def id2word(self):
+        """Return the :py:class:`gensim.corpora.dictionary.Dictionary` object used in the model."""
+        return self.gensim_kw_args["id2word"]
+
+
+class CBDBSCAN():
+    """A Variation of the DBSCAN algorithm called Checkback DBSCAN (CBDBSCAN).
+
+    The algorithm works based on DBSCAN-like parameters 'eps' and 'min_samples' that respectively define how far a
+    "nearby" point is, and the minimum number of nearby points needed to label a candidate datapoint a core of a
+    cluster. (See https://scikit-learn.org/stable/modules/generated/sklearn.cluster.DBSCAN.html).
+
+    The algorithm works as follows:
+
+    1. (A)symmetric distance matrix provided at fit-time (called 'amatrix').
+       For the sake of example below, assume the there are only five topics (amatrix contains distances with dim 5x5),
+       T_1, T_2, T_3, T_4, T_5:
+    2. Start by scanning a candidate topic with respect to a parent topic
+       (e.g. T_1 with respect to parent None)
+    3. Check which topics are nearby the candidate topic using 'self.eps' as a threshold and call them neighbours
+       (e.g. assume T_3, T_4, and T_5 are nearby and become neighbours)
+    4. If there are more neighbours than 'self.min_samples', the candidate topic becomes a core candidate for a cluster
+       (e.g. if 'min_samples'=1, then T_1 becomes the first core of a cluster)
+    5. If candidate is a core, CheckBack (CB) to find the fraction of neighbours that are either the parent or the
+       parent's neighbours.  If this fraction is more than 75%, give the candidate the same label as its parent.
+       (e.g. in the trivial case there is no parent (or neighbours of that parent), a new incremental label is given)
+    6. If candidate is a core, recursively scan the next nearby topic (e.g. scan T_3) labeling the previous topic as
+       the parent and the previous neighbours as the parent_neighbours - repeat steps 2-6:
+
+       2. (e.g. Scan candidate T_3 with respect to parent T_1 that has parent_neighbours T_3, T_4, and T_5)
+       3. (e.g. T5 is the only neighbour)
+       4. (e.g. number of neighbours is 1, therefore candidate T_3 becomes a core)
+       5. (e.g. CheckBack finds that two of the four parent and parent neighbours are neighbours of candidate T_3.
+          Therefore the candidate T_3 does NOT get the same label as its parent T_1)
+       6. (e.g. Scan candidate T_5 with respect to parent T_3 that has parent_neighbours T_5)
+
+    The CB step has the effect that it enforces cluster compactness and allows the model to avoid creating clusters for
+    unreliable topics made of a composition of multiple reliable topics (something that occurs often LDA models that is
+    one cause of unreliable topics).
+
+    """
+
+    def __init__(self, eps, min_samples):
+        """Create a new CBDBSCAN object. Call fit in order to train it on an asymmetric distance matrix.
+
+        Parameters
+        ----------
+        eps : float
+            epsilon for the CBDBSCAN algorithm, having the same meaning as in classic DBSCAN clustering.
+        min_samples : int
+            The minimum number of samples in the neighborhood of a topic to be considered a core in CBDBSCAN.
+
+        """
+        self.eps = eps
+        self.min_samples = min_samples
+
+    def fit(self, amatrix):
+        """Apply the algorithm to an asymmetric distance matrix."""
+        self.next_label = 0
+
+        topic_clustering_results = [{
+            "is_core": False,
+            "neighboring_labels": set(),
+            "neighboring_topic_indices": set(),
+            "label": None
+        } for _ in range(len(amatrix))]
+
+        amatrix_copy = amatrix.copy()
+
+        # to avoid the problem of comparing the topic with itself
+        np.fill_diagonal(amatrix_copy, 1)
+
+        min_distance_per_topic = [(distance, index) for index, distance in enumerate(amatrix_copy.min(axis=1))]
+        min_distance_per_topic_sorted = sorted(min_distance_per_topic, key=lambda distance: distance[0])
+        ordered_min_similarity = [index for distance, index in min_distance_per_topic_sorted]
+
+        def scan_topic(topic_index, current_label=None, parent_neighbors=None):
+            """Extend the cluster in one direction.
+
+            Results are accumulated to ``self.results``.
+
+            Parameters
+            ----------
+            topic_index : int
+                The topic that might be added to the existing cluster, or which might create a new cluster if necessary.
+            current_label : int
+                The label of the cluster that might be suitable for ``topic_index``
+
+            """
+            neighbors_sorted = sorted(
+                [
+                    (distance, index)
+                    for index, distance in enumerate(amatrix_copy[topic_index])
+                ],
+                key=lambda x: x[0],
+            )
+            neighboring_topic_indices = [index for distance, index in neighbors_sorted if distance < self.eps]
+
+            num_neighboring_topics = len(neighboring_topic_indices)
+
+            # If the number of neighbor indices of a topic is large enough, it is considered a core.
+            # This also takes neighbor indices that already are identified as core in count.
+            if num_neighboring_topics >= self.min_samples:
+                # This topic is a core!
+                topic_clustering_results[topic_index]["is_core"] = True
+
+                # if current_label is none, then this is the first core
+                # of a new cluster (hence next_label is used)
+                if current_label is None:
+                    # next_label is initialized with 0 in fit() for the first cluster
+                    current_label = self.next_label
+                    self.next_label += 1
+
+                else:
+                    # In case the core has a parent, check the distance to the parents neighbors (since the matrix is
+                    # asymmetric, it takes return distances into account here)
+                    # If less than 25% of the elements are close enough, then create a new cluster rather than further
+                    # growing the current cluster in that direction.
+                    close_parent_neighbors_mask = amatrix_copy[topic_index][parent_neighbors] < self.eps
+
+                    if close_parent_neighbors_mask.mean() < 0.25:
+                        # start new cluster by changing current_label
+                        current_label = self.next_label
+                        self.next_label += 1
+
+                topic_clustering_results[topic_index]["label"] = current_label
+
+                for neighboring_topic_index in neighboring_topic_indices:
+                    if topic_clustering_results[neighboring_topic_index]["label"] is None:
+                        ordered_min_similarity.remove(neighboring_topic_index)
+                        # try to extend the cluster into the direction of the neighbor
+                        scan_topic(neighboring_topic_index, current_label, neighboring_topic_indices + [topic_index])
+
+                    topic_clustering_results[neighboring_topic_index]["neighboring_topic_indices"].add(topic_index)
+                    topic_clustering_results[neighboring_topic_index]["neighboring_labels"].add(current_label)
+
+            else:
+                # this topic is not a core!
+                if current_label is None:
+                    topic_clustering_results[topic_index]["label"] = -1
+                else:
+                    topic_clustering_results[topic_index]["label"] = current_label
+
+        # elements are going to be removed from that array in scan_topic, do until it is empty
+        while len(ordered_min_similarity) != 0:
+            next_topic_index = ordered_min_similarity.pop(0)
+            scan_topic(next_topic_index)
+
+        self.results = topic_clustering_results
diff --git a/gensim/test/test_data/ensemblelda b/gensim/test/test_data/ensemblelda
new file mode 100644
index 0000000000..127d00888c
Binary files /dev/null and b/gensim/test/test_data/ensemblelda differ
diff --git a/gensim/test/test_ensemblelda.py b/gensim/test/test_ensemblelda.py
new file mode 100644
index 0000000000..6794cb71a0
--- /dev/null
+++ b/gensim/test/test_ensemblelda.py
@@ -0,0 +1,365 @@
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+#
+# Author: Tobias B <github.com/sezanzeb>
+
+"""
+Automated tests for checking the EnsembleLda Class
+"""
+
+import logging
+import unittest
+
+import numpy as np
+from copy import deepcopy
+
+from gensim.models import EnsembleLda, LdaMulticore, LdaModel
+from gensim.test.utils import datapath, get_tmpfile, common_corpus, common_dictionary
+
+num_topics = 2
+num_models = 4
+passes = 50
+
+
+class TestModel(unittest.TestCase):
+    def setUp(self):
+        # same configuration for each model to make sure
+        # the topics are equal
+        random_state = 0
+
+        self.eLDA = EnsembleLda(corpus=common_corpus, id2word=common_dictionary, num_topics=num_topics,
+                                passes=passes, num_models=num_models, random_state=random_state,
+                                topic_model_class=LdaModel)
+
+        self.eLDA_mu = EnsembleLda(corpus=common_corpus, id2word=common_dictionary, num_topics=num_topics,
+                                   passes=passes, num_models=num_models, random_state=random_state,
+                                   memory_friendly_ttda=False, topic_model_class=LdaModel)
+
+    def check_ttda(self, ensemble):
+        """tests the integrity of the ttda of any ensemble"""
+        self.assertGreater(len(ensemble.ttda), 0)
+        a = ensemble.ttda.sum(axis=1)
+        b = np.ones(len(ensemble.ttda)).astype(np.float32)
+        np.testing.assert_allclose(a, b, rtol=1e-04)
+
+    def test_eLDA(self):
+        # given that the random_state doesn't change, it should
+        # always be 2 detected topics in this setup.
+        self.assertEqual(self.eLDA.stable_topics.shape[1], len(common_dictionary))
+        self.assertEqual(len(self.eLDA.ttda), num_models * num_topics)
+        self.check_ttda(self.eLDA)
+
+        # reclustering shouldn't change anything without
+        # added models or different parameters
+        self.eLDA.recluster()
+
+        self.assertEqual(self.eLDA.stable_topics.shape[1], len(common_dictionary))
+        self.assertEqual(len(self.eLDA.ttda), num_models * num_topics)
+        self.check_ttda(self.eLDA)
+
+        # compare with a pre-trained reference model
+        reference = EnsembleLda.load(datapath('ensemblelda'))
+        np.testing.assert_allclose(self.eLDA.ttda, reference.ttda, rtol=1e-05)
+        # small values in the distance matrix tend to vary quite a bit around 2%,
+        # so use some absolute tolerance measurement to check if the matrix is at least
+        # close to the target.
+        atol = reference.asymmetric_distance_matrix.max() * 1e-05
+        np.testing.assert_allclose(self.eLDA.asymmetric_distance_matrix,
+                                   reference.asymmetric_distance_matrix, atol=atol)
+
+    def test_clustering(self):
+        # the following test is quite specific to the current implementation and not part of any api,
+        # but it makes improving those sections of the code easier as long as sorted_clusters and the
+        # cluster_model results are supposed to stay the same. Potentially this test will deprecate.
+
+        reference = EnsembleLda.load(datapath('ensemblelda'))
+        cluster_model_results = deepcopy(reference.cluster_model.results)
+        sorted_clusters = deepcopy(reference.sorted_clusters)
+        stable_topics = deepcopy(reference.get_topics())
+
+        # continue training with the distance matrix of the pretrained reference and see if
+        # the generated clusters match.
+        reference.asymmetric_distance_matrix_outdated = True
+        reference.recluster()
+
+        self.assert_cluster_results_equal(reference.cluster_model.results, cluster_model_results)
+        self.assertEqual(reference.sorted_clusters, sorted_clusters)
+        np.testing.assert_allclose(reference.get_topics(), stable_topics, rtol=1e-05)
+
+    def test_not_trained(self):
+        # should not throw errors and no training should happen
+
+        # 0 passes
+        eLDA = EnsembleLda(corpus=common_corpus, id2word=common_dictionary, num_topics=num_topics,
+                           passes=0, num_models=num_models, random_state=0)
+        self.assertEqual(len(eLDA.ttda), 0)
+
+        # no corpus
+        eLDA = EnsembleLda(id2word=common_dictionary, num_topics=num_topics,
+                           passes=passes, num_models=num_models, random_state=0)
+        self.assertEqual(len(eLDA.ttda), 0)
+
+        # 0 iterations
+        eLDA = EnsembleLda(corpus=common_corpus, id2word=common_dictionary, num_topics=num_topics,
+                           iterations=0, num_models=num_models, random_state=0)
+        self.assertEqual(len(eLDA.ttda), 0)
+
+        # 0 models
+        eLDA = EnsembleLda(corpus=common_corpus, id2word=common_dictionary, num_topics=num_topics,
+                           passes=passes, num_models=0, random_state=0)
+        self.assertEqual(len(eLDA.ttda), 0)
+
+    def test_memory_unfriendly(self):
+        # at this point, self.eLDA_mu and self.eLDA are already trained
+        # in the setUpClass function
+        # both should be 100% similar (but floats cannot
+        # be compared that easily, so check for threshold)
+        self.assertEqual(len(self.eLDA_mu.tms), num_models)
+        np.testing.assert_allclose(self.eLDA.ttda, self.eLDA_mu.ttda, rtol=1e-05)
+        np.testing.assert_allclose(self.eLDA.get_topics(), self.eLDA_mu.get_topics(), rtol=1e-05)
+        self.check_ttda(self.eLDA_mu)
+
+    def test_generate_gensim_rep(self):
+        gensimModel = self.eLDA.generate_gensim_representation()
+        topics = gensimModel.get_topics()
+        np.testing.assert_allclose(self.eLDA.get_topics(), topics, rtol=1e-05)
+
+    def assert_cluster_results_equal(self, a, b):
+        """compares important attributes of the cluster results"""
+        np.testing.assert_array_equal([row["label"] for row in a],
+                                      [row["label"] for row in b])
+        np.testing.assert_array_equal([row["is_core"] for row in a],
+                                      [row["is_core"] for row in b])
+
+    def test_persisting(self):
+        fname = get_tmpfile('gensim_models_ensemblelda')
+        self.eLDA.save(fname)
+        loaded_eLDA = EnsembleLda.load(fname)
+        # storing the ensemble without memory_friendy_ttda
+        self.eLDA_mu.save(fname)
+        loaded_eLDA_mu = EnsembleLda.load(fname)
+
+        # topic_model_class will be lazy loaded and should be None first
+        assert loaded_eLDA.topic_model_class is None
+
+        # was it stored and loaded correctly?
+        # memory friendly.
+        loaded_eLDA_representation = loaded_eLDA.generate_gensim_representation()
+
+        # generating the representation also lazily loads the topic_model_class
+        assert loaded_eLDA.topic_model_class == LdaModel
+
+        topics = loaded_eLDA_representation.get_topics()
+        ttda = loaded_eLDA.ttda
+        amatrix = loaded_eLDA.asymmetric_distance_matrix
+        np.testing.assert_allclose(self.eLDA.get_topics(), topics, rtol=1e-05)
+        np.testing.assert_allclose(self.eLDA.ttda, ttda, rtol=1e-05)
+        np.testing.assert_allclose(self.eLDA.asymmetric_distance_matrix, amatrix, rtol=1e-05)
+
+        a = self.eLDA.cluster_model.results
+        b = loaded_eLDA.cluster_model.results
+
+        self.assert_cluster_results_equal(a, b)
+
+        # memory unfriendly
+        loaded_eLDA_mu_representation = loaded_eLDA_mu.generate_gensim_representation()
+        topics = loaded_eLDA_mu_representation.get_topics()
+        np.testing.assert_allclose(self.eLDA.get_topics(), topics, rtol=1e-05)
+
+    def test_multiprocessing(self):
+        # same configuration
+        random_state = 0
+
+        # use 3 processes for the ensemble and the distance,
+        # so that the 4 models and 8 topics cannot be distributed
+        # to each worker evenly
+        workers = 3
+
+        # memory friendly. contains List of topic word distributions
+        eLDA_multi = EnsembleLda(corpus=common_corpus, id2word=common_dictionary, topic_model_class=LdaModel,
+                                 num_topics=num_topics, passes=passes, num_models=num_models,
+                                 random_state=random_state, ensemble_workers=workers, distance_workers=workers)
+
+        # memory unfriendly. contains List of models
+        eLDA_multi_mu = EnsembleLda(corpus=common_corpus, id2word=common_dictionary, topic_model_class=LdaModel,
+                                    num_topics=num_topics, passes=passes, num_models=num_models,
+                                    random_state=random_state, ensemble_workers=workers, distance_workers=workers,
+                                    memory_friendly_ttda=False)
+
+        np.testing.assert_allclose(self.eLDA.get_topics(), eLDA_multi.get_topics(), rtol=1e-05)
+        np.testing.assert_allclose(self.eLDA_mu.get_topics(), eLDA_multi_mu.get_topics(), rtol=1e-05)
+
+    def test_add_models(self):
+        # same configuration
+        num_models = self.eLDA.num_models
+
+        # make sure countings and sizes after adding are correct
+        # create new models and add other models to them.
+
+        # there are a ton of configurations for the first parameter possible,
+        # try them all
+
+        # quickly train something that can be used for counting results
+        num_new_models = 3
+        num_new_topics = 3
+
+        # 1. memory friendly
+        eLDA_base = EnsembleLda(corpus=common_corpus, id2word=common_dictionary,
+                                num_topics=num_new_topics, passes=1, num_models=num_new_models,
+                                iterations=1, random_state=0, topic_model_class=LdaMulticore,
+                                workers=3, ensemble_workers=2)
+
+        # 1.1 ttda
+        a = len(eLDA_base.ttda)
+        b = eLDA_base.num_models
+        eLDA_base.add_model(self.eLDA.ttda)
+        self.assertEqual(len(eLDA_base.ttda), a + len(self.eLDA.ttda))
+        self.assertEqual(eLDA_base.num_models, b + 1)  # defaults to 1 for one ttda matrix
+
+        # 1.2 an ensemble
+        a = len(eLDA_base.ttda)
+        b = eLDA_base.num_models
+        eLDA_base.add_model(self.eLDA, 5)
+        self.assertEqual(len(eLDA_base.ttda), a + len(self.eLDA.ttda))
+        self.assertEqual(eLDA_base.num_models, b + 5)
+
+        # 1.3 a list of ensembles
+        a = len(eLDA_base.ttda)
+        b = eLDA_base.num_models
+        # it should be totally legit to add a memory unfriendly object to a memory friendly one
+        eLDA_base.add_model([self.eLDA, self.eLDA_mu])
+        self.assertEqual(len(eLDA_base.ttda), a + 2 * len(self.eLDA.ttda))
+        self.assertEqual(eLDA_base.num_models, b + 2 * num_models)
+
+        # 1.4 a single gensim model
+        model = self.eLDA.classic_model_representation
+
+        a = len(eLDA_base.ttda)
+        b = eLDA_base.num_models
+        eLDA_base.add_model(model)
+        self.assertEqual(len(eLDA_base.ttda), a + len(model.get_topics()))
+        self.assertEqual(eLDA_base.num_models, b + 1)
+
+        # 1.5 a list gensim models
+        a = len(eLDA_base.ttda)
+        b = eLDA_base.num_models
+        eLDA_base.add_model([model, model])
+        self.assertEqual(len(eLDA_base.ttda), a + 2 * len(model.get_topics()))
+        self.assertEqual(eLDA_base.num_models, b + 2)
+
+        self.check_ttda(eLDA_base)
+
+        # 2. memory unfriendly
+        eLDA_base_mu = EnsembleLda(corpus=common_corpus, id2word=common_dictionary,
+                                   num_topics=num_new_topics, passes=1, num_models=num_new_models,
+                                   iterations=1, random_state=0, topic_model_class=LdaMulticore,
+                                   workers=3, ensemble_workers=2, memory_friendly_ttda=False)
+
+        # 2.1 a single ensemble
+        a = len(eLDA_base_mu.tms)
+        b = eLDA_base_mu.num_models
+        eLDA_base_mu.add_model(self.eLDA_mu)
+        self.assertEqual(len(eLDA_base_mu.tms), a + num_models)
+        self.assertEqual(eLDA_base_mu.num_models, b + num_models)
+
+        # 2.2 a list of ensembles
+        a = len(eLDA_base_mu.tms)
+        b = eLDA_base_mu.num_models
+        eLDA_base_mu.add_model([self.eLDA_mu, self.eLDA_mu])
+        self.assertEqual(len(eLDA_base_mu.tms), a + 2 * num_models)
+        self.assertEqual(eLDA_base_mu.num_models, b + 2 * num_models)
+
+        # 2.3 a single gensim model
+        a = len(eLDA_base_mu.tms)
+        b = eLDA_base_mu.num_models
+        eLDA_base_mu.add_model(self.eLDA_mu.tms[0])
+        self.assertEqual(len(eLDA_base_mu.tms), a + 1)
+        self.assertEqual(eLDA_base_mu.num_models, b + 1)
+
+        # 2.4 a list of gensim models
+        a = len(eLDA_base_mu.tms)
+        b = eLDA_base_mu.num_models
+        eLDA_base_mu.add_model(self.eLDA_mu.tms)
+        self.assertEqual(len(eLDA_base_mu.tms), a + num_models)
+        self.assertEqual(eLDA_base_mu.num_models, b + num_models)
+
+        # 2.5 topic term distributions should throw errors, because the
+        # actual models are needed for the memory unfriendly ensemble
+        a = len(eLDA_base_mu.tms)
+        b = eLDA_base_mu.num_models
+        self.assertRaises(ValueError, lambda: eLDA_base_mu.add_model(self.eLDA_mu.tms[0].get_topics()))
+        # remains unchanged
+        self.assertEqual(len(eLDA_base_mu.tms), a)
+        self.assertEqual(eLDA_base_mu.num_models, b)
+
+        self.assertEqual(eLDA_base_mu.num_models, len(eLDA_base_mu.tms))
+        self.check_ttda(eLDA_base_mu)
+
+    def test_add_and_recluster(self):
+        # 6.2: see if after adding a model, the model still makes sense
+        num_new_models = 3
+        num_new_topics = 3
+
+        # train
+        new_eLDA = EnsembleLda(corpus=common_corpus, id2word=common_dictionary,
+                               num_topics=num_new_topics, passes=10, num_models=num_new_models,
+                               iterations=30, random_state=1, topic_model_class='ldamulticore',
+                               distance_workers=4)
+        new_eLDA_mu = EnsembleLda(corpus=common_corpus, id2word=common_dictionary,
+                                  num_topics=num_new_topics, passes=10, num_models=num_new_models,
+                                  iterations=30, random_state=1, topic_model_class='ldamulticore',
+                                  distance_workers=4, memory_friendly_ttda=False)
+        # both should be similar
+        np.testing.assert_allclose(new_eLDA.ttda, new_eLDA_mu.ttda, rtol=1e-05)
+        np.testing.assert_allclose(new_eLDA.get_topics(), new_eLDA_mu.get_topics(), rtol=1e-05)
+        # and every next step applied to both should result in similar results
+
+        # 1. adding to ttda and tms
+        new_eLDA.add_model(self.eLDA)
+        new_eLDA_mu.add_model(self.eLDA_mu)
+        np.testing.assert_allclose(new_eLDA.ttda, new_eLDA_mu.ttda, rtol=1e-05)
+        self.assertEqual(len(new_eLDA.ttda), len(self.eLDA.ttda) + num_new_models * num_new_topics)
+        self.assertEqual(len(new_eLDA_mu.ttda), len(self.eLDA_mu.ttda) + num_new_models * num_new_topics)
+        self.assertEqual(len(new_eLDA_mu.tms), num_models + num_new_models)
+        self.check_ttda(new_eLDA)
+        self.check_ttda(new_eLDA_mu)
+
+        # 2. distance matrix
+        new_eLDA._generate_asymmetric_distance_matrix()
+        new_eLDA_mu._generate_asymmetric_distance_matrix()
+        np.testing.assert_allclose(new_eLDA.asymmetric_distance_matrix,
+                                   new_eLDA_mu.asymmetric_distance_matrix)
+
+        # 3. CBDBSCAN results
+        new_eLDA._generate_topic_clusters()
+        new_eLDA_mu._generate_topic_clusters()
+        a = new_eLDA.cluster_model.results
+        b = new_eLDA_mu.cluster_model.results
+        self.assert_cluster_results_equal(a, b)
+
+        # 4. finally, the stable topics
+        new_eLDA._generate_stable_topics()
+        new_eLDA_mu._generate_stable_topics()
+        np.testing.assert_allclose(new_eLDA.get_topics(),
+                                   new_eLDA_mu.get_topics())
+
+        new_eLDA.generate_gensim_representation()
+        new_eLDA_mu.generate_gensim_representation()
+
+        # same random state, hence topics should be still similar
+        np.testing.assert_allclose(new_eLDA.get_topics(), new_eLDA_mu.get_topics(), rtol=1e-05)
+
+    def test_inference(self):
+        import numpy as np
+        # get the most likely token id from topic 0
+        max_id = np.argmax(self.eLDA.get_topics()[0, :])
+        self.assertGreater(self.eLDA.classic_model_representation.iterations, 0)
+        # topic 0 should be dominant in the inference.
+        # the difference between the probabilities should be significant and larger than 0.3
+        infered = self.eLDA[[(max_id, 1)]]
+        self.assertGreater(infered[0][1] - 0.3, infered[1][1])
+
+
+if __name__ == '__main__':
+    logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.WARN)
+    unittest.main()